diff --git a/.github/ISSUE_TEMPLATE/documentation-content.md b/.github/ISSUE_TEMPLATE/documentation-content.md deleted file mode 100644 index bf6a1e5b3f0e..000000000000 --- a/.github/ISSUE_TEMPLATE/documentation-content.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -name: "\U0001F4DADocumentation Content" -about: Report an issue related to the documentation content on https://pytorch.org - ---- - -## 📚 Documentation - -(Add a clear and concise description of what the documentation content issue is. A link to any relevant https://pytorch.org page is helpful if you have one.) diff --git a/.github/ISSUE_TEMPLATE/website-issue.md b/.github/ISSUE_TEMPLATE/website-issue.md deleted file mode 100644 index efc8fef24039..000000000000 --- a/.github/ISSUE_TEMPLATE/website-issue.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -name: "\U0001F54B Website Issue" -about: Report an issue with the https://pytorch.org website itself - ---- - -## 🕋 Website - - - -## To Reproduce - -Steps to reproduce the behavior (if applicable): - -1. Go to '...' -2. Click on '....' -3. Scroll down to '....' -4. See error - -## Expected behavior - - - -## Screenshots - - - -## Desktop (please complete the following information): - - - OS: [e.g. iOS] - - Browser [e.g. chrome, safari] - - Version [e.g. 22] - -## Additional context - - diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 23a8f5ffbad9..000000000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Build - -on: - push: - branches: - - site - workflow_dispatch: - -jobs: - tests: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - secrets: inherit - with: - runner: linux.12xlarge - repository: pytorch/pytorch.github.io - docker-image: cimg/ruby:2.7-node - secrets-env: PYTORCHBOT_TOKEN - script: | - git config --global --add safe.directory /__w/pytorch.github.io/pytorch.github.io - set -euxo pipefail - - ## Bundle Install - cd - mkdir .bundle - bundle config path '~/vendor/bundle' - git clone https://github.com/pytorch/pytorch.github.io.git - cd pytorch.github.io - bundle install - - ## Yarn Install - yarn install --cache-folder ~/.cache/yarn - - ## Notedown Install - sudo apt update && sudo apt install python3-pip && sudo -H pip3 install pyrsistent==0.16 notedown pyyaml -Iv nbformat==5.7 - - ## Configure Bot - git config --global user.email "facebook-circleci-bot@users.noreply.github.com" - git config --global user.name "Website Deployment Script" - - ## Build Jekyll site and push to master - ./scripts/deploy-site.sh build diff --git a/.github/workflows/update-quick-start-module.yml b/.github/workflows/update-quick-start-module.yml deleted file mode 100644 index 7d070eb7ff8a..000000000000 --- a/.github/workflows/update-quick-start-module.yml +++ /dev/null @@ -1,112 +0,0 @@ -name: Update quick start module -on: - schedule: - # At 18:30 pm UTC (1:30 pm EST) - - cron: "30 18 * * *" - pull_request: - paths: - - .github/workflows/update-quick-start-module.yml - - scripts/gen_quick_start_module.py - - _includes/quick-start-module.js - - _includes/quick_start_local.html - push: - branches: - site - paths: - - .github/workflows/update-quick-start-module.yml - - scripts/gen_quick_start_module.py - - _includes/quick-start-module.js - - _includes/quick_start_local.html - workflow_dispatch: - -jobs: - linux-nightly-matrix: - uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main - with: - package-type: all - os: linux - channel: "nightly" - windows-nightly-matrix: - uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main - with: - package-type: all - os: windows - channel: "nightly" - macos-arm64-nightly-matrix: - uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main - with: - package-type: all - os: macos-arm64 - channel: "nightly" - linux-release-matrix: - needs: [linux-nightly-matrix] - uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main - with: - package-type: all - os: linux - channel: "release" - windows-release-matrix: - needs: [windows-nightly-matrix] - uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main - with: - package-type: all - os: windows - channel: "release" - macos-arm64-release-matrix: - needs: [macos-arm64-nightly-matrix] - uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main - with: - package-type: all - os: macos-arm64 - channel: "release" - - update-quick-start: - needs: [linux-nightly-matrix, windows-nightly-matrix, macos-arm64-nightly-matrix, - linux-release-matrix, windows-release-matrix, macos-arm64-release-matrix] - runs-on: "ubuntu-latest" - environment: pytorchbot-env - steps: - - name: Checkout pytorch.github.io - uses: actions/checkout@v2 - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.9 - architecture: x64 - - name: Create json file - shell: bash - env: - LINUX_NIGHTLY_MATRIX: ${{ needs.linux-nightly-matrix.outputs.matrix }} - WINDOWS_NIGHTLY_MATRIX: ${{ needs.windows-nightly-matrix.outputs.matrix }} - MACOS_NIGHTLY_MATRIX: ${{ needs.macos-arm64-nightly-matrix.outputs.matrix }} - LINUX_RELEASE_MATRIX: ${{ needs.linux-release-matrix.outputs.matrix }} - WINDOWS_RELEASE_MATRIX: ${{ needs.windows-release-matrix.outputs.matrix }} - MACOS_RELEASE_MATRIX: ${{ needs.macos-arm64-release-matrix.outputs.matrix }} - run: | - set -ex - printf '%s\n' "$LINUX_NIGHTLY_MATRIX" > linux_nightly_matrix.json - printf '%s\n' "$WINDOWS_NIGHTLY_MATRIX" > windows_nightly_matrix.json - printf '%s\n' "$MACOS_NIGHTLY_MATRIX" > macos_nightly_matrix.json - printf '%s\n' "$LINUX_RELEASE_MATRIX" > linux_release_matrix.json - printf '%s\n' "$WINDOWS_RELEASE_MATRIX" > windows_release_matrix.json - printf '%s\n' "$MACOS_RELEASE_MATRIX" > macos_release_matrix.json - python3 ./scripts/gen_quick_start_module.py --autogenerate > assets/quick-start-module.js - rm *_matrix.json - - name: Create Issue if failed - uses: dacbd/create-issue-action@main - if: ${{ failure() }} # only run when this job is failed. - with: - title: Updating quick start module failed - token: ${{secrets.PYTORCHBOT_TOKEN}} - assignees: ${{github.actor}} - labels: bug - body: Updating quick start module failed, please fix update quick start module - - name: Create Pull Request - uses: peter-evans/create-pull-request@v3 - with: - token: ${{ secrets.PYTORCHBOT_TOKEN }} - commit-message: Modify published_versions.json, releases.json and quick-start-module.js - title: '[Getting Started Page] Modify published_versions.json, releases.json and quick-start-module.js' - body: > - This PR is auto-generated. It updates Getting Started page - labels: automated pr diff --git a/.github/workflows/validate-quick-start-module.yml b/.github/workflows/validate-quick-start-module.yml deleted file mode 100644 index 2813be181d01..000000000000 --- a/.github/workflows/validate-quick-start-module.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Validate quick start module -on: - pull_request: - branches: - site - paths: - - published_versions.json - - assets/quick-start-module.js - - .github/workflows/validate-quick-start-module.yml - push: - branches: - site - paths: - - published_versions.json - - assets/quick-start-module.js - - .github/workflows/validate-quick-start-module.yml - workflow_dispatch: - -jobs: - validate-nightly-binaries: - uses: pytorch/test-infra/.github/workflows/validate-binaries.yml@main - with: - os: all - channel: "nightly" - validate-release-binaries: - if: always() - uses: pytorch/test-infra/.github/workflows/validate-binaries.yml@main - needs: validate-nightly-binaries - with: - os: all - channel: "release" diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 90a766742d95..000000000000 --- a/.gitignore +++ /dev/null @@ -1,18 +0,0 @@ -.DS_Store -node_modules -yarn-error.log -/vendor -# These are NOT autogenerated. Check in files as necessary. -!docs/stable/_static/js/vendor/ -!docs/master/_static/js/vendor/ -.bundle -.sass_cache -_site - -.idea/ - -.jekyll-metadata - -.vscode/ - -.netlify/ diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index df1b3b80f323..000000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "_hub"] - path = _hub - url = https://github.com/pytorch/hub.git diff --git a/.nvmrc b/.nvmrc deleted file mode 100644 index 834eb3fa85bf..000000000000 --- a/.nvmrc +++ /dev/null @@ -1 +0,0 @@ -9.8.0 diff --git a/.ruby-version b/.ruby-version deleted file mode 100644 index 6a81b4c83794..000000000000 --- a/.ruby-version +++ /dev/null @@ -1 +0,0 @@ -2.7.8 diff --git a/2017/05/11/Internals.html b/2017/05/11/Internals.html new file mode 100644 index 000000000000..a535db31eec8 --- /dev/null +++ b/2017/05/11/Internals.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2017/06/27/Internals2.html b/2017/06/27/Internals2.html new file mode 100644 index 000000000000..d5b7eb345060 --- /dev/null +++ b/2017/06/27/Internals2.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2018/01/19/a-year-in.html b/2018/01/19/a-year-in.html new file mode 100644 index 000000000000..4980385057d9 --- /dev/null +++ b/2018/01/19/a-year-in.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2018/03/05/tensor-comprehensions.html b/2018/03/05/tensor-comprehensions.html new file mode 100644 index 000000000000..736fa2110f1d --- /dev/null +++ b/2018/03/05/tensor-comprehensions.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2018/04/22/0_4_0-migration-guide.html b/2018/04/22/0_4_0-migration-guide.html new file mode 100644 index 000000000000..635d247ff5ee --- /dev/null +++ b/2018/04/22/0_4_0-migration-guide.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2018/05/02/road-to-1.0.html b/2018/05/02/road-to-1.0.html new file mode 100644 index 000000000000..78e399bc998a --- /dev/null +++ b/2018/05/02/road-to-1.0.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/04/29/road-to-1.0.html b/2019/04/29/road-to-1.0.html new file mode 100644 index 000000000000..b19638439684 --- /dev/null +++ b/2019/04/29/road-to-1.0.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/05/08/model-serving-in-pyorch.html b/2019/05/08/model-serving-in-pyorch.html new file mode 100644 index 000000000000..a578a77ac542 --- /dev/null +++ b/2019/05/08/model-serving-in-pyorch.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/05/23/torchvision03.html b/2019/05/23/torchvision03.html new file mode 100644 index 000000000000..8547da0cdc64 --- /dev/null +++ b/2019/05/23/torchvision03.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/06/10/pytorch_hub.html b/2019/06/10/pytorch_hub.html new file mode 100644 index 000000000000..de517a02dd78 --- /dev/null +++ b/2019/06/10/pytorch_hub.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/07/23/mapillary-research.html b/2019/07/23/mapillary-research.html new file mode 100644 index 000000000000..90b4512c4c08 --- /dev/null +++ b/2019/07/23/mapillary-research.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/2019/08/06/pytorch_aug2019_releases.html b/2019/08/06/pytorch_aug2019_releases.html new file mode 100644 index 000000000000..29dddfd8c52c --- /dev/null +++ b/2019/08/06/pytorch_aug2019_releases.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

Redirecting…

+ Click here if you are not redirected. + diff --git a/404.html b/404.html index dccbfd661ab7..1d1727fd5d38 100644 --- a/404.html +++ b/404.html @@ -1,11 +1,307 @@ ---- -title: Oops! -id: 404 -permalink: /404.html -layout: general ---- - - + + + + + + + + Oops! | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ + + + + + + + +
+ +
+
+ +
+
+
+
+
- +

Oops!

@@ -80,4 +376,310 @@

Click here to go back to the main page.

-
\ No newline at end of file +
+ +
+
+ + +
+
+
+
+

Docs

+

Access comprehensive developer documentation for PyTorch

+ View Docs +
+ +
+

Tutorials

+

Get in-depth tutorials for beginners and advanced developers

+ View Tutorials +
+ +
+

Resources

+

Find development resources and get your questions answered

+ View Resources +
+
+
+
+ + + +
+
+
+
+ + +
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000000..90e93bd32f19 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing to hub +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## License +By contributing to hub, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. \ No newline at end of file diff --git a/Gemfile b/Gemfile deleted file mode 100644 index 166b9e4fa750..000000000000 --- a/Gemfile +++ /dev/null @@ -1,9 +0,0 @@ -source "https://rubygems.org" -ruby "2.7.8" - -group :jekyll_plugins do - gem "github-pages" - gem "jekyll-paginate-v2" - gem 'jekyll-autoprefixer' - gem 'jekyll-feed' -end diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index f0011c355d87..000000000000 --- a/Gemfile.lock +++ /dev/null @@ -1,277 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - activesupport (6.0.6.1) - concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) - addressable (2.8.0) - public_suffix (>= 2.0.2, < 5.0) - autoprefixer-rails (9.8.6.5) - execjs - coffee-script (2.4.1) - coffee-script-source - execjs - coffee-script-source (1.11.1) - colorator (1.1.0) - commonmarker (0.17.13) - ruby-enum (~> 0.5) - concurrent-ruby (1.2.0) - dnsruby (1.61.5) - simpleidn (~> 0.1) - em-websocket (0.5.2) - eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) - ethon (0.12.0) - ffi (>= 1.3.0) - eventmachine (1.2.7) - execjs (2.7.0) - faraday (1.3.0) - faraday-net_http (~> 1.0) - multipart-post (>= 1.2, < 3) - ruby2_keywords - faraday-net_http (1.0.1) - ffi (1.15.0) - forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (214) - github-pages-health-check (= 1.17.0) - jekyll (= 3.9.0) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.1.6) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) - jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) - jekyll-mentions (= 1.6.0) - jekyll-optional-front-matter (= 0.3.2) - jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.3.0) - jekyll-redirect-from (= 0.16.0) - jekyll-relative-links (= 0.6.1) - jekyll-remote-theme (= 0.4.3) - jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.7.1) - jekyll-sitemap (= 1.4.0) - jekyll-swiss (= 1.0.0) - jekyll-theme-architect (= 0.1.1) - jekyll-theme-cayman (= 0.1.1) - jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.2) - jekyll-theme-leap-day (= 0.1.1) - jekyll-theme-merlot (= 0.1.1) - jekyll-theme-midnight (= 0.1.1) - jekyll-theme-minimal (= 0.1.1) - jekyll-theme-modernist (= 0.1.1) - jekyll-theme-primer (= 0.5.4) - jekyll-theme-slate (= 0.1.1) - jekyll-theme-tactile (= 0.1.1) - jekyll-theme-time-machine (= 0.1.1) - jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.1) - kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) - mercenary (~> 0.3) - minima (= 2.5.1) - nokogiri (>= 1.10.4, < 2.0) - rouge (= 3.26.0) - terminal-table (~> 1.4) - github-pages-health-check (1.17.0) - addressable (~> 2.3) - dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 2.0.2, < 5.0) - typhoeus (~> 1.3) - html-pipeline (2.14.0) - activesupport (>= 2) - nokogiri (>= 1.4) - http_parser.rb (0.6.0) - i18n (0.9.5) - concurrent-ruby (~> 1.0) - jekyll (3.9.0) - addressable (~> 2.4) - colorator (~> 1.0) - em-websocket (~> 0.5) - i18n (~> 0.7) - jekyll-sass-converter (~> 1.0) - jekyll-watch (~> 2.0) - kramdown (>= 1.17, < 3) - liquid (~> 4.0) - mercenary (~> 0.3.3) - pathutil (~> 0.9) - rouge (>= 1.7, < 4) - safe_yaml (~> 1.0) - jekyll-autoprefixer (1.0.2) - autoprefixer-rails (~> 9.3) - jekyll-avatar (0.7.0) - jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) - coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.3.1) - commonmarker (~> 0.14) - jekyll (>= 3.7, < 5.0) - jekyll-commonmark-ghpages (0.1.6) - commonmarker (~> 0.17.6) - jekyll-commonmark (~> 1.2) - rouge (>= 2.0, < 4.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) - jekyll (>= 3.7, < 5.0) - jekyll-gist (1.5.0) - octokit (~> 4.2) - jekyll-github-metadata (2.13.0) - jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.6.0) - html-pipeline (~> 2.3) - jekyll (>= 3.7, < 5.0) - jekyll-optional-front-matter (0.3.2) - jekyll (>= 3.0, < 5.0) - jekyll-paginate (1.1.0) - jekyll-paginate-v2 (3.0.0) - jekyll (>= 3.0, < 5.0) - jekyll-readme-index (0.3.0) - jekyll (>= 3.0, < 5.0) - jekyll-redirect-from (0.16.0) - jekyll (>= 3.3, < 5.0) - jekyll-relative-links (0.6.1) - jekyll (>= 3.3, < 5.0) - jekyll-remote-theme (0.4.3) - addressable (~> 2.0) - jekyll (>= 3.5, < 5.0) - jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) - rubyzip (>= 1.3.0, < 3.0) - jekyll-sass-converter (1.5.2) - sass (~> 3.4) - jekyll-seo-tag (2.7.1) - jekyll (>= 3.8, < 5.0) - jekyll-sitemap (1.4.0) - jekyll (>= 3.7, < 5.0) - jekyll-swiss (1.0.0) - jekyll-theme-architect (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.2) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.5.4) - jekyll (> 3.5, < 5.0) - jekyll-github-metadata (~> 2.9) - jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.3) - jekyll (>= 3.3, < 5.0) - jekyll-watch (2.2.1) - listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) - html-pipeline (~> 2.2) - jekyll (>= 3.0, < 5.0) - kramdown (2.3.1) - rexml - kramdown-parser-gfm (1.1.0) - kramdown (~> 2.0) - liquid (4.0.3) - listen (3.5.1) - rb-fsevent (~> 0.10, >= 0.10.3) - rb-inotify (~> 0.9, >= 0.9.10) - mercenary (0.3.6) - mini_portile2 (2.8.1) - minima (2.5.1) - jekyll (>= 3.5, < 5.0) - jekyll-feed (~> 0.9) - jekyll-seo-tag (~> 2.1) - minitest (5.17.0) - multipart-post (2.1.1) - nokogiri (1.14.3) - mini_portile2 (~> 2.8.0) - racc (~> 1.4) - octokit (4.20.0) - faraday (>= 0.9) - sawyer (~> 0.8.0, >= 0.5.3) - pathutil (0.16.2) - forwardable-extended (~> 2.6) - public_suffix (4.0.6) - racc (1.6.2) - rb-fsevent (0.10.4) - rb-inotify (0.10.1) - ffi (~> 1.0) - rexml (3.2.5) - rouge (3.26.0) - ruby-enum (0.9.0) - i18n - ruby2_keywords (0.0.4) - rubyzip (2.3.0) - safe_yaml (1.0.5) - sass (3.7.4) - sass-listen (~> 4.0.0) - sass-listen (4.0.0) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.2) - addressable (>= 2.3.5) - faraday (> 0.8, < 2.0) - simpleidn (0.2.1) - unf (~> 0.1.4) - terminal-table (1.8.0) - unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) - typhoeus (1.4.0) - ethon (>= 0.9.0) - tzinfo (1.2.11) - thread_safe (~> 0.1) - unf (0.1.4) - unf_ext - unf_ext (0.0.7.7) - unicode-display_width (1.7.0) - zeitwerk (2.6.7) - -PLATFORMS - ruby - -DEPENDENCIES - github-pages - jekyll-autoprefixer - jekyll-feed - jekyll-paginate-v2 - -RUBY VERSION - ruby 2.7.8p225 - -BUNDLED WITH - 1.17.2 diff --git a/Makefile b/Makefile deleted file mode 100644 index dcbcbf191848..000000000000 --- a/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -SHELL := /bin/bash -BUNDLE := bundle -YARN := yarn -VENDOR_DIR = assets/vendor/ -JEKYLL := $(BUNDLE) exec jekyll - -PROJECT_DEPS := Gemfile package.json - -.PHONY: all clean install update - -all : serve - -check: - $(JEKYLL) doctor - $(HTMLPROOF) --check-html \ - --http-status-ignore 999 \ - --internal-domains localhost:4000 \ - --assume-extension \ - _site - -install: $(PROJECT_DEPS) - $(BUNDLE) install --path vendor/bundler - $(YARN) install - -update: $(PROJECT_DEPS) - $(BUNDLE) update - $(YARN) upgrade - -include-yarn-deps: - mkdir -p $(VENDOR_DIR) - cp node_modules/jquery/dist/jquery.min.js $(VENDOR_DIR) - cp node_modules/popper.js/dist/umd/popper.min.js $(VENDOR_DIR) - cp node_modules/bootstrap/dist/js/bootstrap.min.js $(VENDOR_DIR) - cp node_modules/anchor-js/anchor.min.js $(VENDOR_DIR) - -build: install include-yarn-deps - $(JEKYLL) build --config _config.yml - -serve: install include-yarn-deps - JEKYLL_ENV=development $(JEKYLL) serve --incremental --config _config.yml - -build_deploy: include-yarn-deps - JEKYLL_ENV=production $(JEKYLL) build diff --git a/README.md b/README.md deleted file mode 100644 index f3cad9ccd536..000000000000 --- a/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# pytorch.org site - -[https://pytorch.org](https://pytorch.org) - -A static website built in [Jekyll](https://jekyllrb.com/) and [Bootstrap](https://getbootstrap.com/) for [PyTorch](https://pytorch.org/), and its tutorials and documentation. - -## Prerequisites - -Install the following packages before attempting to setup the project: - -- [rbenv](https://github.com/rbenv/rbenv) -- [ruby-build](https://github.com/rbenv/ruby-build) -- [nvm](https://github.com/creationix/nvm) - -On OSX, you can use: - -``` -brew install rbenv ruby-build nvm -``` - -## Setup - -#### Install required Ruby version: - -``` -#### You only need to run these commands if you are missing the needed Ruby version. - -rbenv install `cat .ruby-version` -gem install bundler -v 1.16.3 -rbenv rehash - -#### - -bundle install -rbenv rehash -``` - -#### Install required Node version - -``` -nvm install -nvm use -``` - -#### Install Yarn - -``` -brew install yarn --ignore-dependencies -yarn install -``` - -## Local Development - -To run the website locally for development: - -``` -make serve -``` - -Then navigate to [localhost:4000](localhost:4000). - -Note the `serve` task is contained in a `Makefile` in the root directory. We are using `make` as an alternative to the standard `jekyll serve` as we want to run `yarn`, which is not included in Jekyll by default. - -### Building the Static Site - -To build the static website from source: - -``` -make build -``` - -This will build the static site at `./_site`. This directory is not tracked in git. - -## Deployments - -The website is hosted on [Github Pages](https://pages.github.com/) at [https://pytorch.org](https://pytorch.org). - -To deploy changes, merge your latest code into the `site` branch. A build will be automatically built and committed to the `master` branch via a CircleCI job. - -To view the status of the build visit [https://circleci.com/gh/pytorch/pytorch.github.io](https://circleci.com/gh/pytorch/pytorch.github.io). - -## Contributing to PyTorch Documentation and Tutorials -* You can find information about contributing to PyTorch documentation in the -PyTorch repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. -* Information about contributing to PyTorch Tutorials can be found in the -tutorials [README.md](https://github.com/pytorch/tutorials/blob/master/README.md). -* Additional contribution information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md). diff --git a/_board_info/advanced-micro-devices.md b/_board_info/advanced-micro-devices.md deleted file mode 100644 index 202696a7a13a..000000000000 --- a/_board_info/advanced-micro-devices.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: AMD -summary: '' -link: https://amd.com -image: /assets/images/members/amd-logo.svg -class: pytorch-resource -order: 1 -featured-home: true ---- diff --git a/_board_info/arm.md b/_board_info/arm.md deleted file mode 100644 index 588b4984a914..000000000000 --- a/_board_info/arm.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: arm -summary: '' -link: https://www.arm.com/ -image: /assets/images/members/arm-logo.svg -class: pytorch-resource -order: 2 -featured-home: true ---- diff --git a/_board_info/aws.md b/_board_info/aws.md deleted file mode 100644 index 65b823f8cf13..000000000000 --- a/_board_info/aws.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Amazon -summary: '' -link: https://aws.amazon.com -image: /assets/images/members/aws-logo.svg -class: pytorch-resource -order: 2 -featured-home: true ---- diff --git a/_board_info/google-cloud.md b/_board_info/google-cloud.md deleted file mode 100644 index 10fc9de6d738..000000000000 --- a/_board_info/google-cloud.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Google Cloud -summary: '' -link: https://cloud.google.com/gcp -image: /assets/images/members/google-cloud-logo.svg -class: pytorch-resource -order: 3 -featured-home: true ---- diff --git a/_board_info/huawei.md b/_board_info/huawei.md deleted file mode 100644 index 583d74b0957a..000000000000 --- a/_board_info/huawei.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Huawei -summary: '' -link: https://www.huawei.com/ -image: /assets/images/members/huawei-logo.svg -class: pytorch-resource -order: 5 -featured-home: true ---- diff --git a/_board_info/hugging-face.md b/_board_info/hugging-face.md deleted file mode 100644 index 298c08670cf7..000000000000 --- a/_board_info/hugging-face.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Hugging Face -summary: '' -link: https://huggingface.co/ -image: /assets/images/members/hf-logo.svg -class: pytorch-resource -order: 5 -featured-home: true ---- diff --git a/_board_info/ibm.md b/_board_info/ibm.md deleted file mode 100644 index fa3875006c57..000000000000 --- a/_board_info/ibm.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: IBM -summary: '' -link: https://www.ibm.com/ -image: /assets/images/members/ibm-logo.svg -class: pytorch-resource -order: 6 -featured-home: true ---- diff --git a/_board_info/intel.md b/_board_info/intel.md deleted file mode 100644 index c4f29dbdf4f7..000000000000 --- a/_board_info/intel.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Intel -summary: '' -link: https://www.intel.com/ -image: /assets/images/intel-new-logo.svg -class: pytorch-resource -order: 7 -featured-home: true ---- diff --git a/_board_info/lightning.md b/_board_info/lightning.md deleted file mode 100644 index 037d8f797de9..000000000000 --- a/_board_info/lightning.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Lightning AI -summary: '' -link: https://lightning.ai/ -image: /assets/images/members/lightning-logo.png -class: pytorch-resource -order: 8 -featured-home: true ---- diff --git a/_board_info/meta.md b/_board_info/meta.md deleted file mode 100644 index aac6580d8413..000000000000 --- a/_board_info/meta.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Meta -summary: '' -link: https://meta.com -image: /assets/images/members/meta-logo.svg -class: pytorch-resource -order: 9 -featured-home: true ---- diff --git a/_board_info/microsoft-corporation.md b/_board_info/microsoft-corporation.md deleted file mode 100644 index 7536306b31e2..000000000000 --- a/_board_info/microsoft-corporation.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Microsoft -summary: '' -link: https://azure.microsoft.com -image: /assets/images/members/microsoft-azure-logo.svg -class: pytorch-resource -order: 10 -featured-home: true ---- diff --git a/_board_info/nvidia-corporation.md b/_board_info/nvidia-corporation.md deleted file mode 100644 index fbb018bb9acf..000000000000 --- a/_board_info/nvidia-corporation.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Nvidia -summary: '' -link: https://www.nvidia.com/en-us/ai-data-science/ -image: /assets/images/members/nvidia-logo.svg -class: pytorch-resource -order: 11 -featured-home: true ---- diff --git a/_case_studies/amazon-ads.md b/_case_studies/amazon-ads.md deleted file mode 100644 index 7515e5205a55..000000000000 --- a/_case_studies/amazon-ads.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: blog_detail -title: Amazon Ads -logo: assets/images/amazon-ads-logo.png -featured-home: true -order: 1 -link: /blog/amazon-ads-case-study/ ---- - -Reduce inference costs by 71% and drive scale out using PyTorch, TorchServe, and AWS Inferentia. diff --git a/_case_studies/salesforce.md b/_case_studies/salesforce.md deleted file mode 100644 index 9e0f7713b3b6..000000000000 --- a/_case_studies/salesforce.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: blog_detail -title: Salesforce -logo: assets/images/salesforce.png -featured-home: true -order: 2 -link: ---- - -Pushing the state of the art in NLP and Multi-task learning. diff --git a/_case_studies/stanford-university.md b/_case_studies/stanford-university.md deleted file mode 100644 index 7629ee10a74b..000000000000 --- a/_case_studies/stanford-university.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: blog_detail -title: Stanford University -logo: assets/images/stanford-university.png -featured-home: true -order: 3 -link: ---- - -Using PyTorch's flexibility to efficiently research new algorithmic approaches. diff --git a/_community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.md b/_community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.md deleted file mode 100644 index 9806ff0aa783..000000000000 --- a/_community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: '3D rotations and spatial transformations made easy with RoMa' -author: Romain Brégier -ext_url: https://medium.com/pytorch/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4 -date: Jan 25, 2024 ---- - -Struggling with quaternions, rotation vectors, right-hand rules and all these stuffs? Try RoMa: an easy-to-to-use, stable and efficient library to deal with rotations and spatial transformations in PyTorch. \ No newline at end of file diff --git a/_community_blog/bringing-the-pytorch-community-together.md b/_community_blog/bringing-the-pytorch-community-together.md deleted file mode 100644 index 5825c9993e4f..000000000000 --- a/_community_blog/bringing-the-pytorch-community-together.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Bringing the PyTorch Community Together' -author: Team PyTorch -ext_url: /blog/bringing-the-pytorch-community-together/ -date: January 22, 2025 ---- - -As we step into a new year, it’s a great moment to reflect on the incredible community events that made 2024 a memorable year for the PyTorch Foundation. Global meetups, events, and conferences brought the community together to learn, connect, and grow. Here’s a quick recap of the year’s highlights and what to expect in 2025. \ No newline at end of file diff --git a/_community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.md b/_community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.md deleted file mode 100644 index 16aef370be85..000000000000 --- a/_community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Colossal-LLaMA-2: Low Cost and High-quality Domain-specific LLM Solution Using LLaMA and Colossal-AI' -author: Yang You -ext_url: https://medium.com/pytorch/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92 -date: Jan 29, 2024 ---- - -The most prominent distinction between LLaMA-1 and LLaMA-2 lies in the incorporation of higher-quality corpora, a pivotal factor contributing to significant performance enhancements in LLaMA-2. This, coupled with its commercial availability, extends the potential for creative applications of large models within the open-source community. \ No newline at end of file diff --git a/_community_blog/datathon-2025.md b/_community_blog/datathon-2025.md deleted file mode 100644 index 754406c063a0..000000000000 --- a/_community_blog/datathon-2025.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: "Solve Real-Word AI Challenges with PyTorch at Datathon 2025: DataOrbit" -author: "Aakash Senthilnathan" -ext_url: /blog/datathon-2025/ -date: Feb 12, 2025 ---- - -**We’re excited to have PyTorch sponsor [Datathon 2025: DataOrbit](https://dataorbit-2025.devpost.com/)**, a place where students can collaborate with a team to solve problems using real-world datasets! This event, hosted by Data Science UCSB in collaboration with Gaucho Sports Analytics and ACM@UCSB, will take place on **February 22–23rd, 2025 at UC Santa Barbara**, with the incredible opportunity to present your project to a panel of corporate and faculty judges – **including the executive director of Pytorch!** – for a chance to win prizes up to $3000. \ No newline at end of file diff --git a/_community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.md b/_community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.md deleted file mode 100644 index 373e67b1611b..000000000000 --- a/_community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Distributed training with PyTorch and Azure ML' -author: Beatriz Stollnitz -ext_url: https://medium.com/pytorch/distributed-training-with-pytorch-and-azure-ml-898429139098 -date: Jan 6, 2023 ---- - -Suppose you have a very large PyTorch model, and you’ve already tried many common tricks to speed up training: you optimized your code, you moved training to the cloud and selected a fast GPU VM, you installed software packages that improve training performance (for example, by using the ACPT curated environment on Azure ML). And yet, you still wish your model could train faster. Maybe it’s time to give distributed training a try! Continue reading to learn the simplest way to do distributed training with PyTorch and Azure ML. \ No newline at end of file diff --git a/_community_blog/doctr-joins-pytorch-ecosystem.md b/_community_blog/doctr-joins-pytorch-ecosystem.md deleted file mode 100644 index e0b3331438c7..000000000000 --- a/_community_blog/doctr-joins-pytorch-ecosystem.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: "docTR joins PyTorch Ecosystem: From Pixels to Data, Building a Recognition Pipeline with PyTorch and docTR" -author: Olivier Dulcy & Sebastian Olivera, Mindee -ext_url: /blog/doctr-joins-pytorch-ecosystem/ -date: Dec 18, 2024 ---- - -We’re thrilled to announce that the docTR project has been integrated into the PyTorch ecosystem! This integration ensures that docTR aligns with PyTorch’s standards and practices, giving developers a reliable, community-backed solution for powerful OCR workflows. \ No newline at end of file diff --git a/_community_blog/enhancing-deep-learning.md b/_community_blog/enhancing-deep-learning.md deleted file mode 100644 index 8c5cfd93846a..000000000000 --- a/_community_blog/enhancing-deep-learning.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Enhancing Deep Learning Workflows: PyTorch Ecosystem Tools' -author: Team PyTorch -ext_url: /blog/enhancing-deep-learning/ -date: May 12, 2024 ---- - -Welcome to the thriving PyTorch ecosystem, where a wealth of tools and libraries await, purpose-built to elevate your experience in deep learning as a developer or researcher. The Ecosystem Tools pages host many projects from experts spanning academia, industry, application development, and machine learning. diff --git a/_community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.md b/_community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.md deleted file mode 100644 index d2aef826b5b2..000000000000 --- a/_community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Exploring scientific machine learning pipelines through the SimulAI toolkit' -author: Joao Lucas de Sousa Almeida -ext_url: https://medium.com/pytorch/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0 -date: Feb 15, 2024 ---- - -SciML, short for Scientific Machine Learning, encompasses work that merges quantitative sciences with machine learning. It has gained significant traction over the past decade, driven by the widespread availability of specialized hardware (such as GPUs and TPUs) and datasets. Additionally, it has been propelled by the overarching influence of the machine learning wave, now ingrained in the zeitgeist of our times. In this context, we’d like to introduce SimulAI, an open-source toolkit under the Apache 2.0 license. SimulAI is designed to be user-friendly, providing a high-level Python interface for managing scientific machine learning pipelines. This article aims to showcase its current workflow and utility in constructing scientific experiments. We encourage feedback and potential contributions from the interested community, with plans to delve into more advanced topics in future articles. \ No newline at end of file diff --git a/_community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.md b/_community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.md deleted file mode 100644 index b11544d045ef..000000000000 --- a/_community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'How Activation Checkpointing enables scaling up training deep learning models' -author: PyTorch -ext_url: https://medium.com/pytorch/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d -date: Nov 9, 2023 ---- - -Activation checkpointing is a technique used for reducing the memory footprint at the cost of more compute. It utilizes the simple observation that we can avoid saving intermediate tensors necessary for backward computation if we just recompute them on demand instead. \ No newline at end of file diff --git a/_community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.md b/_community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.md deleted file mode 100644 index 61e8b2c9c8f9..000000000000 --- a/_community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: 'How FASHABLE achieves SoA realistic AI generated images using PyTorch and Azure Machine Learning' -author: Orlando Ribas Fernandes -ext_url: https://medium.com/pytorch/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44 -date: Feb 10, 2023 ---- - -Fashable is a company born at XNFY Lab (a joint initiative with Microsoft). The company’s main goal is to revolutionize the world of fashion with ethical Artificial Intelligence (AI) technologies built on PyTorch framework. Fashable is focused on developing AI models that generates synthetic contents for the global fashion industry. The Fashion industry has been criticized in recent years because it generates a lot of waste and is responsible for up to 10% of global carbon dioxide output. Fashable has stepped up to address this issue by introducing multiple AI solutions that generates realistic personalized consumer garments without actually producing them to help in reducing carbon footprint. This will help the fashion brands make informed decisions without investing in experimental products and also reducing the industry’s carbon footprint globally. Hence, in Fashable, our IP models utilize modern approaches, such as Generative Adversarial Networks (GANs), best seller analysis, custom dataset creation, and so on to resolve such problems. - diff --git a/_community_blog/introducing-depyf.md b/_community_blog/introducing-depyf.md deleted file mode 100644 index e245ea20c9a8..000000000000 --- a/_community_blog/introducing-depyf.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Introducing depyf: mastering torch.compile with ease' -author: PyTorch -ext_url: /blog/introducing-depyf/ -date: May 11, 2024 ---- - -We are thrilled to introduce `depyf`, a new project to the PyTorch ecosystem designed to help users understand, learn, and adapt to `torch.compile`! diff --git a/_community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.md b/_community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.md deleted file mode 100644 index 39123b2e7c55..000000000000 --- a/_community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Introducing TorchOpt: A High-Performance Differentiable Optimization Library for PyTorch' -author: Benjamin Liu -ext_url: https://medium.com/pytorch/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1 -date: Jun 29, 2023 ---- - -Explore TorchOpt, a PyTorch-based library that revolutionizes differentiable optimization with its unified programming abstraction, high-performance distributed execution runtime, and support for various differentiation modes.” \ No newline at end of file diff --git a/_community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.md b/_community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.md deleted file mode 100644 index 2f1eb493e2ab..000000000000 --- a/_community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Latest Colossal-AI boasts novel automatic parallelism and offers savings up to 46x for Stable Diffusion 2' -author: Yang You -ext_url: https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02 -date: Jan 31, 2023 ---- - -As a new PyTorch Ecosystem Partner, we at HPC-AI Tech look forward to working with the PyTorch community to advance AI technologies through our open source project, Colossal-AI. We are excited to join forces with the PyTorch community in this effort. \ No newline at end of file diff --git a/_community_blog/mlops-workflow.md b/_community_blog/mlops-workflow.md deleted file mode 100644 index 8fe3890d5d79..000000000000 --- a/_community_blog/mlops-workflow.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: "MLOps Workflow Simplified for PyTorch with Arm and GitHub Collaboration" -author: Eric Sondhi, Arm -ext_url: /blog/mlops-workflow/ -date: Jan 15, 2025 ---- - -PyTorch is one of the most widely used and most powerful deep learning frameworks for training and deploying complex neural networks. It has never been easier to train and deploy AI applications, and low-cost, high-performance, energy-efficient hardware, tools, and technology for creating optimized workflows are more accessible than ever. But data science, machine learning, and devops can be deep topics unto themselves, and it can be overwhelming for developers with one specialty to see how they all come together in the real world, or even to know where to get started. \ No newline at end of file diff --git a/_community_blog/optimize-llms.md b/_community_blog/optimize-llms.md deleted file mode 100644 index e0ecb819ac05..000000000000 --- a/_community_blog/optimize-llms.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: "Optimize LLMs for Efficiency & Sustainability" -ext_url: /blog/optimize-llms/ -date: Feb 19, 2025 -author: "Zach Lasiuk, Arm" ---- - -The rapid growth of large language model (LLM) applications is linked to rapid growth in energy demand. According to the International Energy Agency (IEA), data center electricity consumption is projected to roughly double by 2026 primarily driven by AI. This is due to the energy-intensive training requirements for massive LLMs – however, the increase in AI Inferencing workloads also plays a role. For example, compared with traditional search queries, a single AI inference can consume about [10x more energy](https://www.weforum.org/stories/2024/07/generative-ai-energy-emissions/). diff --git a/_community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.md b/_community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.md deleted file mode 100644 index 44aa1ba1ba4b..000000000000 --- a/_community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Profiling PyTorch language models with octoml-profile' -author: Team Octo -ext_url: https://medium.com/pytorch/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd -date: Apr 4, 2023 ---- - -The recent launch of PyTorch 2.0 makes it clear that the community is heavily investing in a compiler-powered future for machine learning. The new OctoML Profiler can help any user realize the full potential of these shifts in the ML landscape. \ No newline at end of file diff --git a/_community_blog/pt-fedora-os-communities.md b/_community_blog/pt-fedora-os-communities.md deleted file mode 100644 index ec37d275c4a5..000000000000 --- a/_community_blog/pt-fedora-os-communities.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: "Powering AI with PyTorch, Fedora, and Open Source Communities" -author: Sudhir Dharanendraiah -ext_url: /blog/pt-fedora-os-communities/ -date: Mar 7, 2025 ---- - -At [DevConf.IN 2025](https://www.devconf.info/in/) in Pune, I had the opportunity to host a **[PyTorch Meetup](https://pretalx.devconf.info/devconf-in-2025/talk/W3YURM/)** on February 28th. The session, titled "**Powering AI with PyTorch, Fedora, and Open Source Communities**" was aimed at introducing PyTorch to students and professionals, explaining why **PyTorch+Fedora** form an ideal AI development platform. The other key aspect I covered was collaboration between open source communities. - diff --git a/_community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.md b/_community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.md deleted file mode 100644 index 620bd7ac465d..000000000000 --- a/_community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'PyPose: A Library for Robot Learning with Physics-based Optimization' -author: PyPose -ext_url: https://medium.com/pytorch/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1 -date: Dec 6, 2023 ---- - -We are excited to share our new open-source library PyPose. It is a PyTorch-based robotics-oriented library that provides a set of tools and algorithms for connecting deep learning with physics-based optimization. \ No newline at end of file diff --git a/_community_blog/pytorch-at-gtc.md b/_community_blog/pytorch-at-gtc.md deleted file mode 100644 index da3632fa17fe..000000000000 --- a/_community_blog/pytorch-at-gtc.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: "PyTorch at GTC 2025" -author: "Team PyTorch at NVIDIA" -ext_url: /blog/pytorch-at-gtc/ -date: Mar 16, 2025 ---- - -[GTC](https://www.nvidia.com/gtc/) is coming back to San Jose on March 17–21, 2025. Join PyTorch Foundation members Arm, AWS, Google Cloud, IBM, Lightning AI, Meta, Microsoft Azure, Snowflake, and thousands of developers as we celebrate PyTorch. Together learn how AI & accelerated computing are helping humanity solve our most complex challenges. diff --git a/_community_blog/pytorch-shanghai-notes.md b/_community_blog/pytorch-shanghai-notes.md deleted file mode 100644 index 3c63c4af2ea8..000000000000 --- a/_community_blog/pytorch-shanghai-notes.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Shanghai Meetup Notes" -ext_url: /blog/pytorch-shanghai-notes/ -date: Sep 8, 2024 ---- - -We are honored to successfully host the PyTorch Shanghai Meetup on August 15, 2024. This Meetup has received great attention from the industry. We invited senior PyTorch developers from Intel and Huawei as guest speakers, who shared their valuable experience and the latest technical trends. In addition, this event also attracted PyTorch enthusiasts from many technology companies and well-known universities. A total of more than 40 participants gathered together to discuss and exchange the latest applications and technological advances of PyTorch. - -This Meetup not only strengthened the connection between PyTorch community members, but also provided a platform for local AI technology enthusiasts to learn, communicate and grow. We look forward to the next gathering to continue to promote the development of PyTorch technology in the local area. - -## 1. PyTorch Foundation Updates - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg2.jpg){:style="width:100%"} - -PyTorch Board member Fred Li shared the latest updates in the PyTorch community, He reviewed the development history of the PyTorch community, explained in detail the growth path of community developers, encouraged everyone to delve deeper into technology, and introduced the upcoming PyTorch Conference 2024 related matters. - -## 2. Intel’s Journey with PyTorch Democratizing AI with ubiquitous hardware and open software - -PyTorch CPU module maintainer Jiong Gong shared 6-year technical contributions from Intel to PyTorch and its ecosystem, explored the remarkable advancements that Intel has made in both software and hardware democratizing AI, ensuring accessibility, and optimizing performance across a diverse range of Intel hardware platforms. - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg3.jpg){:style="width:100%"} - -## 3. Exploring Multi-Backend Support in PyTorch Ecosystem: A Case Study of Ascend - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg4.jpg){:style="width:100%"} - -Fengchun Hua, a PyTorch contributor from Huawei, took Huawei Ascend NPU as an example to demonstrate the latest achievements in multi-backend support for PyTorch applications. He introduced the hardware features of Huawei Ascend NPU and the infrastructure of CANN (Compute Architecture for Neural Networks), and explained the key achievements and innovations in native support work. He also shared the current challenges and the next work plan. - -Yuanhao Ji, another PyTorch contributor from Huawei, then introduced the Autoload Device Extension proposal, explained its implementation details and value in improving the scalability of PyTorch, and introduced the latest work progress of the PyTorch Chinese community. - -## 4. Intel XPU Backend for Inductor - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg5.jpg){:style="width:100%"} - -Eikan is a PyTorch contributor from Intel. He focuses on torch.compile stack for both Intel CPU and GPU. In this session, Eikan presented Intel's efforts on torch.compile for Intel GPUs. He provided updates on the current status of Intel GPUs within PyTorch, covering both functionality and performance aspects. Additionally, Eikan used Intel GPU as a case study to demonstrate how to integrate a new backend into the Inductor using Triton. - -## 5. PyTorch PrivateUse1 Evolution Approaches and Insights - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg6.jpg){:style="width:100%"} - -Jiawei Li, a PyTorch collaborator from Huawei, introduced PyTorch's Dispatch mechanism and emphasized the limitations of DIspatchKey. He took Huawei Ascend NPU as an example to share the best practices of the PyTorch PrivateUse1 mechanism. He mentioned that while using the PrivateUse1 mechanism, Huawei also submitted many improvements and bug fixes for the mechanism to the PyTorch community. He also mentioned that due to the lack of upstream CI support for out-of-tree devices, changes in upstream code may affect their stability and quality, and this insight was recognized by everyone. diff --git a/_community_blog/sglang-joins-pytorch.md b/_community_blog/sglang-joins-pytorch.md deleted file mode 100644 index 6a05a4714873..000000000000 --- a/_community_blog/sglang-joins-pytorch.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: "SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine" -author: "SGLang Team" -ext_url: /blog/sglang-joins-pytorch/ -date: Mar 19, 2025 ---- - -We’re thrilled to announce that the SGLang project has been integrated into the PyTorch ecosystem! This integration ensures that SGLang aligns with PyTorch’s standards and practices, providing developers with a reliable and community-supported framework for fast and flexible serving of LLMs. \ No newline at end of file diff --git a/_community_blog/torch-compile-explained-ae0def293084.md b/_community_blog/torch-compile-explained-ae0def293084.md deleted file mode 100644 index 1e13b9508f02..000000000000 --- a/_community_blog/torch-compile-explained-ae0def293084.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'torch.compile, explained' -author: Kaichao You -ext_url: https://medium.com/pytorch/torch-compile-explained-ae0def293084 -date: Oct 26, 2023 ---- - -Have you ever felt overwhelmed by the complexities of torch.compile? Diving into its workings can feel like black magic, with bytecode and Python internal details that many users fail to understand, hindering them from understanding and debugging torch.compile. diff --git a/_community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.md b/_community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.md deleted file mode 100644 index 4e38934fb80b..000000000000 --- a/_community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'torchdistill — a modular, configuration-driven framework for reproducible deep learning and knowledge distillation experiments' -author: Yoshitomo Matsubara -ext_url: https://medium.com/pytorch/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815 -date: Jan 4, 2024 ---- - -This article summarizes key features and concepts of torchdistill (v1.0.0). Refer to the official documentation for its APIs and research projects. \ No newline at end of file diff --git a/_community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.md b/_community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.md deleted file mode 100644 index 1f1591a91477..000000000000 --- a/_community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Unveiling the Power of Semi-Supervised Learning: The Unified Semi-Supervised Learning Benchmark' -author: Jindong Wang -ext_url: https://medium.com/pytorch/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a -date: Jul 6, 2023 ---- - -Machine Learning models thrive on high-quality, fully-annotated data. The traditional supervised learning approach typically requires data on the scale of millions, or even billions, to train large foundational models. However, obtaining such a vast amount of labeled data is often tedious and labor-intensive. As an alternative, semi-supervised learning (SSL) aims to enhance model generalization with only a fraction of labeled data, complemented by a considerable amount of unlabeled data. This blog introduces USB — the Unified Semi-Supervised Learning Framework and Benchmark, covering multi-modalities and various SSL scenarios. \ No newline at end of file diff --git a/_community_blog/vllm-joins-pytorch.md b/_community_blog/vllm-joins-pytorch.md deleted file mode 100644 index fcdba719232c..000000000000 --- a/_community_blog/vllm-joins-pytorch.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: "vLLM Joins PyTorch Ecosystem: Easy, Fast, and Cheap LLM Serving for Everyone" -author: vLLM Team -ext_url: /blog/vllm-joins-pytorch/ -date: Dec 9, 2024 ---- - -We’re thrilled to announce that the [vLLM project](https://github.com/vllm-project/vllm) has become a PyTorch ecosystem project, and joined the PyTorch ecosystem family! - -Running large language models (LLMs) is both resource-intensive and complex, especially as these models scale to hundreds of billions of parameters. That’s where vLLM comes in — a high-throughput, memory-efficient inference and serving engine designed for LLMs. diff --git a/_community_blog/zeus.md b/_community_blog/zeus.md deleted file mode 100644 index 24299fb494b0..000000000000 --- a/_community_blog/zeus.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Deep Learning Energy Measurement and Optimization' -author: Jae-Won Chung -ext_url: /blog/zeus/ -date: May 11, 2024 ---- - -[Zeus](https://github.com/ml-energy/zeus) is an open-source toolbox for measuring and optimizing the energy consumption of deep learning workloads. Our goal is to make energy optimization based on accurate measurements as easy as possible for diverse deep learning workloads and setups by offering composable tools with minimal assumptions. diff --git a/_community_stories/1.md b/_community_stories/1.md deleted file mode 100644 index 267bd361773f..000000000000 --- a/_community_stories/1.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How Outreach Productionizes PyTorch-based Hugging Face Transformers for NLP' -ext_url: https://www.databricks.com/blog/2021/05/14/how-outreach-productionizes-pytorch-based-hugging-face-transformers-for-nlp.html -date: May 14, 2021 -tags: ["Advertising & Marketing"] ---- -At Outreach, a leading sales engagement platform, our data science team is a driving force behind our innovative product portfolio largely driven by deep learning and AI. We recently announced enhancements to the Outreach Insights feature, which is powered by the proprietary Buyer Sentiment deep learning model developed by the Outreach Data Science team. This model allows sales teams to deepen their understanding of customer sentiment through the analysis of email reply content, moving from just counting the reply rate to classification of the replier’s intent. \ No newline at end of file diff --git a/_community_stories/10.md b/_community_stories/10.md deleted file mode 100644 index b7ee0b245571..000000000000 --- a/_community_stories/10.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Solliance makes headlines with cryptocurrency news analysis platform powered by Azure Machine Learning, PyTorch' -ext_url: https://medium.com/pytorch/solliance-makes-headlines-with-cryptocurrency-news-analysis-platform-powered-by-azure-machine-52a2a290fefb -date: Mar 14, 2022 -tags: ["Finance"] ---- -Solliance delivers cutting-edge solutions that fill gaps across a wide variety of industries. Through its recent collaboration with Baseline, Solliance revolutionizes the cryptocurrency trading experience, extracting news insights from more than 150,000 global sources in near real time. To manage Baseline workloads, Solliance brought Microsoft Azure Machine Learning and PyTorch together for maximum processing power and deep learning capabilities. The result: investors can get under the headlines and see which specific news metrics are moving the volatile crypto market to make more informed trading decisions, while Baseline can release new features in weeks instead of months. \ No newline at end of file diff --git a/_community_stories/11.md b/_community_stories/11.md deleted file mode 100644 index 96138278f774..000000000000 --- a/_community_stories/11.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Create a Wine Recommender Using NLP on AWS' -ext_url: https://www.capitalone.com/tech/machine-learning/create-wine-recommender-using-nlp/ -date: March 2, 2022 -tags: ["Finance"] ---- -In this tutorial, we’ll build a simple machine learning pipeline using a BERT word embedding model and the Nearest Neighbor algorithm to recommend wines based on user inputted preferences. To create and power this recommendation engine, we’ll leverage AWS’s SageMaker platform, which provides a fully managed way for us to train and deploy our service. \ No newline at end of file diff --git a/_community_stories/12.md b/_community_stories/12.md deleted file mode 100644 index 56f6b2ab93ed..000000000000 --- a/_community_stories/12.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Crayon boosts speed, accuracy of healthcare auditing process using Azure Machine Learning and PyTorch' -ext_url: https://www.microsoft.com/en/customers/story/1503427278296945327-crayon-partner-professional-services-azure -date: June 28, 2022 -tags: ["Healthcare"] ---- -Healthcare providers need to be able to verify that they’re maintaining the highest operating safety and efficacy standards. Those standards are set by a national accreditation organization whose surveyors, often healthcare professionals themselves, regularly visit facilities and document situations that might need to be corrected or brought back in line with the latest rules and policies. That assessment and accreditation process generates a huge amount of data, and even the most experienced surveyors struggle to keep ahead of the ongoing development of thousands of policy rules that might be relevant in any particular scenario. Vaagan and his team took on the task of fixing the issue by building a machine learning solution that could ingest text from those reports and return a top ten list of the latest associated rules with unprecedented accuracy. They used Azure technology, development tools, and services to bring that solution to fruition. Crayon customers report clear time savings with the new healthcare solution. Just as important, the solution provides consistent responses that aren’t subject to the vagaries of individual interpretation or potentially out-of-date data. \ No newline at end of file diff --git a/_community_stories/13.md b/_community_stories/13.md deleted file mode 100644 index 0e7b6371eaf1..000000000000 --- a/_community_stories/13.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Extracting value from siloed healthcare data using federated learning with Azure Machine Learning' -ext_url: https://www.microsoft.com/en/customers/story/1587521717158304168-microsoft-partner-professional-services-azure -date: December 30, 2022 -tags: ["Healthcare"] ---- -Sensitive information such as healthcare data is often siloed within health organization boundaries. This has posed a challenge to machine learning models used by the health and life sciences industry that require data for training purposes. To improve patient care and accelerate health industry progression, the Microsoft Health & Life Sciences AI group used a federated learning setup to train their biomedical natural language processing service, Text Analytics for Health, while preserving the trust boundaries of siloed data. The federated learning framework was built using Microsoft Azure Machine Learning and open-source technologies to help organizations analyze siloed data and build new applications without compromising data privacy. \ No newline at end of file diff --git a/_community_stories/14.md b/_community_stories/14.md deleted file mode 100644 index 23f3a2bbc3f8..000000000000 --- a/_community_stories/14.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'HippoScreen Improves AI Performance by 2.4x with oneAPI Tools' -ext_url: https://www.intel.com/content/www/us/en/developer/articles/case-study/hipposcreen-boosts-ai-performance-2-4x-with-oneapi.html -date: Feb 21, 2023 -tags: ["Healthcare"] ---- -The Taiwan-based neurotechnology startup used tools and frameworks in the Intel® oneAPI Base and AI Analytics Toolkits to the improve efficiency and build times of deep-learning models used in its Brain Waves AI system. As a result, HippoScreen is able to broaden the system’s applications to a wider range of psychiatric conditions and diseases. \ No newline at end of file diff --git a/_community_stories/16.md b/_community_stories/16.md deleted file mode 100644 index 0bee1f4ac29a..000000000000 --- a/_community_stories/16.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: "Disney's Creative Genome by Miquel Farré" -ext_url: https://www.youtube.com/watch?v=KuDxEhHk2Rk -date: Apr 27, 2021 -tags: ["Media & Entertainment"] ---- -Miquel Farré is a senior technology manager at Disney, taking the lead on projects at the intersection of video technology, machine learning and web applications. Metadata that drives content searchability is most often indexed at the title level, with limited governance and high ambiguity; at best, keyword metadata has been added to a title as a layer of enrichment. \ No newline at end of file diff --git a/_community_stories/17.md b/_community_stories/17.md deleted file mode 100644 index 3669cda5942f..000000000000 --- a/_community_stories/17.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How Disney uses PyTorch for animated character recognition' -ext_url: https://medium.com/pytorch/how-disney-uses-pytorch-for-animated-character-recognition-a1722a182627 -date: Jul 16, 2020 -tags: ["Media & Entertainment"] ---- -The long and incremental evolution of the media industry, from a traditional broadcast and home video model, to a more mixed model with increasingly digitally-accessible content, has accelerated the use of machine learning and artificial intelligence (AI). Advancing the implementation of these technologies is critical for a company like Disney that has produced nearly a century of content, as it allows for new consumer experiences and enables new applications for illustrators and writers to create the highest-quality content. \ No newline at end of file diff --git a/_community_stories/18.md b/_community_stories/18.md deleted file mode 100644 index 87dc0045b4ec..000000000000 --- a/_community_stories/18.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Machine Learning at Tubi: Powering Free Movies, TV and News for All' -ext_url: https://medium.com/pytorch/machine-learning-at-tubi-powering-free-movies-tv-and-news-for-all-51499643018e -date: Feb 25, 2021 -tags: ["Media & Entertainment"] ---- -In this blog series, our aim is to highlight the nuances of Machine Learning in Tubi’s Ad-based Video on Demand (AVOD) space as practiced at Tubi. Machine Learning helps solve myriad problems involving recommendations, content understanding and ads. We extensively use PyTorch for several of these use cases as it provides us the flexibility, computational speed and ease of implementation to train large scale deep neural networks using GPUs. \ No newline at end of file diff --git a/_community_stories/19.md b/_community_stories/19.md deleted file mode 100644 index 1c26fc2f71a2..000000000000 --- a/_community_stories/19.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How Pixar uses AI and GANs to create high-resolution content' -ext_url: https://venturebeat.com/business/how-pixar-uses-ai-and-gans-to-create-high-resolution-content/ -date: July 17, 2020 -tags: ["Media & Entertainment"] ---- -As digital animators continue to push the boundaries of technology and creativity, the technical teams that support them are turning to artificial intelligence and machine learning to deliver the tools they need. That’s the case at Pixar, where the company has made new machine learning breakthroughs it hopes will both improve quality and reduce costs. \ No newline at end of file diff --git a/_community_stories/2.md b/_community_stories/2.md deleted file mode 100644 index 424e66e6fcac..000000000000 --- a/_community_stories/2.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing' -ext_url: /blog/amazon-ads-case-study/ -date: February 24, 2022 -tags: ["Advertising & Marketing", "Retail"] ---- -Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out. Amazon Ads helps companies build their brand and connect with shoppers through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses and brands of all sizes, including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies can upload their own ad creatives, which can include images, video, audio, and, of course, products sold on Amazon. \ No newline at end of file diff --git a/_community_stories/20.md b/_community_stories/20.md deleted file mode 100644 index c5ad56b5e728..000000000000 --- a/_community_stories/20.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Running BERT model inference on AWS Inf1: From model compilation to speed comparison' -ext_url: https://note.com/asahi_ictrad/n/nf5195eb53b88 -date: November 21, 2021 -tags: ["Media & Entertainment"] ---- -In this tech blog, we will compare the speed and cost of Inferentia, GPU, and CPU for a BERT sequence labeling example. We also provide a helpful tutorial on the steps for model compilation and inference on Inf1 instances. \ No newline at end of file diff --git a/_community_stories/21.md b/_community_stories/21.md deleted file mode 100644 index ede721b4241e..000000000000 --- a/_community_stories/21.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Ambient Clinical Intelligence: Generating Medical Reports with PyTorch' -ext_url: /blog/ambient-clinical-intelligence-generating-medical-reports-with-pytorch/ -date: May 12, 2022 -tags: ["Medical"] ---- -Complete and accurate clinical documentation is an essential tool for tracking patient care. It allows for treatment plans to be shared among care teams to aid in continuity of care and ensures a transparent and effective process for reimbursement. \ No newline at end of file diff --git a/_community_stories/22.md b/_community_stories/22.md deleted file mode 100644 index 24683262ecfd..000000000000 --- a/_community_stories/22.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'AstraZeneca is using PyTorch-powered algorithms to discover new drugs' -ext_url: https://www.zdnet.com/article/astrazeneca-is-using-pytorch-powered-algorithms-to-discover-new-drugs/ -date: Sept. 30, 2020 -tags: ["Medical"] ---- -Since it launched in 2017, Facebook's machine-learning framework PyTorch has been put to good use, with applications ranging from powering Elon Musk's autonomous cars to driving robot-farming projects. Now pharmaceutical firm AstraZeneca has revealed how its in-house team of engineers are tapping PyTorch too, and for equally as important endeavors: to simplify and speed up drug discovery. \ No newline at end of file diff --git a/_community_stories/23.md b/_community_stories/23.md deleted file mode 100644 index ffda0ce4b314..000000000000 --- a/_community_stories/23.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Deploying huggingface‘s BERT to production with pytorch/serve' -ext_url: https://medium.com/analytics-vidhya/deploy-huggingface-s-bert-to-production-with-pytorch-serve-27b068026d18 -date: Apr 25, 2020 -tags: ["Medical"] ---- -TL;DR: pytorch/serve is a new awesome framework to serve torch models in production. This story teaches you how to use it for huggingface/transformers models like BERT. \ No newline at end of file diff --git a/_community_stories/24.md b/_community_stories/24.md deleted file mode 100644 index fb33da259dd6..000000000000 --- a/_community_stories/24.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How AI is Helping Vets to Help our Pets' -ext_url: https://medium.com/pytorch/how-ai-is-helping-vets-to-help-our-pets-e6e3d58c052e -date: Sep 7, 2021 -tags: ["Medical"] ---- -1 in 4 dogs, and 1 in 5 cats, will develop cancer at some point in their lives. Pets today have a better chance of being successfully treated than ever, thanks to advances in early recognition, diagnosis and treatment. \ No newline at end of file diff --git a/_community_stories/25.md b/_community_stories/25.md deleted file mode 100644 index 5b2905604d25..000000000000 --- a/_community_stories/25.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How theator Built a Continuous Training Framework To Scale up Its Surgical Intelligence Platform' -ext_url: https://medium.com/pytorch/how-theator-built-a-continuous-training-framework-to-scale-up-its-surgical-intelligence-platform-b5135e3229fd -date: Dec 17, 2020 -tags: ["Medical"] ---- -Performing surgery is largely about decision making. As Dr. Frank Spencer put it in 1978, “A skillfully performed operation is about 75% decision making and 25% dexterity”. Five decades later, and the surgical field is finally — albeit gradually — implementing advances in data science and AI to enhance surgeons’ ability to make the best decisions in the operating room. That’s where theator comes in: the company is re-imagining surgery with a Surgical Intelligence platform that leverages highly advanced AI, specifically machine learning and computer vision technology, to analyze every step, event, milestone, and critical junction of surgical procedures — significantly boosting surgeons’ overall performance. \ No newline at end of file diff --git a/_community_stories/26.md b/_community_stories/26.md deleted file mode 100644 index 63397a1af6dc..000000000000 --- a/_community_stories/26.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Speeding up drug discovery with advanced machine learning' -ext_url: https://medium.com/pytorch/speeding-up-drug-discovery-with-advanced-machine-learning-b17d59e0daa6 -date: Sep 30, 2020 -tags: ["Medical"] ---- -Whatever our job title happens to be at AstraZeneca, we’re seekers. I’m part of the Biological Insights Knowledge Graph (BIKG) team. We help scientists comb through massive amounts of data in our quest to find the information we need to help us deliver life-changing medicines. \ No newline at end of file diff --git a/_community_stories/27.md b/_community_stories/27.md deleted file mode 100644 index d612e75e5724..000000000000 --- a/_community_stories/27.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Using PyTorch to streamline machine-learning projects' -ext_url: https://www.zdnet.com/article/using-pytorch-to-streamline-machine-learning-projects/ -date: Dec. 17, 2020 -tags: ["Medical"] ---- -For many surgeons, the possibility of going back into the operating room to review the actions they carried out on a patient could provide invaluable medical insights. \ No newline at end of file diff --git a/_community_stories/28.md b/_community_stories/28.md deleted file mode 100644 index a77212f18930..000000000000 --- a/_community_stories/28.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Run inference at scale for OpenFold, a PyTorch-based protein folding ML model, using Amazon EKS' -ext_url: https://aws.amazon.com/blogs/machine-learning/run-inference-at-scale-for-openfold-a-pytorch-based-protein-folding-ml-model-using-amazon-eks/ -date: Oct. 25, 2022 -tags: ["Medical"] ---- -In drug discovery, understanding the 3D structure of proteins is key to assessing the ability of a drug to bind to it, directly impacting its efficacy. Predicting the 3D protein form, however, is very complex, challenging, expensive, and time consuming, and can take years when using traditional methods such as X-ray diffraction. Applying machine learning (ML) to predict these structures can significantly accelerate the time to predict protein structures—from years to hours. Several high-profile research teams have released algorithms such as AlphaFold2 (AF2), RoseTTAFold, and others. These algorithms were recognized by Science magazine as the 2021 Breakthrough of the Year. \ No newline at end of file diff --git a/_community_stories/29.md b/_community_stories/29.md deleted file mode 100644 index a6ac02477809..000000000000 --- a/_community_stories/29.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Optimize Protein Folding Costs with OpenFold on AWS Batch' -ext_url: https://aws.amazon.com/blogs/hpc/optimize-protein-folding-costs-with-openfold-on-aws-batch/ -date: Oct. 4, 2022 -tags: ["Medical"] ---- -Knowing the physical structure of proteins is an important part of the drug discovery process. Machine learning (ML) algorithms like AlphaFold v2.0 significantly reduce the cost and time needed to generate usable protein structures. These projects have also inspired development of AI-driven workflows for de novo protein design and protein-ligand interaction analysis. \ No newline at end of file diff --git a/_community_stories/3.md b/_community_stories/3.md deleted file mode 100644 index 99394598af83..000000000000 --- a/_community_stories/3.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: 'NASA and IBM to Speed AI Creation with New Foundation Models' -ext_url: https://thenewstack.io/nasa-and-ibm-to-speed-ai-creation-with-new-foundation-models/ -date: February 2, 2023 -tags: ["Aerospace"] ---- -NASA and IBM are working together to create foundation models based on NASA’s data sets — including geospatial data — with the goal of accelerating the creation of AI models. - -Foundation models are trained on large, broad data sets, then used to train other AI models by using targeted and smaller datasets. Foundation models can be used for different tasks and can apply information about one situation to another. One real-world example of a foundation model at work is ChatGPT3, which was built with the foundation model, GPT3. \ No newline at end of file diff --git a/_community_stories/30.md b/_community_stories/30.md deleted file mode 100644 index 1a723fb9bc9a..000000000000 --- a/_community_stories/30.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How Datarock is using PyTorch for more intelligent mining decision making' -ext_url: https://medium.com/pytorch/how-datarock-is-using-pytorch-for-more-intelligent-decision-making-d5d1694ba170 -date: Jun 9, 2020 -tags: ["Mining"] ---- -The mining industry is currently going through a digital revolution as it looks for new and innovative ways to explore and extract mineral resources. This has largely been driven by a need to reduce costs in a competitive global industry that’s experiencing declining ore grades and fewer new discoveries. \ No newline at end of file diff --git a/_community_stories/32.md b/_community_stories/32.md deleted file mode 100644 index b58f986c159a..000000000000 --- a/_community_stories/32.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How Trigo built a scalable AI development & deployment pipeline for Frictionless Retail' -ext_url: https://medium.com/pytorch/how-trigo-built-a-scalable-ai-development-deployment-pipeline-for-frictionless-retail-b583d25d0dd -date: Jun 16, 2020 -tags: ["Retail"] ---- -Trigo is a provider of AI & computer vision based checkout-free systems for the retail market, enabling frictionless checkout and a range of other in-store operational and marketing solutions such as predictive inventory management, security and fraud prevention, pricing optimization and event-driven marketing. \ No newline at end of file diff --git a/_community_stories/33.md b/_community_stories/33.md deleted file mode 100644 index 423906b5bc15..000000000000 --- a/_community_stories/33.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How We Built: An Early-Stage Recommender System' -ext_url: https://www.onepeloton.com/press/articles/designing-an-early-stage-recommender-system -date: October 18, 2021 -tags: ["Retail"] ---- -Personalization is ubiquitous on most platforms today. Supercharged by connectivity, and scaled by machine learning, most experiences on the internet are tailored to our personal tastes. Peloton classes offer a diversity of instructors, languages, fitness disciplines, durations and intensity. Each Member has specific fitness goals, schedule, fitness equipment, and level of skill or strength. This diversity of content and individuality of Member needs at massive scale creates the opportunity for a recommender system to create a personalized experience on the Peloton platform. \ No newline at end of file diff --git a/_community_stories/34.md b/_community_stories/34.md deleted file mode 100644 index 8fc6ba0c4738..000000000000 --- a/_community_stories/34.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Automated Background Removal in E-commerce Fashion Image Processing Using PyTorch on Databricks' -ext_url: https://www.databricks.com/blog/2021/04/29/automated-background-removal-in-e-commerce-fashion-image-processing-using-pytorch-on-databricks.html -date: April 29, 2021 -tags: ["Retail"] ---- -Wehkamp is one of the biggest e-commerce companies in the Netherlands, with more than 500,000 daily visitors on their website. A wide variety of products offered on the Wehkamp site aims to meet its customers’ many needs. An important aspect of any customer visit on an e-commerce website is a qualitative and accurate visual experience of the products. At a large scale, this is no easy task, with thousands of product photos processed in a local photo studio. \ No newline at end of file diff --git a/_community_stories/35.md b/_community_stories/35.md deleted file mode 100644 index c572513c77ea..000000000000 --- a/_community_stories/35.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'Search Model Serving Using PyTorch and TorchServe' -ext_url: https://medium.com/walmartglobaltech/search-model-serving-using-pytorch-and-torchserve-6caf9d1c5f4d -date: Jan 23, 2023 -tags: ["Retail"] ---- -Walmart Search has embarked on the journey of adopting Deep Learning in the search ecosystem to improve search relevance. For our pilot use case, we served the computationally intensive Bert Base model at runtime with an objective to achieve low latency and high throughput. - diff --git a/_community_stories/36.md b/_community_stories/36.md deleted file mode 100644 index 5a2b3e7c9737..000000000000 --- a/_community_stories/36.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'How We Used AWS Inferentia to Boost PyTorch NLP Model Performance by 4.9x for the Autodesk Ava Chatbot' -ext_url: https://medium.com/pytorch/how-we-used-aws-inferentia-to-boost-pytorch-nlp-model-performance-by-4-9x-9f79f5314ca8 -date: Apr 7, 2021 -tags: ["Technology"] ---- -Autodesk is a multinational software company with world-renowned products in areas such as Architecture, Engineering, & Construction, Manufacturing, and Media & Entertainment. Amongst Autodesk’s best-known products are AutoCAD, Revit, Maya, and Fusion 360. The company has millions of customers around the world, and many of them have need for support to make best use of their products. - diff --git a/_community_stories/37.md b/_community_stories/37.md deleted file mode 100644 index a7e6e376a9e0..000000000000 --- a/_community_stories/37.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Bentley Systems creates breakthrough framework, drastically speeds up AI development with Azure Machine Learning' -ext_url: https://www.microsoft.com/en/customers/story/1480221307332639219-bentley-systems-partner-professional-services-azure-machine-learning -date: March 16, 2022 -tags: ["Technology"] ---- -Software innovator Bentley Systems offers a broad portfolio of solutions to help the organizations that design, build, and operate the world’s infrastructure assets. The company uses machine learning in its flagship product to read disparate paper-based asset data and transform it into consolidated digital data. To speed up and formalize this process, Bentley created a machine learning operations framework using Microsoft Azure Machine Learning and PyTorch. Developers’ speed and job satisfaction have shot up since they began using this stable, reproducible framework, which easily gets their code into the cloud, accelerating delivery by three to five times and significantly increasing efficiency. \ No newline at end of file diff --git a/_community_stories/38.md b/_community_stories/38.md deleted file mode 100644 index e76ae4a1164e..000000000000 --- a/_community_stories/38.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'PyTorch Community Voices' -ext_url: https://www.youtube.com/watch?v=LBOIxA5sg2A -date: Jun 2, 2021 -tags: ["Technology"] ---- -Join us for an interview with star PyTorch community members Alexander O’Connor and Binghui Ouyang from AutoDesk as we learn how they used PyTorch and AWS Inferentia to deploy production-scale models in chatbot intent classification. \ No newline at end of file diff --git a/_community_stories/39.md b/_community_stories/39.md deleted file mode 100644 index d7771ef6c0a6..000000000000 --- a/_community_stories/39.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How PyTorch is bringing the power of AI to computers and smartphones' -ext_url: https://ai.meta.com/blog/pytorch-ai-smartphones-computers/ -date: December 2, 2022 -tags: ["Technology"] ---- -Many of the experiences people enjoy on Facebook and Instagram are powered by artificial intelligence (AI). A number of them, like Assistant, Avatars, and AR effects, cannot be powered by server-side AI due to latency, network bandwidth, and other constraints. Running AI on-device —that is, directly on a phone, tablet, or even a pair of smart glasses — offers huge advantages over constantly sending data back to a server. It’s faster, and it creates a privacy-enhancing experience for people who use our platforms. However, on-device AI presents new challenges, since it requires coping with devices that have a small battery, far less powerful processors, and less memory than a server in a data center. \ No newline at end of file diff --git a/_community_stories/4.md b/_community_stories/4.md deleted file mode 100644 index 90f2c15de2ec..000000000000 --- a/_community_stories/4.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'AI for AG: Production machine learning for agriculture' -ext_url: https://medium.com/pytorch/ai-for-ag-production-machine-learning-for-agriculture-e8cfdb9849a1 -date: Aug 6, 2020 -tags: ["Agriculture"] ---- -How did farming affect your day today? If you live in a city, you might feel disconnected from the farms and fields that produce your food. Agriculture is a core piece of our lives, but we often take it for granted. - diff --git a/_community_stories/40.md b/_community_stories/40.md deleted file mode 100644 index 0c45ff732658..000000000000 --- a/_community_stories/40.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Axon offers technology boost for public safety with in-car Automated License Plate Recognition on Azure' -ext_url: https://www.microsoft.com/en/customers/story/1610624764549732009-axon-partner-professional-services-azure -date: March 09, 2023 -tags: ["Technology"] ---- -Axon, a technology leader in public safety, developed AI technology to add cutting-edge license plate recognition capabilities to its in-car camera products, which now can identify plates for vehicles of interest and provide law enforcement with proactive notifications and alerts. Axon AI scientists and engineers chose Microsoft Azure infrastructure as a scalable, cost-efficient, and feature-rich environment where they can develop and test AI models. With Azure compute, storage, and PyTorch and machine learning resources, Axon can easily take advantage of the latest software and hardware technology to develop best-in-class AI solutions for its customers. \ No newline at end of file diff --git a/_community_stories/41.md b/_community_stories/41.md deleted file mode 100644 index bd1d083e7577..000000000000 --- a/_community_stories/41.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'ML Model Server Resource Saving - Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance' -ext_url: /blog/ml-model-server-resource-saving/ -date: October 11, 2023 -tags: ["Technology"] ---- -Here, We will be sharing our experience in moving AI workloads from our GPU servers to our Intel CPU servers without any performance or quality degradation, and saving annual costs of approximately 340 thousand U.S. Dollar (refer to the Conclusion) in the process. \ No newline at end of file diff --git a/_community_stories/42.md b/_community_stories/42.md deleted file mode 100644 index 21fb9616f644..000000000000 --- a/_community_stories/42.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Dialogue Assistance for Customer Service at Airbnb' -ext_url: https://www.youtube.com/watch?v=jtVUV0Gzxp0&t=730s -date: Aug 20, 2019 -tags: ["Technology"] ---- -Businesses are using PyTorch, an open source machine learning framework, to seamlessly build, train, and deploy AI models in production across their products and services. Hear how industry leaders leverage PyTorch to help power everything from ubiquitous productivity software used across the world to enabling advances in medicine for fighting cancer. \ No newline at end of file diff --git a/_community_stories/43.md b/_community_stories/43.md deleted file mode 100644 index a51d7765b881..000000000000 --- a/_community_stories/43.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Using deep learning and PyTorch to power next gen aircraft at Caltech' -ext_url: https://www.youtube.com/watch?v=se206WBk2dM -date: Nov 14, 2019 -tags: ["Research", "Aeorospace"] ---- -Learn how Caltech’s Center for Autonomous Systems and Technologies (CAST) uses PyTorch to build deep learning systems that can understand the aerodynamics of how aircrafts interact with the ground to enable much smoother and safer landings. \ No newline at end of file diff --git a/_community_stories/44.md b/_community_stories/44.md deleted file mode 100644 index 4ab96977bba0..000000000000 --- a/_community_stories/44.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Deepset achieves a 3.9x speedup and 12.8x cost reduction for training NLP models by working with AWS and NVIDIA' -ext_url: https://aws.amazon.com/blogs/machine-learning/deepset-achieves-a-3-9x-speedup-and-12-8x-cost-reduction-for-training-nlp-models-by-working-with-aws-and-nvidia/ -date: Jan 27, 2021 -tags: ["Research", "NLP"] ---- -At deepset, we’re building the next-level search engine for business documents. Our core product, Haystack, is an open-source framework that enables developers to utilize the latest NLP models for semantic search and question answering at scale. Our software as a service (SaaS) platform, Haystack Hub, is used by developers from various industries, including finance, legal, and automotive, to find answers in all kinds of text documents. You can use these answers to improve the search experience, cover the long-tail of chat bot queries, extract structured data from documents, or automate invoicing processes. \ No newline at end of file diff --git a/_community_stories/45.md b/_community_stories/45.md deleted file mode 100644 index 6ad0704a27e1..000000000000 --- a/_community_stories/45.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'PyTorch at Dolby Labs' -ext_url: https://www.youtube.com/watch?v=K5hD0et_wUc&list=PL_lsbAsL_o2BY-RrqVDKDcywKnuUTp-f3&index=20 -date: Nov 6, 2019 -tags: ["Research", "NLP"] ---- -Hear how Dolby Labs is using PyTorch to develop deep learning for audio, and learn about the challenges that audio AI presents and the breakthroughs and applications they’ve built at Dolby to push the field forward. \ No newline at end of file diff --git a/_community_stories/46.md b/_community_stories/46.md deleted file mode 100644 index d7562ccc49bb..000000000000 --- a/_community_stories/46.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Using a Grapheme to Phoneme Model in Cisco’s Webex Assistant' -ext_url: https://blogs.cisco.com/developer/graphemephoneme01 -date: September 7, 2021 -tags: ["Research", "NLP"] ---- -Grapheme to Phoneme (G2P) is a function that generates pronunciations (phonemes) for words based on their written form (graphemes). It has an important role in automatic speech recognition systems, natural language processing, and text-to-speech engines. In Cisco’s Webex Assistant, we use G2P modelling to assist in resolving person names from voice. See here for further details of various techniques we use to build robust voice assistants. \ No newline at end of file diff --git a/_community_stories/47.md b/_community_stories/47.md deleted file mode 100644 index c479e32d0c4d..000000000000 --- a/_community_stories/47.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'AI21 Labs Trains 178-Billion-Parameter Language Model Using Amazon EC2 P4d Instances, PyTorch' -ext_url: https://aws.amazon.com/solutions/case-studies/AI21-case-study-p4d/ -date: June 7, 2021 -tags: ["Research", "NLP"] ---- -AI21 Labs uses machine learning to develop language models focused on understanding meaning, and in 2021 it set a goal to train the recently released Jurassic-1 Jumbo, an autoregressive language model with 178 billion parameters. Developers who register for beta testing will get access to Jurassic-1 Jumbo and can immediately start to customize the model for their use case. The software startup wanted to train the model efficiently, so it looked to Amazon Web Services (AWS) and built a solution using Amazon Elastic Compute Cloud (Amazon EC2), a web service that provides secure, resizable compute capacity in the cloud. Choosing Amazon EC2 gave the company control over the training process, including node allocation. \ No newline at end of file diff --git a/_community_stories/48.md b/_community_stories/48.md deleted file mode 100644 index 147c55460932..000000000000 --- a/_community_stories/48.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'The Why and How of Scaling Large Language Models' -ext_url: https://www.youtube.com/watch?v=qscouq3lo0s -date: Jan 4, 2022 -tags: ["Research", "NLP"] ---- -Anthropic is an AI safety and research company that’s working to build reliable, interpretable, and steerable AI systems. Over the past decade, the amount of compute used for the largest training runs has increased at an exponential pace. We've also seen in many domains that larger models are able to attain better performance following precise scaling laws. The compute needed to train these models can only be attained using many coordinated machines that are communicating data between them. In this talk, Nicholas Joseph (Technical Staff, Anthropic) goes through why and how they can scale up training runs to use these machines efficiently. \ No newline at end of file diff --git a/_community_stories/49.md b/_community_stories/49.md deleted file mode 100644 index 8dac0320ec2f..000000000000 --- a/_community_stories/49.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'University of Pécs enables text and speech processing in Hungarian, builds the BERT-large model with just 1,000 euro with Azure' -ext_url: https://www.microsoft.com/en/customers/story/1402696956382669362-university-of-pecs-higher-education-azure-en-hungary -date: August 10, 2021 -tags: ["Research", "NLP"] ---- -Everyone prefers to use their mother tongue when communicating with chat agents and other automated services. However, for languages like Hungarian—spoken by only 15 million people—the market size will often be viewed as too small for large companies to create software, tools or applications that can process Hungarian text as input. Recognizing this need, the Applied Data Science and Artificial Intelligence team from University of Pécs decided to step up. Using Microsoft AI Solutions and ONNX Runtime solutions, it built and trained its own BERT-large model in native Hungarian in under 200 hours and total build cost of 1,000 euro. \ No newline at end of file diff --git a/_community_stories/5.md b/_community_stories/5.md deleted file mode 100644 index b0006022eece..000000000000 --- a/_community_stories/5.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Using PyTorch for Monocular Depth Estimation Webinar' -ext_url: https://www.youtube.com/watch?v=xf2QgioY370 -date: Sep 27, 2024 -tags: ["Research"] ---- -In this webinar, Bob Chesebrough of Intel guides you through the steps he took to create a clipped image with background clutter removed from the image. He accomplished this using monocular depth estimation with PyTorch. This could potentially be used to automate structure from motion and other image-related tasks where you want to highlight or focus on a single portion of an image, particularly for identifying parts of the image that were closest to the camera. Specifically, he used depth estimation on a couple of images that he took at a natural history museum to capture just the dinosaur in the foreground, eliminating the background murals, lights, and building structure. The cool thing about this algorithm is that it creates a depth estimate from a single image! \ No newline at end of file diff --git a/_community_stories/50.md b/_community_stories/50.md deleted file mode 100644 index 9f1014e46b5d..000000000000 --- a/_community_stories/50.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Mapillary Research: Seamless Scene Segmentation and In-Place Activated BatchNorm' -ext_url: /blog/mapillary-research/ -date: July 23, 2019 -tags: ["Research"] ---- -With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry. \ No newline at end of file diff --git a/_community_stories/51.md b/_community_stories/51.md deleted file mode 100644 index 2b9e820aa47a..000000000000 --- a/_community_stories/51.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How 3DFY.ai Built a Multi-Cloud, Distributed Training Platform Over Spot Instances with TorchElastic and Kubernetes' -ext_url: https://medium.com/pytorch/how-3dfy-ai-built-a-multi-cloud-distributed-training-platform-over-spot-instances-with-44be40936361 -date: Jun 17, 2021 -tags: ["Research"] ---- -Deep Learning development is becoming more and more about minimizing the time from idea to trained model. To shorten this lead time, researchers need access to a training environment that supports running multiple experiments concurrently, each utilizing several GPUs. \ No newline at end of file diff --git a/_community_stories/52.md b/_community_stories/52.md deleted file mode 100644 index 4d249134c9ea..000000000000 --- a/_community_stories/52.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'SearchSage: Learning Search Query Representations at Pinterest' -ext_url: https://medium.com/pinterest-engineering/searchsage-learning-search-query-representations-at-pinterest-654f2bb887fc -date: Nov 9, 2021 -tags: ["Research"] ---- -Pinterest surfaces billions of ideas to people every day, and the neural modeling of embeddings for content, users, and search queries are key in the constant improvement of these machine learning-powered recommendations. Good embeddings — representations of discrete entities as vectors of numbers — enable fast candidate generation and are strong signals to models that classify, retrieve and rank relevant content. \ No newline at end of file diff --git a/_community_stories/53.md b/_community_stories/53.md deleted file mode 100644 index 7929cd8495db..000000000000 --- a/_community_stories/53.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'IBM Research: Bringing massive AI models to any cloud' -ext_url: https://research.ibm.com/blog/ibm-pytorch-cloud-ai-ethernet -date: Nov 17, 2022 -tags: ["Research"] ---- -The field of AI is in the middle of a revolution. In recent years, AI models have made images, songs, or even websites out of simple text prompts. These types of models with billions of parameters, called foundation models, can with little fine-tuning be repurposed from one task to another, removing countless hours of training and labelling, and refitting a model to take on a new task. \ No newline at end of file diff --git a/_community_stories/54.md b/_community_stories/54.md deleted file mode 100644 index a6e2e0b4a958..000000000000 --- a/_community_stories/54.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'ChemicalX: A Deep Learning Library for Drug Pair Scoring' -ext_url: https://arxiv.org/abs/2202.05240 -date: Feb 10, 2022 -tags: ["Research", "Healthcare"] ---- -In this paper, we introduce ChemicalX, a PyTorch-based deep learning library designed for providing a range of state of the art models to solve the drug pair scoring task. The primary objective of the library is to make deep drug pair scoring models accessible to machine learning researchers and practitioners in a streamlined this http URL design of ChemicalX reuses existing high level model training utilities, geometric deep learning, and deep chemistry layers from the PyTorch ecosystem. Our system provides neural network layers, custom pair scoring architectures, data loaders, and batch iterators for end users. We showcase these features with example code snippets and case studies to highlight the characteristics of ChemicalX. A range of experiments on real world drug-drug interaction, polypharmacy side effect, and combination synergy prediction tasks demonstrate that the models available in ChemicalX are effective at solving the pair scoring task. Finally, we show that ChemicalX could be used to train and score machine learning models on large drug pair datasets with hundreds of thousands of compounds on commodity hardware. \ No newline at end of file diff --git a/_community_stories/55.md b/_community_stories/55.md deleted file mode 100644 index 103aa76737c0..000000000000 --- a/_community_stories/55.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Graph Convolutional Operators in the PyTorch JIT' -ext_url: https://www.youtube.com/watch?v=4swsvOLzL_A&list=PL_lsbAsL_o2BSe3eS4spnodObBa3RL08E&index=3 -date: Dec 2, 2020 -tags: ["Research", "Science"] ---- -In this talk, scientist Lindsey Gray and Ph.D. student Matthias Fey co-examine how the challenges of High Energy Particle Physics are driving the need for more efficient research and development pipelines in neural network development. In particular, they look at the additions made to PyTorch Geometric, which allow Graph Neural Network models to be compiled by the PyTorch JIT, significantly easing the process of deploying such networks at scale. \ No newline at end of file diff --git a/_community_stories/56.md b/_community_stories/56.md deleted file mode 100644 index 8a07059e38db..000000000000 --- a/_community_stories/56.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'How Intel Uses PyTorch to Empower Generative AI through Intel Arc GPUs' -ext_url: /blog/how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus/ -date: Jan 24, 2025 -tags: ["Gaming"] ---- -Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads. - diff --git a/_community_stories/57.md b/_community_stories/57.md deleted file mode 100644 index 7e717dfd000b..000000000000 --- a/_community_stories/57.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'How IBM Research Uses PyTorch and TerraTorch to Make Geospatial Computer Vision Accessible for Everyone' -ext_url: /blog/how-ibm-uses-pt-terratorch/ -date: May 1, 2025 -tags: ["Computer Vision"] ---- - -Geospatial computer vision is essential for understanding our planet — from monitoring deforestation to tracking urban development and analyzing the impacts of climate change. However, the coding and deep learning skills for applying AI models to satellite imagery and earth observation data has traditionally been a major barrier for many practitioners. diff --git a/_community_stories/6.md b/_community_stories/6.md deleted file mode 100644 index b218ca839725..000000000000 --- a/_community_stories/6.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How Wadhwani AI Uses PyTorch To Empower Cotton Farmers' -ext_url: https://medium.com/pytorch/how-wadhwani-ai-uses-pytorch-to-empower-cotton-farmers-14397f4c9f2b -date: Oct 22, 2020 -tags: ["Agriculture"] ---- -Cotton is a major fibre crop across the world, cultivated in over 80 countries with nearly 100 million families across the world rely on cotton farming for their livelihood. With such importance placed on many farmers’ crops, cotton’s particular vulnerability to pest infestations has been troubling to many. However, pest infestation is also simultaneously one of the most significant and preventable problems that farmers face with 55% of all pesticide usage in India being devoted to cotton farming. \ No newline at end of file diff --git a/_community_stories/7.md b/_community_stories/7.md deleted file mode 100644 index 7103bf45be6c..000000000000 --- a/_community_stories/7.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'How Lyft Uses PyTorch to Power Machine Learning for Their Self-Driving Cars' -ext_url: https://medium.com/pytorch/how-lyft-uses-pytorch-to-power-machine-learning-for-their-self-driving-cars-80642bc2d0ae -date: Oct 7, 2020 -tags: ["Autonomous Driving"] ---- -Lyft’s mission is to improve people’s lives with the world’s best transportation. We believe in a future where self-driving cars make transportation safer and more accessible for everyone. That’s why Level 5, Lyft’s self-driving division, is developing a complete autonomous system for the Lyft network to provide riders’ access to the benefits of this technology. However, this is an incredibly complex task. \ No newline at end of file diff --git a/_community_stories/8.md b/_community_stories/8.md deleted file mode 100644 index f23672204a07..000000000000 --- a/_community_stories/8.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Wayve’s AV2.0 builds a brighter future with Azure Machine Learning and PyTorch' -ext_url: https://www.microsoft.com/en/customers/story/1415185921593450824-wayve-partner-professional-services-azure-machine-learning -date: May 25, 2022 -tags: ["Autonomous Driving"] ---- -Wayve wants to accelerate and scale autonomous vehicle (AV) development by using vision-based machine learning for rapid prototyping and quick iteration. So, it developed a platform that uses the open-source machine learning framework PyTorch with Microsoft Azure Machine Learning to gather, manage, and process millions of hours of driving data per year—petabytes of data—consisting of images, GPS data, and data from other sensors. Wayve now has the scalable capacity to build and iterate driving models for complex urban environments, adjust models more nimbly, and adapt to new environments more readily. \ No newline at end of file diff --git a/_community_stories/9.md b/_community_stories/9.md deleted file mode 100644 index 0d208d53d26d..000000000000 --- a/_community_stories/9.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 'AI Helps Duolingo Personalize Language Learning' -ext_url: https://aws.amazon.com/machine-learning/customers/innovators/duolingo/ -date: May 25, 2024 -tags: ["Education"] ---- -Learning a foreign language was probably one of your goals last year. And the year before, and the year before that. Like gym memberships, our best intentions often don’t survive very long. Aside from the time required to achieve proficiency with a new language, most people struggle with traditional approaches to learning. Even many web-based language tools can be monotonous and cumbersome. - diff --git a/_config.yml b/_config.yml index 8048f57000ff..4b319e0d4e19 100644 --- a/_config.yml +++ b/_config.yml @@ -1,131 +1 @@ -# Site settings -title: "PyTorch Website" -author: "Facebook" -default_author: Team PyTorch -description: "Scientific Computing..." -latest_version: 1.0 -timezone: America/Los_Angeles -url: "https://pytorch.org" -baseurl: "" -plugins: - - jekyll-paginate-v2 - - jekyll-redirect-from - - jekyll-autoprefixer - - jekyll-feed -sass: - load_paths: - - _sass - - node_modules -exclude: - [ - vendor, - node_modules, - README.md, - Gemfile, - Gemdile.lock, - yarn.lock, - yarn-error.log, - package.json, - Makefile, - scripts, - _hub/docs/template.md, - ] -include: - [ - _static, - _images, - _modules, - _sources, - _asserts.html, - _comparison.html, - _creation.html, - _dynamo.html, - _inductor.html, - _lowrank.html, - _script.html, - _serialization.html, - _symbolic_trace.html, - _tensor_str.html, - _trace.html, - _utils.html, - ] -keep_files: [vendor/assets, docs/master/_static/js/vendor/] -github: [metadata] -external_urls: - github: https://github.com/pytorch/pytorch - github_issues: https://github.com/pytorch/pytorch/issues - hub_issues: https://github.com/pytorch/hub/issues - contributing: https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md - hub_template: https://github.com/pytorch/hub/blob/master/docs/template.md - twitter: https://twitter.com/pytorch - facebook: https://www.facebook.com/pytorch - slack: https://join.slack.com/t/pytorch/shared_invite/zt-2j2la612p-miUinTTaxXczKOJw48poHA - wechat: https://pytorch.org/wechat - discuss: https://discuss.pytorch.org - contributor_forum: https://dev-discuss.pytorch.org/ - tutorials: https://pytorch.org/tutorials - previous_pytorch_versions: https://pytorch.org/previous-versions/ - udacity_courses: https://pytorch.org - security: https://github.com/pytorch/pytorch/security/policy - youtube: https://www.youtube.com/pytorch - spotify: https://open.spotify.com/show/6UzHKeiy368jKfQMKKvJY5 - apple: https://podcasts.apple.com/us/podcast/pytorch-developer-podcast/id1566080008 - google: https://www.google.com/podcasts?feed=aHR0cHM6Ly9mZWVkcy5zaW1wbGVjYXN0LmNvbS9PQjVGa0lsOA%3D%3D - amazon: https://music.amazon.com/podcasts/7a4e6f0e-26c2-49e9-a478-41bd244197d0/PyTorch-Developer-Podcast? - linkedIn: https://www.linkedin.com/company/pytorch -livereload: true -markdown: kramdown -highlighter: rouge -collections: - get_started: - output: true - ecosystem: - output: true - permalink: /ecosystem/:path/ - hub: - output: true - permalink: /hub/:title/ - community_stories: - output: true - permalink: /community-stories/:path/ - style_guide: - output: false - posts: - output: true - permalink: /blog/:title/ - resources: - output: false - features: - output: false - courses: - output: false - mobile: - output: true - news: - output: true - past_issues: - output: true - events: - output: true - future: true - case_studies: - output: true - board_info: - output: true - community_blog: - output: true - videos: - output: true - -pagination: - enabled: true - per_page: 8 - permalink: "/:num/" - title: ":title | :num of :max" - limit: 0 - sort_field: "date" - sort_reverse: true - trail: - before: 2 - after: 2 -# google_site_verification: eOAFtDphTbbm4OPKva2d3Z0Z_2bBxWMGdkD0IRQ6VeA +include: [_static, _images, _modules, _sources, _asserts.html, _creation.html, _comparison.html, _lowrank.html, _script.html, _diagnostic.html, _dynamo.html, _serialization.html, _type_utils, _tensor_str.html, _trace.html, _utils.html, _internal, _C, _distributed_autograd.html, _distributed_c10d.html, _distributed_rpc.html, _fft.html, _linalg.html, _monitor.html, _nested.html, _nn.html, _profiler.html, _sparse.html, _special.html, __config__.html, _dynamo, _lobpcg.html, _jit_internal.html, _numeric_suite.html, _numeric_suite_fx.html, _sanitizer.html, _symbolic_trace.html, _async.html, _freeze.html, _fuser.html, _type_utils.html, _utils ] diff --git a/_courses/course-1.md b/_courses/course-1.md deleted file mode 100644 index d637355ed0fe..000000000000 --- a/_courses/course-1.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Course 1 -summary: Lorem ipsum dolor sit amet, consectetur adipiscing elit -thumbnail: http://via.placeholder.com/560x360/ffffff/d8d8d8 -link: https://pytorch.org -order: 1 ---- - diff --git a/_courses/course-2.md b/_courses/course-2.md deleted file mode 100644 index bcbf94fbddd6..000000000000 --- a/_courses/course-2.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Course 2 -summary: Lorem ipsum dolor sit amet, consectetur adipiscing elit -thumbnail: http://via.placeholder.com/560x360/ffffff/d8d8d8 -link: https://pytorch.org -order: 2 ---- diff --git a/_courses/course-3.md b/_courses/course-3.md deleted file mode 100644 index 35bddf9e3cd0..000000000000 --- a/_courses/course-3.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Course 3 -summary: Lorem ipsum dolor sit amet, consectetur adipiscing elit -thumbnail: http://via.placeholder.com/560x360/ffffff/d8d8d8 -link: https://pytorch.org -order: 3 ---- diff --git a/_courses/course-4.md b/_courses/course-4.md deleted file mode 100644 index 7bab968a5b34..000000000000 --- a/_courses/course-4.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Course 4 -summary: Lorem ipsum dolor sit amet, consectetur adipiscing elit -thumbnail: http://via.placeholder.com/560x360/ffffff/d8d8d8 -link: https://pytorch.org -order: 4 ---- diff --git a/_data/ecosystem/ptc/2022/posters.yaml b/_data/ecosystem/ptc/2022/posters.yaml deleted file mode 100644 index 8c6e3fdf1cb9..000000000000 --- a/_data/ecosystem/ptc/2022/posters.yaml +++ /dev/null @@ -1,461 +0,0 @@ -- authors: - - Dinkar Juyal - - Syed Asher Javed - - Harshith Padigela - - Limin Yu - - Aaditya Prakash - - Logan Kilpatrick - - Anand Sampat - - PathAI - categories: - - COMPUTER VISION - description: "PathAI is a Boston based company focussed on improving patient care using AI powered pathology. We heavily use PyTorch for building our ML systems, specifically training and deploying models on large gigapixel pathology images. In this case study, we highlight our use of PyTorch to build, experiment and deploy Additive Multiple Instance Learning (MIL) models. Additive MIL is a novel MIL technique built using PyTorch Lightning which allows end-to-end learning from millions of pixels while providing granular interpretability of spatial heatmaps. These models allow for the exact computation of the extent to which each smaller region in the gigapixel-sized image contributes to the final model prediction. This enables class-wise excitatory and inhibitory contributions to be visualized on top of the pathology image. This informs the practitioners of model failures and guides the pathologists to areas of interest. All this is made possible due to PyTorch's rapid research-to-prototype-to-deployment iteration cycle." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/A01.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/A01-thumb.png - title: "Enabling State-of-the-art Interpretability for Medical Imaging Using PyTorch" - -- authors: - - Erik Hagendorn - categories: - - LIBRARIES - description: "TorchUnmix is a library which aims to provide automatic stain unmixing and augmentation for histopathology whole slide images. Separation of histochemical stains (unmixing) is performed by orthonormal transformation of the RGB pixel data from predefined light absorption coefficients called stain vectors [1]. Precomputed publicly available stain vector definitions are often used, but inter-laboratory variation due to the histology and/or image acquisition process is common, yielding suboptimal unmixing results. Classical stain vector estimation methods rely on abundant distribution of stains, making them less practical for sparser distributions as observed from immunohistochemical stains. Geis et al. proposed a method based on k-means clustering of pixel values in the hue-saturation-density color space to determine optimal stain vectors which has been used in this work [2]. While stain vectors may be used for quantification of individual stains, TorchUnmix also provides functionalities to perform stain augmentation. Stain augmentation is a method used during the training process of deep learning models to improve generalization by unmixing the image, stochastically modifying the individual stains, and then compositing the stains into the final augmented image [3]. To our knowledge, no other libraries fully implement the above methods in PyTorch, utilizing GPU-acceleration. Additionally, TorchUnmix has extended all calculations used to perform the automatic stain unmixing and augmentation to operate on batches of images, drastically accelerating execution performance speeds in comparison to other libraries." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B01.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B01-thumb.png - title: "TorchUnmix: Automatic Stain Unmixing and Augmentation for Histopathology Images in PyTorch" - -- authors: - - Kai Fricke - - Balaji Veeramani - categories: - - LIBRARIES - description: "Scaling machine learning is hard: Cloud platform solutions like SageMaker can limit flexibility, but a custom distributed framework is often too hard to implement. In effect, ML engineers struggle to scale their workloads from local prototyping to the cloud. \n The Ray AI Runtime ('Ray AIR') is an integrated collection of machine learning libraries built around distributed computing framework Ray. It provides an easy to use interface for scalable data processing, training, tuning, batch prediction, and online serving. Adapting existing PyTorch training loops to Ray AIR's PyTorch integration needs as little as 10 lines of code changes. And scaling from local development to the cloud needs no code changes at all." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B02.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B02-thumb.png - title: "Scalable Training and Inference With Ray AIR" - -- authors: - - Jan Hückelheim - categories: - - LIBRARIES - description: "Mixed Mode autodiff combines back-propagation and forward differentiation. Both modes have pros and cons: Back-propagation is efficient for scalar functions with many trainable parameters. Back-propagation uses memory for intermediate results, requires data flow reversal, scales poorly for many output variables. Forward differentiation is straightforward to implement, memory-efficient, and easy to vectorize/parallelize or port to new hardware. Forward mode scales poorly with large number of trainable parameters. AutoMAD makes it possible to combine both modes. Use forward differentiation for some layers, while using back-prop for others." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B03.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B03-thumb.png - title: "AutoMAD: Mixed Mode Autodiff for PyTorch Models" - -- authors: - - Daniel Haziza - - Francisco Massa - - Jeremy Reizenstein - - Patrick Labatut - - Diana Liskovich - categories: - - LIBRARIES - description: "We present xFormers, a toolbox to accelerate research on Transformers. It contains efficient components, like an exact memory-efficient multi-head attention that can accelerate trainings 2x while using a fraction of the memory. xFormers components are also customizable and can be combined together to build variations of Transformers. Our hope is to enable the next generation of research based on Transformers." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B04.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B04-thumb.png - title: "xFormers: Building Blocks for Efficient Transformers" - -- authors: - - Max Balandat - categories: - - LIBRARIES - description: "linear_operator (https://github.com/cornellius-gp/linear_operator) is a library for structured linear algebra built on PyTorch. It provides a LinearOperator class that represents a tensor that is never instantiated but is instead accessed through operations like matrix multiplication, solves, decompositions, and indexing. These objects use custom linear algebra operations that can exploit particular matrix structure (e.g. diagonal, block-diagonal, triangular, Kronecker, etc.) in computations in order to achieve substantial (many orders of magnitude) improvements in time and memory complexity. Moreover, many efficient linear algebra operations (e.g. solves, decompositions, indexing, etc.) can be automatically generated from the LinearOperator's matmul function. This makes it extremely easy to compose or implement custom LinearOperators. \n The key aspect that makes linear_operator easy to use in PyTorch code is its integration with the `__torch_function__` interface - Common linear algebra operations (such as matrix multiplication, solve, SVD) are mapped to the respective torch functions (`__matmul__`, `torch.linalg.solve`, `torch.linalg.svd`), so that LinearOperator objects can be used as drop-in replacements for dense tensors even in existing code. LinearOperator operations themselves may return LinearOperator objects, automatically keeping track of algebraic structure after each computation. As a result, users never need to reason about what efficient linear algebra routines to use (so long as the input elements defined by the user encode known input structure)." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B05.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B05-thumb.png - title: "linear_operator - Structured Linear Algebra in PyTorch" - -- authors: - - Justin Zhao - categories: - - LIBRARIES - description: "Ludwig is a declarative machine learning framework that makes it easy to define and compare machine learning pipelines using a simple and flexible data-driven configuration system. The minimal configuration declares the input and output features with their respective data types. Users can specify additional parameters to preprocess, encode, and decode features, load from pre-trained models, compose the internal model architecture, set training parameters, or run hyperparameter optimization. Ludwig will build an end-to-end machine learning pipeline automatically, using whatever is explicitly specified in the configuration, while falling back to smart defaults for any parameters that are not. Scientists, engineers, and researchers use Ludwig to explore state-of-the-art model architectures, run hyperparameter search, and scale up to larger than available memory datasets and multi-node clusters, on a variety of problems using structured and unstructured features. Ludwig has 8.5K+ stars on Github and is built on top of PyTorch, Horovod, and Ray." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B06.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B06-thumb.png - title: "Declarative Machine Learning with Ludwig: End-to-end Machine Learning Pipelines Using Simple and Flexible Data-driven Configurations" - -- authors: - - Christian Puhrsch - categories: - - LIBRARIES - description: "This poster presents an overview of available and ongoing developments related to sparse memory formats, masked computation, and support for collections of variably shaped data. In particular it contains a case study of block sparse memory formats, MaskedTensor, and NestedTensor." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B07.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B07-thumb.png - title: "Generalized Shapes: Block Sparsity, MaskedTensor, NestedTensor" - -- authors: - - Sang Keun Choe - - categories: - - LIBRARIES - description: "Betty is a simple, scalable and modular library for generalized meta-learning (GML) and multilevel optimization (MLO), built upon PyTorch, that allows a unified programming interface for a number of GML/MLO applications including few-shot learning, hyperparameter optimization, neural architecture search, data reweighting, and many more. The internal autodiff mechanism and the software design of Betty are developed by the novel interpretation of GML/MLO as a dataflow graph." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B08.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B08-thumb.png - title: "Betty: An Automatic Differentiation Library for Generalized Meta Learning" - -- authors: - - Samantha Andow - - Richard Zhou - - Horace He - - Animesh Jain - categories: - - LIBRARIES - description: "Inspired by Google JAX, functorch is a library in Pytorch that offers composable vmap (vectorization) and autodiff transforms (grad, vjp, jvp). Since its first release alongside Pytorch 1.11, combining these transforms has helped users develop and explore new techniques that were previously tricky to write in Pytorch, like Neural Tangent Kernels and non-linear optimizations (see Theseus, also from PyTorch). This will go through some basic usages and highlight some research that leverages functorch." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B09.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B09-thumb.png - title: "Functorch: Composable Function Transforms in Pytorch" - -- authors: - - Patrick Stiller - - Jeyhun Rustamov - - Friedrich Bethke - - Maksim Zhdanov - - Raj Sutarya - - Mahnoor Tanveer - - Karan Shah - - Richard Pausch - - Sunna Torge - - Alexander Debus - - Attila Cangi - - Peter Steinbach - - Michael Bussmann - - Nico Hoffmann - categories: - - LIBRARIES - description: "Our open-source Neural Solvers framework provides data-free ML-based solvers for the study and analysis of phenomena in natural sciences built on top of Pytorch. We were the first to show that certain quantum systems modeled by the 2d Schr√∂dinger equation can be accurately solved while retaining strong scaling. We also developed a novel neural network architecture, GatedPINN [1], introducing adaptable domain decomposition into the training of Physics-informed Neural Networks based on the Mixture-of-Experts paradigm. Distributed large-scale training of our GatedPINN is facilitated by Horovod, resulting in excellent GPU utilization making Neural Solvers ready for the upcoming exascale era. Upcoming projects involve higher dimensional problems such as 3d laser systems and coupled models to study the Vlasov-Maxwell system. Further experiments on novel very scalable compute hardware paves the way for applications of high-fidelity Neural Solvers to real-world applications such as Inverse Scattering Problems." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B10.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B10-thumb.png - title: "Large-Scale Neural Solvers for Partial Differential Equations" - -- authors: - - Haoqi Fan - categories: - - LIBRARIES - description: "PyTorchVideo is the deep learning library for video understanding research in PyTorch. \n" - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B11.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B11-thumb.png - title: "PyTorch Video: A Deep Learning Library for Video Understanding" - -- authors: - - Zhihan Fang - categories: - - LIBRARIES - description: "Federated Learning with Differential Privacy has witnessed an increased adoption as one of the most promising ways to train machine learning models while preserving user privacy. Existing models in Meta around people attributes are mostly built on traditional centralized machine learning methods. Recently, due to the increasing concerns about user privacy internally and externally, Machine Learning teams at Meta are experiencing either signal loss or restriction on applying new features in models to further improve model performance. In this paper, we are introducing a generic framework we built for preparing and generating models for federated learning. The model preparation process is to utilize traditional machine learning to understand model structure and hyperparameters for the target problems including training, inference, evaluations. It also requires a simulation process to train the target model structure and understand the simulated environment on the server side to tune FL specific hyperparameters. \n The model generation process is to generate device compatible models, which can be used directly on users’ devices for federated learning. We applied the FL framework on our on-device models, and integrated with device signals to improve user experience and protect user privacy." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B12.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B12-thumb.png - title: "Model Preparation Federated Learning and Device Computation" - -- authors: - - Jose Gallego-Posada - - Juan Camilo Ramirez - categories: - - LIBRARIES - description: "Cooper (https://github.com/cooper-org/cooper) is a general-purpose, deep learning-first constrained optimization library in PyTorch. Cooper is (almost!) seamlessly integrated with PyTorch and preserves the usual loss backward step workflow. If you are already familiar with PyTorch, using Cooper will be a breeze! \n This library aims to encourage and facilitate the study of constrained optimization problems in deep learning. Cooper focuses on non-convex constrained optimization problems for which the loss or constraints are not necessarily “nicely behaved” or “theoretically tractable”. Moreover, Cooper has been designed to play nicely with mini-batched/stochastic estimates for the objective and constraint functions. \n Cooper implements several popular constrained optimization protocols so you can focus on your project, while we handle the nitty-gritty behind the scenes." - link: https://github.com/cooper-org/cooper - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B13.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B13-thumb.png - title: "Constrained Optimization in PyTorch With Cooper" - -- authors: - - Wanchao Liang - - Junjie Wang - categories: - - LIBRARIES - description: "This talk will introduce 2-dimensional parallelism with PyTorch (Data Parallelism + Tensor Parallelism) using Distributed Tensor, a fundamental distributed primitive offered by PyTorch Distributed that empowers Tensor Parallelism. We have proven that using FSDP + Tensor Parallelism together could enable us to train large models like Transformer, and increase training performance. We offer end to end training techniques that enable you to train models in 2-D parallelism fashion, and checkpoint save/load models in a distributed manner." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B14.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B14-thumb.png - title: "Two Dimensional Parallelism Using Distributed Tensors" - -- authors: - - Manu Joseph - categories: - - LIBRARIES - description: "In spite of showing unreasonable effectiveness in modalities like text and image, Deep Learning has always lagged Gradient Boosting in tabular data- both in popularity and performance. But recently there have been newer models created specifically for tabular data, which is pushing the performance bar. Popularity is still a challenge, however, because there is no easy, ready-to-use library like Sci-Kit Learn for deep learning. PyTorch Tabular aims to change that by being an easy-to-use and flexible framework which makes using SOTA model architectures in tabular data as easy as Sci-Kit Learn." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B15.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B15-thumb.png - title: "PyTorch Tabular: A Framework for Deep Learning with Tabular Data" - -- authors: - - Michael Gschwind - - Christian Puhrsch - - Driss Guessous - - Rui Zhu - - Daniel Haziza - - Francisco Massa - categories: - - LIBRARIES - description: "We introduce Better Transformer, the PyTorch project to accelerate Transformers for inference and training with out-of-the-box enablement by implementing the Better Transformer ‘fastpath’. Fastpath accelerates many of the most commonly executed functions in Transformer models. Starting with PyTorch 1.13, the PyTorch Core API is implemented with accelerated operations to deliver up to 2x-4x speedups on many Transformer models, such as BERT and XLM-R. Accelerated operations are based on (1) operator and kernel fusion and (2) exploiting sparsity created by variable sequence-length NLP batches. In addition to improving MultiHeadAttention with fastpath, the model also includes sparsity support for MultiHeadAttention and TransformerEncoder modules to take advantage of variable sequence-length information with Nested Tensors for NLP models. \n At present, we enable torchtext and Hugging Face domain libraries with Better Transformer, delivering significant speedups for text, image, and audio models. Starting with the next release, PyTorch core will include even faster fused kernels and training support. You can preview these features today with PyTorch Nightlies, the nightly preview builds of the upcoming PyTorch release." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B17.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B17-thumb.png - title: "Better Transformer: Accelerating Transformer Inference in PyTorch" - -- authors: - - Ke Wen - - Pavel Belevich - - Anjali Sridhar - categories: - - LIBRARIES - description: "PiPPy is a library that provides automated pipeline parallelism for PyTorch models. With compiler techniques, PiPPy splits a model into pipeline stages without requiring model changes. PiPPy also provides a distributed runtime that distributes the split stages to multiple devices and hosts and orchestrates micro-batch execution in an overlapped fashion. We demonstrate application of PiPPy to Hugging Face models achieving 3x speedup on cloud platforms." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/B18.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/B18-thumb.png - title: "PiPPy: Automated Pipeline Parallelism for PyTorch" - -- authors: - - Keita Watanabe - categories: - - OPTIMIZATION - description: "In this session we will go through step-by-step how to conduct the inference process of machine learning models using Inferentia. In addition, we compare the inference performance with GPU and discuss the cost advantage. In the later part of the session, we will also cover model deployment on Kubernetes." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/C01.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/C01-thumb.png - title: "Practical Guide on PyTorch Inference Using AWS Inferentia" - -- authors: - - Mingfei Ma - - - categories: - - OPTIMIZATION - description: "Accelerating PyG CPU performance with faster sparse aggregation.\nPyG is a library built upon PyTorch to easily write and train Graph Neural Networks, which heavily relies on the mechanism of Message Passing for information aggregation. We have optimized critical bottlenecks of Message Passing from PyTorch, including: 1. Scatter Reduce: maps to classic PyG use case when the EdgeIndex is stored in COO memory format. 2. SpMM Reduce: maps to the usage case when the EdgeIndex is stored in CSR memory format." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/C02.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/C02-thumb.png - title: "PyG Performance Optimization for CPU" - -- authors: - - Jerry Zhang - categories: - - OPTIMIZATION - description: "Currently, PyTorch Architecture Optimization (torch.ao) offers two quantization flow tools: eager mode quantization (beta) and fx graph mode quantization (prototype). With PyTorch 2.0 coming up, we are going to redesign quantization on top of the PyTorch 2.0 export path, this talk will introduce our plans for supporting quantization in PyTorch 2.0 export path, its main advantages over the previous tools, and how modeling developers and backend developers will be interacting with this flow." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/C03.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/C03-thumb.png - title: "Quantization in PyTorch 2.0 Export" - -- authors: - - Naren Dasan - - Dheeraj Peri - - Bo Wang - - Apurba Bose - - George Stefanakis - - Nick Comly - - Wei Wei - - Shirong Wu - - Yinghai Lu - categories: - - OPTIMIZATION - description: "Torch-TensorRT is an open-source compiler targeting NVIDIA GPUs for high-performance deep-learning inference in PyTorch. It combines the usability of PyTorch with the performance of TensorRT allowing for easy optimization of inference workloads on NVIDIA GPUs. Torch-TensorRT supports all classes of optimizations in TensorRT including reduced mixed precision down to INT8, through simple Python & C++ APIs designed to work directly from PyTorch. Torch-TensorRT outputs standard PyTorch modules as well as the TorchScript format to allow for a completely self-contained, portable, & static module with TensorRT engines embedded. We present recent improvements to Torch-TensorRT including the new FX frontend which allows developers to use a full Python workflow for optimizing models and extend Torch-TensorRT in Python, the unified Torch-TensorRT Runtime which enables hybrid FX + TorchScript workflows and discuss future work for the project." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/C04.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/C04-thumb.png - title: "Torch-TensorRT: A Compiler for Accelerating PyTorch Inference Using TensorRT" - -- authors: - - Sanchit Jain - categories: - - OPTIMIZATION - description: "The open-source oneDNN Graph library extends oneDNN with a flexible graph API to maximize the optimization opportunities for generating efficient code on AI hardware (currently x86-64 CPUs, but GPU support is on the way). It automatically identifies the graph partitions to be accelerated via fusion. Its fusion patterns entail fusing compute-intensive operations such as convolution, matmul and their neighbor operations for both inference and training use cases. Since PyTorch 1.12, oneDNN Graph has been supported as an experimental feature to speed up inference with Float32 datatype on x86-64 CPUs. Support for inference with oneDNN Graph using BFloat16 datatype exists in the PyTorch master branch, and hence also in nightly PyTorch releases. Intel Extension for PyTorch is an open-source library that builds on top of PyTorch, and can be thought of as a 'staging-ground' for optimizations in PyTorch from Intel. It leverages oneDNN Graph for inference with int8 datatype. This poster presents reproducible results with PyTorch’s TorchBench benchmarking suite to demonstrate the inference speedup achieved with PyTorch & oneDNN Graph using Float32, BFloat16 & int8 datatypes." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/G01.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/G01-thumb.png - title: "Accelerating Inference with PyTorch by Leveraging Graph Fusions With oneDNN Graph" - -- authors: - - Alban Desmaison - categories: - - OTHER - description: "This poster presents the new extension points that the PyTorch team has designed to allow users to extend PyTorch from Python. We will cover an introduction to Tensor Subclassing, Modes and torch library. We will briefly describe each extension point and talk through examples such as memory profiling, logging used operators, quantization and custom sparse kernel all in less than 100 LOC. We will also introduce the new ways you can add new devices and author kernels without the need to modify PyTorch directly." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/D01.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/D01-thumb.png - title: "Back to Python: Extending PyTorch Without Touching C++" - -- authors: - - Brian Hirsh - categories: - - OTHER - description: "Functionalization is a way to remove mutations from arbitrary PyTorch programs sent to downstream compilers. The PyTorch 2.0 stack is all about capturing graphs of PyTorch operations and sending them off to a compiler to get better performance. PyTorch programs can mutate and alias state, making them unfriendly to compilers. Functionalization is a technique to take a program full of PyTorch operators, including mutable and aliasing operators, and remove all mutations from the program while preserving semantics." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/D02.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/D02-thumb.png - title: "Functionalization in PyTorch" - -- authors: - - Pankaj Takawale - - Dagshayani Kamalaharan - - Zbigniew Gasiorek - - Rahul Sharnagat - categories: - - OTHER - description: "Walmart Search has embarked on the journey of adopting Deep Learning in the Search ecosystem for improving Search relevance in various parts. As our pilot use case, we wanted to serve the computationally intensive Bert Base model at runtime with an objective to achieve low latency and high throughput. We had JVM hosted web applications loading and serving multiple models. The experimental models were being loaded onto the same applications. These models are large in size and computation is expensive. \n We were facing the following limitations with this approach: Refreshing model with the latest version or adding new experimental model would need application deployment. Increased memory pressure on a single application. Slow startup time due to loading multiple ML models during startup. Concurrency was not beneficial due to limited CPU (Metrics on concurrent model prediction vs sequential)." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/D03.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/D03-thumb.png - title: "Walmart Search: Serving Models at a Scale on TorchServe" - -- authors: - - Joe Doliner - - Jimmy Whitaker - categories: - - PRODUCTION - description: "TorchX is incredibly useful for developing PyTorch applications quickly. But when it comes to deployment, nothing is easy. With docker development, Kubernetes, and customer schedulers, there’s a lot to learn. In this talk, we’ll discuss how organizations can deploy to production, why TorchX is a great system for this, and lessons we learned so you can avoid hitting them too." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/E01.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/E01-thumb.png - title: "TorchX: From Local Development to Kubernetes and Back" - -- authors: - - Shauheen Zahirazami - - Jack Cao - - Blake Hechtman - - Alex Wertheim - - Ronghang Hu - categories: - - PRODUCTION - description: "PyTorch/XLA enables PyTorch users to run their models on XLA devices including Google's Cloud TPUs. The latest improvements in PyTorch/XLA enables training PyTorch models using FSDP to train very large models. In this work we present benchmarks and Hardware Flops Utilization of training HuggingFace GPT-2 on Cloud TPU v4." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/E02.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/E02-thumb.png - title: "Training at Scale Using Fully Sharded Data Parallel (FSDP) with PyTorch/XLA" - -- authors: - - Rohan Varma - - Andrew Gu - categories: - - PRODUCTION - description: "This talk dives into recent advances in PyTorch Fully Sharded Data Parallel (FSDP) that have enabled better throughput, memory savings, and extensibility. These improvements have unblocked using FSDP for models of different modalities and for varying model and data sizes. We will share best practices to apply these features to specific use cases such as XLMR, FLAVA, ViT, DHEN, and GPT3-style models." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/E03.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/E03-thumb.png - title: "FSDP Production Readiness" - -- authors: - - Erwin Huizenga - - Nikita Namjoshi - categories: - - PRODUCTION - description: "TorchX is a universal job launcher for PyTorch applications that helps ML practitioners speed up iteration time and support end to end production. In this talk, we show you how to build and run TorchX components as a pipeline using the Kubeflow Pipeline (KFL) DSL. We go into detail on how to use KFP and TorchX to build components and how to use KFP DSL to orchestrate and run ML workflows." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/E04.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/E04-thumb.png - title: "Orchestrating Pytorch Workflows With Kubeflow Pipelines and TorchX" - -- authors: - - Shauheen Zahirazami - - James Rubin - - Mehdi Amini - - Thea Lamkin - - Eugene Burmako - - Navid Khajouei - categories: - - PRODUCTION - description: "ML development is often stymied by incompatibilities between frameworks and hardware, forcing developers to compromise on technologies when building ML solutions. OpenXLA is a community-led and open-source ecosystem of ML compiler and infrastructure projects being co-developed by AI/ML leaders including Alibaba, Amazon Web Services, AMD, Arm, Apple, Google, Intel, Meta, NVIDIA, and more. It will address this challenge by letting ML developers build their models on leading frameworks and execute them with high performance across any hardware backend. This flexibility will let developers make the right choice for their project, rather than being locked into decisions by closed systems. Our community will start by collaboratively evolving the XLA compiler and StableHLO, a portable ML compute operation set that makes frameworks easier to deploy across different hardware options." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/H01.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/H01-thumb.png - title: "A Community- led and OSS Ecosystem of ML Compiler and Infrastructure Projects" - -- authors: - - Mao Lin - - Keren Zhou - - Penfei Su - categories: - - TOOLS - description: "The limited GPU memory resources can often hinder the performance of GPU-accelerated applications. While PyTorch’s Caching Allocator aims to minimize the number of expensive memory allocations and deallocations and maximize the efficient utilization of GPU memory resources, our study of common deep learning models revealed significant memory fragmentation problems. In some cases, up to 50% of GPU memory is wasted. To better understand the root causes of memory fragmentation, we developed a tool that visualizes GPU memory usage in two ways: the allocator view and the block view. The allocator view presents memory usage with each allocation or deallocation event, and the block view shows the changes in specific memory blocks over time. Our analysis revealed the considerable potential to save GPU memory, which would relieve the bottleneck of limited resources. By employing strategies such as swapping, activation recomputation, and memory defragmentation, we were able to reduce GPU memory waste significantly." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/F01.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/F01-thumb.png - title: "Squeezing GPU Memory Usage in PyTorch" - -- authors: - - Mohamed Masoud - - Farfalla Hu - - Sergey Plis - categories: - - TOOLS - description: "In brainchop project, we bring high fidelity pre-trained deep learning models for volumetric analysis of structural magnetic resonance imaging (MRI) right to the browsers of scientists and clinicians with no requirement on their technical skills in setting up AI-solutions. All of this in an extensible open-source framework. Our tool is the first front-end MRI segmentation tool on the web that supports full brain volumetric processing in a single pass inside a browser. This property is powered by our lightweight and reliable deep learning model Meshnet that enables volumetric processing of the entire brain at once, which leads to increased accuracy with modest computational requirements. High-quality client-side processing solves the privacy problem, as the data does not need to leave the client. Moreover, browser-based implementation is able to take advantage of available hardware acceleration regardless of the brand or architecture.\n GitHub: https://github.com/neuroneural/brainchop" - link: https://github.com/neuroneural/brainchop - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/F02.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/F02-thumb.png - title: "'Brainchop': In Browser MRI Volumetric Segmentation and Rendering" - -- authors: - - Xu Zhao - - Will Constable - - David Berard - - Taylor Robie - - Eric Han - - Adnan Aziz - categories: - - TOOLS - description: "Holding the line of performance is challenging for ML frameworks like PyTorch. The existing AI benchmarks like MLPerf are end-to-end, therefore require large volumes of datasets, at-scale GPU clusters, and long benchmarking time. We develop TorchBench, a novel AI benchmark suite which highlights with minimal data inputs, single GPU, and milliseconds-per-test latencies. TorchBench is now deployed as part of the PyTorch nightly release process, guarding performance/correctness regressions and testing experimental PyTorch features on SOTA machine learning models." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/F03.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/F03-thumb.png - title: "TorchBench: Quantifying PyTorch Performance During the Development Loop" - -- authors: - - Gustaf Ahdritz - - Sachin Kadyan - - Will Gerecke - - Luna Xia - - Nazim Bouatta - - Mohammed AlQuraishi - categories: - - TOOLS - description: "OpenFold, developed by Columbia University, is an open-source protein structure prediction model implemented with PyTorch. The goal of OpenFold is to verify that AlphaFold 2 — DeepMind's protein structure prediction model — can be reproduced from scratch and beyond that, make components of the system available to like-minded researchers and academics so they can build on top of it. During this research, Weights & Biases was used to accelerate OpenFold’s reproduction of AlphaFold 2. The collaborative nature of W&B allowed for insights to scale from a single researcher to the entire team and helped solve the reproducibility challenge in ML." - link: - poster_link: https://pytorch.s3.amazonaws.com/posters/ptc2022/F04.pdf - section: F8 - thumbnail_link: https://pytorch.org/assets/images/ptc2022/F04-thumb.png - title: "Democratizing AI for Biology With OpenFold" diff --git a/_data/ecosystem/ptdd/2021/posters.yaml b/_data/ecosystem/ptdd/2021/posters.yaml deleted file mode 100644 index a26264ec07f4..000000000000 --- a/_data/ecosystem/ptdd/2021/posters.yaml +++ /dev/null @@ -1,719 +0,0 @@ -- authors: - - Brian Hu - - Paul Tunison - - Elim Schenck - - Roddy Collins - - Anthony Hoogs - categories: - - MEDICAL & HEALTHCARE, RESPONSIBLE AI - description: "Despite significant progress in the past few years, machine learning-based systems are still often viewed as “black boxes,” which lack the ability to explain their output decisions to human users. Explainable artificial intelligence (XAI) attempts to help end-users understand and appropriately trust machine learning-based systems. One commonly used technique involves saliency maps, which are a form of visual explanation that reveals what an algorithm pays attention to during its decision process. We introduce the xaitk-saliency python package, an open-source, explainable AI framework and toolkit for visual saliency algorithm interfaces and implementations, built for analytics and autonomy applications. The framework is modular and easily extendable, with support for several image understanding tasks, including image classification, image similarity, and object detection. We have also recently added support for the autonomy domain, by creating saliency maps for pixel-based deep reinforcement-learning agents in environments such as ATARI. Several example notebooks are included that demo the current capabilities of the toolkit. xaitk-saliency will be of broad interest to anyone who wants to deploy AI capabilities in operational settings and needs to validate, characterize and trust AI performance across a wide range of real-world conditions and application areas using saliency maps. To learn more, please visit: https://github.com/XAITK/xaitk-saliency." - link: https://github.com/XAITK/xaitk-saliency - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F8.png - section: F8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F8-thumb.png - title: "xaitk-saliency: Saliency built for analytics and autonomy applications" -# - authors: -# - Ali Hatamizadeh -# - Yucheng Tang -# - Vishwesh Nath -# - Dong Yang -# - Holger Roth -# - Bennett Landman -# - Daguang Xu -# categories: -# - MEDICAL & HEALTHCARE, RESPONSIBLE AI -# description: "A novel transformer-based architecture, dubbed UNETR, for semantic segmentation of volumetric medical images by reformulating this task as a 1D sequence-to-sequence prediction problem. Using a transformer encoder increases the model's ability to learn long-range dependencies and effectively captures global contextual representation at multiple scales. The effectiveness of UNETR has been validated on different volumetric segmentation tasks in CT and MRI modalities. UNETR achieves new state-of-the-art performance in both Standard and Free Competitions on the BTCV leaderboard for the multi-organ segmentation and outperforms competing approaches for brain tumor and spleen segmentation on the MSD dataset. UNETR has shown the potential to effectively learn the critical anatomical relationships represented in medical images" -# link: -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F7.png -# section: F7 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F7-thumb.png -# title: "UNETR: Transformers for 3D Medical Image Segmentation" -- authors: - - Laila Rasmy - - Ziqian Xie - - Bingyu Mao - - Khush Patel - - Wanheng Zhang - - Degui Zhi - categories: - - MEDICAL & HEALTHCARE, RESPONSIBLE AI - description: "CovRNN is a collection of recurrent neural network (RNN)-based models to predict COVID-19 patients' outcomes, using their available electronic health record (EHR) data on admission, without the need for specific feature selection or missing data imputation. CovRNN is designed to predict three outcomes: in-hospital mortality, need for mechanical ventilation, and long length of stay (LOS >7 days). Predictions are made for time-to-event risk scores (survival prediction) and all-time risk scores (binary prediction). Our models were trained and validated using heterogeneous and de-identified data of 247,960 COVID-19 patients from 87 healthcare systems, derived from the Cerner® Real-World Dataset (CRWD) and 36,140 de-identified patients' data derived from the Optum® de-identified COVID-19 Electronic Health Record v. 1015 dataset (2007 - 2020). CovRNN shows higher performance than do traditional models. It achieved an area under the receiving operating characteristic (AUROC) of 93% for mortality and mechanical ventilation predictions on the CRWD test set (vs. 91·5% and 90% for light gradient boost machine (LGBM) and logistic regression (LR), respectively) and 86.5% for prediction of LOS > 7 days (vs. 81·7% and 80% for LGBM and LR, respectively). For survival prediction, CovRNN achieved a C-index of 86% for mortality and 92·6% for mechanical ventilation. External validation confirmed AUROCs in similar ranges. https://www.medrxiv.org/content/10.1101/2021.09.27.2126" - link: https://github.com/ZhiGroup/CovRNN - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F6.png - section: F6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F6-thumb.png - title: "CovRNN—A collection of recurrent neural network models for predicting outcomes of COVID-19 patients using their EHR data" -- authors: - - Sanzhar Askaruly - - Nurbolat Aimakov - - Alisher Iskakov - - Hyewon Cho - - Yujin Ahn - - Myeong Hoon Choi - - Hyunmo Yang - - Woonggyu Jung - categories: - - MEDICAL & HEALTHCARE, RESPONSIBLE AI - description: "Deep learning has transformed many aspects of industrial pipelines recently. Scientists involved in biomedical imaging research are also benefiting from the power of AI to tackle complex challenges. Although the academic community has widely accepted image processing tools, such as scikit-image, ImageJ, there is still a need for a tool which integrates deep learning into biomedical image analysis. We propose a minimal, but convenient Python package based on PyTorch with common deep learning models, extended by flexible trainers and medical datasets. In this work, we also share theoretical dive in the form of course as well as minimal tutorials to run Android applications, containing models trained with Farabio." - link: https://github.com/tuttelikz/farabio - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F5.png - section: F5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F5-thumb.png - title: "Farabio - Deep learning for Biomedical Imaging" -- authors: - - Fernando Pérez-García - - Rachel Sparks - - Sébastien Ourselin - categories: - - MEDICAL & HEALTHCARE, RESPONSIBLE AI - description: "Processing of medical images such as MRI or CT presents different challenges compared to RGB images typically used in computer vision: a lack of labels for large datasets, high computational costs, and the need of metadata to describe the physical properties of voxels. Data augmentation is used to artificially increase the size of the training datasets. Training with image patches decreases the need for computational power. Spatial metadata needs to be carefully taken into account in order to ensure a correct alignment and orientation of volumes. We present TorchIO, an open-source Python library to enable efficient loading, preprocessing, augmentation and patch-based sampling of medical images for deep learning. TorchIO follows the style of PyTorch and integrates standard medical image processing libraries to efficiently process images during training of neural networks. TorchIO transforms can be easily composed, reproduced, traced and extended. We provide multiple generic preprocessing and augmentation operations as well as simulation of MRI-specific artifacts.TorchIO was developed to help researchers standardize medical image processing pipelines and allow them to focus on the deep learning experiments. It encourages good open-science practices, as it supports experiment reproducibility and is version-controlled so that the software can be cited precisely. Due to its modularity, the library is compatible with other frameworks for deep learning with medical images." - link: https://github.com/fepegar/torchio/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F4.png - section: F4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F4-thumb.png - title: "TorchIO: Pre-processing & Augmentation of Medical Images for Deep Learning Applications" -- authors: - - Michael Zephyr - - Prerna Dogra - - Richard Brown - - Wenqi Li - - Eric Kerfoot - categories: - - MEDICAL & HEALTHCARE, RESPONSIBLE AI - description: "Healthcare image analysis for both radiology and pathology is increasingly being addressed with deep-learning-based solutions. These applications have specific requirements to support various imaging modalities like MR, CT, ultrasound, digital pathology, etc. It is a substantial effort for researchers in the field to develop custom functionalities to handle these requirements. Consequently, there has been duplication of effort, and as a result, researchers have incompatible tools, which makes it hard to collaborate. MONAI stands for Medical Open Network for AI. Its mission is to accelerate the development of healthcare imaging solutions by providing domain-specialized building blocks and a common foundation for the community to converge in a native PyTorch paradigm." - link: https://monai.io/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F3.png - section: F3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F3-thumb.png - title: "MONAI: A Domain Specialized Library for Healthcare Imaging" -- authors: - - Sahar Karimi - - Beliz Gokkaya - - Audrey Flower - - Ehsan Emamjomeh-Zadeh - - Adly Templeton - - Ilknur Kaynar Kabul - - Erik Meijer - categories: - - MEDICAL & HEALTHCARE, RESPONSIBLE AI - description: "We are presenting a framework for building Bayesian Neural Networks (BNN). One of the critical use cases of BNNs is uncertainty quantification of ML predictions in deep learning models. Uncertainty quantification leads to more robust and reliable ML systems that are often employed to prevent catastrophic outcomes of overconfident predictions especially in sensitive applications such as integrity, medical imaging and treatments, self driving cars, etc.. Our framework provides tools to build BNN models, estimate the uncertainty of their predictions, and transform existing models into their BNN counterparts. We discuss the building blocks and API of our framework along with a few examples and future directions." - link: - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F2.png - section: F2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F2-thumb.png - title: "A Framework for Bayesian Neural Networks" -# - authors: -# - Pranav Bhamidipati -# - Guruprasad Raghavan -# - Matt Thomson -# categories: -# - MEDICAL & HEALTHCARE, RESPONSIBLE AI -# description: "Biological tissues reliably grow into precise, functional structures from simple starting states during development. Throughout the developmental process, the energy of a tissue changes depending on its natural resistance to deformations such as stretching, bending, shearing, and torsion. In this paper, we represent tissue structures as shapes and develop a mathematical framework using PyTorch's autograd functionality and TorchVision to discover paths on the tissue shape manifold to minimize the total energy during development. We find that paths discovered by gradient descent and the geodesic algorithm outperform naive shape interpolation in energetic terms and resemble strategies observed in development. Broadly, these tools built on PyTorch frameworks can be used to understand and compare shape transformations in biology and propose optimal strategies for synthetic tissue engineering." -# link: -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F1.png -# section: F1 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/F1-thumb.png -# title: "Traversing Geodesics to Grow Biological Structures" -# - authors: -# - Zhengyang Feng -# - Shaohua Guo -# categories: -# - AUDIO, IMAGE & VIDEO, VISION -# description: "PytorchAutoDrive is an open-source codebase to facilitate autonomous driving research, which focuses on autonomous driving perception tasks. Based on PyTorch and TorchVision, it provides a unified level of tricks for fair evaluation of different methods, beginner-friendly codes, visualization tools, and benchmarking of model speed/flops count. Currently, with PyTorch DDP and AMP, fast training of semantic segmentation and lane detection tasks are supported on 7 datasets, with 9 re-implemented methods. With help from the PyTorch developer community, we will support more methods and functionals in the future https://github.com/voldemortX/pytorch-auto-drive" -# link: -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E10.png -# section: E10 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E10-thumb.png -# title: "PytorchAutoDrive: Toolkit & Fair Benchmark for Autonomous Driving Research" -- authors: - - Philip Meier - - torchvision team - - torchdata team - categories: - - AUDIO, IMAGE & VIDEO, VISION - description: "torchvision provides a lot of image and video datasets as well as transformations for research and prototyping. In fact, the very first release of torchvision in 2016 was all about these two submodules. Since their inception their extent has grown organically and became hard to maintain and sometimes also hard to use. Over the years we have gathered a lot of user feedback and decided to revamp the datasets and transforms. This poster will showcase the current state of the rework and compare it to the hopefully soon to be legacy API." - link: https://pytorchvideo.org/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E9-thumb.png - section: E9 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E9-thumb.png - title: "Revamp of torchvision datasets and transforms" -- authors: - - Wenwei Zhang - - Han Lyu - - Kai Chen - categories: - - AUDIO, IMAGE & VIDEO, VISION - description: "OpenMMLab builds open-source tool boxes for computer vision. It aims to 1) provide high-quality codebases to reduce the difficulties in algorithm reimplementation; 2) create efficient deployment toolchains targeting a variety of inference engines and devices; 3) build a solid foundation for the community to bridge the gap between academic research and industrial applications. Based on PyTorch, OpenMMLab develops MMCV to provide unified abstract interfaces and common utils, which serve as a foundation of the whole system. Since the initial release in October 2018, OpenMMLab has released 15+ tool boxes covering different research areas. It has implemented 200+ algorithms and released contain 1800+ pre-trained models. With tighter collaboration with the community, OpenMMLab will open source more toolboxes and full-stack toolchains in the future." - link: openmmlab.com - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E8.png - section: E8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E8-thumb.png - title: "OpenMMLab: Open-Source Toolboxes for Artificial Intelligence" -# - authors: -# - Ayse Ayyuce Demirbas -# categories: -# - AUDIO, IMAGE & VIDEO, VISION -# description: "A Generative Adversarial Network (GAN) is a powerful architecture to generate realistic images. In this work, we generate new lung adenocarcinoma tissue images with GAN using Lung and Colon Cancer Histopathological Images dataset. Additionally, we propose two convolutional neural network models for the generator and discriminator." -# link: -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E7.png -# section: E7 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E7-thumb.png -# title: "Generation of Synthetic Lung Cancer Histopathological Images using Generative Adversarial Networks" -# - authors: -# - Haoqi Fan* -# - Tullie Murrell* -# - Heng Wang† -# - Kalyan Vasudev Alwala† -# - Yanghao Li† -# - Yilei Li† -# - Bo Xiong† -# - Nikhila Ravi -# - Meng Li -# - Haichuan Yang -# - Jitendra Malik -# - Ross Girshick -# - Matt Feiszli -# - Aaron Adcock‡ -# - Wan-Yen Lo‡ -# - Christoph Feichtenhofer‡ -# categories: -# - AUDIO, IMAGE & VIDEO, VISION -# description: "We introduce PyTorchVideo, an open-source deep-learning library that provides a rich set of modular, efficient, and reproducible components for a variety of video understanding tasks, including classification, detection, self-supervised learning, and low-level processing. The library covers a full stack of video understanding tools including multimodal data loading, transformations, and models that reproduce state-of-the-art performance. PyTorchVideo further supports hardware acceleration that enables real-time inference on mobile devices. The library is based on PyTorch and can be used by any training framework; for example, PyTorchLightning, PySlowFast, or Classy Vision. PyTorchVideo is available at https://pytorchvideo.org/." -# link: https://pytorchvideo.org/ -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E6.png -# section: E6 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E6-thumb.png -# title: "PyTorchVideo - A Deep Learning Library for Video Understanding" -- authors: - - Siddha Ganju - - Sayak Paul - categories: - - AUDIO, IMAGE & VIDEO, VISION - description: "Floods wreak havoc throughout the world, causing billions of dollars in damages, and uprooting communities, ecosystems and economies. Aligning flood extent mapping with local topography can provide a plan-of-action that the disaster response team can consider. Thus, remote flood level estimation via satellites like Sentinel-1 can prove to be remedial. The Emerging Techniques in Computational Intelligence (ETCI) competition on Flood Detection tasked participants with predicting flooded pixels after training with synthetic aperture radar (SAR) images in a supervised setting. We use a cyclical approach involving two stages (1) training an ensemble model of multiple UNet architectures with available high and low confidence labeled data and, generating pseudo labels or low confidence labels on the entire unlabeled test dataset, and then, (2) filter out quality generated labels and, (3) combining the generated labels with the previously available high confidence labeled dataset. This assimilated dataset is used for the next round of training ensemble models. This cyclical process is repeated until the performance improvement plateaus. Additionally, we post-process our results with Conditional Random Fields. Our approach sets the second-highest score on the public hold-out test leaderboard for the ETCI competition with 0.7654 IoU. To the best of our knowledge we believe this is one of the first works to try out semi-supervised learning to improve flood segmentation models." - link: https://github.com/sidgan/ETCI-2021-Competition-on-FLood-Detection - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E5.png - section: E5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E5-thumb.png - title: "Flood Segmentation on Sentinel-1 SAR Imagery with Semi-Supervised Learning" -# - authors: -# - Kevin Zakka -# - Andy Zeng -# - Pete Florence -# - Jonathan Tompson -# - Jeannette Bohg -# - Debidatta Dwibedi -# categories: -# - AUDIO, IMAGE & VIDEO, VISION -# description: "We investigate the visual cross-embodiment imitation setting, in which agents learn policies from videos of other agents (such as humans) demonstrating the same task, but with stark differences in their embodiments -- end-effector shape, actions, etc. In this work, we demonstrate that it is possible to automatically discover and learn vision-based reward functions from cross-embodiment demonstration videos that are robust to these differences. Specifically, we present a self-supervised method for Cross-embodiment Inverse Reinforcement Learning (XIRL) that leverages temporal cycle-consistency constraints to learn deep visual embeddings that capture task progression from offline videos of demonstrations across multiple expert agents, each performing the same task differently due to embodiment differences. We show empirically that if the embeddings are aware of task progress, simply taking the negative distance between the current state and goal state in the learned embedding space is useful as a reward for training policies with reinforcement learning. We find our learned reward function not only works for embodiments seen during training, but also generalizes to entirely new embodiments. Additionally, when transferring real-world human demonstrations to a simulated robot, we find that XIRL is more sample efficient than current best methods." -# link: https://x-irl.github.io/ -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E4.png -# section: E4 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E4-thumb.png -# title: "XIRL: Cross-embodiment Inverse Reinforcement Learning" -- authors: - - Xiaoyu Liu - - James Wagner - - Roy Fejgin - - Joan Serra - - Santiago Pascual - - Cong Zhou - - Jordi Pons - - Vivek Kumar - categories: - - AUDIO, IMAGE & VIDEO, VISION - description: "Speech enhancement is a fundamental audio processing task that has experienced a radical change with the advent of deep learning technologies. We will overview the main characteristics of the task and the key principles of existing deep learning solutions. We will be presenting the past and present work done by our group with the overall goal of delivering the best possible intelligibility and sound quality. Finally, we will provide our view on the future of speech enhancement and show how our current long-term research aligns with such a view." - link: - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E3.png - section: E3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E3-thumb.png - title: "Real time Speech Enhancement" -- authors: - - Edgar Riba - - Dmytro Mishkin - - Jian Shi - - Luis Ferraz - categories: - - AUDIO, IMAGE & VIDEO, VISION - description: "Kornia is a differentiable library that allows classical computer vision to be integrated into deep learning models. It consists of a set of routines and differentiable modules to solve generic computer vision problems. At its core, the package uses PyTorch as its main backend both for efficiency and to take advantage of the reverse-mode auto-differentiation to define and compute the gradient of complex functions." - link: https://kornia.github.io// - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E2.png - section: E2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E2-thumb.png - title: "Kornia AI: Low Level Computer Vision for AI" -- authors: - - Daniel Neimark - - Omri Bar - - Maya Zohar - - Dotan Asselmann - categories: - - AUDIO, IMAGE & VIDEO, VISION - description: "This paper presents VTN, a transformer-based framework for video recognition. Inspired by recent developments in vision transformers, we ditch the standard approach in video action recognition that relies on 3D ConvNets and introduce a method that classifies actions by attending to the entire video sequence information. Our approach is generic and builds on top of any given 2D spatial network. In terms of wall runtime, it trains 16.1× faster and runs 5.1× faster during inference while maintaining competitive accuracy compared to other state-of-the-art methods. It enables whole video analysis, via a single end-to-end pass, while requiring 1.5× fewer GFLOPs. We report competitive results on Kinetics-400 and present an ablation study of VTN properties and the trade-off between accuracy and inference speed. We hope our approach will serve as a new baseline and start a fresh line of research in the video recognition domain. Code and models are available at: https://github.com/bomri/SlowFast/blob/master/projects/vtn/README.md . See paper: https://arxiv.org/abs/2102.00719" - link: https://github.com/bomri/SlowFast/blob/master/projects/vtn/README.md - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E1.png - section: E1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/E1-thumb.png - title: "Video Transformer Network" -- authors: - - Dr. Ehsan Saboori - - Dr. Sudhakar Sah - - MohammadHossein AskariHemmat Saad Ashfaq - - Alex Hoffman - - Olivier Mastropietro - - Davis Sawyer - categories: - - PERFORMANCE, PRODUCTION & DEPLOYMENT - description: "The emergence of Deep Neural Networks (DNNs) on embedded and low-end devices holds tremendous potential to expand the adoption of AI technologies to wider audiences. However, making DNNs applicable for inference on such devices using techniques such as quantization and model compression, while maintaining model accuracy, remains a challenge for production deployment. Furthermore, there is a lack of inference engines available in any AI framework to run such low precision networks. Our work presents a novel inference engine and model compression framework that automatically enables PyTorch developers to quantize and run their deep learning models at 2bit and 1bit precision, making them faster, smaller and more energy-efficient in production. DLRT empowers PyTorch developers to unlock advanced AI on low-power CPUs, starting with ARM CPUs and MCUs. This work allows AI researchers and practitioners to achieve 10x faster inference and near-GPU level performance on a fraction of the power and cost." - link: https://github.com/deeplite - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D7.png - section: D7 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D7-thumb.png - title: "DLRT: Ultra Low-Bit Precision Inference Engine for PyTorch on CPU" -- authors: - - Adway Dhillo - - Nidhin Pattaniyil - categories: - - PERFORMANCE, PRODUCTION & DEPLOYMENT - description: "This poster is for a data scientist or ML engineer looking to productionalize their pytorch models. It will cover post training steps that should be taken to optimize the model such as quantization and torch script. It will also walk the user in packaging and serving the model through Facebook’s TorchServe. Will also cover benefits of script mode and Pytorch JIT. Benefits of Torch Serve: high performance serving , multi model serving , model version for A/B testing, server side batching, support for pre and post processing" - link: https://pytorch.org/serve/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D6.png - section: D6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D6-thumb.png - title: "Serving PyTorch Models in Production at Walmart Search" -- authors: - - Shengyi Huang - - Rousslan Fernand Julien Dossa - - Chang Ye - - Jeff Braga - categories: - - PERFORMANCE, PRODUCTION & DEPLOYMENT - description: "CleanRL is an open-source library that provides high-quality single-file implementations of Deep Reinforcement Learning algorithms. It provides a simpler yet scalable developing experience by having a straightforward codebase and integrating production tools to help interact and scale experiments. In CleanRL, we put all details of an algorithm into a single file, making these performance-relevant details easier to recognize. Additionally, an experiment tracking feature is available to help log metrics, hyperparameters, videos of an agent's gameplay, dependencies, and more to the cloud. Despite succinct implementations, we have also designed tools to help scale, at one point orchestrating experiments on more than 2000 machines simultaneously via Docker and cloud providers.environments. The source code can be found at https://github.com/vwxyzjn/cleanrl." - link: https://github.com/vwxyzjn/cleanrl/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D5.png - section: D5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D5-thumb.png - title: "CleanRL: high-quality single file implementation of Deep Reinforcement Learning algorithms with research-friendly features" -- authors: - - Nidhin Pattaniyil - - Reshama Shaikh - categories: - - PERFORMANCE, PRODUCTION & DEPLOYMENT - description: "As technology improves, so does the use of training deep learning models. Additionally, since the time spent on mobile devices is greater than on desktop, the demand for applications running natively on mobile devices is also high. This demo will go through a complete example of training a deep learning vision classifier on the Food-101 dataset using PyTorch. We then deploy it on web and mobile using TorchServe and PyTorch Mobile." - link: https://github.com/npatta01/pytorch-food - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D4.png - section: D4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D4-thumb.png - title: "Deploying a Food Classifier on PyTorch Mobile" -# - authors: -# - James McCaffrey -# - Ricky Loynd -# - Amanda Minnich -# - Bryan Xia -# categories: -# - PERFORMANCE, PRODUCTION & DEPLOYMENT -# description: "Anomaly detection using deep neural autoencoder reconstruction error is a well-known and effective technique. Reconstruction error anomaly detection compares X and reconstructed X, and when they differ greatly, X is likely anomalous in some way. Recent research has explored an evolution of the autoencoder reconstruction error technique, called variational autoencoder (VAE) reconstruction probability. Briefly, a source data item X generates an internal (u1, v1) mean and standard deviation (equivalently, variance or log-variance) which define a probability distribution P. The P(u1, v1) distribution determines a Q(u2, v2) distribution which is sampled to generate a reconstructed X. The VAE reconstruction probability technique determines how likely it is that the source item X came from the Q(u2, v2) distribution, and if the likelihood is small, X is tagged as anomalous. Experiments on synthetic datasets suggest that the autoencoder reconstruction error and VAE reconstruction probability techniques identify different types of anomalous data items." -# link: -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D3.png -# section: D3 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D3-thumb.png -# title: "Variational Autoencoder Reconstruction Probability Anomaly Detection" -- authors: - - Naren Dasan - - Nick Comly - - Dheeraj Peri - - Anurag Dixit - - Abhiram Iyer - - Bo Wang - - Arvind Sridhar - - Boris Fomitchev - - Josh Park - categories: - - PERFORMANCE, PRODUCTION & DEPLOYMENT - description: "Learn how to accelerate PyTorch inference, from framework, for model deployment. The PyTorch integration for TensorRT makes the performance of TensorRT's GPU optimizations available in PyTorch for any model. We will walk you through how with 3 lines of code you can go from a trained model to optimized TensorRT-embedded TorchScript, ready to deploy to a production environment." - link: https://github.com/NVIDIA/Torch-TensorRT/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D2.png - section: D2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D2-thumb.png - title: "Torch-TensorRT: Accelerating Inference Performance Directly from PyTorch using TensorRT" -- authors: - - Jean Kossaifi - categories: - - PERFORMANCE, PRODUCTION & DEPLOYMENT - description: "Most of the data in modern machine learning (e.g. fMRI, videos, etc) is inherently multi-dimensional and leveraging that structure is crucial for good performance. Tensor methods are the natural way to achieve this and can improve deep learning and enable i) large compression ratios through a reduction of the number of parameters, ii) computational speedups, iii) improved performance and iv) better robustness. The TensorLy project provides the tools to manipulate tensors, including tensor algebra, regression and decomposition. TensorLy-Torch builds on top of this and enables tensor-based deep learning by providing out-of-the-box tensor based PyTorch layers that can be readily combined with any deep neural network architecture and takes care of things such as initialization and tensor dropout." - link: http://tensorly.org/quantum - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D1.png - section: D1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/D1-thumb.png - title: "Tensorized Deep Learning with TensorLy-Torch" -- authors: - - Sergey Kolesnikov - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "Catalyst is a PyTorch framework for Deep Learning Research and Development. It focuses on reproducibility, rapid experimentation, and codebase reuse so you can create something new rather than write yet another train loop." - link: https://catalyst-team.com/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C12.png - section: C12 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C12-thumb.png - title: "Catalyst-Accelerated Deep Learning R&D" -- authors: - - Amog Kamsetty - - Richard Liaw - - Will Drevo - - Michael Galarnyk - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "PyTorch Lightning is a library that provides a high-level interface for PyTorch which helps you organize your code and reduce boilerplate. By abstracting away engineering code, it makes deep learning experiments easier to reproduce and improves developer productivity. PyTorch Lightning also includes plugins to easily parallelize your training across multiple GPUs. This parallel training, however, depends on a critical assumption: that you already have your GPU(s) set up and networked together in an efficient way for training. While you may have a managed cluster like SLURM for multi-node training on the cloud, setting up the cluster and its configuration is no easy task. Ray Lightning was created with this problem in mind to make it easy to leverage multi-node training without needing extensive infrastructure expertise. It is a simple and free plugin for PyTorch Lightning with a number of benefits like simple setup, easy scale up, seamless creation of multi-node clusters on AWS/Azure/GCP via the Ray Cluster Launcher, and an integration with Ray Tune for large-scale distributed hyperparameter search and state of the art algorithms" - link: https://github.com/ray-project/ray_lightning - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C11.png - section: C11 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C11-thumb.png - title: "Ray Lightning: Easy Multi-node PyTorch Lightning training" -- authors: - - Jin Howe Teo - - Way Yen Chen - - Najib Ninaba - - Choo Heng Chong Mark - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "Data sits as the centerpiece of any machine learning endeavour, yet in many real-world projects, a single party’s data is often insufficient and needs to be augmented with data from other sources. This is unfortunately easier said than done, as there are many innate concerns (be it regulatory, ethical, commercial etc.) stopping parties from exchanging data. Fortunately, there exists an emerging privacy-preserving machine learning technology called Federated Learning. It enables multiple parties holding local data to collaboratively train machine learning models without actually exchanging their data with one another, hence preserving the confidentiality of different parties’ local data.Today, we will be showcasing Synergos, a distributed platform built here at AI Singapore to facilitate the adoption of Federated Learning. Specifically, it strives to make the complex mechanisms involved in any federated endeavour simple, accessible and sustainable." - link: https://github.com/aimakerspace/synergos_simulator - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C10.png - section: C10 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C10-thumb.png - title: "Supercharge your Federated Learning with Synergos" -- authors: - - Aurick Qiao - - Omkar Pangarkar - - Richard Fan - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "AdaptDL is an open source framework and scheduling algorithm that directly optimizes cluster-wide training performance and resource utilization. By elastically re-scaling jobs, co-adapting batch sizes and learning rates, and avoiding network interference, AdaptDL improves shared-cluster training compared with alternative schedulers. AdaptDL can automatically determine the optimal number of resources given a job’s need. It will efficiently add or remove resources dynamically to ensure the highest-level performance. The AdaptDL scheduler will automatically figure out the most efficient number of GPUs to allocate to your job, based on its scalability. When the cluster load is low, your job can dynamically expand to take advantage of more GPUs. AdaptDL offers an easy-to-use API to make existing PyTorch training code elastic with adaptive batch sizes and learning rates. We have also ported AdaptDL to Ray/Tune which can automatically scale trials of an Experiment and can be used to schedule stand-alone PyTorch training jobs on the cloud in a cost-effective way." - link: https://github.com/petuum/adaptdl - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C9.png - section: C9 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C9-thumb.png - title: "AdaptDL: An Open-Source Resource-Adaptive Deep Learning Training and Scheduling Framework" -- authors: - - Vasiliy Kuznetsov - - James Reed - - Jerry Zhang - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "Describes a prototype PyTorch workflow to perform quantization syntax transforms in Eager mode with: * no model changes needed (compared to Eager mode which requires manual quant/dequant insertion and fusion) * almost no model syntax restrictions (compared to FX graph mode which requires symbolic traceability)" - link: https://pytorch.org/docs/stable/quantization.html - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C8.png - section: C8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C8-thumb.png - title: "Define-by-run quantization" -- authors: - - Charles Hernandez - - Vasiliy Kuznetzov - - Haixin Liu - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "wrong when it doesn't satisfy the accuracy we expect. Debugging the accuracy issue of quantization is not easy and time consuming. The Fx Numeric Suite Core APIs allows users to better diagnose the source of their quantization error for both statically and dynamically quantized modelsThis poster gives an overview of the core APIs and techniques available to users through the Fx Numeric Suite, and how they can use them to improve quantization performance." - link: - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C7.png - section: C7 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C7-thumb.png - title: "Fx Numeric Suite Core APIs" -- authors: - - J.K. Eshraghian - - M. Ward - - E.O. Neftci - - G. Lenz - - X. Wang - - G. Dwivedi - - M. Bennamoun - - D.S. Jeong - - W.D. Lu - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "The brain is the perfect place to look for inspiration to develop more efficient neural networks. One of the main differences with modern deep learning is that the brain encodes and processes information as spikes rather than continuous activations. Combining the training methods intended for neural networks with the sparse, spiking activity inspired by biological neurons has shown the potential to improve the power efficiency of training and inference by several orders of magnitude. snnTorch is a Python package for performing gradient-based learning with spiking neural networks. It extends the capabilities of PyTorch, taking advantage of its GPU accelerated tensor computation and applying it to networks of event-driven spiking neurons. snnTorch is designed to be intuitively used with PyTorch, as though each spiking neuron were simply another activation in a sequence of layers. It is therefore agnostic to fully-connected layers, convolutional layers, residual connections, etc. The classical challenges that have faced the neuromorphic engineering community, such as the non-differentiability of spikes, the dead neuron problem, vanishing gradients in backpropagation-through-time, are effectively solved in snnTorch and enable the user to focus on building applications that leverage sparsity and event-driven data streams." - link: https://snntorch.readthedocs.io/en/latest/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C6.png - section: C6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C6-thumb.png - title: "snnTorch: Training spiking neural networks using gradient-based optimization" -- authors: - - Daniel Falbel - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "Last year the PyTorch for the R language project has been released allowing R users to benefit of PyTorch's speed and flexibility. Since then we have a growing community of contributors that are both improving the torch for R interface, building research and products on top of it and using it to teach deep learning methods. In this poster we will showcase what are the past and current developments in the PyTorch for R project as well as what are our plans for the future." - link: https://torch.mlverse.org/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C5.png - section: C5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C5-thumb.png - title: "PyTorch for R" -- authors: - - Laurent Mazare - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "The main front-end for using PyTorch is its Python API, however LibTorch provides a lower level C++ API to manipulate tensors, perform automatic differentiation, etc. ocaml-torch and tch-rs are two open-source projects providing wrappers for this C++ API respectively in OCaml and Rust. Users can then write OCaml and Rust code to create new models, perform inference and training, and benefit from the guarantees provided by strongly typed programming languages and functional programming. They can also use TorchScript to leverage existing Python models. The libraries provide various examples, ranging from the main computer vision models to a minimalist GPT implementation. - The main challenges for these bindings are to provide idiomatic APIs adapted to the languages specificities; to automatically generate most of the bindings code as there are thousands of C++ functions to expose; and to interact properly with the memory models for each language." - link: https://github.com/laurentMazare/ocaml-torch - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C4.png - section: C4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C4-thumb.png - title: "ocaml-torch and tch-rs: writing and using PyTorch models using OCaml or Rust" -- authors: - - Ari Bornstein - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "Flash is a high-level deep learning framework for fast prototyping, baselining, finetuning and solving deep learning problems. It features a set of tasks for you to use for inference and finetuning out of the box, and an easy to implement API to customize every step of the process for full flexibility. Flash is built for beginners with a simple API that requires very little deep learning background, and for data scientists, Kagglers, applied ML practitioners and deep learning researchers that want a quick way to get a deep learning baseline with advanced features PyTorch Lightning offers. Flash enables you to easily configure and run complex AI recipes for over 15 tasks across 7 data domains" - link: https://github.com/PyTorchLightning - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C3.png - section: C3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C3-thumb.png - title: "PyTorch Lightning Flash - Your PyTorch AI Factory" -- authors: - - Victor Fomin - - Taras Savchyn - - Priyansi - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "PyTorch-Ignite is a high-level library to help with training and evaluating neural networks in PyTorch flexibly and transparently. PyTorch-Ignite is designed to be at the crossroads of high-level Plug & Play features and under-the-hood expansion possibilities. The tool aims to improve the deep learning community's technical skills by promoting best practices where things are not hidden behind a divine tool that does everything, but remain within the reach of users. PyTorch-Ignite differs from other similar tools by allowing users to compose their applications without being focused on a super multi-purpose object, but rather on weakly coupled components allowing advanced customization." - link: https://pytorch-ignite.ai/ecosystem/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C2.png - section: C2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C2-thumb.png - title: "PyTorch-Ignite: Training and evaluating neural networks flexibly and transparently" -- authors: - - Albert Jimenez - - Mohamed Akrout - categories: - - EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING - description: "Backpropagation is the default algorithm for training deep neural networks due to its simplicity, efficiency and high convergence rate. However, its requirements make it impossible to be implemented in a human brain. In recent years, more biologically plausible learning methods have been proposed. Some of these methods can match backpropagation accuracy, and simultaneously provide other extra benefits such as faster training on specialized hardware (e.g., ASICs) or higher robustness against adversarial attacks. While the interest in the field is growing, there is a necessity for open-source libraries and toolkits to foster research and benchmark algorithms. In this poster, we present BioTorch, a software framework to create, train, and benchmark biologically motivated neural networks. In addition, we investigate the performance of several feedback alignment methods proposed in the literature, thereby unveiling the importance of the forward and backward weight initialization and optimizer choice. Finally, we provide a novel robustness study of these methods against state-of-the-art white and black-box adversarial attacks." - link: https://github.com/jsalbert/biotorch - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C1.png - section: C1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C1-thumb.png - title: "Benchmarking the Accuracy and Robustness of Feedback Alignment Methods" -- authors: - - Ludovic Denoyer - - Alfredo de la Fuente - - Song Duong - - Jean-Baptiste Gaya - - Pierre-Alexandre Kamienny - - Daniel H. Thompson - categories: - - ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY - description: "salina is a lightweight library extending PyTorch modules for the development of sequential decision models. It can be used for Reinforcement Learning (including model-based with differentiable environments, multi-agent RL, ...), but also in a supervised/unsupervised learning settings (for instance for NLP, Computer Vision, etc..)." - link: https://github.com/facebookresearch/salina - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B7.png - section: B7 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B7-thumb.png - title: "Salina: Easy programming of Sequential Decision Learning and Reinforcement Learning Models in pytorch" -- authors: - - Zafar Takhirov - - Karen Zhou - - Raghuraman Krishnamoorthi - categories: - - ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY - description: "Two new toolflows for model pruning are introduced: Sparsifier and Pruner, which enable unstructured and structured pruning of the model weights respectively. The toolflow can be combined with other optimization techniques, such as quantization to achieve even higher levels of model compression. In addition to that, the \"Pruner\" toolflow can also be used for \"shape propagation\", where the physical structure of the model is modified after structured pruning (in FX graph mode only).This poster gives a high-level overview of the prototype API, usage example, currently supported sparse quantized kernels, as well as provides a brief overview of future plans" - link: https://github.com/pytorch/pytorch - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B6.png - section: B6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B6-thumb.png - title: "Structured and Unstructured Pruning Workflow in PyTorch" -- authors: - - François-Guillaume Fernandez - categories: - - ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY - description: "One of the core inconveniences of Deep Learning comes from its interpretability, which remains obscure for most non-basic convolutional models. Their very performances are granted by optimization processes that have high degrees of freedom and no constraints on explainability. Fortunately, modern frameworks mechanisms grant access to information flow in their components, which paved the way to building intuition around result interpretability in CNN models. The main contributions of the author are described as follows: - - building a flexible framework for class activation computation - - providing high-quality implementations of most popular methods - - making these methods usable by entry users as well as researchers - The open-source project is available here: https://github.com/frgfm/torch-cam" - link: https://github.com/frgfm/torch-cam - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B5.png - section: B5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B5-thumb.png - title: "Torch-CAM: class activation explorer" -- authors: - - Nikolaos Zioulis - categories: - - ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY - description: moai is a PyTorch-based AI Model Development Kit (MDK) that seeks to improve data-driven model workflows, design and understanding. It relies on hydra for handling configuration and lightning for handling infrastructure. As a kit, It offers a set of actions to `train` or `evaluate` models using the corresponding actions which consume configuration files. Apart from the definition of the model, data, training scheme, optimizer, visualization and logging, these configuration files additionally use named tensors to define tensor processing graphs. These are created by chaining various building blocks called monads, which are functional units or otherwise single responsibility modules. Monad parameters and input/output tensors are defined on the configuration file, allowing for the entire model to be summarized into a single file. This opens up novel functionalities like querying for inter-model differences using the `diff` action, or aggregating the results of multiple models using the `plot` action which uses hiplot to compare models in various ways. moai facilitates high quality reproduction (using the `reprod` action), as apart from automatically handling all boilerplate related to it, it standardizes the process of developing modules/monads and implicitly logs all hyperparameters. Even though no code is required, moai exploits python’s flexibility to allow developers to integrate their own code into its engine from external projects, vastly increasing their productivity. - link: https://github.com/ai-in-motion/moai - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B4.png - section: B4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B4-thumb.png - title: "moai: A Model Development Kit to Accelerate Data-driven Workflows" -- authors: - - Vaibhav Singh - - Rajesh Thallam - - Jordan Totten - - Karl Weinmeister - categories: - - ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY - description: Machine Learning Operationalization has rapidly evolved in the last few years with a growing set of tools for each phase of development. From experimentation to automated model analysis and deployment, each of these tools offer some unique capabilities. In this work we survey a slice of these tools and demonstrate an opinionated example of an end to end CI/CD pipeline for PyTorch model development and deployment using Vertex AI SDK. The goal of this session is to aid an informed conversation on the choices available to PyTorch industry practitioners who are looking to operationalize their ML models, and to researchers who are simply trying to organize their experiments. Although our implementation example will make tool choices at various stages, we will be focused on ML design patterns that are applicable to a wide variety of commercial and open-source offerings. - link: https://github.com/GoogleCloudPlatform/vertex-ai-samples - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B3.png - section: B3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B3-thumb.png - title: "Building Production ML Pipelines for PyTorch Models" -- authors: - - George Hosu - - Particio Cerda-Mardini - - Natasha Seelam - - Jorge Torres - categories: - - ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY - description: Nearly 64% of companies take over a month to a year to deploy a single machine learning (ML) model into production [1]. Many of these companies cite key challenges integrating with complex ML frameworks as a root cause [1], as there is still a gap between where data lives, how models are trained, and how downstream applications access predictions from models [1, 2]. MindsDB is a PyTorch-based ML platform that aims to solve fundamental MLOps challenges by abstracting ML models as “virtual tables”, allowing models to be queried in the same natural way users work with data in databases. As data is diverse and varied, we recently developed an open-source declarative syntax, named “JSON-AI” to allow others to customize ML model internals without changing source code. We believe that the key elements of the data science (DS)/ML pipeline, namely data pre-processing/cleaning, feature engineering, and model-building [2], should be automated in a robust, reliable, and reproducible manner with simplicity. JSON-AI allows you refined control of each of these steps, and enables users to bring custom routines into their ML pipeline. In our poster, we will show how a user interfaces with JSON-AI to bring original approaches to each of the aforementioned parts of the DS/ML, along with control over analysis and explainability tools. [1] Algorithmia (2021). 2021 state of enterprise machine learning [2] “How Much Automation Does a Data Scientist Want?” ArXiV (2021) - link: https://github.com/mindsdb/mindsdb/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B2.png - section: B2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B2-thumb.png - title: "Customizing MLOps pipelines with JSON-AI: a declarative syntax to streamline ML in the database" -# - authors: -# - Moses Gurtmann -# - Erez Schnaiderl -# categories: -# - ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY -# description: "Both from sanity considerations and the productivity perspective, Data Scientists, ML engineers, Graduate students, and other research-facing roles are all starting to adopt best-practices from production-grade MLOps. However, most toolchains come with a hefty price of extra code and maintenance, which reduces the actual time available for R&D. We will show an alternative approach using ClearML, the open-source MLOps solution. In this “best-practices” poster, we will overview the “must-haves” of R&D-MLOPs: Orchestration, Automation, and Reproducibility. These enable easy remote execution through magically reproducible setups and even custom, reusable, bottom-up pipelines. We will take a single example and schematically transform it from the “as downloaded from GitHub” stage to a fully-fledged, scalable, version-controlled, parameterizable R&D pipeline. We will measure the number of changes needed to the codebase and provide evidence of real low-cost integration. All code, logs, and metrics will be available as supporting information" -# link: -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B1.png -# section: B1 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/B1-thumb.png -# title: "The Fundamentals of MLOps for R&D: Orchestration, Automation, Reproducibility" -- authors: - - Robin Lobel - categories: - - ACCELERATORS, TOOLS, LIBRARY, DATA - description: TorchStudio is an open-source, full-featured IDE for PyTorch. It aims to simplify the creation, training and iterations of AI models. It can load, analyze and explore datasets from the TorchVision or TorchAudio categories, or custom datasets with any format and number of inputs and outputs. TorchVision, TorchAudio or custom models can then be loaded or written from scratch, debugged, visualized as a graph, and trained using local hardware, a distant server or GPUs in the cloud. Trainings can then be compared in the dashboard with several analyzing tools to help you identify the best performing set of models and hyper parameters and export it as TorchScript or ONNX files. TorchStudio is also highly customizable, with 90% of its functionalities accessible as open source scripts and independent modules, to fit as many AI scenario as possible. - link: https://torchstudio.ai/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A10.png - section: A10 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A10-thumb.png - title: "TorchStudio, a full featured IDE for PyTorch" -- authors: - - Mark Saroufim - - Hamid Shojanazeri - - Patrick Hu - - Geeta Chauhan - - Jing Xu - - Jianan Gu - - Jiong Gong - - Ashok Emani - - Eikan Wang - - Min Jean Cho - - Fan Zhao - categories: - - ACCELERATORS, TOOLS, LIBRARY, DATA - description: "Accelerate TorchServe with Intel® Extension for PyTorch: Intel is collaborating with Meta to take advantage of performance boosting from Intel® Extension for PyTorch* from TorchServe, so that users can easily deploy their PyTorch models with out of the box satisfying performance. With these SW advancements, we demonstrated ease-of-use IPEX user-facing API, and we also showcased speed-up with Intel® Extension for PyTorch* FP32 inference with the stock PyTorch and speed-up with Intel® Extension for PyTorch* INT8 inference with the stock PyTorch." - link: www.intel.com/Performanceindex - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A9.png - section: A9 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A9-thumb.png - title: "Accelerate TorchServe with Intel Extension for PyTorch" -# - authors: -# - Isaac Godfried -# - Anton Polishko -# categories: -# - ACCELERATORS, TOOLS, LIBRARY, DATA -# description: Flow Forecast is a multi-purpose open-source deep learning for time series forecasting, classification, and anomaly detection framework built in PyTorch. Flow Forecast utilizes modular code design, unit/integration tests, model/prediction visualizations, and native cloud provider integration in order to allow researchers to rapidly experiment with new model architectures, benchmark their results on popular datasets and reproduce their results. Simultaneously it aids industry data scientists to deploy models to production, periodically retrain models, and explain model decisions to stakeholders through easy to use APIs, out of the box interpretability methods (e.g. SHAP), and model deployment support. Flow Forecast supports a broad variety of deep time series models such as LSTMs, GRUs, Transformers, and GNNs. It also features easy multitask learning support and loaders to help with geo-spatial-temporal data. -# link: https://github.com/AIStream-Peelout/flow-forecast -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A8.png -# section: A8 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A8-thumb.png -# title: "Flow Forecast: A deep learning for time series forecasting , classification , and anomaly detection framework" -# - authors: -# - Patrick Kidger -# categories: -# - ACCELERATORS, TOOLS, LIBRARY, DATA -# description: "Turn this: -# ``` -# def batch_outer_product(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: -# # x has shape (batch, x_channels) -# # y has shape (batch, y_channels) -# # return has shape (batch, x_channels, y_channels) - -# return x.unsqueeze(-1) * y.unsqueeze(-2) -# ``` -# into this: -# ``` -# def batch_outer_product(x: TensorType[\"\"batch\"\", \"\"x_channels\"\"], -# y: TensorType[\"\"batch\"\", \"\"y_channels\"\"] -# ) -> TensorType[\"\"batch\"\", \"\"x_channels\"\", \"\"y_channels\"\"]: - -# return x.unsqueeze(-1) * y.unsqueeze(-2) -# ``` -# with programmatic checking that the shape (dtype, ...) specification is met! - -# Bye-bye bugs -- say hello to enforced, clear documentation of PyTorch code. -# torchtyping may be used instead of littering code with comments like `# x has shape (batch, hidden_state)` or statements like `assert x.shape == y.shape`, just to keep track of what shape/dtype/etc everything is." -# link: https://github.com/patrick-kidger/torchtyping -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A7.png -# section: A7 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A7-thumb.png -# title: "TorchTyping: rich type annotations of shape, dtype, etc…" -# - authors: -# - Yashl Kanungo -# - Sumit Negi -# categories: -# - ACCELERATORS, TOOLS, LIBRARY, DATA -# description: "Amazon Ads helps companies build their brand and connect with shoppers, through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses or brands of all sizes including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies on Amazon marketplaces can upload their own ad creatives, which can include images, video, audio, and of course products sold on Amazon. For our text ad processing, we deploy PyTorch based BERT models globally on AWS Inferentia based Inf1 instances. By moving to Inferentia from GPUs, we were able to lower our cost by 69% with comparable performance." -# link: https://aws.amazon.com/blogs/aws/scaling-ad-verification-with-machine-learning-and-aws-inferentia/ -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A6.png -# section: A6 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A6-thumb.png -# title: Scaling Ad Classification with Machine Learning on PyTorch and AWS Inferentia -# - authors: -# - Bharath Ramsundar -# categories: -# - ACCELERATORS, TOOLS, LIBRARY, DATA -# description: "DeepChem uses PyTorch to implement a number of scientific deep learning models for use in modeling proteins, small molecules, materials and physical simulations. DeepChem aims to become a powerful domain specific language for scientific applications that leverages PyTorch to provide a solid base for our models." -# link: -# poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A5.png -# section: A5 -# thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A5-thumb.png -# title: "DeppChem: A Toolbox for AI driven Science" -- authors: - - Clement Fuji Tsang - - Jean-Francois Lafleche - - Charles Loop - - Masha Shugrina - - Towaki Takikawa - - Jiehan Wang - categories: - - ACCELERATORS, TOOLS, LIBRARY, DATA - description: "NVIDIA Kaolin is a suite of tools for accelerating 3D Deep Learning research. The Kaolin library provides a PyTorch API for working with a variety of 3D representations and includes a growing collection of GPU-optimized operations such as modular differentiable rendering, fast conversions between representations, loss functions, data loading, 3D checkpoints and more. The library also contains a lightweight 3D visualizer Dash3D and can work with an Omniverse companion app for dataset/checkpoint visualization and synthetic data generation." - link: - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A3.png - section: A3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A3-thumb.png - title: Kaolin Library -- authors: - - Jack Cao - - Milad Mohammadi - - Zak Stone - - Vaibhav Singh - - Calvin Pelletier - - Shauheen Zahirazami - categories: - - ACCELERATORS, TOOLS, LIBRARY, DATA - description: "PyTorch / XLA offers PyTorch users the ability to train their models on XLA devices including Cloud TPUs. This compiled path often makes it possible to utilize creative optimizations and achieve top performance on target XLA devices. With the introduction of Cloud TPU VMs, users have direct access to TPU host machines and therefore a great level of flexibility. In addition, TPU VMs make debugging easier and reduce data transfer overheads. Google has also recently announced the availability of Cloud TPU v4 Pods, which are exaflop-scale supercomputers for machine learning. Cloud TPU v4 Pods offer a whole new level of performance for large-scale PyTorch / XLA training of ML models." - link: - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A2.png - section: A2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A2-thumb.png - title: Accelerate PyTorch training with Cloud TPUs -- authors: - - Antonio Kim - - Behzad Abghari - - Chris Oliver - - Cynthia Liu - - Mark Browning - - Vishal Subbiah - - Kamran Jafari - - Emad Barsoum - - Jessica Liu - - Sean Lie - categories: - - ACCELERATORS, TOOLS, LIBRARY, DATA - description: "The Cerebras Wafer Scale Engine (WSE) is the largest processor ever built, dedicated to accelerating deep learning model for training and inference. A single chip in a single CS-2 system provides the compute power of a cluster of GPUs but acts as a single processor, making it also much simpler to use. We present the current PyTorch backend architecture for the Cerebras CS-2 and how we go all the way from PyTorch to laying out the model graph on the wafer. Additionally, we will discuss the advantages of training on Cerebras hardware and its unique capabilities." - link: https://cerebras.net - poster_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A1.png - section: A1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/A1-thumb.png - title: Accelerating PyTorch on the largest chip ever built (WSE) \ No newline at end of file diff --git a/_data/ecosystem/pted/2021/posters.yaml b/_data/ecosystem/pted/2021/posters.yaml deleted file mode 100644 index 5f5f22524a3a..000000000000 --- a/_data/ecosystem/pted/2021/posters.yaml +++ /dev/null @@ -1,1853 +0,0 @@ -- authors: - - Josh Izaac - - Thomas Bromley - categories: - - Platform, Ops & Tools - description: - PennyLane allows you to train quantum circuits just like neural networks!, - This poster showcases how PennyLane can be interfaced with PyTorch to enable training - of quantum and hybrid machine learning models. The outputs of a quantum circuit - are provided as a Torch tensor with a defined gradient. We highlight how this - functionality can be used to explore new paradigms in machine learning, including - the use of hybrid models for transfer learning. - link: http://pennylane.ai - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K1.png - section: K1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-K1.png - title: Bring quantum machine learning to PyTorch with PennyLane -- authors: - - Jeffrey Mew - categories: - - Compiler & Transform & Production - description: - "Visual Studio Code, a free cross-platform lightweight code editor,\ - \ has become the most popular among Python developers for both web and machine\ - \ learning projects. We will be walking you through an end to end PyTorch project\ - \ to showcase what VS Code has a lot to offer to PyTorch developers to boost their\ - \ productivity.\n \nFirstly, get your PyTorch project quickly up and running with\ - \ VS Code's environment/dependency management and built-in Jupyter Notebook support.\ - \ Secondly, breeze through coding with help from our AI-powered IntelliSense.\ - \ When it's time to run your code, use the built-in Tensorboard integration to\ - \ monitor your training along with the integrated PyTorch profiler to analyze\ - \ and debug your code. Once you're ready for the cloud, VS Code has Azure service\ - \ integration to allow you to scale your model training and deployment, along\ - \ with deployment.\n \nCombing the power of the code editor with easy access to\ - \ the Azure services, VS Code can be the one-stop shop for any developers looking\ - \ to build machine learning models with PyTorch." - link: https://pytorch.org/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A4.png - section: A4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-A4.png - title: PyTorch development in VS Code -- authors: - - Yanan Cao - - Harry Kim - - Jason Ansel - categories: - - Compiler & Transform & Production - description: - TorchScript is the bridge between PyTorch's flexible eager mode to - more deterministic and performant graph mode suitable for production deployment. - As part of PyTorch 1.9 release, TorchScript will launch a few features that we'd - like to share with you earlier, including a) a new formal language specification - that defines the exact subset of Python/PyTorch features supported in TorchScript; - b) Profile-Directed Typing that reduces the burden of converting a loosely-typed - eager model into a strictly-typed TorchScript model; c) A TorchScript profiler - that can shed light on performance characteristics of TorchScript model. We are - constantly making improvements to make TorchScript easier to use and more performant. - link: http://fb.me/torchscript - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A5.png - section: A5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-A5.png - title: Upcoming features in TorchScript -- authors: - - Alessandro Pappalardo - categories: - - Compiler & Transform & Production - description: - Brevitas is an open-source PyTorch library for quantization-aware training. - Thanks to its flexible design at multiple levels of abstraction, Brevitas generalizes - the typical uniform affine quantization paradigm adopted in the deep learning - community under a common set of unified APIs. Brevitas provides a platform to - both ML practitioners and researchers to either apply built-in state-of-the-art - techniques in training for reduced-precision inference, or to implement novel - quantization-aware training algorithms. Users can target supported inference toolchains, - such as onnxruntime, TVM, Vitis AI, FINN or PyTorch itself, or experiment with - hypothetical target hardware platforms. In particular, when combined with the - flexibility of Xilinx FPGAs through the FINN toolchain, Brevitas supports the - co-design of novel hardware building blocks in a machine-learning driven fashion. - Within Xilinx, Brevitas has been adopted by various research projects concerning - quantized neural networks, as well as in large scale deployments targeting custom - programmable logic accelerators. - link: https://github.com/Xilinx/brevitas/ - section: B4 - title: Quantization-Aware Training with Brevitas -- authors: - - Jerry Zhang - - Vasiliy Kuznetsov - - Raghuraman Krishnamoorthi - categories: - - Compiler & Transform & Production - description: - Quantization is a common model optimization technique to speedup runtime - of a model by upto 4x, with a possible slight loss of accuracy. Currently, PyTorch - support Eager Mode Quantization. FX Graph Mode Quantization improves upon Eager - Mode Quantization by adding support for functionals and automating the quantization - process. To use FX Graph Mode Quantization, one might need to refactor the model - to make the model compatible with FX Graph Mode Quantization (symbolically traceable - with torch.fx). - link: https://pytorch.org/docs/master/quantization.html#prototype-fx-graph-mode-quantization - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/B5.png - section: B5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-B5.png - title: "PyTorch Quantization: FX Graph Mode Quantization" -- authors: - - Fabio Nonato - categories: - - Compiler & Transform & Production - description: - " Deep learning models can have game-changing impact on machine learning\ - \ applications. However, deploying and managing deep learning models in production\ - \ is complex and requires considerable engineering effort - from building custom\ - \ inferencing APIs and scaling prediction services, to securing applications,\ - \ while still leveraging the latest ML frameworks and hardware technology. Amazon\ - \ EC2 Inf1 instances powered by AWS Inferentia deliver the highest performance\ - \ and lowest cost machine learning inference in the cloud. Developers can deploy\ - \ their deep-learning models to Inf1 instances using the AWS Neuron SDK that is\ - \ natively integrated with PyTorch.\n \nAttend this poster session to learn how\ - \ you can optimize and accelerate the deployment of your deep learning models\ - \ in production using Inf1 instances and TorchServe containers. You will learn\ - \ how to deploy TorchScript models on Inf1 and optimize your models with minimal\ - \ code changes with features such as NeuronCore Groups and NeuronCore Pipeline,\ - \ to meet your throughput and latency requirements. You can directly integrate\ - \ these model level optimizations into the inference endpoint using TorchServe.\n\ - \ \nWe will also deep dive into how we optimized performance of a natural language\ - \ processing endpoint and showcase the workflow for deploying the optimized model\ - \ using TorchServe containers on Amazon ECS." - link: https://bit.ly/3mQVowk - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C4.png - section: C4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-C4.png - title: - Accelerate deployment of deep learning models in production with Amazon EC2 - Inf1 and TorchServe containers -- authors: - - James Reed - - Zachary DeVito - - Ansley Ussery - - Horace He - - Michael Suo - categories: - - Compiler & Transform & Production - description: - "FX is a toolkit for writing Python-to-Python transforms over PyTorch\ - \ code.\nFX consists of three parts:\n> Symbolic Tracing \u2013 a method to extract\ - \ a representation of the program by running it with \"proxy\" values.\n> Graph-based\ - \ Transformations \u2013 FX provides an easy-to-use Python-based Graph API for\ - \ manipulating the code.\n> Python code generation \u2013 FX generates valid Python\ - \ code from graphs and turns that code into executable Python `nn.Module` instances." - link: https://pytorch.org/docs/stable/fx.html - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C5.png - section: C5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-C5.png - title: Torch.fx -- authors: - - Abhijit Khobare - - Murali Akula - - Tijmen Blankevoort - - Harshita Mangal - - Frank Mayer - - Sangeetha Marshathalli Siddegowda - - Chirag Patel - - Vinay Garg - - Markus Nagel - categories: - - Compiler & Transform & Production - description: - "AI is revolutionizing industries, products, and core capabilities - by delivering dramatically enhanced experiences. However, the deep neural networks - of today use too much memory, compute, and energy. To make AI truly ubiquitous, - it needs to run on the end device within a tight power and thermal budget. Quantization - and compression help address these issues. In this tutorial, we'll discuss: - - The existing quantization and compression challenges - - Our research in novel quantization and compression techniques to overcome these - challenges - - How developers and researchers can implement these techniques through the AI Model - Efficiency Toolkit" - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D4.png - section: D4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-D4.png - title: AI Model Efficiency Toolkit (AIMET) -- authors: - - Natasha Seelam - - Patricio Cerda-Mardini - - Cosmo Jenytin - - Jorge Torres - categories: - - Database & AI Accelerators - description: - 'Pytorch enables building models with complex inputs and outputs, including - time-series data, text and audiovisual data. However, such models require expertise - and time to build, often spent on tedious tasks like cleaning the data or transforming - it into a format that is expected by the models. - - - Thus, pre-trained models are often used as-is when a researcher wants to experiment - only with a specific facet of a problem. See, as examples, FastAI''s work into - optimizers, schedulers, and gradual training through pre-trained residual models, - or NLP projects with Hugging Face models as their backbone. - - - We think that, for many of these problems, we can automatically generate a "good - enough" model and data-processing pipeline from just the raw data and the endpoint. - To address this situation, we are developing MindsDB, an open-source, PyTorch-based - ML platform that works inside databases via SQL commands. It is built with a modular - approach, and in this talk we are going to focus on Lightwood, the stand-alone - core component that performs machine learning automation on top of the PyTorch - framework. - - - Lightwood automates model building into 5 stages: (1) classifying each feature - into a "data type", (2) running statistical analyses on each column of a dataset, - (3) fitting multiple models to normalize, tokenize, and generate embeddings for - each feature, (4) deploying the embeddings to fit a final estimator, and (5) running - an analysis on the final ensemble to evaluate it and generate a confidence model. - It can generate quick "baseline" models to benchmark performance for any custom - encoder representation of a data type and can also serve as scaffolding for investigating - new hypotheses (architectures, optimizers, loss-functions, hyperparameters, etc). - - - We aim to present our benchmarks covering wide swaths of problem types and illustrate - how Lightwood can be useful for researchers and engineers through a hands-on demo.' - link: https://mindsdb.com - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H8.png - section: H8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-H8.png - title: - "Pytorch via SQL commands: A flexible, modular AutoML framework that democratizes - ML for database users" -- authors: - - "Sam Partee " - - Alessandro Rigazzi - - Mathew Ellis - - Benjamin Rob - categories: - - Database & AI Accelerators - description: - SmartSim is an open source library dedicated to enabling online analysis - and Machine Learning (ML) for traditional High Performance Computing (HPC) simulations. - Clients are provided in common HPC simulation languages, C/C++/Fortran, that enable - simulations to perform inference requests in parallel on large HPC systems. SmartSim - utilizes the Redis ecosystem to host and serve PyTorch models alongside simulations. - We present a use case of SmartSim where a global ocean simulation, used in climate - modeling, is augmented with a PyTorch model to resolve quantities of eddy kinetic - energy within the simulation. - link: https://github.com/CrayLabs/SmartSim - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/J8.png - section: J8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-J8.png - title: PyTorch on Supercomputers Simulations and AI at Scale with SmartSim -- authors: - - Patricio Cerda-Mardini - - Natasha Seelam - categories: - - Database & AI Accelerators - description: - 'Many domains leverage the extraordinary predictive performance of - machine learning algorithms. However, there is an increasing need for transparency - of these models in order to justify deploying them in applied settings. Developing - trustworthy models is a great challenge, as they are usually optimized for accuracy, - relegating the fit between the true and predicted distributions to the background - [1]. This concept of obtaining predicted probability estimates that match the - true likelihood is also known as calibration. - - - Contemporary ML models generally exhibit poor calibration. There are several methods - that aim at producing calibrated ML models [2, 3]. Inductive conformal prediction - (ICP) is a simple yet powerful framework to achieve this, offering strong guarantees - about the error rates of any machine learning model [4]. ICP provides confidence - scores and turns any point prediction into a prediction region through nonconformity - measures, which indicate the degree of inherent strangeness a data point presents - when compared to a calibration data split. - - - In this work, we discuss the integration of ICP with MindsDB --an open source - AutoML framework-- successfully replacing its existing quantile loss approach - for confidence estimation capabilities. - - Our contribution is threefold. First, we present a study on the effect of a "self-aware" - neural network normalizer in the width of predicted region sizes (also known as - efficiency) when compared to an unnormalized baseline. Our benchmarks consider - results for over 30 datasets of varied domains with both categorical and numerical - targets. Second, we propose an algorithm to dynamically determine the confidence - level based on a target size for the predicted region, effectively prioritizing - efficiency over a minimum error rate. Finally, we showcase the results of a nonconformity - measure specifically tailored for small datasets. - - - References: - - [1] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K.Q. (2017). On Calibration of - Modern Neural Networks. ArXiv, abs/1706.04599. - - [2] Naeini, M., Cooper, G., & Hauskrecht, M. (2015). Obtaining Well Calibrated - Probabilities Using Bayesian Binning. Proceedings of the AAAI Conference on Artificial - Intelligence. AAAI Conference on Artificial Intelligence, 2015, 2901-2907 . - - [3] Maddox, W., Garipov, T., Izmailov, P., Vetrov, D., & Wilson, A. (2019). A - Simple Baseline for Bayesian Uncertainty in Deep Learning. NeurIPS. - - [4] Papadopoulos, H., Vovk, V., & Gammerman, A. (2007). Conformal Prediction with - Neural Networks. 19th IEEE International Conference on Tools with Artificial Intelligence - (ICTAI 2007), 2, 388-395.' - link: https://mindsdb.com - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/I8.png - section: I8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-I8.png - title: Model agnostic confidence estimation with conformal predictors for AutoML -- authors: - - Derek Bouius - categories: - - Database & AI Accelerators - description: - AMD Instinct GPUs are enabled with the upstream PyTorch repository - via the ROCm open software platform. Now users can also easily download the installable - Python package, built from the upstream PyTorch repository and hosted on pytorch.org. - Notably, it includes support for distributed training across multiple GPUs and - supports accelerated mixed precision training. AMD also provides hardware support - for the PyTorch community build to help develop and maintain new features. This - poster will highlight some of the work that has gone into enabling PyTorch support. - link: https://www.amd.com/rocm - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K8.png - section: K8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-K8.png - title: - "Enabling PyTorch on AMD Instinct\u2122 GPUs with the AMD ROCm\u2122 Open\ - \ Software Platform" -- authors: - - DeepSpeed Team Microsoft Corporation - categories: - - Distributed Training - description: - "In the poster (and a talk during the breakout session), we will present - three aspects of DeepSpeed (https://github.com/microsoft/DeepSpeed), a deep learning - optimization library based on PyTorch framework: 1) How we overcome the GPU memory - barrier by ZeRO-powered data parallelism. 2) How we overcome the network bandwidth - barrier by 1-bit Adam and 1-bit Lamb compressed optimization algorithms. 3) How - we overcome the usability barrier by integration with Azure ML, HuggingFace, and - PyTorch Lightning." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/E1.png - section: E1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-E1.png - title: "DeepSpeed: Shattering barriers of deep learning speed & scale" -- authors: - - Stephanie Kirmer - - Hugo Shi - categories: - - Distributed Training - description: - We have developed a library that helps simplify the task of multi-machine - parallel training for PyTorch models, bringing together the power of PyTorch DDP - with Dask for parallelism on GPUs. Our poster describes the library and its core - function, and demonstrates how the multi-machine training process works in practice. - link: https://github.com/saturncloud/dask-pytorch-ddp - section: E2 - title: - "Dask PyTorch DDP: A new library bringing Dask parallelization to PyTorch - training" -- authors: - - Vignesh Gopakumar - categories: - - Distributed Training - description: - Solving PDEs using Neural Networks are often ardently laborious as - it requires training towards a well-defined solution, i.e. global minima for a - network architecture - objective function combination. For a family of complex - PDEs, Physics Informed neural networks won't offer much in comparison to traditional - numerical methods as their global minima becomes more and more intractable. We - propose a modified approach that hinges on continual and parametrised learning - that can create more general PINNs that can solve for a variety of PDE scenarios - rather than solving for a well-defined case. We believe that this brings Neural - Network based PDE solvers in comparison to numerical solvers. - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/E3.png - section: E3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-E3.png - title: Optimising Physics Informed Neural Networks. -- authors: - - Mandeep Baines - - Shruti Bhosale - - Vittorio Caggiano - - Benjamin Lefaudeux - - Vitaliy Liptchinsky - - Naman Goyal - - Siddhardth Goyal - - Myle Ott - - Sam Sheifer - - Anjali Sridhar - - Min Xu - categories: - - Distributed Training - description: - 'FairScale is a library that extends basic PyTorch capabilities while - adding new SOTA techniques for high performance and large scale training on one - or multiple machines. FairScale makes available the latest distributed training - techniques in the form of composable modules and easy to use APIs. - - - Machine Learning (ML) training at scale traditionally means data parallelism to - reduce training time by using multiple devices to train on larger batch size. - Nevertheless, with the recent increase of ML models sizes data parallelism is - no longer enough to satisfy all "scaling" needs. FairScale provides several options - to overcome some of the limitations to scale. - - - For scaling training that is bottlenecked by memory (optimizer state, intermediate - activations, parameters), FairScale provides APIs that have implemented optimizer, - gradient and parameter sharding. This will allow users to train large models using - devices in a more memory efficient manner. - - - To overcome the memory required for large models FairScale provides various flavors - of pipeline and model parallelism, MOE (Mixture Of Experts) layer, and Offload - models. Those methods allow to perform computation only of shards of the models - across multiple devices with micro batches of data to maximize device efficiency. - - - FairScale also provides modules to aid users to scale batch size effectively without - changing their existing learning rate hyperparameter - AdaScale - and save memory - with checkpoint activation of intermediate layers. - - - FairScale has also been integrated into Pytorch Lightening, HuggingFace, FairSeq, - VISSL, and MMF to enable users of those frameworks to take advantage of its features.' - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/F1.png - section: F1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-F1.png - title: - FairScale-A general purpose modular PyTorch library for high performance - and large scale training -- authors: - - Aurick Qiao - - Sang Keun Choe - - Suhas Jayaram Subramanya - - Willie Neiswanger - - Qirong Ho - - Hao Zhang - - Gregory R. Ganger - - Eric P. Xing - categories: - - Distributed Training - description: - "AdaptDL is an open source framework and scheduling algorithm that - directly optimizes cluster-wide training performance and resource utilization. - By elastically re-scaling jobs, co-adapting batch sizes and learning rates, and - avoiding network interference, AdaptDL improves shared-cluster training compared - with alternative schedulers. AdaptDL can automatically determine the optimal number - of resources given a job's need. It will efficiently add or remove resources - dynamically to ensure the highest-level performance. The AdaptDL scheduler will - automatically figure out the most efficient number of GPUs to allocate to your - job, based on its scalability. When the cluster load is low, your job can dynamically - expand to take advantage of more GPUs. AdaptDL offers an easy-to-use API to make - existing PyTorch training code elastic with adaptive batch sizes and learning - rates. - - Showcase: Distributed training and Data Loading" - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/F2.png - section: F2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-F2.png - title: - "AdaptDL: An Open-Source Resource-Adaptive Deep Learning Training/Scheduling - Framework" -- authors: - - Natalie Kershaw - categories: - - Distributed Training - description: - "As deep learning models, especially transformer models get bigger - and bigger, reducing training time becomes both a financial and environmental - imperative. ONNX Runtime can accelerate large-scale distributed training of PyTorch - transformer models with a one-line code change (in addition to import statements - ;-)) Adding in the DeepSpeed library improves training speed even more. - - - With the new ORTModule API, you wrap an existing torch.nn.Module, and have us - automatically: export the model as an ONNX computation graph; compile and optimize - it with ONNX Runtime; and integrate it into your existing training script. - - - In this poster, we demonstrate how to fine-tune a popular HuggingFace model and - show the performance improvement, on a multi-GPU cluster in the Azure Machine - Learning cloud service." - link: https://aka.ms/pytorchort - section: G1 - title: - "Accelerate PyTorch large model training with ONNX Runtime: just add one - line of code!" -- authors: - - Jack Cao - - Daniel Sohn - - Zak Stone - - Shauheen Zahirazami - categories: - - Distributed Training - description: - PyTorch / XLA enables users to train PyTorch models on XLA devices - including Cloud TPUs. Cloud TPU VMs now provide direct access to TPU host machines - and hence offer much greater flexibility in addition to making debugging easier - and reducing data transfer overheads. PyTorch / XLA has now full support for this - new architecture. A new profiling tool has also been developed to enable better - profiling of PyTorch / XLA. These improvements not only make it much easier to - develop models but also reduce the cost of large-scale PyTorch / XLA training - runs on Cloud TPUs. - link: http://goo.gle/pt-xla-tpuvm-signup - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/G2.png - section: G2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-G2.png - title: PyTorch/XLA with new Cloud TPU VMs and Profiler -- authors: - - Ari Bornstein - categories: - - Frontend & Experiment Manager - description: - PyTorch Lightning reduces the engineering boilerplate and resources - required to implement state-of-the-art AI. Organizing PyTorch code with Lightning - enables seamless training on multiple-GPUs, TPUs, CPUs, and the use of difficult - to implement best practices such as model sharding, 16-bit precision, and more, - without any code changes. In this poster, we will use practical Lightning examples - to demonstrate how to train Deep Learning models with less boilerplate. - link: https://www.pytorchlightning.ai/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/E4.png - section: E4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-E4.png - title: "PyTorch Lightning: Deep Learning without the Boilerplate" -- authors: - - Jiong Gong - - Nikita Shustrov - - Eikan Wang - - Jianhui Li - - Vitaly Fedyunin - categories: - - Frontend & Experiment Manager - description: - "Intel and Facebook collaborated to enable BF16, a first-class data\ - \ type in PyTorch, and a data type that are accelerated natively with the 3rd\ - \ Gen Intel\xAE Xeon\xAE scalable processors. This poster introduces the latest\ - \ SW advancements added in Intel Extension for PyTorch (IPEX) on top of PyTorch\ - \ and the oneAPI DNN library for ease-of-use and high-performance BF16 DL compute\ - \ on CPU. With these SW advancements, we demonstrated ease-of-use IPEX user-facing\ - \ API, and we also showcased 1.55X-2.42X speed-up with IPEX BF16 training over\ - \ FP32 with the stock PyTorch and 1.40X-4.26X speed-up with IPEX BF16 inference\ - \ over FP32 with the stock PyTorch." - link: https://github.com/intel/intel-extension-for-pytorch - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/E5.png - section: E5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-E5.png - title: Accelerate PyTorch with IPEX and oneDNN using Intel BF16 Technology -- authors: - - Robin Lobel - categories: - - Frontend & Experiment Manager - description: - TorchStudio is a standalone software based on PyTorch and LibTorch. - It aims to simplify the creation, training and iterations of PyTorch models. It - runs locally on Windows, Ubuntu and macOS. It can load, analyze and explore PyTorch - datasets from the TorchVision or TorchAudio categories, or custom datasets with - any number of inputs and outputs. PyTorch models can then be loaded and written - from scratch, analyzed, and trained using local hardware. Trainings can be run - simultaneously and compared to identify the best performing models, and export - them as a trained TorchScript or ONNX model. - link: https://torchstudio.ai/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/F4.png - section: F4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-F4.png - title: TorchStudio, a machine learning studio software based on PyTorch -- authors: - - Jieru Hu - - "Omry Yadan " - categories: - - Frontend & Experiment Manager - description: - "Hydra is an open source framework for configuring and launching research - Python applications. Key features: - Compose and override your config dynamically - to get the perfect config for each run - Run on remote clusters like SLURM and - AWS without code changes - Perform basic greed search and hyper parameter optimization - without code changes - Command line tab completion for your dynamic config And - more." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/F5.png - section: F5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-F5.png - title: Hydra Framework -- authors: - - Victor Fomin - - Sylvain Desroziers - - Taras Savchyn - categories: - - Frontend & Experiment Manager - description: - This poster intends to give a brief but illustrative overview of what - PyTorch-Ignite can offer for Deep Learning enthusiasts, professionals and researchers. - Following the same philosophy as PyTorch, PyTorch-Ignite aims to keep it simple, - flexible and extensible but performant and scalable. Throughout this poster, we - will introduce the basic concepts of PyTorch-Ignite, its API and features it offers. - We also assume that the reader is familiar with PyTorch. - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/G4.png - section: G4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-G4.png - title: "PyTorch-Ignite: training common things easy and the hard things possible" -- authors: - - Sanzhar Askaruly - - Nurbolat Aimakov - - Alisher Iskakov - - Hyewon Cho - categories: - - Medical & Healthcare - description: - Deep learning has transformed many aspects of industrial pipelines - recently. Scientists involved in biomedical imaging research are also benefiting - from the power of AI to tackle complex challenges. Although the academic community - has widely accepted image processing tools, such as scikit-image, ImageJ, there - is still a need for a tool which integrates deep learning into biomedical image - analysis. We propose a minimal, but convenient Python package based on PyTorch - with common deep learning models, extended by flexible trainers and medical datasets. - link: https://github.com/tuttelikz/farabio - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H4.png - section: H4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-H4.png - title: Farabio - Deep Learning Toolkit for Biomedical Imaging -- authors: - - Michael Zephyr - - Prerna Dogra Richard Brown - - Wenqi Li - - Eric Kerfoot - categories: - - Medical & Healthcare - description: - "Healthcare image analysis for both radiology and pathology is increasingly\ - \ being addressed with deep-learning-based solutions. These applications have\ - \ specific requirements to support various imaging modalities like MR, CT, ultrasound,\ - \ digital pathology, etc. It is a substantial effort for researchers in the field\ - \ to develop custom functionalities to handle these requirements. Consequently,\ - \ there has been duplication of effort, and as a result, researchers have incompatible\ - \ tools, which makes it hard to collaborate.\n \nMONAI stands for Medical Open\ - \ Network for AI. Its mission is to accelerate the development of healthcare imaging\ - \ solutions by providing domain-specialized building blocks and a common foundation\ - \ for the community to converge in a native PyTorch paradigm." - link: https://monai.io/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H5.png - section: H5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-H5.png - title: "MONAI: A Domain Specialized Library for Healthcare Imaging" -- authors: - - Shai Brown - - Daniel Neimark - - Maya Zohar - - Omri Bar - - Dotan Asselmann - categories: - - Medical & Healthcare - description: - "Theator is re-imagining surgery with a Surgical Intelligence platform\ - \ that leverages highly advanced AI, specifically machine learning and computer\ - \ vision technology, to analyze every step, event, milestone, and critical junction\ - \ of surgical procedures.\n\nOur platform analyzes lengthy surgical procedure\ - \ videos and extracts meaningful information, providing surgeons with highlight\ - \ reels of key moments in an operation, enhanced by annotations.\n\nAs the team\ - \ expanded, we realized that we were spending too much time manually running model\ - \ training and focusing on DevOps tasks and not enough time dedicated to core\ - \ research.\n\nTo face this, we build an automation framework composed of multiple\ - \ training pipelines using PyTorch and ClearML. Our framework automates and manages\ - \ our entire process, from model development to deployment to continuous training\ - \ for model improvement.\n\nNew data is now immediately processed and fed directly\ - \ into training pipelines \u2013 speeding up workflow, minimizing human error,\ - \ and freeing up our research team for more important tasks. Thus, enabling us\ - \ to scale our ML operation and deliver better models for our end users." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/I4.png - section: I4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-I4.png - title: - How theator Built a Continuous Training Framework to Scale Up Its Surgical - Intelligence Platform -- authors: - - Cebere Bogdan - - Cebere Tudor - - Manolache Andrei - - Horia Paul-Ion - categories: - - Medical & Healthcare - description: - We present Q&Aid, a conversation agent that relies on a series of machine - learning models to filter, label, and answer medical questions based on a provided - image and text inputs. Q&Aid is simplifying the hospital logic backend by standardizing - it to a Health Intel Provider (HIP). A HIP is a collection of models trained on - local data that receives text and visual input, afterward filtering, labeling, - and feeding the data to the right models and generating at the end output for - the aggregator. Any hospital is identified as a HIP holding custom models and - labeling based on its knowledge. The hospitals are training and fine-tuning their - models, such as a Visual Question Answering (VQA) model, on private data (e.g. - brain anomaly segmentation). We aggregate all of the tasks that the hospitals - can provide into a single chat app, offering the results to the user. When the - chat ends, the transcript is forwarded to each hospital, a doctor being in charge - of the final decision. - link: https://qrgo.page.link/d1fQk - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/I5.png - section: I5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-I5.png - title: "Q&Aid: A Conversation Agent Powered by PyTorch" -- authors: - - Jaden Hong - - Kevin Tran - - Tyler Lee - - Paul Lee - - Freddie Cha - - Louis Jung - - Dr. Jung Kyung Hong - - Dr. In-Young Yoon - - David Lee - categories: - - Medical & Healthcare - description: - "Sleep disorders and insomnia are now regarded as a worldwide problem.\ - \ Roughly 62% of adults worldwide feel that they don't sleep well. However, sleep\ - \ is difficult to track so it's not easy to get suitable treatment to improve\ - \ your sleep quality. Currently, the PSG (Polysomnography) is the only way to\ - \ evaluate the sleep quality accurately but it's expensive and often inaccurate\ - \ due to the first night effect. \n\nWe propose a multi-signal sleep stage classifier\ - \ for contactless sleep tracking: Sleepbot. By automating the manual PSG reading\ - \ and providing explainable analysis, Sleepbot opens a new possibility to apply\ - \ sleep staging AI in both home and hospital. With sound recorded by a smartphone\ - \ app and RF-sensed signal measured by Asleep's non-contact sleep tracker, Sleepbot\ - \ provides a clinical level of sleep stage classification. \n\nSleepbot achieved\ - \ 85.5 % accuracy in 5-class (Wake, N1, N2, N3, Rem) using PSG signals measured\ - \ from 3,700 subjects and 77 % accuracy in 3-class (Wake, Sleep, REM) classification\ - \ using only sound data measured from 1,2000 subjects." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/J4.png - section: J4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-J4.png - title: "Sleepbot: Multi-signal Sleep Stage Classifier AI for hospital and home" -- authors: - - Akshay Agrawal - - Alnur Ali - - Stephen Boyd - categories: - - Medical & Healthcare - description: - "We present a unifying framework for the vector embedding problem: - given a set of items and some known relationships between them, we seek a representation - of the items by vectors, possibly subject to some constraints (e.g., requiring - the vectors to have zero mean and identity covariance). We want the vectors associated - with similar items to be near each other, and vectors associated with dissimilar - items to not be near, measured in Euclidean distance. We formalize this by introducing - distortion functions, defined for some pairs of the items. Our goal is to choose - an embedding that minimizes the total distortion, subject to the constraints. - We call this the minimum-distortion embedding (MDE) problem. The MDE framework - generalizes many well-known embedding methods, such as PCA, the Laplacian eigenmap, - multidimensional scaling, UMAP, and others, and also includes new types of embeddings. - - - Our accompanying software library, PyMDE, makes it easy for users to specify and - approximately solve MDE problems, enabling experimentation with well-known and - custom embeddings alike. By making use of automatic differentiation and hardware - acceleration via PyTorch, we are able to scale to very large embedding problems. - We will showcase examples of embedding real datasets, including an academic co-authorship - network, single-cell mRNA transcriptomes, US census data, and population genetics." - link: "" - section: J5 - title: "PyMDE: Minimum-Distortion Embedding" -- authors: - - "Fernando P\xE9rez-Garc\xEDa" - - Rachel Sparks - - "S\xE9bastien Ourselin" - categories: - - Medical & Healthcare - description: - "Processing of medical images such as MRI or CT presents unique challenges - compared to RGB images typically used in computer vision. These include a lack - of labels for large datasets, high computational costs, and metadata to describe - the physical properties of voxels. Data augmentation is used to artificially increase - the size of the training datasets. Training with image patches decreases the need - for computational power. Spatial metadata needs to be carefully taken into account - in order to ensure a correct alignment of volumes. - - - We present TorchIO, an open-source Python library to enable efficient loading, - preprocessing, augmentation and patch-based sampling of medical images for deep - learning. TorchIO follows the style of PyTorch and integrates standard medical - image processing libraries to efficiently process images during training of neural - networks. TorchIO transforms can be composed, reproduced, traced and extended. - We provide multiple generic preprocessing and augmentation operations as well - as simulation of MRI-specific artifacts. - - - TorchIO was developed to help researchers standardize medical image processing - pipelines and allow them to focus on the deep learning experiments. It encourages - open science, as it supports reproducibility and is version controlled so that - the software can be cited precisely. Due to its modularity, the library is compatible - with other frameworks for deep learning with medical images." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K4.png - section: K4 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-K4.png - title: - "TorchIO: Pre-Processing & Augmentation of Medical Images for Deep Learning - Applications" -- authors: - - Laila Rasmy - - Ziqian Xie - - Degui Zhi - categories: - - Medical & Healthcare - description: - With the extensive use of electronic records and the availability of - historical patient information, predictive models that can help identify patients - at risk based on their history at an early stage can be a valuable adjunct to - clinician judgment. Deep learning models can better predict patients' outcomes - by consuming their medical history regardless of the length and the complexity - of such data. We used our Pytorch_EHR framework to train a model that can predict - COVID-19 patient's health outcomes on admission. We used the Cerner Real-world - COVID-19 (Q2) cohort which included information for 117,496 COVID patients from - 62 health systems. We used a cohort of 55,068 patients and defined our outcomes - including mortality, intubation, and hospitalization longer than 3 days as binary - outcomes. We feed the model with all diagnoses, medication, laboratory results, - and other clinical events information available before or on their first COVID-19 - encounter admission date. We kept the data preprocessing at a minimum for convenience - and practicality relying on the embedding layer that learns features representations - from the large training set. Our model showed improved performance compared to - other baseline machine learning models like logistic regression (LR). For in-hospital - mortality, our model showed AUROC of 89.5%, 90.6%, and 84.3% for in-hospital mortality, - intubation, and hospitalization for more than 3 days, respectively versus LR which - showed 82.8%, 83.2%, and 76.8% - link: https://github.com/ZhiGroup/pytorch_ehr - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K5.png - section: K5 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-K5.png - title: Deep Learning Based Model to Predict Covid19 Patients' Outcomes on Admission -- authors: - - Binghui Ouyang - - "Alexander O\u2019Connor " - categories: - - NLP & Multimodal, RL & Time Series - description: - "While Transformers have brought unprecedented improvements in the\ - \ accuracy and ease of developing NLP applications, their deployment remains challenging\ - \ due to the large size of the models and their computational complexity. \n Indeed,\ - \ until recently is has been a widespread misconception that hosting high-performance\ - \ transformer-based models was prohibitively expensive, and technically challenging.\ - \ Fortunately, recent advances in both the PyTorch ecosystem and in custom hardware\ - \ for inference have created a world where models can be deployed in a cost-effective,\ - \ scalable way, without the need for complex engineering.\n\nIn this presentation,\ - \ we will discuss the use of PyTorch and AWS Inferentia to deploy production-scale\ - \ models in chatbot intent classification - a particularly relevant and demanding\ - \ scenario. \n\nAutodesk deploys a number of transformer based models to solve\ - \ customer support issues across our channels, and our ability to provide a flexible,\ - \ high-quality machine learning solution is supported by leveraging cutting-edge\ - \ technology such as transformer based classification. Our chatbot, AVA, responds\ - \ to tens of thousands of customer interactions monthly, and we are evolving our\ - \ architecture to be supported by customer inference.\n\nWe will discuss our experience\ - \ of piloting transformer-based intent models, and present a workflow for going\ - \ from data to deployment for similar projects." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A1.png - section: A1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-A1.png - title: " Rolling out Transformers with TorchScript and Inferentia" -- authors: - - Kashif Rasul - categories: - - NLP & Multimodal, RL & Time Series - description: - PyTorchTS is a PyTorch based Probabilistic Time Series forecasting - framework that comes with state of the art univariate and multivariate models. - link: https://github.com/zalandoresearch/pytorch-ts - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A2.png - section: A2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-A2.png - title: "PyTorchTS: PyTorch Probabilistic Time Series Forecasting Framework" -- authors: - - Sasha Sheng - - Amanpreet Singh - categories: - - NLP & Multimodal, RL & Time Series - description: - MMF is designed from ground up to let you focus on what matters -- - your model -- by providing boilerplate code for distributed training, common datasets - and state-of-the-art pretrained baselines out-of-the-box. MMF is built on top - of PyTorch that brings all of its power in your hands. MMF is not strongly opinionated. - So you can use all of your PyTorch knowledge here. MMF is created to be easily - extensible and composable. Through our modular design, you can use specific components - from MMF that you care about. Our configuration system allows MMF to easily adapt - to your needs. - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A3.png - section: A3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-A3.png - title: "MMF: A modular framework for multimodal research" -- authors: - - Dirk Groeneveld - - Akshita Bhagia - - Pete Walsh - - Michael Schmitz - categories: - - NLP & Multimodal, RL & Time Series - description: - An Apache 2.0 NLP research library, built on PyTorch, for developing - state-of-the-art deep learning models on a wide variety of linguistic tasks. - link: https://github.com/allenai/allennlp - section: B1 - title: "AllenNLP: An NLP research library for developing state-of-the-art models" -- authors: - - John Trenkle - - Jaya Kawale & Tubi ML team - categories: - - NLP & Multimodal, RL & Time Series - description: - "Tubi is one of the leading platforms providing free high-quality streaming\ - \ movies and TV shows to a worldwide audience. We embrace a data-driven approach\ - \ and leverage advanced machine learning techniques using PyTorch to enhance our\ - \ platform and business in any way we can. The Three Pillars of AVOD are the\ - \ guiding principle for our work. The Pillars are \nContent: all the titles we\ - \ maintain in our library\nAudience: everyone who watches titles on Tubi\nAdvertising:\ - \ ads shown to viewers on behalf of brands\n\nIn this poster, we'll focus on the\ - \ Content aspect with more details for the various use cases especially Content\ - \ Understanding. Content is an important pillar of Tubi since to be successful,\ - \ we need to look at existing titles and beyond what we already have and attempt\ - \ to understand all of the titles out in the wild and how they could benefit our\ - \ platform in some fashion. Content Understanding revolves around digesting a\ - \ rich collection of 1st- and 3rd-party data in structured (metadata) and unstructured\ - \ (text) forms and developing representations that capture the essence of those\ - \ Titles. With the analogy of linear algebra, we can say we are attempting to\ - \ project Title vectors from the universe to our tubiverse with as much fidelity\ - \ as possible in order to ascertain potential value for each target use case.\ - \ We will describe several techniques to understand content better using Pytorch." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/B2.png - section: B2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-B2.png - title: "Project Spock at Tubi: Understanding Content using Deep Learning for NLP" -- authors: - - Benoit Steiner - - Chris Cummins - - Horace He - - Hugh Leather - categories: - - NLP & Multimodal, RL & Time Series - description: - "As the usage of machine learning techniques is becoming ubiquitous,\ - \ the efficient execution of neural networks is crucial to many applications.\ - \ Frameworks, such as Halide and TVM, separate the algorithmic representation\ - \ of\nthe deep learning model from the schedule that determines its implementation.\ - \ Finding good schedules, however, remains extremely challenging. Auto-tuning\ - \ methods, which search the space of valid schedules and execute each candidate\ - \ on the hardware, identify some of the best performing schedules, but the search\ - \ can take hours, hampering the productivity of deep learning practitioners. What\ - \ is needed is a method that achieves a similar performance without extensive\ - \ search, delivering the needed efficiency quickly.\n\nUsing PyTorch, we model\ - \ the scheduling process as a sequence of optimization choices, and implement\ - \ a new technique to accurately predict the expected performance of a partial\ - \ schedule using a LSTM over carefully engineered features that describe each\ - \ DNN operator and their current scheduling choices. Leveraging these predictions\ - \ we are able to make these optimization decisions greedily and, without any executions\ - \ on the target hardware, rapidly identify an efficient schedule.\nThis techniques\ - \ enables to find schedules that improve the execution performance of deep neural\ - \ networks by 2.6\xD7 over Halide and 1.5\xD7 over TVM. Moreover, our technique\ - \ completes in seconds instead of hours, making it possible to include it as\ - \ a new backend for PyTorch itself." - link: http://facebook.ai - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/B3.png - section: B3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-B3.png - title: RL Based Performance Optimization of Deep Neural Networks -- authors: - - Zhenghong Liu - categories: - - NLP & Multimodal, RL & Time Series - description: - Forte is an open-source toolkit for building Natural Language Processing - workflows via assembling state-of-the-art NLP and ML technologies. This toolkit - features composable pipeline, cross-task interaction, adaptable data-model interfaces. - The highly composable design allows users to build complex NLP pipelines of a - wide range of tasks including document retrieval, information extraction, and - text generation by combining existing toolkits or customized PyTorch models. The - cross-task interaction ability allows developers to utilize the results from individual - tasks to make informed decisions. The data-model interface helps developers to - focus on building reusable PyTorch models by abstracting out domain and preprocessing - details. We show that Forte can be used to build complex pipelines, and the resulting - pipeline can be easily adapted to different domains and tasks with small changes - in the code. - link: https://github.com/asyml/forte - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C1.png - section: C1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-C1.png - title: A Data-Centric Framework for Composable NLP -- authors: - - Shagun Sodhani - - Amy Zhang - - Ludovic Denoyer - - Pierre-Alexandre Kamienny - - Olivier Delalleau - categories: - - NLP & Multimodal, RL & Time Series - description: - "The two key components in a multi-task RL codebase are (i) Multi-task - RL algorithms and (ii) Multi-task RL environments. We develop open-source libraries - for both components. [MTRL](https://github.com/facebookresearch/mtrl) provides - components to implement multi-task RL algorithms, and [MTEnv](https://github.com/facebookresearch/mtenv) - is a library to interface with existing multi-task RL environments and create - new ones. - - - MTRL has two building blocks: (i) single task policy and (ii) components to augment - the single-task policy for multi-task setup. The ideal workflow is to start with - a base policy and add multi-task components as they seem fit. MTRL enables algorithms - like GradNorm, Distral, HiPBMDP, PCGrad, Soft Modularization, etc. - - - MTEnv is an effort to standardize multi-task RL environments and provide better - benchmarks. We extend the Gym API to support multiple tasks, with two guiding - principles: (i) Make minimal changes to the Gym Interface (which the community - is very familiar with) and (ii) Make it easy to port existing environments to - MTEnv. Additionally, we provide a collection of commonly used multi-task RL environments - (Acrobot, Cartpole, Multitask variant of DeepMind Control Suite, Meta-World, Multi-armed - Bandit, etc.). The RL practitioner can combine its own environments with the MTEnv - wrappers to add multi-task support with a small code change. - - - MTRL and MTEnv are used in several ongoing/published works at FAIR." - link: http://qr.w69b.com/g/tGZSFw33G - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C2.png - section: C2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-C2.png - title: Environments and Baselines for Multitask Reinforcement Learning -- authors: - - Lysandre Debut - - Sylvain Gugger - - "Quentin Lhoest\_" - categories: - - NLP & Multimodal, RL & Time Series - description: - "Transfer learning has become the norm to get state-of-the-art results - in NLP. Hugging Face provides you with tools to help you on every step along the - way: - - - - A free git-based shared hub with more than 7,500 PyTorch checkpoints, and more - than 800 NLP datasets. - - - The ? Datasets library, to easily download the dataset, manipulate it and prepare - it. - - - The ? Tokenizers library, that provides ultra-fast tokenizers backed by Rust, - and converts text in PyTorch tensors. - - - The ? Transformers library, providing more than 45 PyTorch implementations of - Transformer architectures as simple nn.Module as well as a training API. - - - The ? Accelerate library, a non-intrusive API that allows you to run your raw - training loop on any distributed setup. - - - The pipeline is then simply a six-step process: select a pretrained model from - the hub, handle the data with Datasets, tokenize the text with Tokenizers, load - the model with Transformers, train it with the Trainer or your own loop powered - by Accelerate, before sharing your results with the community on the hub." - link: https://huggingface.co/models - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C3.png - section: C3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-C3.png - title: The Hugging Face Ecosystem -- authors: - - Manuel Pariente - - Samuele Cornell - - Jonas Haag - - Joris Cosentino - - Michel Olvera - - "Fabian-Robert St\xF6ter" - - Efthymios Tzinis - categories: - - NLP & Multimodal, RL & Time Series - description: - Asteroid is an audio source separation toolkit built with PyTorch and - PyTorch-Lightning. Inspired by the most successful neural source separation systems, - it provides all neural building blocks required to build such a system. To improve - reproducibility, recipes on common audio source separation datasets are provided, - including all the steps from data download\preparation through training to evaluation - as well as many current state-of-the-art DNN models. Asteroid exposes all levels - of granularity to the user from simple layers to complete ready-to-use models. - Our pretrained models are hosted on the asteroid-models community in Zenodo and - on the Huggingface model Hub. Loading and using pretrained models is trivial and - sharing them is also made easy with asteroid's CLI.","poster_showcase":"Audio - Source Separation, Speech Processing, Deep Learning","email":"cornellsamuele@gmail.com"} - link: https://asteroid-team.github.io/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D1.png - section: D1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-D1.png - title: "\_Asteroid: the Pytorch-based Audio Source Separation Toolkit for Researchers" -- authors: - - Ludovic Denoyer - - Danielle Rothermel - - Xavier Martinet - categories: - - NLP & Multimodal, RL & Time Series - description: - RLStructures is a lightweight Python library that provides simple APIs - as well as data structures that make as few assumptions as possible about the - structure of your agent or your task, while allowing for transparently executing - multiple policies on multiple environments in parallel (incl. multiple GPUs). - It thus facilitates the implementation of RL algorithms while avoiding complex - abstractions. - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D2.png - section: D2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-D2.png - title: "rlstructures: A Lightweight Python Library for Reinforcement Learning Research" -- authors: - - Luis Pineda - - Brandon Amos - - Amy Zhang - - Nathan O. Lambert - - Roberto Calandra - categories: - - NLP & Multimodal, RL & Time Series - description: - Model-based reinforcement learning (MBRL) is an active area of research - with enormous potential. In contrast to model-free RL, MBRL algorithms solve tasks - by learning a predictive model of the task dynamics, and use this model to predict - the future and facilitate decision making. Many researchers have argued that MBRL - can result in lower sample complexity, better generalization, as well as safer - and more interpretable decisions. However, despite the surge in popularity and - great potential of MBRL, there is currently no widely accepted library for facilitating - research in this area. Since MBRL methods often involve the interplay of complex - components such as probabilistic ensembles, latent variable models, planning algorithms, - and even model-free methods, the lack of such a library raises the entry bar to - the field and slows down research efforts. In this work we aim to solve this problem - by introducing MBRL-Lib, a modular PyTorch toolbox specifically designed for facilitating - research on model-based reinforcement learning. MBRL-Lib provides interchangeable - options for training dynamics models and running planning algorithms, which can - then be used in a mix and match fashion to create novel MBRL methods. The library - also provides a set of utility functions to run common MBRL tasks, as well a set - of diagnostics tools to identify potential issues while training dynamics models - and control algorithms. - link: https://github.com/facebookresearch/mbrl-lib - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D3.png - section: D3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-D3.png - title: "MBRL-Lib: a PyTorch toolbox for model-based reinforcement learning research" -- authors: - - Geeta Chauhan - - Gisle Dankel - - Elena Neroslavaskaya - categories: - - Performance & Profiler - description: - Analyzing and improving large-scale deep learning model performance - is an ongoing challenge that continues to grow in importance as the model sizes - increase. Microsoft and Facebook collaborated to create a native PyTorch performance - debugging tool called PyTorch Profiler. The profiler builds on the PyTorch autograd - profiler foundation, adds a new high fidelity GPU profiling engine, and out-of-the-box - bottleneck analysis tool in Tensorboard. New Profiler delivers the simplest experience - available to date where users can profile their models without installing any - additional packages and see results immediately in Tensorboard. Until today, beginner - users of PyTorch may not have attempted to profile their models due to the task - complexity. With the new bottleneck analysis tool, they will find profiling easy - and accessible. Experienced users will be delighted by the detailed trace views - which illustrate GPU kernel execution events and their relationship to the PyTorch - operations. Come learn how to profile your PyTorch models using this new delightfully - simple tool. - link: https://pytorch.org/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H6.png - section: H6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-H6.png - title: Introducing New PyTorch Profiler -- authors: - - Naren Dasan - categories: - - Performance & Profiler - description: - For experimentation and the development of machine learning models, - few tools are as approachable as PyTorch. However, when moving from research to - production, some of the features that make PyTorch great for development make - it hard to deploy. With the introduction of TorchScript, PyTorch has solid tooling - for addressing some of the problems of deploying PyTorch models. TorchScript removes - the dependency on Python and produces portable, self contained, static representations - of code and weights. But in addition to portability, users also look to optimize - performance in deployment. When deploying on NVIDIA GPUs, TensorRT, NVIDIA's deep - learning optimizer, provides the capability to maximize performance of workloads - by tuning the execution of models for specific target hardware. TensorRT also - provides tooling for conducting further optimization through mixed and reduced - precision execution and post training quantization (PTQ). We present TRTorch, - a compiler for PyTorch and TorchScript targeting NVIDIA GPUs, which combines the - usability of PyTorch with the performance of TensorRT and allows users to fully - optimize their inference workloads without leaving the PyTorch ecosystem. It also - simplifies conducting complex optimizations like PTQ by leveraging common PyTorch - tooling. TRTorch can be used directly from PyTorch as a TorchScript Backend, embedded - in an application or used from the command line to easily increase the performance - of inference applications. - link: https://nvidia.github.io/TRTorch/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/I6.png - section: I6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-I6.png - title: "TRTorch: A Compiler for TorchScript Targeting NVIDIA GPUs with TensorRT" -- authors: - - Charles H. Martin - categories: - - Performance & Profiler - description: - "WeightWatcher (WW) is an open-source, diagnostic tool for analyzing\ - \ Deep Neural Networks (DNN), without needing access to training or even test\ - \ data. It can be used to: analyze pre/trained pyTorch models; \ninspect models\ - \ that are difficult to train; gauge improvements in model performance; predict\ - \ test accuracies across different models; and detect potential problems when\ - \ compressing or fine-tuning pretrained models.\n\nWeightWatcher is based on theoretical\ - \ research (done in\\-joint with UC Berkeley) into \"Why Deep Learning Works\"\ - , using ideas from Random Matrix Theory (RMT), Statistical Mechanics, and Strongly\ - \ Correlated Systems." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/J6.png - section: J6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-J6.png - title: "WeightWatcher: A Diagnostic Tool for DNNs" -- authors: - - Mario Lezcano-Casado - categories: - - Performance & Profiler - description: - '"This poster presents the ""parametrizations"" feature that will be - added to PyTorch in 1.9.0. - - This feature allows for a simple implementation of methods like pruning, weight_normalization - or spectral_normalization. - - More generally, it implements a way to have ""computed parameters"". This means - that we replace a parameter `weight` in a layer with `f(weight)`, where `f` is - an arbitrary module. In other words, after putting a parametrization `f` on `layer.weight`, - `layer.weight` will return `f(weight)`. - - They implement a caching system, so that the value `f(weight)` is computed just - once during the forward pass. - - A module that implements a parametrisation may also have a `right_inverse` method. - If this method is present, it is possible to assign to a parametrised tensor. - This is useful when initialising a parametrised tensor. - - This feature can be seen as a first step towards invertible modules. In particular, - it may also help making distributions first-class citizens in PyTorch. - - Parametrisations also allows for a simple implementation of constrained optimisation. - From this perspective, parametrisation maps an unconstrained tensor to a constrained - space such as the space of orthogonal matrices, SPD matrices, low-rank matrices... - This approach is implemented in the library GeoTorch (https://github.com/Lezcano/geotorch/)."' - link: "" - section: K6 - title: Constrained Optimization in PyTorch 1.9 Through Parametrizations -- authors: - - Richard Liaw - - Kai Fricke - - Amog Kamsetty - - Michael Galarnyk - categories: - - Platforms & Ops & Tools - description: - Ray is a popular framework for distributed Python that can be paired - with PyTorch to rapidly scale machine learning applications. Ray contains a large - ecosystem of applications and libraries that leverage and integrate with Pytorch. - This includes Ray Tune, a Python library for experiment execution and hyperparameter - tuning at any scale; RLlib, a state-of-the-art library for reinforcement learning; - and Ray Serve, a library for scalable model serving. Together, Ray and Pytorch - are becoming the core foundation for the next generation of production machine - learning platforms. - link: https://ray.io/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H1.png - section: H1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-H1.png - title: Distributed Pytorch with Ray -- authors: - - Vincenzo Lomonaco - - Lorenzo Pellegrini Andrea Cossu - - Antonio Carta - - Gabriele Graffieti - categories: - - Platforms & Ops & Tools - description: - Learning continually from non-stationary data stream is a long sought - goal of machine learning research. Recently, we have witnessed a renewed and fast-growing - interest in Continual Learning, especially within the deep learning community. - However, algorithmic solutions are often difficult to re-implement, evaluate and - port across different settings, where even results on standard benchmarks are - hard to reproduce. In this work, we propose an open-source, end-to-end library - for continual learning based on PyTorch that may provide a shared and collaborative - code-base for fast prototyping, training and reproducible evaluation of continual - learning algorithms. - link: https://avalanche.continualai.org - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H2.png - section: H2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-H2.png - title: "Avalanche: an End-to-End Library for Continual Learning based on PyTorch" -- authors: - - Hong Xu - categories: - - Platforms & Ops & Tools - description: - IBM Z is a hardware product line for mission-critical applications, - such as finance and health applications. It employs its own CPU architecture, - which PyTorch does not officially support. In this poster, we discuss why it is - important to support PyTorch on Z. Then, we show our prebuilt minimal PyTorch - package for IBM Z. Finally, we demonstrate our continuing commitment to make more - PyTorch features available on IBM Z. - link: https://codait.github.io/pytorch-on-z - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H3.png - section: H3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-H3.png - title: PyTorch on IBM Z and LinuxONE (s390x) -- authors: - - Dr. Ariel Biller - categories: - - Platforms & Ops & Tools - description: - "Both from sanity considerations and the productivity perspective,\ - \ Data Scientists, ML engineers, Graduate students, and other research-facing\ - \ roles are all starting to adopt best-practices from production-grade MLOps.\n\ - \nHowever, most toolchains come with a hefty price of extra code and maintenance,\ - \ which reduces the actual time available for R&D. We will show an alternative\ - \ approach using ClearML, the open-source MLOps solution.\n\nIn this \"best-practices\"\ - \ poster, we will overview the \"must-haves\" of R&D-MLOPs: \nOrchestration, Automation,\ - \ and Reproducibility. These enable easy remote execution through magically reproducible\ - \ setups and even custom, reusable, bottom-up pipelines.\n\nWe will take a single\ - \ example and schematically transform it from the \"as downloaded from GitHub\"\ - \ stage to a fully-fledged, scalable, version-controlled, parameterizable R&D\ - \ pipeline. We will measure the number of changes needed to the codebase and provide\ - \ evidence of real low-cost integration. All code, logs, and metrics will be available\ - \ as supporting information." - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/I1.png - section: I1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-I1.png - title: "The Fundamentals of MLOps for R&D: Orchestration, Automation, Reproducibility" -- authors: - - Masashi Sode - - Akihiko Fukuchi - - Yoki Yabe - - Yasufumi Nakata - categories: - - Platforms & Ops & Tools - description: - "Is your machine learning model fair enough to be used in your system?\ - \ What if a recruiting AI discriminates on gender and race? What if the accuracy\ - \ of medical AI depends on a person's annual income or on the GDP of the country\ - \ where it is used? Today's AI has the potential to cause such problems. In recent\ - \ years, fairness in machine learning has received increasing attention. If current\ - \ machine learning models used for decision making may cause unfair discrimination,\ - \ developing a fair machine learning model is an important goal in many areas,\ - \ such as medicine, employment, and politics. Despite the importance of this goal\ - \ to society, as of 2020, there was no PyTorch\xB9 project incorporating fairness\ - \ into a machine learning model. To solve this problem, we created FairTorch at\ - \ the PyTorch Summer Hackathon 2020.\n\nFairTorch provides a tool to mitigate\ - \ the unfairness of machine learning models. A unique feature of our tool is that\ - \ it allows you to add a fairness constraint to your model by adding only a few\ - \ lines of code, using the fairness criteria provided in the library." - link: https://github.com/wbawakate/fairtorch - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/I2.png - section: I2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-I2.png - title: "FairTorch: Aspiring to Mitigate the Unfairness of Machine Learning Models" -- authors: - - Thomas Viehmann - - Luca Antiga - categories: - - Platforms & Ops & Tools - description: - "When machine learning models are deployed to solve a given task, a - crucial question is whether they are actually able to perform as expected. TorchDrift - addresses one aspect of the answer, namely drift detection, or whether the information - flowing through our models - either probed at the input, output or somewhere in-between - - is still consistent with the one it was trained and evaluated on. In a nutshell, - TorchDrift is designed to be plugged into PyTorch models and check whether they - are operating within spec. - - TorchDrift's principles apply PyTorch's motto _from research to production_ - to drift detection: We provide a library of methods that canbe used as baselines - or building blocks for drift detection research, as well as provide practitioners - deploying PyTorch models in production with up-to-date methods and educational - material for building the necessary statistical background. Here we introduce - TorchDrift with an example illustrating the underlying two-sample tests. We show - how TorchDrift can be integrated in high-performance runtimes such as TorchServe - or RedisAI, to enable drift detection in real-world applications thanks to the - PyTorch JIT." - link: https://torchdrift.org/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/I3.png - section: I3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-I3.png - title: "TorchDrift: Drift Detection for PyTorch" -- authors: - - Quincy Chen - - Arjun Bhargava - - Sudeep Pillai - - Marcus Pan - - Chao Fang - - Chris Ochoa - - Adrien Gaidon - - Kuan-Hui Lee - - Wolfram Burgard - categories: - - Platforms & Ops & Tools - description: - "Modern machine learning for autonomous vehicles requires a fundamentally\ - \ different infrastructure and production lifecycle from their standard software\ - \ continuous-integration/continuous-deployment counterparts. At Toyota Research\ - \ Institute (TRI), we have developed \u200BOuroboros\u200B - a modern ML platform\ - \ that supports the end-to-end lifecycle of all ML models delivered to TRI's autonomous\ - \ vehicle fleets. We envision that all ML models delivered to our fleet undergo\ - \ a systematic and rigorous treatment. Ouroboros delivers several essential features\ - \ including:\na. ML dataset governance and infrastructure-as-code\u200B that ensures\ - \ the traceability, reproducibility, standardization, and fairness for all ML\ - \ datasets and models procedurally generated and delivered to the TRI fleet.\n\ - b. Unified ML dataset and model management:\u200B An unified and streamlined workflow\ - \ for ML dataset curation, label management, and model development that supports\ - \ several key ML models delivered to the TRI fleet today\nc. A Large-scale Multi-task,\ - \ Multi-modal Dataset for Automated Driving\u200B that supports the development\ - \ of various models today, including 3D object detection, 2D object detection,\ - \ 2D BeVFlow, Panoptic Segmentation;\nd. Orchestrated ML workflows\u200B to stand\ - \ up scalable ML applications such as push-button re-training solutions, ML CI/CDs\ - \ pipelines, Dataset Curation workflows, Auto-labelling pipelines, leveraging\ - \ the most up-to-date cloud tools available. along their lifecycles, ensuring\ - \ strong governance on building reusable, reproducible, robust, traceable, and\ - \ fair ML models for the production driving setting. By following the best MLOps\ - \ practices, we expect our platform to lay the foundation for continuous life-long\ - \ learning in our autonomous vehicle fleets and accelerate the transition from\ - \ research to production." - link: https://github.com/TRI-ML - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/J1.png - section: J1 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-J1.png - title: "Ouroboros: MLOps for Automated Driving" -- authors: - - Yujian He - categories: - - Platforms & Ops & Tools - description: - carefree-learn makes PyTorch accessible to people who are familiar - with machine learning but not necessarily PyTorch. By having already implemented - all the pre-processing and post-processing under the hood, users can focus on - implementing the core machine learning algorithms / models with PyTorch and test - them on various datasets. By having designed the whole structure carefully, users - can easily customize every block in the whole pipeline, and can also 'combine' - the implemented blocks to 'construct' new models without efforts. By having carefully - made abstractions users can adapt it to their specific down-stream tasks, such - as quantitative trading (in fact I've already implemented one for my company and - it works pretty well XD). carefree-learn handles distributed training carefully, - so users can either run multiple tasks at the same time, or run a huge model with - DDP in one line of code. carefree-learn also integrates with mlflow and supports - exporting to ONNX, which means it is ready for production to some extend. - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/J2.png - section: J2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-J2.png - title: "carefree-learn: Tabular Datasets \u2764\uFE0F PyTorch" -- authors: - - Wenwei Zhang - categories: - - Platforms & Ops & Tools - description: - "OpenMMLab project builds open-source toolboxes for Artificial Intelligence - (AI). It aims to 1) provide high-quality codebases to reduce the difficulties - in algorithm reimplementation; 2) provide a complete research platform to accelerate - the research production; and 3) shorten the gap between research production to - the industrial applications. Based on PyTorch, OpenMMLab develops MMCV to provide - unified abstract training APIs and common utils, which serves as a foundation - of 15+ toolboxes and 40+ datasets. - - - Since the initial release in October 2018, OpenMMLab has released 15+ toolboxes - that cover 10+ directions, implement 100+ algorithms, and contain 1000+ pre-trained - models. With a tighter collaboration with the community, OpenMMLab will release - more toolboxes with more flexible and easy-to-use training frameworks in the future." - link: https://openmmlab.com/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/J3.png - section: J3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-J3.png - title: "OpenMMLab: An Open-Source Algorithm Platform for Computer Vision" -- authors: - - Sergey Kolesnikov - categories: - - Platforms & Ops & Tools - description: - "For the last three years, Catalyst-Team and collaborators have been\ - \ working on Catalyst\u200A - a high-level PyTorch framework Deep Learning Research\ - \ and Development. It focuses on reproducibility, rapid experimentation, and codebase\ - \ reuse so you can create something new rather than write yet another train loop.\ - \ You get metrics, model checkpointing, advanced logging, and distributed training\ - \ support without the boilerplate and low-level bugs." - link: https://catalyst-team.com - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K2.png - section: K2 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-K2.png - title: "Catalyst \u2013 Accelerated deep learning R&D" -- authors: - - Anton Obukhov - categories: - - Platforms & Ops & Tools - description: - "Evaluation of generative models such as GANs is an important part\ - \ of deep learning research. In 2D image generation, three approaches became widely\ - \ spread: Inception Score, Fr\xE9chet Inception Distance, and Kernel Inception\ - \ Distance. Despite having a clear mathematical and algorithmic description, these\ - \ metrics were initially implemented in TensorFlow and inherited a few properties\ - \ of the framework itself, such as a specific implementation of the interpolation\ - \ function. These design decisions were effectively baked into the evaluation\ - \ protocol and became an inherent part of the specification of the metrics. As\ - \ a result, researchers wishing to compare against state of the art in generative\ - \ modeling are forced to perform an evaluation using the original metric authors'\ - \ codebases. Reimplementations of metrics in PyTorch and other frameworks exist,\ - \ but they do not provide a proper level of fidelity, thus making them unsuitable\ - \ for reporting results and comparing them to other methods. This software aims\ - \ to provide epsilon-exact implementations of the said metrics in PyTorch and\ - \ remove inconveniences associated with generative model evaluation and development.\ - \ All the evaluation pipeline steps are correctly tested, with relative errors\ - \ and sources of remaining non-determinism summarized in sections below.\nTLDR;\ - \ fast and reliable GAN evaluation in PyTorch" - link: https://github.com/toshas/torch-fidelity - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K3.png - section: K3 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-K3.png - title: High-fidelity performance metrics for generative models in PyTorch -- authors: - - Jona Raphael (jona@skytruth.org) - - Ben Eggleston - - Ryan Covington - - Tatianna Evanisko - - John Amos - categories: - - Vision - description: - "Operational oil discharges from ships, also known as \"bilge dumping,\"\ - \ have been identified as a major source of petroleum products entering our oceans,\ - \ cumulatively exceeding the largest oil spills, such as the Exxon Valdez and\ - \ Deepwater Horizon spills, even when considered over short time spans. However,\ - \ we still don't have a good estimate of\n\u25CF How much oil is being discharged;\n\ - \u25CF Where the discharge is happening;\n\u25CF Who the responsible vessels are.\n\ - This makes it difficult to prevent and effectively respond to oil pollution that\ - \ can damage our marine and coastal environments and economies that depend on\ - \ them.\n\nIn this poster we will share SkyTruth's recent work to address these\ - \ gaps using machine learning tools to detect oil pollution events and identify\ - \ the responsible vessels when possible. We use a convolutional neural network\ - \ (CNN) in a ResNet-34 architecture to perform pixel segmentation on all incoming\ - \ Sentinel-1 synthetic aperture radar (SAR) imagery to classify slicks. Despite\ - \ the satellites' incomplete oceanic coverage, we have been detecting an average\ - \ of 135 vessel slicks per month, and have identified several geographic hotspots\ - \ where oily discharges are occurring regularly. For the images that capture a\ - \ vessel in the act of discharging oil, we rely on an Automatic Identification\ - \ System (AIS) database to extract details about the ships, including vessel type\ - \ and flag state. We will share our experience\n\u25CF Making sufficient training\ - \ data from inherently sparse satellite image datasets;\n\u25CF Building a computer\ - \ vision model using PyTorch and fastai;\n\u25CF Fully automating the process\ - \ in the Amazon Web Services (AWS) cloud.\nThe application has been running continuously\ - \ since August 2020, has processed over 380,000 Sentinel-1 images, and has populated\ - \ a database with more than 1100 high-confidence slicks from vessels. We will\ - \ be discussing preliminary results from this dataset and remaining challenges\ - \ to be overcome.\nLearn more at https://skytruth.org/bilge-dumping/" - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A6.png - section: A6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-A6.png - title: Using Satellite Imagery to Identify Oceanic Oil Pollution -- authors: - - Tanishq Abraham - categories: - - Vision - description: - Unpaired image-to-image translation algorithms have been used for various - computer vision tasks like style transfer and domain adaption. Such algorithms - are highly attractive because they alleviate the need for the collection of paired - datasets. In this poster, we demonstrate UPIT, a novel fastai/PyTorch package - (built with nbdev) for unpaired image-to-image translation. It implements various - state-of-the-art unpaired image-to-image translation algorithms such as CycleGAN, - DualGAN, UNIT, and more. It enables simple training and inference on unpaired - datasets. It also comes with implementations of commonly used metrics like FID, - KID, and LPIPS. It also comes with Weights-and-Biases integration for easy experiment - tracking. Since it is built on top of fastai and PyTorch, it comes with support - for mixed-precision and multi-GPU training. It is highly flexible, and custom - dataset types, models, and metrics can be used as well. With UPIT, training and - applying unpaired image-to-image translation only takes a few lines of code. - link: https://github.com/tmabraham/UPIT - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A7.png - section: A7 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-A7.png - title: "UPIT: A fastai Package for Unpaired Image-to-Image Translation" -- authors: - - Aaron Adcock - - Bo Xiong - - Christoph Feichtenhofer - - Haoqi Fan - - Heng Wang - - Kalyan Vasudev Alwala - - Matt Feiszli - - Tullie Murrell - - Wan-Yen Lo - - Yanghao Li - - Yilei Li - - "Zhicheng Yan " - categories: - - Vision - description: - PyTorchVideo is the new Facebook AI deep learning library for video - understanding research. It contains variety of state of the art pretrained video - models, dataset, augmentation, tools for video understanding. PyTorchVideo provides - efficient video components on accelerated inference on mobile device. - link: https://pytorchvideo.org/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A8.png - section: A8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-A8.png - title: "PyTorchVideo: A Deep Learning Library for Video Understanding" -- authors: - - A. Speiser - - "L-R. M\xFCller" - - P. Hoess - - U. Matti - - C. J. Obara - - J. H. Macke - - J. Ries - - S. C. Turaga - categories: - - Vision - description: - Single-molecule localization microscopy (SMLM) has had remarkable success - in imaging cellular structures with nanometer resolution, but the need for activating - only single isolated emitters limits imaging speed and labeling density. Here, - we overcome this major limitation using deep learning. We developed DECODE, a - computational tool that can localize single emitters at high density in 3D with - the highest accuracy for a large range of imaging modalities and conditions. In - a public software benchmark competition, it outperformed all other fitters on - 12 out of 12 data-sets when comparing both detection accuracy and localization - error, often by a substantial margin. DECODE allowed us to take live-cell SMLM - data with reduced light exposure in just 3 seconds and to image microtubules at - ultra-high labeling density. Packaged for simple installation and use, DECODE - will enable many labs to reduce imaging times and increase localization density - in SMLM. - link: http://github.com/turagalab/decode - section: B6 - title: - Deep Learning Enables Fast and Dense Single-Molecule Localization with High - Accuracy -- authors: - - "Abraham S\xE1nchez" - - Guillermo Mendoza - - "E. Ulises Moya-S\xE1nchez" - categories: - - Vision - description: - "We draw inspiration from the cortical area V1. We try to mimic their - main processing properties by means of: quaternion local phase/orientation to - compute lines and edges detection in a specific direction. We analyze how this - layer is robust by its greometry to large illumination and brightness changes." - link: https://gitlab.com/ab.sanchezperez/pytorch-monogenic - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/B7.png - section: B7 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-B7.png - title: A Robust PyTorch Trainable Entry Convnet Layer in Fourier Domain -- authors: - - "Fran\xE7ois-Guillaume Fernandez" - - Mateo Lostanlen - - Sebastien Elmaleh - - Bruno Lenzi - - Felix Veith - - and more than 15+ contributors - categories: - - Vision - description: - '"PyroNear is non-profit organization composed solely of volunteers - which was created in late 2019. Our core belief is that recent technological developments - can support the cohabitation between mankind & its natural habitat. We strive - towards high-performing, accessible & affordable tech-solutions for protection - against natural hazards. More specifically, our first efforts are focused on wildfire - protection by increasing the coverage of automatic detection systems. - - - Our ongoing initiative has now gathered dozens of volunteers to put up the following - main contributions: - - - Computer Vision: compiling open-source models and datasets (soon to be published) - for vision tasks related to wildfire detection - - - Edge Computing: developing an affordable physical prototype running our PyTorch - model on a Raspberry Pi - - - End-to-end detection workflow: building a responsible end-to-end system for - large scale detection and alert management (API, front-end monitoring platform) - - - Deployment: working with French firefighter departments to gather field knowledge - and conduct a test phase over the incoming European summer." - - PyTorch3D is a modular and optimized library for 3D Deep Learning with PyTorch. - It includes support for: data structures for heterogeneous batching of 3D data - (Meshes, Point clouds and Volumes), optimized 3D operators and loss functions - (with custom CUDA kernels), a modular differentiable rendering API for Meshes, - Point clouds and Implicit functions, as well as several other tools for 3D Deep - Learning.' - link: https://github.com/pyronear - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/B8.png - section: B8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-B8.png - title: "PyroNear: Embedded Deep Learning for Early Wildfire Detection" -- authors: - - Nikhila Ravi - - Jeremy Reizenstein - - David Novotny - - Justin Johnson - - Georgia Gkioxari - - Roman Shapovalov - - Patrick Labatut - - Wan-Yen Lo - categories: - - Vision - description: - "PyTorch3D is a modular and optimized library for 3D Deep Learning - with PyTorch. It includes support for: data structures for heterogeneous batching - of 3D data (Meshes, Point clouds and Volumes), optimized 3D operators and loss - functions (with custom CUDA kernels), a modular differentiable rendering API for - Meshes, Point clouds and Implicit functions, as well as several other tools for - 3D Deep Learning." - link: https://arxiv.org/abs/2007.08501 - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C6.png - section: C6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-C6.png - title: "PyTorch3D: Fast, Flexible, 3D Deep Learning " -- authors: - - E. Riba - - J. Shi - - D. Mishkin - - L. Ferraz - - A. Nicolao - categories: - - Vision - description: - This work presents Kornia, an open source computer vision library built - upon a set of differentiable routines and modules that aims to solve generic computer - vision problems. The package uses PyTorch as its main backend, not only for efficiency - but also to take advantage of the reverse auto-differentiation engine to define - and compute the gradient of complex functions. Inspired by OpenCV, Kornia is composed - of a set of modules containing operators that can be integrated into neural networks - to train models to perform a wide range of operations including image transformations,camera - calibration, epipolar geometry, and low level image processing techniques, such - as filtering and edge detection that operate directly on high dimensional tensor - representations on graphical processing units, generating faster systems. Examples - of classical vision problems implemented using our framework are provided including - a benchmark comparing to existing vision libraries. - link: http://www.kornia.org - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C7.png - section: C7 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-C7.png - title: "Kornia: an Open Source Differentiable Computer Vision Library for PyTorch" -- authors: - - Thomas George - categories: - - Vision - description: - Fisher Information Matrices (FIM) and Neural Tangent Kernels (NTK) - are useful tools in a number of diverse applications related to neural networks. - Yet these theoretical tools are often difficult to implement using current libraries - for practical size networks, given that they require per-example gradients, and - a large amount of memory since they scale as the number of parameters (for the - FIM) or the number of examples x cardinality of the output space (for the NTK). - NNGeometry is a PyTorch library that offers a high level API for computing various - linear algebra operations such as matrix-vector products, trace, frobenius norm, - and so on, where the matrix is either the FIM or the NTK, leveraging recent advances - in approximating these matrices. - link: https://github.com/tfjgeorge/nngeometry/ - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C8.png - section: C8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-C8.png - title: - "NNGeometry: Easy and Fast Fisher Information Matrices and Neural Tangent - Kernels in PyTorch" -- authors: - - "B\xE9gaint J." - - "Racap\xE9 F." - - Feltman S. - - Pushparaja A. - categories: - - Vision - description: - CompressAI is a PyTorch library that provides custom operations, layers, - modules and tools to research, develop and evaluate end-to-end image and video - compression codecs. In particular, CompressAI includes pre-trained models and - evaluation tools to compare learned methods with traditional codecs. State-of-the-art - end-to-end compression models have been reimplemented in PyTorch and trained from - scratch, reproducing published results and allowing further research in the domain. - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D6.png - section: D6 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-D6.png - title: "CompressAI: a research library and evaluation platform for end-to-end compression " -- authors: - - Philip Meier - - Volker Lohweg - categories: - - Vision - description: - "The seminal work of Gatys, Ecker, and Bethge gave birth to the field\ - \ of _Neural Style Transfer_ (NST) in 2016. An NST describes the merger between\ - \ the content and artistic style of two arbitrary images. This idea is nothing\ - \ new in the field of Non-photorealistic rendering (NPR). What distinguishes NST\ - \ from traditional NPR approaches is its generality: an NST only needs a single\ - \ arbitrary content and style image as input and thus \"makes -- for the first\ - \ time -- a generalized style transfer practicable\". Besides peripheral tasks,\ - \ an NST at its core is the definition of an optimization criterion called _perceptual\ - \ loss_, which estimates the perceptual quality of the stylized image. Usually\ - \ the perceptual loss comprises a deep neural network that needs to supply encodings\ - \ of images from various depths. \n\n`pystiche` is a library for NST written in\ - \ Python and built upon PyTorch. It provides modular and efficient implementations\ - \ for commonly used perceptual losses as well as neural net architectures. This\ - \ enables users to mix current state-of-the-art techniques with new ideas with\ - \ ease. This poster will showcase the core concepts of `pystiche` that will enable\ - \ other researchers as well as lay persons to got an NST running in minutes." - link: https://github.com/pmeier/pystiche - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D7.png - section: D7 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-D7.png - title: "pystiche: A Framework for Neural Style Transfer" -- authors: - - Siddhish Thakur - categories: - - Vision - description: - " Deep Learning (DL) has greatly highlighted the potential impact of - optimized machine learning in both the scientific - - and clinical communities. The advent of open-source DL libraries from major industrial - entities, such as TensorFlow - - (Google), PyTorch (Facebook), further contributes to DL promises on the democratization - of computational analytics. However, increased technical and specialized background - is required to develop DL algorithms, and the variability of implementation details - hinders their reproducibility. Towards lowering the barrier and making the mechanism - of DL development, training, and inference more stable, reproducible, and scalable, - without requiring an extensive technical background, this manuscript proposes - the Generally Nuanced Deep Learning Framework (GaNDLF). With built-in support - for k-fold cross-validation, data augmentation, multiple modalities and output - classes, and multi-GPU training, as well as the ability to work with both radiographic - and histologic imaging, GaNDLF aims to provide an end-to-end solution for all - DL-related tasks, to tackle problems in medical imaging and provide a robust application - framework for deployment in clinical workflows. - - - Keywords: Deep Learning, Framework, Segmentation, Regression, Classification, - Cross-validation, Data - - augmentation, Deployment, Clinical, Workflows" - link: "" - poster_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D8.png - section: D8 - thumbnail_link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/thumb-D8.png - title: - " GaNDLF \u2013 A Generally Nuanced Deep Learning Framework for Clinical\ - \ Imaging Workflows" diff --git a/_devel/formatter.py b/_devel/formatter.py deleted file mode 100644 index 9624af4a9e67..000000000000 --- a/_devel/formatter.py +++ /dev/null @@ -1,64 +0,0 @@ -""" -Usage: cat pytorch_vision_vgg.md | python formatter.py | notedown >pytorch_vision_vgg.ipynb -""" -import sys -import yaml - -header = [] -markdown = [] -header_read = False -with open('/dev/stdin', 'r') as input, open('/dev/stdout', 'w') as output: - for line in input: - if line.startswith('---'): - header_read = not header_read - continue - if header_read == True: - header += [line] - else: - markdown += [line] - - header = yaml.load(''.join(header), Loader=yaml.BaseLoader) - if header is None: - # This assumes the markdown document has a yaml header - # but some documents, like the README.md do not - # Don't bother rendering them - exit() - - images = [] - try: - if header['featured_image_1'] != 'no-image': - images.append(header['featured_image_1']) - if header['featured_image_2'] != 'no-image': - images.append(header['featured_image_2']) - except: - pass - - pre = [] - - if 'accelerator' in header.keys(): - acc = header['accelerator'] - if acc == 'cuda': - note = ['### This notebook requires a GPU runtime to run.\n', - '### Please select the menu option "Runtime" -> "Change runtime type", select "Hardware Accelerator" -> "GPU" and click "SAVE"\n\n', - '----------------------------------------------------------------------\n\n'] - pre += note - elif acc == 'cuda-optional': - note = ['### This notebook is optionally accelerated with a GPU runtime.\n', - '### If you would like to use this acceleration, please select the menu option "Runtime" -> "Change runtime type", select "Hardware Accelerator" -> "GPU" and click "SAVE"\n\n', - '----------------------------------------------------------------------\n\n'] - pre += note - - pre += ['# ' + header['title'] + '\n\n'] - pre += ['*Author: ' + header['author'] + '*' + '\n\n'] - pre += ['**' + header['summary'] + '**' + '\n\n'] - - if len(images) == 2: - pre += ['_ | _\n'] - pre += ['- | -\n'] - pre += ['![alt](https://pytorch.org/assets/images/{}) | ' - '![alt](https://pytorch.org/assets/images/{})\n\n'.format(*images)] - elif len(images) == 1: - pre += ['alt\n\n'.format(*images)] - - markdown = pre + markdown - output.write(''.join(markdown)) diff --git a/_devel/update_hub_submodule.sh b/_devel/update_hub_submodule.sh deleted file mode 100755 index 8e35a6b6f02d..000000000000 --- a/_devel/update_hub_submodule.sh +++ /dev/null @@ -1,16 +0,0 @@ -set -ex -pushd _hub -git pull https://github.com/pytorch/hub -popd -cp _hub/images/* assets/images/ - -python3 -c 'import notedown' || pip3 install notedown -python3 -c 'import yaml' || pip3 install pyyaml -mkdir -p assets/hub/ - -pushd _hub -find . -maxdepth 1 -name "*.md" | grep -v "README" | cut -f2- -d"/" | - while read file; do - cat "$file" | python3 ../_devel/formatter.py | notedown >"../assets/hub/${file%.md}.ipynb"; - done -popd diff --git a/_ecosystem/Captum b/_ecosystem/Captum deleted file mode 100644 index 2da478fe4963..000000000000 --- a/_ecosystem/Captum +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: ecosystem_detail -title: Captum -summary: Captum (“comprehension” in Latin) is an open source, extensible library for model interpretability built on PyTorch. -link: https://captum.ai/ -order: 4 -summary-home: Captum (“comprehension” in Latin) is an open source, extensible library for model interpretability built on PyTorch. -featured-home: true -redirect_to: https://captum.ai/ -github-id: pytorch/captum -date-added: 10/18/19 ---- diff --git a/_ecosystem/Flair b/_ecosystem/Flair deleted file mode 100644 index 4d76513c939a..000000000000 --- a/_ecosystem/Flair +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Flair -summary: Flair is a very simple framework for state-of-the-art natural language processing (NLP). -link: https://github.com/flairNLP/flair -order: 6 -redirect_to: https://github.com/flairNLP/flair -github-id: flairNLP/flair -date-added: 12/30/19 ---- diff --git a/_ecosystem/Forte b/_ecosystem/Forte deleted file mode 100644 index 040bd5bb4a65..000000000000 --- a/_ecosystem/Forte +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: forte -summary: Forte is a toolkit for building NLP pipelines featuring composable components, convenient data interfaces, and cross-task interaction. -link: https://github.com/asyml/forte -summary-home: Forte is a toolkit for building NLP pipelines featuring composable components, convenient data interfaces, and cross-task interaction. -featured-home: false -github-id: asyml/forte -date-added: 07/19/21 ---- diff --git a/_ecosystem/Ignite b/_ecosystem/Ignite deleted file mode 100644 index faf5f4792ef8..000000000000 --- a/_ecosystem/Ignite +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Ignite -summary: Ignite is a high-level library for training neural networks in PyTorch. It helps with writing compact, but full-featured training loops. -link: https://github.com/pytorch/ignite -order: 10 -redirect_to: https://github.com/pytorch/ignite -github-id: pytorch/ignite -date-added: 7/14/19 ---- diff --git a/_ecosystem/OpenMMLab b/_ecosystem/OpenMMLab deleted file mode 100644 index 8f3a1f047e65..000000000000 --- a/_ecosystem/OpenMMLab +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: OpenMMLab -summary: OpenMMLab covers a wide range of computer vision research topics including classification, detection, segmentation, and super-resolution. -link: https://github.com/open-mmlab -summary-home: OpenMMLab covers a wide range of computer vision research topics including classification, detection, segmentation, and super-resolution. -featured-home: false -github-id: open-mmlab -date-added: 06/27/21 ---- diff --git a/_ecosystem/accelerate b/_ecosystem/accelerate deleted file mode 100644 index ab0316743384..000000000000 --- a/_ecosystem/accelerate +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: accelerate -summary: 🚀 A simple way to train and use PyTorch models with multi-GPU, TPU, mixed-precision -link: https://huggingface.co/docs/accelerate -summary-home: 🚀 A simple way to train and use PyTorch models with multi-GPU, TPU, mixed-precision -featured-home: false -github-id: huggingface/accelerate -date-added: 09/13/21 ---- diff --git a/_ecosystem/adaptdl b/_ecosystem/adaptdl deleted file mode 100644 index e48192595d6f..000000000000 --- a/_ecosystem/adaptdl +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: AdaptDL -summary: AdaptDL is a resource-adaptive deep learning training and scheduling framework. -link: https://github.com/petuum/adaptdl -summary-home: AdaptDL is a resource-adaptive deep learning training and scheduling framework. -featured-home: false -github-id: petuum/adaptdl -date-added: 2/5/21 ---- \ No newline at end of file diff --git a/_ecosystem/advertorch.md b/_ecosystem/advertorch.md deleted file mode 100644 index fc35a8dacb2a..000000000000 --- a/_ecosystem/advertorch.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -layout: ecosystem_detail -title: AdverTorch -summary: A toolbox for adversarial robustness research. It contains modules for generating adversarial examples and defending against attacks. -link: https://github.com/BorealisAI/advertorch -order: 1 -summary-home: A toolbox for adversarial robustness research. It contains modules for generating adversarial examples and defending against attacks. -featured-home: false -redirect_to: https://github.com/BorealisAI/advertorch -github-id: BorealisAI/advertorch -date-added: 6/14/19 ---- - diff --git a/_ecosystem/albumentations b/_ecosystem/albumentations deleted file mode 100644 index a91565536538..000000000000 --- a/_ecosystem/albumentations +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Albumentations -summary: Fast and extensible image augmentation library for different CV tasks like classification, segmentation, object detection and pose estimation. -link: https://github.com/albu/albumentations -summary-home: Fast and extensible image augmentation library for different CV tasks like classification, segmentation, object detection and pose estimation. -featured-home: false -github-id: albumentations-team/albumentations -date-added: 10/28/19 ---- diff --git a/_ecosystem/allennlp.md b/_ecosystem/allennlp.md deleted file mode 100644 index 37f1fab01642..000000000000 --- a/_ecosystem/allennlp.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -layout: ecosystem_detail -title: AllenNLP -summary: AllenNLP is an open-source research library built on PyTorch for designing and evaluating deep learning models for NLP. -link: https://allennlp.org/ -order: 2 -summary-home: AllenNLP is an open-source research library built on PyTorch for designing and evaluating deep learning models for NLP. -featured-home: false -redirect_to: https://allennlp.org/ -github-id: allenai/allennlp -date-added: 6/14/19 ---- - diff --git a/_ecosystem/avalanche b/_ecosystem/avalanche deleted file mode 100644 index 76e0fe500add..000000000000 --- a/_ecosystem/avalanche +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: avalanche -summary: "Avalanche: an End-to-End Library for Continual Learning" -link: http://avalanche.continualai.org -summary-home: "Avalanche: an End-to-End Library for Continual Learning" -featured-home: false -github-id: ContinualAI/avalanche -date-added: 02/23/22 ---- diff --git a/_ecosystem/baal b/_ecosystem/baal deleted file mode 100644 index c10b4b841d4c..000000000000 --- a/_ecosystem/baal +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: baal -summary: baal (bayesian active learning) aims to implement active learning using metrics of uncertainty derived from approximations of bayesian posteriors in neural networks. -link: https://baal.readthedocs.io/en/latest/ -summary-home: baal (bayesian active learning) aims to implement active learning using metrics of uncertainty derived from approximations of bayesian posteriors in neural networks. -featured-home: false -github-id: ElementAI/baal -date-added: 3/19/20 ---- diff --git a/_ecosystem/botorch b/_ecosystem/botorch deleted file mode 100644 index 2f83838be2d8..000000000000 --- a/_ecosystem/botorch +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: BoTorch -summary: BoTorch is a library for Bayesian Optimization. It provides a modular, extensible interface for composing Bayesian optimization primitives. -link: https://botorch.org/ -order: 3 -redirect_to: https://botorch.org/ -github-id: pytorch/botorch -date-added: 6/14/19 ---- diff --git a/_ecosystem/catalyst b/_ecosystem/catalyst deleted file mode 100644 index 10ad675939f4..000000000000 --- a/_ecosystem/catalyst +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Catalyst -summary: Catalyst helps you write compact, but full-featured deep learning and reinforcement learning pipelines with a few lines of code. -link: https://github.com/catalyst-team/catalyst -summary-home: Catalyst helps you write compact, but full-featured deep learning and reinforcement learning pipelines with a few lines of code. -featured-home: false -github-id: catalyst-team/catalyst -date-added: 10/28/19 ---- diff --git a/_ecosystem/clinicadl b/_ecosystem/clinicadl deleted file mode 100644 index 8b0707e58d49..000000000000 --- a/_ecosystem/clinicadl +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: ClinicaDL -summary: Framework for reproducible classification of Alzheimer's Disease -link: https://clinicadl.readthedocs.io/ -summary-home: Framework for reproducible classification of Alzheimer's Disease -featured-home: false -github-id: aramis-lab/AD-DL -date-added: 05/07/21 ---- diff --git a/_ecosystem/colossal b/_ecosystem/colossal deleted file mode 100644 index a3425c26d1fb..000000000000 --- a/_ecosystem/colossal +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: ColossalAI -summary: Colossal-AI is a Unified Deep Learning System for Big Model Era -link: https://www.colossalai.org/ -summary-home: Colossal-AI is a Unified Deep Learning System for Big Model Era -featured-home: false -github-id: hpcaitech/ColossalAI -date-added: 01/04/23 ---- diff --git a/_ecosystem/colossal-llama-2 b/_ecosystem/colossal-llama-2 deleted file mode 100644 index ab2751f4292b..000000000000 --- a/_ecosystem/colossal-llama-2 +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Colossal-LLaMA-2 -summary: A complete and open-sourced solution for injecting domain-specific knowledge into pre-trained LLM. -link: https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2 -summary-home: A complete and open-sourced solution for injecting domain-specific knowledge into pre-trained LLM. -featured-home: false -github-id: hpcaitech/ColossalAI -date-added: 1/24/24 ---- diff --git a/_ecosystem/composer b/_ecosystem/composer deleted file mode 100644 index d61e18974197..000000000000 --- a/_ecosystem/composer +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: composer -summary: library of algorithms to speed up neural network training -link: https://github.com/mosaicml/composer -summary-home: library of algorithms to speed up neural network training -featured-home: false -github-id: mosaicml/composer -date-added: 03/28/22 ---- diff --git a/_ecosystem/crypten b/_ecosystem/crypten deleted file mode 100644 index c3a5420cfcaa..000000000000 --- a/_ecosystem/crypten +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: ecosystem_detail -title: CrypTen -summary: CrypTen is a framework for Privacy Preserving ML. Its goal is to make secure computing techniques accessible to ML practitioners. -link: https://github.com/facebookresearch/CrypTen -order: 5 -summary-home: CrypTen is a framework for Privacy Preserving ML. Its goal is to make secure computing techniques accessible to ML practitioners. -featured-home: false -redirect_to: https://github.com/facebookresearch/CrypTen -github-id: facebookresearch/CrypTen -date-added: 10/18/19 ---- diff --git a/_ecosystem/deepspeed b/_ecosystem/deepspeed deleted file mode 100644 index a2f81619f877..000000000000 --- a/_ecosystem/deepspeed +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: DeepSpeed -summary: DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective. -link: https://www.deepspeed.ai/ -summary-home: DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective. -featured-home: false -github-id: microsoft/DeepSpeed -date-added: 11/13/20 ---- diff --git a/_ecosystem/depyf b/_ecosystem/depyf deleted file mode 100644 index f8a9a7d2543c..000000000000 --- a/_ecosystem/depyf +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: depyf -summary: depyf is a tool to help users understand and adapt to PyTorch compiler torch.compile. -link: https://github.com/thuml/depyf -summary-home: depyf is a tool to help users understand and adapt to PyTorch compiler torch.compile. -featured-home: false -github-id: thuml/depyf -date-added: 1/24/24 ---- diff --git a/_ecosystem/detectron2 b/_ecosystem/detectron2 deleted file mode 100644 index 2cf4899ae13e..000000000000 --- a/_ecosystem/detectron2 +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Detectron2 -summary: Detectron2 is FAIR's next-generation platform for object detection and segmentation. -link: https://github.com/facebookresearch/detectron2 -summary-home: Detectron2 is FAIR's next-generation platform for object detection and segmentation. -featured-home: false -github-id: facebookresearch/detectron2 -date-added: 3/27/20 ---- diff --git a/_ecosystem/determined b/_ecosystem/determined deleted file mode 100644 index 67e7e8b467c1..000000000000 --- a/_ecosystem/determined +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Determined -summary: Determined is a platform that helps deep learning teams train models more quickly, easily share GPU resources, and effectively collaborate. -link: https://github.com/determined-ai/determined -summary-home: Determined is a platform that helps deep learning teams train models more quickly, easily share GPU resources, and effectively collaborate. -featured-home: false -github-id: determined-ai/determined -date-added: 9/8/20 ---- diff --git a/_ecosystem/dgl b/_ecosystem/dgl deleted file mode 100644 index 902ba360312b..000000000000 --- a/_ecosystem/dgl +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: DGL -summary: Deep Graph Library (DGL) is a Python package built for easy implementation of graph neural network model family, on top of PyTorch and other frameworks. -link: https://www.dgl.ai -summary-home: Deep Graph Library (DGL) is a Python package built for easy implementation of graph neural network model family, on top of PyTorch and other frameworks. -featured-home: false -github-id: dmlc/dgl -date-added: 3/3/20 ---- diff --git a/_ecosystem/diffusers b/_ecosystem/diffusers deleted file mode 100644 index 882922f60ed9..000000000000 --- a/_ecosystem/diffusers +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Diffusers -summary: Diffusers provides pretrained diffusion models across multiple modalities, such as vision and audio, and serves as a modular toolbox for inference and training of diffusion models. -link: https://huggingface.co/docs/diffusers -summary-home: Diffusers provides pretrained diffusion models across multiple modalities, such as vision and audio, and serves as a modular toolbox for inference and training of diffusion models. -featured-home: false -github-id: huggingface/diffusers -date-added: 6/1/23 ---- diff --git a/_ecosystem/doctr b/_ecosystem/doctr deleted file mode 100644 index 91ed5d688557..000000000000 --- a/_ecosystem/doctr +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: docTR -summary: docTR (Document Text Recognition) - a seamless, high-performing & accessible library for OCR-related tasks powered by Deep Learning. -link: https://github.com/mindee/doctr -summary-home: docTR (Document Text Recognition) - a seamless, high-performing & accessible library for OCR-related tasks powered by Deep Learning. -featured-home: false -github-id: mindee/doctr -date-added: 12/3/24 ---- diff --git a/_ecosystem/einops b/_ecosystem/einops deleted file mode 100644 index 62d776040b6e..000000000000 --- a/_ecosystem/einops +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: einops -summary: Flexible and powerful tensor operations for readable and reliable code. -link: https://github.com/arogozhnikov/einops -summary-home: Flexible and powerful tensor operations for readable and reliable code. -featured-home: false -github-id: arogozhnikov/einops -date-added: 1/20/21 ---- diff --git a/_ecosystem/ensemble-pytorch b/_ecosystem/ensemble-pytorch deleted file mode 100644 index 1ff8367ba575..000000000000 --- a/_ecosystem/ensemble-pytorch +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Ensemble-Pytorch -summary: A unified ensemble framework for PyTorch to improve the performance and robustness of your deep learning model. -link: https://ensemble-pytorch.readthedocs.io -summary-home: A unified ensemble framework for PyTorch to improve the performance and robustness of your deep learning model. -featured-home: false -github-id: TorchEnsemble-Community/Ensemble-Pytorch -date-added: 06/02/21 ---- diff --git a/_ecosystem/fairscale b/_ecosystem/fairscale deleted file mode 100644 index 87d8e468721b..000000000000 --- a/_ecosystem/fairscale +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: FairScale -summary: FairScale is a PyTorch extension library for high performance and large scale training on one or multiple machines/nodes. -link: https://github.com/facebookresearch/fairscale -summary-home: FairScale is a PyTorch extension library for high performance and large scale training on one or multiple machines/nodes. -featured-home: false -github-id: facebookresearch/fairscale -date-added: 1/22/21 ---- diff --git a/_ecosystem/fastai b/_ecosystem/fastai deleted file mode 100644 index 0ecaaf9017ba..000000000000 --- a/_ecosystem/fastai +++ /dev/null @@ -1,13 +0,0 @@ ---- -layout: ecosystem_detail -title: fastai -summary: fastai is a library that simplifies training fast and accurate neural nets using modern best practices. -link: https://docs.fast.ai -order: 5 -summary-home: fastai is a library that simplifies training fast and accurate neural nets using modern best practices. -featured-home: false -redirect_to: https://docs.fast.ai -github-id: fastai/fastai -date-added: 7/14/19 ---- - diff --git a/_ecosystem/flower b/_ecosystem/flower deleted file mode 100644 index 8ef1bdbcad1e..000000000000 --- a/_ecosystem/flower +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Flower -summary: Flower - A Friendly Federated Learning Framework -link: https://flower.dev -summary-home: Flower - A Friendly Federated Learning Framework -featured-home: false -github-id: adap/flower -date-added: 01/05/22 ---- diff --git a/_ecosystem/fusemedml b/_ecosystem/fusemedml deleted file mode 100644 index ab588de504b0..000000000000 --- a/_ecosystem/fusemedml +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: FuseMedML -summary: FuseMedML is a python framework accelerating ML based discovery in the medical field by encouraging code reuse -link: https://github.com/BiomedSciAI/fuse-med-ml -summary-home: FuseMedML is a python framework accelerating ML based discovery in the medical field by encouraging code reuse -featured-home: false -github-id: BiomedSciAI/fuse-med-ml -date-added: 02/16/23 ---- diff --git a/_ecosystem/gandlf b/_ecosystem/gandlf deleted file mode 100644 index 39acd653939a..000000000000 --- a/_ecosystem/gandlf +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: GaNDLF -summary: A generalizable application framework for segmentation, regression, and classification using PyTorch -link: https://mlcommons.github.io/GaNDLF/ -summary-home: A generalizable application framework for segmentation, regression, and classification using PyTorch -featured-home: false -github-id: mlcommons/GaNDLF -date-added: 05/07/21 ---- diff --git a/_ecosystem/glow.md b/_ecosystem/glow.md deleted file mode 100644 index 56503644e0ae..000000000000 --- a/_ecosystem/glow.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -layout: ecosystem_detail -title: Glow -summary: Glow is a ML compiler that accelerates the performance of deep learning frameworks on different hardware platforms. -link: https://github.com/pytorch/glow -order: 7 -summary-home: Glow is a ML compiler that accelerates the performance of deep learning frameworks on different hardware platforms. -featured-home: false -logo-class: tool -redirect_to: https://github.com/pytorch/glow -github-id: pytorch/glow -date-added: 3/27/20 ---- diff --git a/_ecosystem/gpytorch.md b/_ecosystem/gpytorch.md deleted file mode 100644 index 1082637bfe51..000000000000 --- a/_ecosystem/gpytorch.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: GPyTorch -summary: GPyTorch is a Gaussian process library implemented using PyTorch, designed for creating scalable, flexible Gaussian process models. -link: https://cornellius-gp.github.io/ -order: 8 -redirect_to: https://cornellius-gp.github.io/ -github-id: cornellius-gp/gpytorch -date-added: 7/14/19 ---- diff --git a/_ecosystem/higher b/_ecosystem/higher deleted file mode 100644 index c9129977d96a..000000000000 --- a/_ecosystem/higher +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: higher -summary: higher is a library which facilitates the implementation of arbitrarily complex gradient-based meta-learning algorithms and nested optimisation loops with near-vanilla PyTorch. -link: https://github.com/facebookresearch/higher -summary-home: higher is a library which facilitates the implementation of arbitrarily complex gradient-based meta-learning algorithms and nested optimisation loops with near-vanilla PyTorch. -featured-home: false -github-id: facebookresearch/higher -date-added: 5/21/20 ---- diff --git a/_ecosystem/horovod b/_ecosystem/horovod deleted file mode 100644 index 763062c8c12c..000000000000 --- a/_ecosystem/horovod +++ /dev/null @@ -1,13 +0,0 @@ ---- -layout: ecosystem_detail -title: Horovod -summary: Horovod is a distributed training library for deep learning frameworks. Horovod aims to make distributed DL fast and easy to use. -link: http://horovod.ai -order: 9 -summary-home: Horovod is a distributed training library for deep learning frameworks. Horovod aims to make distributed DL fast and easy to use. -featured-home: false -redirect_to: http://horovod.ai -github-id: horovod/horovod -date-added: 7/14/19 ---- - diff --git a/_ecosystem/hummingbird b/_ecosystem/hummingbird deleted file mode 100644 index c68544d17378..000000000000 --- a/_ecosystem/hummingbird +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Hummingbird -summary: Hummingbird compiles trained ML models into tensor computation for faster inference. -link: https://github.com/microsoft/hummingbird -summary-home: Hummingbird compiles trained ML models into tensor computation for faster inference. -featured-home: false -github-id: microsoft/hummingbird -date-added: 6/17/20 ---- diff --git a/_ecosystem/hydra b/_ecosystem/hydra deleted file mode 100644 index e130e2d98c48..000000000000 --- a/_ecosystem/hydra +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Hydra -summary: A framework for elegantly configuring complex applications. -link: https://hydra.cc/ -summary-home: A framework for elegantly configuring complex applications. -featured-home: false -github-id: facebookresearch/hydra -date-added: 1/6/20 ---- diff --git a/_ecosystem/inc b/_ecosystem/inc deleted file mode 100644 index be46670b1ea7..000000000000 --- a/_ecosystem/inc +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: neural-compressor -summary: Intel® Neural Compressor provides unified APIs for network compression technologies for faster inference -link: https://intel.github.io/neural-compressor/ -summary-home: Intel® Neural Compressor provides unified APIs for network compression technologies for faster inference -featured-home: false -github-id: intel/neural-compressor -date-added: 03/28/22 ---- diff --git a/_ecosystem/ipex b/_ecosystem/ipex deleted file mode 100644 index 9b04cd5faa37..000000000000 --- a/_ecosystem/ipex +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: intel-extension-for-pytorch -summary: A Python package for improving PyTorch performance on Intel platforms -link: https://intel.github.io/intel-extension-for-pytorch/ -summary-home: A Python package for improving PyTorch performance on Intel platforms -featured-home: false -github-id: intel/intel-extension-for-pytorch -date-added: 02/16/22 ---- diff --git a/_ecosystem/ivy b/_ecosystem/ivy deleted file mode 100644 index bd51388a497c..000000000000 --- a/_ecosystem/ivy +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: ivy -summary: The Unified Machine Learning Framework -link: https://lets-unify.ai -summary-home: The Unified Machine Learning Framework -featured-home: false -github-id: unifyai/ivy -date-added: 02/23/22 ---- diff --git a/_ecosystem/joeynmt b/_ecosystem/joeynmt deleted file mode 100644 index 1758bdda543b..000000000000 --- a/_ecosystem/joeynmt +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: joeynmt -summary: Minimalist Neural Machine Translation toolkit for educational purposes -link: https://joeynmt.readthedocs.io/en/latest/ -summary-home: Minimalist Neural Machine Translation toolkit for educational purposes -featured-home: false -github-id: joeynmt/joeynmt -date-added: 05/07/21 ---- diff --git a/_ecosystem/kornia b/_ecosystem/kornia deleted file mode 100644 index 18bc91095281..000000000000 --- a/_ecosystem/kornia +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Kornia -summary: Kornia is a differentiable computer vision library that consists of a set of routines and differentiable modules to solve generic CV problems. -link: https://kornia.github.io/ -summary-home: Kornia is a differentiable computer vision library that consists of a set of routines and differentiable modules to solve generic CV problems. -featured-home: false -github-id: kornia/kornia -date-added: 10/29/19 ---- diff --git a/_ecosystem/l5kit b/_ecosystem/l5kit deleted file mode 100644 index f76ac00d8739..000000000000 --- a/_ecosystem/l5kit +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: L5Kit -summary: ML Prediction, Planning and Simulation for Self-Driving built on PyTorch. -link: https://github.com/lyft/l5kit -summary-home: ML Prediction, Planning and Simulation for Self-Driving built on PyTorch. -featured-home: false -github-id: lyft/l5kit -date-added: 1/10/21 ---- diff --git a/_ecosystem/lightly b/_ecosystem/lightly deleted file mode 100644 index 6bb024568c19..000000000000 --- a/_ecosystem/lightly +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Lightly -summary: Lightly is a computer vision framework for self-supervised learning. -link: https://github.com/lightly-ai/lightly -summary-home: Lightly is a computer vision framework for self-supervised learning. -featured-home: false -github-id: lightly-ai/lightly -date-added: 08/23/21 ---- diff --git a/_ecosystem/ludwig b/_ecosystem/ludwig deleted file mode 100644 index db30bab6e0b5..000000000000 --- a/_ecosystem/ludwig +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: ludwig -summary: Data-centric declarative deep learning framework -link: http://ludwig.ai -summary-home: Data-centric declarative deep learning framework -featured-home: false -github-id: ludwig-ai/ludwig -date-added: 05/20/22 ---- diff --git a/_ecosystem/mmf b/_ecosystem/mmf deleted file mode 100644 index f24d9a683f6b..000000000000 --- a/_ecosystem/mmf +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: MMF -summary: A modular framework for vision & language multimodal research from Facebook AI Research (FAIR). -link: https://mmf.sh/ -summary-home: A modular framework for vision & language multimodal research from Facebook AI Research (FAIR). -featured-home: false -github-id: facebookresearch/mmf -date-added: 6/11/20 ---- diff --git a/_ecosystem/monai b/_ecosystem/monai deleted file mode 100644 index 0228b06bc3ad..000000000000 --- a/_ecosystem/monai +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: MONAI -summary: MONAI provides domain-optimized foundational capabilities for developing healthcare imaging training workflows. -link: https://monai.io -summary-home: MONAI provides domain-optimized foundational capabilities for developing healthcare imaging training workflows. -featured-home: false -github-id: Project-MONAI/MONAI -date-added: 5/1/20 ---- diff --git a/_ecosystem/nemo b/_ecosystem/nemo deleted file mode 100644 index 3cfb69d31c09..000000000000 --- a/_ecosystem/nemo +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: NeMo -summary: "NeMo: a toolkit for conversational AI." -link: https://github.com/NVIDIA/NeMo -summary-home: "NeMo: a toolkit for conversational AI" -featured-home: false -github-id: NVIDIA/NeMo -date-added: 6/16/20 ---- diff --git a/_ecosystem/octoml b/_ecosystem/octoml deleted file mode 100644 index 43ce44ca1589..000000000000 --- a/_ecosystem/octoml +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: OctoML Profile -summary: octoml-profile is a python library and cloud service designed to provide a simple experience for assessing and optimizing the performance of PyTorch models. -link: https://github.com/octoml/octoml-profile -summary-home: octoml-profile is a python library and cloud service designed to provide a simple experience for assessing and optimizing the performance of PyTorch models. -featured-home: false -github-id: octoml/octoml-profile -date-added: 6/1/23 ---- diff --git a/_ecosystem/onnxrt b/_ecosystem/onnxrt deleted file mode 100644 index f2e6e688442e..000000000000 --- a/_ecosystem/onnxrt +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: ONNX Runtime -summary: ONNX Runtime is a cross-platform inferencing and training accelerator. -link: https://github.com/microsoft/onnxruntime -summary-home: ONNX Runtime is a cross-platform inferencing and training accelerator. -featured-home: false -github-id: microsoft/onnxruntime -date-added: 2/1/21 ---- diff --git a/_ecosystem/opacus b/_ecosystem/opacus deleted file mode 100644 index 91671d612731..000000000000 --- a/_ecosystem/opacus +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Opacus -summary: Train PyTorch models with Differential Privacy -link: https://opacus.ai/ -summary-home: Train PyTorch models with Differential Privacy -featured-home: false -github-id: pytorch/opacus -date-added: 10/29/20 ---- diff --git a/_ecosystem/opencompass b/_ecosystem/opencompass deleted file mode 100644 index a55a4ef31f61..000000000000 --- a/_ecosystem/opencompass +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: OpenCompass -summary: OpenCompass is an LLM evaluation platform, supporting a wide range of models (Llama3, Mistral, InternLM2,GPT-4,LLaMa2, Qwen,GLM, Claude, etc) over 100+ datasets. -link: https://github.com/open-compass/opencompass -summary-home: OpenCompass is an LLM evaluation platform, supporting a wide range of models (Llama3, Mistral, InternLM2,GPT-4,LLaMa2, Qwen,GLM, Claude, etc) over 100+ datasets. -featured-home: false -github-id: open-compass/opencompass -date-added: 12/18/24 ---- diff --git a/_ecosystem/optuna b/_ecosystem/optuna deleted file mode 100644 index beff1d69bcd5..000000000000 --- a/_ecosystem/optuna +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Optuna -summary: An open source hyperparameter optimization framework to automate hyperparameter search. -link: https://optuna.org/ -summary-home: An open source hyperparameter optimization framework to automate hyperparameter search. -featured-home: false -github-id: optuna/optuna -date-added: 4/6/20 ---- diff --git a/_ecosystem/padl b/_ecosystem/padl deleted file mode 100644 index ac3f5818a67b..000000000000 --- a/_ecosystem/padl +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: padl -summary: Pipeline Abstractions for Deep Learning in PyTorch -link: https://lf1-io.github.io/padl/ -summary-home: Pipeline Abstractions for Deep Learning in PyTorch -featured-home: false -github-id: lf1-io/padl -date-added: 03/28/22 ---- diff --git a/_ecosystem/parlai b/_ecosystem/parlai deleted file mode 100644 index cac2924c3ad8..000000000000 --- a/_ecosystem/parlai +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: ecosystem_detail -title: ParlAI -summary: ParlAI is a unified platform for sharing, training, and evaluating dialog models across many tasks. -link: http://parl.ai/ -order: 11 -summary-home: ParlAI is a unified platform for sharing, training, and evaluating dialog models across many tasks. -featured-home: false -redirect_to: http://parl.ai/ -github-id: facebookresearch/ParlAI -date-added: 7/14/19 ---- diff --git a/_ecosystem/pennylane b/_ecosystem/pennylane deleted file mode 100644 index eb4ba2e24643..000000000000 --- a/_ecosystem/pennylane +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: ecosystem_detail -title: PennyLane -summary: PennyLane is a library for quantum ML, automatic differentiation, and optimization of hybrid quantum-classical computations. -link: https://pennylane.ai/ -order: 12 -summary-home: PennyLane is a library for quantum ML, automatic differentiation, and optimization of hybrid quantum-classical computations. -featured-home: false -redirect_to: https://pennylane.ai/ -github-id: PennyLaneAI/pennylane -date-added: 7/14/19 ---- diff --git a/_ecosystem/pfrl b/_ecosystem/pfrl deleted file mode 100644 index 8b33e14229b9..000000000000 --- a/_ecosystem/pfrl +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PFRL -summary: PFRL is a deep reinforcement learning library that implements various state-of-the-art deep reinforcement algorithms in Python using PyTorch. -link: https://github.com/pfnet/pfrl -summary-home: PFRL is a deep reinforcement learning library that implements various state-of-the-art deep reinforcement algorithms in Python using PyTorch. -featured-home: false -github-id: pfnet/pfrl -date-added: 8/6/20 ---- diff --git a/_ecosystem/polyaxon b/_ecosystem/polyaxon deleted file mode 100644 index 54757d85205d..000000000000 --- a/_ecosystem/polyaxon +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Polyaxon -summary: Polyaxon is a platform for building, training, and monitoring large-scale deep learning applications. -link: https://github.com/polyaxon/polyaxon -summary-home: Polyaxon is a platform for building, training, and monitoring large-scale deep learning applications. -featured-home: false -github-id: polyaxon/polyaxon -date-added: 9/17/20 ---- diff --git a/_ecosystem/pomegranate b/_ecosystem/pomegranate deleted file mode 100644 index 984b55851cfc..000000000000 --- a/_ecosystem/pomegranate +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: pomegranate -summary: pomegranate is a library of probabilistic models that is built in a modular manner and treats all models as the probability distributions that they are. -link: https://pomegranate.readthedocs.io/en/latest/ -summary-home: pomegranate is a library of probabilistic models that is built in a modular manner and treats all models as the probability distributions that they are. -featured-home: false -github-id: jmschrei/pomegranate -date-added: 6/1/23 ---- diff --git a/_ecosystem/poptorch b/_ecosystem/poptorch deleted file mode 100644 index 15295d77db91..000000000000 --- a/_ecosystem/poptorch +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PopTorch -summary: The PopTorch interface library is a simple wrapper for running PyTorch programs directly on Graphcore IPUs. -link: https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/ -summary-home: The PopTorch interface library is a simple wrapper for running PyTorch programs directly on Graphcore IPUs. -featured-home: false -github-id: graphcore/poptorch -date-added: 3/23/21 ---- diff --git a/_ecosystem/poutyne b/_ecosystem/poutyne deleted file mode 100644 index 9ac823ab7ccb..000000000000 --- a/_ecosystem/poutyne +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Poutyne -summary: Poutyne is a Keras-like framework for PyTorch and handles much of the boilerplating code needed to train neural networks. -link: https://poutyne.org/ -summary-home: Poutyne is a Keras-like framework for PyTorch and handles much of the boilerplating code needed to train neural networks. -featured-home: false -github-id: GRAAL-Research/poutyne -date-added: 2/13/20 ---- diff --git a/_ecosystem/pykale b/_ecosystem/pykale deleted file mode 100644 index dd8338c18d77..000000000000 --- a/_ecosystem/pykale +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyKale -summary: PyKale is a PyTorch library for multimodal learning and transfer learning with deep learning and dimensionality reduction on graphs, images, texts, and videos. -link: https://github.com/pykale/pykale -summary-home: PyKale is a PyTorch library for multimodal learning and transfer learning with deep learning and dimensionality reduction on graphs, images, texts, and videos. -featured-home: false -github-id: pykale/pykale -date-added: 09/09/21 ---- diff --git a/_ecosystem/pypose b/_ecosystem/pypose deleted file mode 100644 index d4af561ece42..000000000000 --- a/_ecosystem/pypose +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyPose -summary: PyPose is a robotics-oriented, PyTorch-based library that combines deep perceptual models with physics-based optimization techniques, so that users can focus on their novel applications. -link: https://pypose.org -summary-home: PyPose is a robotics-oriented, PyTorch-based library that combines deep perceptual models with physics-based optimization techniques, so that users can focus on their novel applications. -featured-home: false -github-id: pypose/pypose -date-added: 6/1/23 ---- diff --git a/_ecosystem/pypots b/_ecosystem/pypots deleted file mode 100644 index b728914b16c8..000000000000 --- a/_ecosystem/pypots +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyPOTS -summary: A Python toolbox for data mining on Partially-Observed Time Series (POTS) and helps engineers focus more on the core problems in rather than missing parts in their data. -link: https://github.com/WenjieDu/PyPOTS -summary-home: A Python toolbox for data mining on Partially-Observed Time Series (POTS) and helps engineers focus more on the core problems in rather than missing parts in their data. -featured-home: false -github-id: WenjieDu/PyPOTS -date-added: 6/28/23 ---- diff --git a/_ecosystem/pyro.md b/_ecosystem/pyro.md deleted file mode 100644 index 2b5bf4b99329..000000000000 --- a/_ecosystem/pyro.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Pyro -summary: Pyro is a universal probabilistic programming language (PPL) written in Python and supported by PyTorch on the backend. -link: http://pyro.ai/ -order: 13 -redirect_to: http://pyro.ai/ -github-id: pyro-ppl/pyro -date-added: 7/14/19 ---- diff --git a/_ecosystem/pystiche b/_ecosystem/pystiche deleted file mode 100644 index 1474285b15ac..000000000000 --- a/_ecosystem/pystiche +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: pystiche -summary: pystiche is a framework for Neural Style Transfer (NST) built upon PyTorch. -link: https://github.com/pystiche/pystiche -summary-home: pystiche is a framework for Neural Style Transfer (NST) built upon PyTorch. -featured-home: false -github-id: pystiche/pystiche -date-added: 5/7/21 ---- diff --git a/_ecosystem/pysyft b/_ecosystem/pysyft deleted file mode 100644 index 08ca1b429ed0..000000000000 --- a/_ecosystem/pysyft +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PySyft -summary: PySyft is a Python library for encrypted, privacy preserving deep learning. -link: https://github.com/OpenMined/PySyft -order: 14 -redirect_to: https://github.com/OpenMined/PySyft -github-id: OpenMined/PySyft -date-added: 7/14/19 ---- diff --git a/_ecosystem/pytorch-geometric b/_ecosystem/pytorch-geometric deleted file mode 100644 index 6dba5ab31a44..000000000000 --- a/_ecosystem/pytorch-geometric +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: ecosystem_detail -title: PyTorch Geometric -summary: PyTorch Geometric is a library for deep learning on irregular input data such as graphs, point clouds, and manifolds. -link: https://github.com/pyg-team/pytorch_geometric/ -order: 15 -summary-home: PyTorch Geometric is a library for deep learning on irregular input data such as graphs, point clouds, and manifolds. -featured-home: true -redirect_to: https://github.com/pyg-team/pytorch_geometric/ -github-id: pyg-team/pytorch_geometric -date-added: 7/14/19 ---- diff --git a/_ecosystem/pytorch-lightning b/_ecosystem/pytorch-lightning deleted file mode 100644 index c15dae5edfa4..000000000000 --- a/_ecosystem/pytorch-lightning +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: ecosystem_detail -title: PyTorch Lightning -summary: PyTorch Lightning is a Keras-like ML library for PyTorch. It leaves core training and validation logic to you and automates the rest. -link: https://github.com/williamFalcon/pytorch-lightning -order: 16 -summary-home: PyTorch Lightning is a Keras-like ML library for PyTorch. It leaves core training and validation logic to you and automates the rest. -featured-home: false -redirect_to: https://github.com/williamFalcon/pytorch-lightning -github-id: PyTorchLightning/pytorch-lightning -date-added: 8/14/19 ---- diff --git a/_ecosystem/pytorch-metric-learning b/_ecosystem/pytorch-metric-learning deleted file mode 100644 index 22007f4276d5..000000000000 --- a/_ecosystem/pytorch-metric-learning +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyTorch Metric Learning -summary: The easiest way to use deep metric learning in your application. Modular, flexible, and extensible. -link: https://github.com/KevinMusgrave/pytorch-metric-learning -summary-home: The easiest way to use deep metric learning in your application. Modular, flexible, and extensible. -featured-home: false -github-id: KevinMusgrave/pytorch-metric-learning -date-added: 1/20/21 ---- diff --git a/_ecosystem/pytorch-nlp b/_ecosystem/pytorch-nlp deleted file mode 100644 index c56435317f77..000000000000 --- a/_ecosystem/pytorch-nlp +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyTorch-NLP -summary: Basic Utilities for PyTorch Natural Language Processing (NLP). -link: https://pytorchnlp.readthedocs.io -summary-home: Basic Utilities for PyTorch Natural Language Processing (NLP). -featured-home: false -github-id: PetrochukM/PyTorch-NLP -date-added: 4/6/20 ---- diff --git a/_ecosystem/pytorch3d b/_ecosystem/pytorch3d deleted file mode 100644 index fbba2cbcb86f..000000000000 --- a/_ecosystem/pytorch3d +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyTorch3D -summary: PyTorch3D provides efficient, reusable components for 3D Computer Vision research with PyTorch. -link: https://pytorch3d.org/ -summary-home: PyTorch3D provides efficient, reusable components for 3D Computer Vision research with PyTorch. -featured-home: false -github-id: facebookresearch/pytorch3d -date-added: 3/27/20 ---- diff --git a/_ecosystem/pytorch_geometric_temporal b/_ecosystem/pytorch_geometric_temporal deleted file mode 100644 index 89c13df5b3d5..000000000000 --- a/_ecosystem/pytorch_geometric_temporal +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyTorch Geometric Temporal -summary: PyTorch Geometric Temporal is a temporal (dynamic) extension library for PyTorch Geometric. -link: https://github.com/benedekrozemberczki/pytorch_geometric_temporal -summary-home: PyTorch Geometric Temporal is a temporal (dynamic) extension library for PyTorch Geometric. -featured-home: false -github-id: benedekrozemberczki/pytorch_geometric_temporal -date-added: 4/11/21 ---- diff --git a/_ecosystem/pytorchfi b/_ecosystem/pytorchfi deleted file mode 100644 index afa174e5ade0..000000000000 --- a/_ecosystem/pytorchfi +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: pytorchfi -summary: A runtime fault injection tool for PyTorch. -link: https://github.com/pytorchfi/pytorchfi -summary-home: A runtime fault injection tool for PyTorch. -featured-home: false -github-id: pytorchfi/pytorchfi -date-added: 09/08/21 ---- diff --git a/_ecosystem/pytorchvideo b/_ecosystem/pytorchvideo deleted file mode 100644 index 15d778bdc257..000000000000 --- a/_ecosystem/pytorchvideo +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyTorchVideo -summary: A deep learning library for video understanding research. Hosts various video-focused models, datasets, training pipelines and more. -link: https://pytorchvideo.org/ -summary-home: A deep learning library for video understanding research. Hosts various video-focused models, datasets, training pipelines and more. -featured-home: false -github-id: facebookresearch/pytorchvideo -date-added: 08/15/21 ---- diff --git a/_ecosystem/rastervision b/_ecosystem/rastervision deleted file mode 100644 index 88ef1358de2e..000000000000 --- a/_ecosystem/rastervision +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: raster-vision -summary: An open source framework for deep learning on satellite and aerial imagery. -link: https://docs.rastervision.io -summary-home: An open source framework for deep learning on satellite and aerial imagery. -featured-home: false -github-id: azavea/raster-vision -date-added: 05/07/21 ---- diff --git a/_ecosystem/ray b/_ecosystem/ray deleted file mode 100644 index aab4600a0da4..000000000000 --- a/_ecosystem/ray +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Ray -summary: Ray is a fast and simple framework for building and running distributed applications. -link: https://github.com/ray-project/ray -summary-home: Ray is a fast and simple framework for building and running distributed applications. -featured-home: false -github-id: ray-project/ray -date-added: 8/20/20 ---- diff --git a/_ecosystem/renate b/_ecosystem/renate deleted file mode 100644 index 46308be32fae..000000000000 --- a/_ecosystem/renate +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Renate -summary: Renate is a library providing tools for re-training pytorch models over time as new data becomes available. -link: https://renate.readthedocs.io/en/latest/ -summary-home: Renate is a library providing tools for re-training pytorch models over time as new data becomes available. -featured-home: false -github-id: awslabs/renate -date-added: 6/1/23 ---- diff --git a/_ecosystem/roma b/_ecosystem/roma deleted file mode 100644 index c1af32c70cbf..000000000000 --- a/_ecosystem/roma +++ /dev/null @@ -1,9 +0,0 @@ ---- -layout: ecosystem_detail -title: RoMa -summary: RoMa is a standalone library to handle rotation representations with PyTorch (rotation matrices, quaternions, rotation vectors, etc). It aims for robustness, ease-of-use, and efficiency. -link: https://github.com/naver/roma -order: 10 -redirect_to: https://github.com/naver/roma -date-added: 9/21/23 ---- diff --git a/_ecosystem/simulai b/_ecosystem/simulai deleted file mode 100644 index 8076cfe9e122..000000000000 --- a/_ecosystem/simulai +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: SimulAI -summary: SimulAI is basically a toolkit with pipelines for physics-informed machine learning. -link: https://github.com/IBM/simulai -summary-home: SimulAI is basically a toolkit with pipelines for physics-informed machine learning. -featured-home: false -github-id: IBM/simulai -date-added: 1/24/24 ---- diff --git a/_ecosystem/skorch b/_ecosystem/skorch deleted file mode 100644 index 24746439398d..000000000000 --- a/_ecosystem/skorch +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: ecosystem_detail -title: skorch -summary: skorch is a high-level library for PyTorch that provides full scikit-learn compatibility. -link: https://github.com/skorch-dev/skorch -order: 17 -summary-home: skorch is a high-level library for PyTorch that provides full scikit-learn compatibility. -featured-home: true -redirect_to: https://github.com/skorch-dev/skorch -github-id: skorch-dev/skorch -date-added: 8/14/19 ---- diff --git a/_ecosystem/stable-baselines3 b/_ecosystem/stable-baselines3 deleted file mode 100644 index 81d3ebab042f..000000000000 --- a/_ecosystem/stable-baselines3 +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Stable Baselines3 -summary: Stable Baselines3 (SB3) is a set of reliable implementations of reinforcement learning algorithms in PyTorch. -link: https://github.com/DLR-RM/stable-baselines3 -summary-home: Stable Baselines3 (SB3) is a set of reliable implementations of reinforcement learning algorithms. -featured-home: false -github-id: DLR-RM/stable-baselines3 -date-added: 3/29/21 ---- diff --git a/_ecosystem/stoke b/_ecosystem/stoke deleted file mode 100644 index 773b1e32e5f1..000000000000 --- a/_ecosystem/stoke +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: stoke -summary: A lightweight declarative PyTorch wrapper for context switching between devices, distributed modes, mixed-precision, and PyTorch extensions. -link: https://fidelity.github.io/stoke/ -summary-home: A lightweight declarative PyTorch wrapper for context switching between devices, distributed modes, mixed-precision, and PyTorch extensions. -featured-home: false -github-id: fidelity/stoke -date-added: 09/08/21 ---- diff --git a/_ecosystem/substra b/_ecosystem/substra deleted file mode 100644 index 59e194c5976d..000000000000 --- a/_ecosystem/substra +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Substra -summary: Substra is a federated learning Python library to run federated learning experiments at scale on real distributed data. -link: https://github.com/Substra -summary-home: Substra is a federated learning Python library to run federated learning experiments at scale on real distributed data. -featured-home: false -github-id: substra -date-added: 6/28/23 ---- diff --git a/_ecosystem/tensorly.md b/_ecosystem/tensorly.md deleted file mode 100644 index 0e7f36ceeb34..000000000000 --- a/_ecosystem/tensorly.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: TensorLy -summary: TensorLy is a high level API for tensor methods and deep tensorized neural networks in Python that aims to make tensor learning simple. -link: http://tensorly.org/stable/home.html -order: 18 -redirect_to: http://tensorly.org/stable/home.html -github-id: tensorly/tensorly -date-added: 8/14/19 ---- diff --git a/_ecosystem/textbrewer b/_ecosystem/textbrewer deleted file mode 100644 index 88bad3356eea..000000000000 --- a/_ecosystem/textbrewer +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: TextBrewer -summary: A PyTorch-based knowledge distillation toolkit for natural language processing -link: http://textbrewer.hfl-rc.com -summary-home: A PyTorch-based knowledge distillation toolkit for natural language processing -featured-home: false -github-id: airaria/TextBrewer -date-added: 06/02/21 ---- diff --git a/_ecosystem/tiatoolbox b/_ecosystem/tiatoolbox deleted file mode 100644 index d61918d21523..000000000000 --- a/_ecosystem/tiatoolbox +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: TIAToolbox -summary: TIAToolbox provides an easy-to-use API where researchers can use, adapt and create models for CPath. -link: https://github.com/TissueImageAnalytics/tiatoolbox -summary-home: TIAToolbox provides an easy-to-use API where researchers can use, adapt and create models for CPath. -featured-home: false -github-id: TissueImageAnalytics/tiatoolbox -date-added: 6/1/23 ---- diff --git a/_ecosystem/torchdistill b/_ecosystem/torchdistill deleted file mode 100644 index 6224cf04e847..000000000000 --- a/_ecosystem/torchdistill +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: torchdistill -summary: torchdistill is a coding-free framework built on PyTorch for reproducible deep learning and knowledge distillation studies. -link: https://github.com/yoshitomo-matsubara/torchdistill -summary-home: torchdistill is a coding-free framework built on PyTorch for reproducible deep learning and knowledge distillation studies. -featured-home: false -github-id: yoshitomo-matsubara/torchdistill -date-added: 12/05/23 ---- diff --git a/_ecosystem/torchdrift b/_ecosystem/torchdrift deleted file mode 100644 index 714e5015e972..000000000000 --- a/_ecosystem/torchdrift +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: TorchDrift -summary: TorchDrift is a data and concept drift library for PyTorch. It lets you monitor your PyTorch models to see if they operate within spec. -link: https://torchdrift.org -summary-home: TorchDrift is a data and concept drift library for PyTorch. It lets you monitor your PyTorch models to see if they operate within spec. -featured-home: false -github-id: TorchDrift/TorchDrift -date-added: 3/31/21 ---- diff --git a/_ecosystem/torchdrug b/_ecosystem/torchdrug deleted file mode 100644 index 840c10f7efbd..000000000000 --- a/_ecosystem/torchdrug +++ /dev/null @@ -1,11 +0,0 @@ ---- -layout: ecosystem_detail -title: torchdrug -summary: A powerful and flexible machine learning platform for drug discovery. -link: https://torchdrug.ai/ -summary-home: A powerful and flexible machine learning platform for drug discovery. -featured-home: false -github-id: DeepGraphLearning/torchdrug -date-added: 08/19/21 ---- - diff --git a/_ecosystem/torchgeo b/_ecosystem/torchgeo deleted file mode 100644 index 32caf26b9ae2..000000000000 --- a/_ecosystem/torchgeo +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: torchgeo -summary: Datasets, transforms, and models for geospatial data -link: https://github.com/microsoft/torchgeo -summary-home: Datasets, transforms, and models for geospatial data -featured-home: false -github-id: microsoft/torchgeo -date-added: 01/05/22 ---- diff --git a/_ecosystem/torchio b/_ecosystem/torchio deleted file mode 100644 index e7caff8c2ccc..000000000000 --- a/_ecosystem/torchio +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: TorchIO -summary: TorchIO is a set of tools to efficiently read, preprocess, sample, augment, and write 3D medical images in deep learning applications written in PyTorch. -link: https://github.com/fepegar/torchio -summary-home: TorchIO is a set of tools to efficiently read, preprocess, sample, augment, and write 3D medical images in deep learning applications written in PyTorch. -featured-home: false -github-id: fepegar/torchio -date-added: 12/06/20 ---- diff --git a/_ecosystem/torchmetrics b/_ecosystem/torchmetrics deleted file mode 100644 index 4a82d19a12d6..000000000000 --- a/_ecosystem/torchmetrics +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: TorchMetrics -summary: Machine learning metrics for distributed, scalable PyTorch applications. -link: https://github.com/PyTorchLightning/metrics -summary-home: Machine learning metrics for distributed, scalable PyTorch applications. -featured-home: false -github-id: PyTorchLightning/metrics -date-added: 06/22/21 ---- diff --git a/_ecosystem/torchopt b/_ecosystem/torchopt deleted file mode 100644 index 7e1500228210..000000000000 --- a/_ecosystem/torchopt +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: TorchOpt -summary: TorchOpt is a PyTorch-based library for efficient differentiable optimization. -link: https://torchopt.readthedocs.io/en/latest/# -summary-home: TorchOpt is a PyTorch-based library for efficient differentiable optimization. -featured-home: false -github-id: metaopt/TorchOpt -date-added: 6/1/23 ---- diff --git a/_ecosystem/torchpoints3d b/_ecosystem/torchpoints3d deleted file mode 100644 index 95c9a39efca8..000000000000 --- a/_ecosystem/torchpoints3d +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: PyTorch-Points3d -summary: A PyTorch framework for deep learning on point clouds. -link: https://torch-points3d.readthedocs.io/en/latest/ -summary-home: A PyTorch framework for deep learning on point clouds. -featured-home: false -github-id: nicolas-chaulet/torch-points3d -date-added: 5/20/20 ---- diff --git a/_ecosystem/torchquantum b/_ecosystem/torchquantum deleted file mode 100644 index aff150a369e6..000000000000 --- a/_ecosystem/torchquantum +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: TorchQuantum -summary: TorchQuantum is a quantum classical simulation framework based on PyTorch. It supports statevector, density matrix simulation and pulse simulation on different hardware platforms such as CPUs and GPUs. -link: https://hanruiwanghw.wixsite.com/torchquantum -summary-home: TorchQuantum is a quantum classical simulation framework based on PyTorch. It supports statevector, density matrix simulation and pulse simulation on different hardware platforms such as CPUs and GPUs. -featured-home: false -github-id: mit-han-lab/torchquantum -date-added: 6/1/23 ---- diff --git a/_ecosystem/trains b/_ecosystem/trains deleted file mode 100644 index a328ae5dd84f..000000000000 --- a/_ecosystem/trains +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Clear ML -summary: ClearML is a full system ML / DL experiment manager, versioning and ML-Ops solution. -link: https://github.com/allegroai/trains/ -summary-home: ClearML is a full system ML / DL experiment manager, versioning and ML-Ops solution. -featured-home: false -github-id: allegroai/clearml -date-added: 6/17/20 ---- diff --git a/_ecosystem/transformers b/_ecosystem/transformers deleted file mode 100644 index 34d3f19a3904..000000000000 --- a/_ecosystem/transformers +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Transformers -summary: State-of-the-art Natural Language Processing for PyTorch. -link: https://github.com/huggingface/transformers -summary-home: State-of-the-art Natural Language Processing for PyTorch. -featured-home: false -github-id: huggingface/transformers -date-added: 01/18/21 ---- diff --git a/_ecosystem/trtorch b/_ecosystem/trtorch deleted file mode 100644 index fcdeaf03b162..000000000000 --- a/_ecosystem/trtorch +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: Torch-TensorRT -summary: PyTorch/TorchScript compiler for NVIDIA GPUs using TensorRT -link: https://pytorch.org/TensorRT/ -summary-home: PyTorch/TorchScript compiler for NVIDIA GPUs using TensorRT -featured-home: false -github-id: NVIDIA/Torch-TensorRT -date-added: 03/28/22 ---- diff --git a/_ecosystem/usb b/_ecosystem/usb deleted file mode 100644 index 73b8de9a20be..000000000000 --- a/_ecosystem/usb +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: USB -summary: USB is a Pytorch-based Python package for Semi-Supervised Learning (SSL). It is easy-to-use/extend, affordable to small groups, and comprehensive for developing and evaluating SSL algorithms. -link: https://usb.readthedocs.io/ -summary-home: USB is a Pytorch-based Python package for Semi-Supervised Learning (SSL). It is easy-to-use/extend, affordable to small groups, and comprehensive for developing and evaluating SSL algorithms. -featured-home: false -github-id: microsoft/Semi-supervised-learning -date-added: 6/1/23 ---- diff --git a/_ecosystem/vissl b/_ecosystem/vissl deleted file mode 100644 index 12329124081f..000000000000 --- a/_ecosystem/vissl +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: VISSL -summary: A library for state-of-the-art self-supervised learning -link: https://vissl.ai/ -summary-home: A library for state-of-the-art self-supervised learning -featured-home: false -github-id: facebookresearch/vissl -date-added: 2/1/21 ---- diff --git a/_ecosystem/vllm b/_ecosystem/vllm deleted file mode 100644 index 0c510878f4d7..000000000000 --- a/_ecosystem/vllm +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: ecosystem_detail -title: vllm -summary: vllm is a high-throughput and memory-efficient inference and serving engine for LLMs. -link: https://github.com/vllm-project/vllm -summary-home: vllm is a high-throughput and memory-efficient inference and serving engine for LLMs. -featured-home: false -github-id: vllm-project/vllm -date-added: 12/3/24 ---- diff --git a/_events/2d_distributed_tensor.md b/_events/2d_distributed_tensor.md deleted file mode 100644 index a362704ac012..000000000000 --- a/_events/2d_distributed_tensor.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "PyTorch 2.0 Ask the Engineers Live Q&A Series: 2D + Distributed Tensor" -date: March 1, 2023 - ---- - -**Date**: March 1, 2023, 11 AM PST -**Speakers**: Wanchao Liang and Junjie Wang -[Watch on YouTube](https://www.youtube.com/watch?v=dKkSYNQISeI&list=PL_lsbAsL_o2CQr8oh5sNWt96yWQphNEzM&index=13) -[Watch on LinkedIn](https://www.linkedin.com/video/event/urn:li:ugcPost:7032431642803671040/) diff --git a/_events/ai-programming.md b/_events/ai-programming.md deleted file mode 100644 index 32379ba8c65e..000000000000 --- a/_events/ai-programming.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -category: event -title: "AI-Powered Competitive Programming: My HackerCup 2024 Experience" -date: January 24, 2025 -poster: assets/images/ai-programming.png ---- - -**Date**: January 24, 2025, 1PM ET - - -AI-Powered Competitive Programming - - - -In this talk, Anton will share how he built an AI agent that ranked #1 in the finals of Meta HackerCup 2024 (AI division). Anton developed a workflow that could solve the hardest competitive programming problems quickly and reliably. Anton will walk through how he used state-of-the-art reasoning LLM models, curated RAG, and leveraged cloud infrastructure to safely test and execute solutions at scale. This approach highlights the massive potential of test-time compute scaling and provides insights into AI's future role in programming. - -Anton Pidkuiko is a Software Engineer at Meta, Reality Labs in London. He is currently working on applying the power of Large Language Models to Metaverse Avatar product experiences. - -[More info on this event.](/ai-powered-competitive-programming) diff --git a/_events/autonomous-language-model-systems.md b/_events/autonomous-language-model-systems.md deleted file mode 100644 index 8532258afef0..000000000000 --- a/_events/autonomous-language-model-systems.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -category: event -title: "Towards Autonomous Language Model Systems" -date: May 21, 2025 -poster: assets/images/pt-day-cfp.png ---- - - -Towards Autonomous Language Model Systems - - -**Date**: May 21, 2025, 11AM PT / 2PM ET -**Location**: Online - -Language models (LMs) are increasingly used to assist users in day-to-day tasks such as programming (Github Copilot) or search (Google's AI Overviews). But can we build language model systems that are able to autonomously complete entire tasks end-to-end? - -In this talk, Ofir Press will discuss efforts to build autonomous LM systems, focusing on the software engineering domain. Ofir will present SWE-bench, a novel method for measuring AI systems on their abilities to fix real issues in popular software libraries. Ofir will then discuss SWE-agent, a system for solving SWE-bench tasks. - -SWE-bench and SWE-agent are used by many leading AI organizations in academia and industry, including OpenAI, Anthropic, Meta, and Google, and SWE-bench has been downloaded over 2 million times. These projects show that academics on tight budgets can have a substantial impact in steering the research community toward building autonomous systems that can complete challenging tasks. - -Ofir is a postdoc at Princeton University, where they mainly work with Karthik Narasimhan's lab. Ofir previously completed their PhD at the University of Washington in Seattle, where Ofir was advised by Noah Smith. During their PhD, Ofir spent two years at Facebook AI Research Labs on Luke Zettlemoyer's team. - -[Register Now](/autonomous-language-model-systems) diff --git a/_events/ce1.md b/_events/ce1.md deleted file mode 100644 index 94c9e66165d9..000000000000 --- a/_events/ce1.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -category: event -title: "COLING 2025" -date: Jan 19, 2025 ---- -Community Event - -**Date**: Jan 19 - 25, 2025 - -COLING, the International Conference on Computational Linguistics, is one of the premier conferences for the natural language processing and computational linguistics. - -First established in 1965, the biennial COLING conference is held in diverse parts of the globe and attracts participants from both top-ranked research centers and emerging countries. Today, the most important developments in our field are taking place not only in universities and academic research institutes but also in industrial research departments including tech-startups. COLING provides opportunities for all these communities to showcase their exciting discovery. - -[Learn more about this event](https://coling2025.org/) \ No newline at end of file diff --git a/_events/ce10.md b/_events/ce10.md deleted file mode 100644 index 67d9e00f66f8..000000000000 --- a/_events/ce10.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "PyCon 2025" -date: May 14, 2025 ---- -Community Event - -**Date**: May 15 - 22, 2025 -**Location**: Pittsburgh, PA - -At PyCon US 2025, find a program filled with pre-conference tutorials and sponsor presentations, 90+ of our community’s best talks, which includes the Charlas track, brilliant keynote speakers, posters on display, a lively Expo Hall filled with incredible Sponsors’ booths, and famed lightning talks on each main conference day. - -[Learn more about this event](https://us.pycon.org/2025/) diff --git a/_events/ce11.md b/_events/ce11.md deleted file mode 100644 index 7cc0095a96cd..000000000000 --- a/_events/ce11.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -category: event -title: "Gamesbeat Summit 2025" -date: May 19, 2025 ---- -Community Event - -**Date**: May 19 - 20, 2025 -**Location**: Los Angeles, CA - -The gaming industry is on the cusp of a transformative era, driven by innovation, cultural impact, and new economic opportunities. At GamesBeat Summit 2025, explore how creative storytelling, community engagement, and effective business strategies that are shaping the future of gaming industry. - -Delve into the diverse influences—ranging from player experiences to industry collaborations—that are paving the way for the next phase of growth. - -[Learn more about this event](https://gbs.venturebeat.com/) diff --git a/_events/ce12.md b/_events/ce12.md deleted file mode 100644 index d2ea93af6df7..000000000000 --- a/_events/ce12.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "NYC Tech Week" -date: Jun 2, 2025 ---- -Community Event - -**Date**: Jun 2 - 8, 2025 -**Location**: New York City - -Tech Week is a decentralized tech conference presented by a16z. Every Tech Week, hundreds of events take place across the host city - from hackathons to panel events, community meetups and more. Every event is organized individually by startups, companies and VCs. - -[Learn more about this event](https://www.tech-week.com/) diff --git a/_events/ce14.md b/_events/ce14.md deleted file mode 100644 index fcfab07f890f..000000000000 --- a/_events/ce14.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "Data + AI Summit" -date: Jun 9, 2025 ---- -Community Event - -**Date**: Jun 9 - 12, 2025 -**Location**: San Francisco, CA - -Join 20,000 peers for 700+ sessions, keynotes and training at the world’s largest data, analytics and AI conference. - -[Learn more about this event](https://www.databricks.com/dataaisummit) diff --git a/_events/ce15.md b/_events/ce15.md deleted file mode 100644 index e85a7403d1e8..000000000000 --- a/_events/ce15.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "CVPR 2025" -date: Jun 10, 2025 ---- -Community Event - -**Date**: Jun 10 - 17, 2025 -**Location**: Nashville, TN - -The IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) is the premier annual computer vision event comprising the main conference and several co-located workshops and short courses. With its high quality and low cost, it provides an exceptional value for students, academics and industry researchers. - -[Learn more about this event](https://cvpr.thecvf.com/) diff --git a/_events/ce16.md b/_events/ce16.md deleted file mode 100644 index eda670bc7191..000000000000 --- a/_events/ce16.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "We are Developers Conference" -date: Jul 9, 2025 ---- -Community Event - -**Date**: Jul 9 - 11, 2025 -**Location**: Berlin, Germany - -Join the largest gathering of software innovators, tech leaders, and decision-makers shaping the future of AI-powered technology. - -[Learn more about this event](https://www.wearedevelopers.com/world-congress) diff --git a/_events/ce17.md b/_events/ce17.md deleted file mode 100644 index ded03e328983..000000000000 --- a/_events/ce17.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "ICML 2025" -date: Jul 13, 2025 ---- -Community Event - -**Date**: Jul 13 - 19, 2025 -**Location**: Berlin, Germany - -Forty-Second International Conference on Machine Learning. - -[Learn more about this event](https://icml.cc/) diff --git a/_events/ce18.md b/_events/ce18.md deleted file mode 100644 index dd61d8531f90..000000000000 --- a/_events/ce18.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "SIGGRAPH 2025" -date: Aug 10, 2025 ---- -Community Event - -**Date**: Aug 10 - 14, 2025 -**Location**: Vancouver, B.C. - -[ACM SIGGRAPH](https://www.siggraph.org/) is a special interest group (SIG) devoted to computer graphics (GRAPH) within the [Association for Computing Machinery](https://www.acm.org/) (ACM), the world’s largest educational and scientific computing society devoted to advancing computing as a science and a profession. Its annual conference, first held in 1974, is the premier conference on computer graphics and interactive techniques worldwide. At SIGGRAPH 2025, we boldly look toward the future, imagining how humanity and technology will be increasingly connected and examining how we can create a future that connects our physical and digital worlds for the better. - -[Learn more about this event](https://s2025.siggraph.org/) diff --git a/_events/ce19.md b/_events/ce19.md deleted file mode 100644 index 2e9625dd9a67..000000000000 --- a/_events/ce19.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "San Francisco Tech Week" -date: Oct 6, 2025 ---- -Community Event - -**Date**: Oct 6 - 12, 2025 -**Location**: San Francisco - -Tech Week is a decentralized tech conference presented by a16z. Every Tech Week, hundreds of events take place across the host city - from hackathons to panel events, community meetups and more. Every event is organized individually by startups, companies and VCs. - -[Learn more about this event](https://www.tech-week.com/) diff --git a/_events/ce2.md b/_events/ce2.md deleted file mode 100644 index f0857e44a475..000000000000 --- a/_events/ce2.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -category: event -title: "Open Source AI Summit" -date: Jan 22, 2025 ---- -Community Event - -**Date**: Jan 22, 2025 -**Location**: Paris, France - -Open Source AI has become a major trend in the industry, with even many digital giants adopting an Open Source approach. While Open Source AI isn't magic, it does offer the potential to address many challenges more effectively than proprietary AI models. - -This first edition of the Paris Open Source AI Summit will bring together global leaders and industry players to address these issues. The summit will aim to establish a common set of ideas, vocabulary and definitions to create a shared understanding of the current state of Open Source AI. - -[Learn more about this event](https://opensourceaisummit.eu/#rec838155366) diff --git a/_events/ce20.md b/_events/ce20.md deleted file mode 100644 index de0a07092616..000000000000 --- a/_events/ce20.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "LA Tech Week" -date: Oct 13, 2025 ---- -Community Event - -**Date**: Oct 13 - 19, 2025 -**Location**: Los Angeles, CA - -Tech Week is a decentralized tech conference presented by a16z. Every Tech Week, hundreds of events take place across the host city - from hackathons to panel events, community meetups and more. Every event is organized individually by startups, companies and VCs. - -[Learn more about this event](https://www.tech-week.com/) diff --git a/_events/ce21.md b/_events/ce21.md deleted file mode 100644 index c7b0e5dae932..000000000000 --- a/_events/ce21.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "ICCV 2025" -date: Oct 20, 2025 ---- -Community Event - -**Date**: Oct 20 - 24, 2025 -**Location**: Honolulu, HI - -International Conference on Computer Vision, ICCV 2025. - -[Learn more about this event](https://iccv.thecvf.com/) diff --git a/_events/ce22.md b/_events/ce22.md deleted file mode 100644 index 07ef894b515a..000000000000 --- a/_events/ce22.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -category: event -title: "Open Source AI Week" -date: Oct 18, 2025 ---- -Community Event - -**Date**: Oct 18 - 26, 2025 -**Location**: San Francisco, CA - -Open Source AI Week is the premier event that brings together the best AI and ML conferences, hackathons, startup showcases, and networking opportunities exploring the intersection of artificial intelligence, machine learning, and open source technology. Taking place between October 18 – 26, 2025 in San Francisco area. This week-long celebration is dedicated to fostering innovation, collaboration, and community-driven solutions in the rapidly evolving AI landscape, featuring the PyTorch Conference as the flagship event. - -[Submit your event](https://linuxfoundation.research.net/r/FD6JMH5) to be included in Open Source AI Week, and check back mid-May to see the Open Source AI Week event lineup! - -[Learn more about this event](https://events.linuxfoundation.org/open-source-ai-week/) diff --git a/_events/ce23.md b/_events/ce23.md deleted file mode 100644 index e06dedf1e645..000000000000 --- a/_events/ce23.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "NeurIPS 2025" -date: Dec 7, 2025 ---- -Community Event - -**Date**: Dec 7 - 10, 2025 -**Location**: San Diego, CA - -The Thirty-Ninth Annual Conference on Neural Information Processing Systems. - -[Learn more about this event](https://neurips.cc/) diff --git a/_events/ce24.md b/_events/ce24.md deleted file mode 100644 index d08216a6e078..000000000000 --- a/_events/ce24.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -category: event -title: "ECCV 2026" -date: Sep 9, 2025 ---- -Community Event - -**Date**: Sep 9 - 13, 2026 -**Location**: Malmö, Sweden - -ECCV is the official event under the European Computer Vision Association and is biannual on even numbered years. Any other event trying to utilize this title is not a sanctioned event. - -The European Conference on Computer Vision (ECCV) is a biennial premier research conference in Computer Vision and Machine Learning, managed by the [European Computer Vision Association (ECVA)](https://www.ecva.net/). It is held on even years and gathers the scientific and industrial communities on these areas. The first ECCV was held in 1990 in Antibes, France, and subsequently organized all over Europe. Paper proceedings are published by [Springer Science+Business Media](https://en.wikipedia.org/wiki/Springer_Science%2BBusiness_Media). - -[Learn more about this event](https://eccv.ecva.net/) diff --git a/_events/ce25.md b/_events/ce25.md deleted file mode 100644 index 2d9d6d02d568..000000000000 --- a/_events/ce25.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "GOSIM AI" -date: May 6, 2025 ---- -Community Event - -**Date**: May 6 - 7, 2025 -**Location**: Paris, France - -[Learn more about this event](https://paris2025.gosim.org/) diff --git a/_events/ce26.md b/_events/ce26.md deleted file mode 100644 index 328b0fd3d870..000000000000 --- a/_events/ce26.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "PyTorch ATX Community Meetup" -date: April 30, 2025 ---- -Community Event - -**Date**: April 30, 2025 -**Location**: Austin, TX - -The Triton framework provides a hardware agnostic way of programming and targeting GPUs. As Triton becomes more widely adopted, it will be essential in understanding how to write, optimize and troubleshoot the Triton kernel in order to optimize GPU efficiency for algorithms. Join the PyTorch community meetup to learn how Red Hat, Intel, AMD, IBM Research and University of Texas are working on developing Triton kernels. - -[Learn more about this event](https://meetu.ps/e/NYlm0/qrnF8/i) diff --git a/_events/ce3.md b/_events/ce3.md deleted file mode 100644 index 9a4e195afee3..000000000000 --- a/_events/ce3.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -category: event -title: "Open Source Forum" -date: Feb 13, 2025 ---- -Community Event - -**Date**: Feb 13, 2025 -**Location**: Los Angeles, CA - -The Academy Software Foundation’s (ASWF) annual Open Source Forum brings together Foundation members and select guests from the motion picture and media industries to collaborate and discuss the future of open source software. - -Open Source Forum 2025 features a new format to better enable open dialogue and interactive discussion. Hosted at Walt Disney Animation Studios in Burbank, CA, the half-day event will kick off with several presentations around the anatomy of a studio, emerging technologies impacting studios, and open source opportunities, followed by a moderated discussion. - -[Learn more about this event](https://events.linuxfoundation.org/aswf-open-source-forum/) diff --git a/_events/ce4.md b/_events/ce4.md deleted file mode 100644 index 1b1063abf142..000000000000 --- a/_events/ce4.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "AAAI Conference on AI" -date: Feb 25, 2025 ---- -Community Event - -**Date**: Feb 25 - Mar 4, 2025 -**Location**: Philadelphia, PA - -The purpose of the AAAI conference series is to promote research in Artificial Intelligence (AI) and foster scientific exchange between researchers, practitioners, scientists, students, and engineers across the entirety of AI and its affiliated disciplines. AAAI-25 will feature technical paper presentations, special tracks, invited speakers, workshops, tutorials, poster sessions, senior member presentations, competitions, and exhibit programs, and a range of other activities to be announced. - -[Learn more about this event](https://aaai.org/conference/aaai/) diff --git a/_events/ce5.md b/_events/ce5.md deleted file mode 100644 index 6be2a635a465..000000000000 --- a/_events/ce5.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "Nvidia GTC 2025" -date: Mar 17, 2025 ---- -Community Event - -**Date**: Mar 17 - 21, 2025 -**Location**: San Jose, CA - -Nvidia's GTC 2025, a global AI conference for developers, showcased advancements in AI, robotics, and data centers, with key announcements including the Blackwell Ultra AI chip and the Vera Rubin architecture. - -[Learn more about this event](https://www.nvidia.com/gtc/) diff --git a/_events/ce6.md b/_events/ce6.md deleted file mode 100644 index 1a45335fedf1..000000000000 --- a/_events/ce6.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -category: event -title: "LF Member Summit" -date: Mar 18, 2025 ---- -Community Event - -**Date**: Mar 18 - 20, 2025 -**Location**: Napa, CA - -The Linux Foundation Member Summit is the annual gathering for Linux Foundation member organizations. - -An annual gathering for Linux Foundation members that fosters collaboration, innovation, and partnerships among the leading projects and organizations working to drive digital transformation with open source technologies. It is a must-attend for business and technical leaders looking to advance open source strategy, implementation, and investment in their organizations and learn how to collaboratively manage the largest shared technology investment of our time. - -[Learn more about this event](https://events.linuxfoundation.org/lf-member-summit/) diff --git a/_events/ce7.md b/_events/ce7.md deleted file mode 100644 index 37a87c50453f..000000000000 --- a/_events/ce7.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -category: event -title: "ICLR 2025" -date: Apr 24, 2025 ---- -Community Event - -**Date**: Apr 24 - 28, 2025 -**Location**: Singapore - -The International Conference on Learning Representations (ICLR) is the premier gathering of professionals dedicated to the advancement of the branch of artificial intelligence called representation learning, but generally referred to as deep learning. - -ICLR is globally renowned for presenting and publishing cutting-edge research on all aspects of deep learning used in the fields of artificial intelligence, statistics and data science, as well as important application areas such as machine vision, computational biology, speech recognition, text understanding, gaming, and robotics. - -[Learn more about this event](https://iclr.cc/) diff --git a/_events/ce8.md b/_events/ce8.md deleted file mode 100644 index 13d99e29d4bc..000000000000 --- a/_events/ce8.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -category: event -title: "Dubai AI Festival" -date: Apr 23, 2025 ---- -Community Event - -**Date**: Apr 23 - 24, 2025 -**Location**: Dubai, UAE - -At Dubai AI Festival, attendees will experience the convergence of artificial intelligence, blockchain, XR, decentralised systems, driving the progression of digital economies and technological innovation. - -This dynamic platform is designed to foster collaboration, innovation, and knowledge-sharing among industry leaders, entrepreneurs, and tech enthusiasts from around the world. Join us to engage with the future of technology at Dubai AI Festival. - -[Learn more about this event](https://dubaiaifestival.com/) diff --git a/_events/ce9.md b/_events/ce9.md deleted file mode 100644 index 99bfe5b69ed9..000000000000 --- a/_events/ce9.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "MLSys" -date: May 12, 2025 ---- -Community Event - -**Date**: May 12 - 15, 2025 -**Location**: Santa Clara, CA - -The Eighth Annual Conference on Machine Learning and Systems - -[Learn more about this event](https://mlsys.org/) diff --git a/_events/ddp_fdsp_support.md b/_events/ddp_fdsp_support.md deleted file mode 100644 index 6c1cb7a33f33..000000000000 --- a/_events/ddp_fdsp_support.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "PyTorch 2.0 Ask the Engineers Live Q&A Series: DDP/FDSP Support" -date: January 24, 2023 - ---- - -**Date**: January 24, 2023, 2PM PST -**Speaker**: Will Constable -[Watch on YouTube](https://www.youtube.com/watch?v=6S4tH9qEswo) -[Watch on LinkedIn](https://www.linkedin.com/video/event/urn:li:ugcPost:7023384711771160577/?utm_content=235500656&utm_medium=social&utm_source=twitter&hss_channel=tw-776585502606721024) diff --git a/_events/deep_dive_torchinductor.md b/_events/deep_dive_torchinductor.md deleted file mode 100644 index f659a159a19a..000000000000 --- a/_events/deep_dive_torchinductor.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "PyTorch 2.0 Ask the Engineers Live Q&A Series: A deep dive on TorchInductor and PT2 Backend Integration" -date: January 25, 2023 - ---- - -**Date**: January 25, 2023, 11AM PST -**Speakers**: Natalia Gimelshein, Bin Bao, Sherlock Huang and Eikan Wang -[Watch on YouTube](https://www.youtube.com/watch?v=AaFc3C7CZAs) -[Watch on LinkedIn](https://www.linkedin.com/video/event/urn:li:ugcPost:7023453241841500161/?utm_content=235635469&utm_medium=social&utm_source=twitter&hss_channel=tw-776585502606721024) diff --git a/_events/devcon-meetup.md b/_events/devcon-meetup.md deleted file mode 100644 index a93c10cd4c6b..000000000000 --- a/_events/devcon-meetup.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: event -title: "PyTorch Meetup at DevConf.IN 2025" -date: Feb 28, 2025 ---- - -**Date**: Feb 28, 2025 -**Location**: Pune, India - -[Event Blog](https://pytorch.org/blog/pt-fedora-os-communities/) \ No newline at end of file diff --git a/_events/docathon-2024.md b/_events/docathon-2024.md deleted file mode 100644 index 099dc36894a5..000000000000 --- a/_events/docathon-2024.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -category: event -title: "PyTorch Docathon 2024" -date: June 4, 2024 -poster: assets/images/docathon-2024.png ---- - -**Join us for our 3rd PyTorch Docathon June 4-20th** - -**Date**: June 4 - 20th, 2024 - - -PyTorch Docathon banner - - -The Docathon, similar to a hackathon, is an event focused on improving PyTorch documentation with help from our community. Quality documentation is crucial for any technology, and by enhancing it, we make it easier for new users to start with PyTorch, use its features effectively, and accelerate the shift from research to production in machine learning. See our previous events here and here. - -#### Why Participate - -The Docathon is an inclusive event designed to be accessible to newcomers, requiring only a basic understanding of Python, PyTorch, and Machine Learning, with some tasks not even requiring these skills. It offers a rewarding experience as participants can see the direct impact of their contributions on the project's usability and accessibility. The Docathon promotes a collaborative environment, allowing participants to work with other contributors and PyTorch maintainers, fostering the exchange of ideas and networking. It also provides a rich learning experience, offering the opportunity to explore PyTorch modules, update docstrings, and test tutorials. - - -#### Event Details - -- **June 4**: Kick-off -- **June 4 - 16**: Submissions and Feedback -- **June 17 - 18**: Final Reviews -- **June 20**: Winner Announcements - -Further details for the Docathon will be announced at the Kick-off call on June 4. - -[Register here](https://hubs.la/Q02xrG5Z0) \ No newline at end of file diff --git a/_events/docathon-2025.md b/_events/docathon-2025.md deleted file mode 100644 index 88bc55a52724..000000000000 --- a/_events/docathon-2025.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -category: event -title: "Docathon 2025" -date: Jun 3, 2025 ---- - -**Date**: June 3-18, 2025 -**Location**: Online - - -PyTorch Docathon - - -The PyTorch Docathon 2025, akin to a hackathon, is an event dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. This is an inclusive event designed to be accessible to all levels of expertise, from newcomers to experienced ML/PyTorch users. It offers a rewarding experience as participants can see the direct impact of their contributions on the project's usability and accessibility. The Docathon promotes a collaborative environment, allowing participants to work with other contributors and PyTorch maintainers, fostering the exchange of ideas and networking. It also provides a rich learning experience, offering the opportunity to explore PyTorch modules, update docstrings, and test tutorials. - -[RSVP Now](https://community.linuxfoundation.org/events/details/lfhq-pytorch-foundation-presents-pytorch-docathon-june-3rd-18th-2025/) \ No newline at end of file diff --git a/_events/docathon.md b/_events/docathon.md deleted file mode 100644 index ff4529ba1833..000000000000 --- a/_events/docathon.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -category: event -title: "PyTorch 2023 Docathon" -date: May 31, 2023 - ---- - -**Date**: May 31, 2023 - -We are excited to announce the first-ever PyTorch Docathon! - -The Docathon is a hackathon-style event focused on improving documentation by enlisting the community's help. Documentation is a crucial aspect of any technology. - -By improving the documentation, we can make it easier for users to get started with PyTorch, help them understand how to use its features effectively, and ultimately accelerate research to production in the field of machine learning. - -Details for the Docathon will be announced at the kick-off call on May 31. - -[Register now to join this year’s event](https://community.linuxfoundation.org/e/mmbqqb/) diff --git a/_events/dynamic_shapes.md b/_events/dynamic_shapes.md deleted file mode 100644 index 3940d48af81c..000000000000 --- a/_events/dynamic_shapes.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "PyTorch 2.0 Ask the Engineers Live Q&A Series: Dynamic Shapes and Calculating Maximum Batch Size" -date: February 7, 2023 - ---- - -**Date**: February 7, 2023, 1PM PST -**Speakers**: Edward Yang and Elias Ellison -[Watch on YouTube](https://www.youtube.com/watch?v=4dX4kuVbl9U&list=PL_lsbAsL_o2CQr8oh5sNWt96yWQphNEzM&index=10) -[Watch on LinkedIn](https://www.linkedin.com/events/7027389624620634112/?lipi=urn%3Ali%3Apage%3Ad_flagship3_company_admin%3BUIqLb0KRSZm%2FHtSu2bw%2B4g%3D%3D) diff --git a/_events/episode_1.md b/_events/episode_1.md deleted file mode 100644 index 66295ce01ebf..000000000000 --- a/_events/episode_1.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 1 -guest: Thomas Viehmann -company: TorchDrift -date: May 19, 2021 -time: 1 pM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/I3.png -video: https://www.youtube.com/watch?v=rV5BhoKILoE ---- diff --git a/_events/episode_10.md b/_events/episode_10.md deleted file mode 100644 index 1052cd929053..000000000000 --- a/_events/episode_10.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 10 -guest: Robin Lobel -company: TorchStudio -date: July 21, 2021 -time: 9 AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/F4.png -video: https://www.youtube.com/watch?v=2YZRw2BF8Gw ---- diff --git a/_events/episode_11.md b/_events/episode_11.md deleted file mode 100644 index 4a13f263e51f..000000000000 --- a/_events/episode_11.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 11 -guest: Will Falcon, Thomas Chaton -company: PyTorch Lightning -date: July 28, 2021 -time: 9 AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/E4.png -video: https://www.youtube.com/watch?v=A1bkh4gNDJA ---- diff --git a/_events/episode_12.md b/_events/episode_12.md deleted file mode 100644 index f96bef9ae3e5..000000000000 --- a/_events/episode_12.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: live-stream -title: Episode 12 -guest: Shagun Sodhani -company: Facebook AI -date: August 4, 2021 -time: 1PM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C2.png -link: https://www.youtube.com/pytorch -video: https://www.youtube.com/watch?v=QIX9b9EAZOY ---- diff --git a/_events/episode_13.md b/_events/episode_13.md deleted file mode 100644 index 069aac160a58..000000000000 --- a/_events/episode_13.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: live-stream -title: Episode 13 -guest: Sabrina Smai + Geeta Chauhan -company: Microsoft + Facebook AI -date: August 11, 2021 -time: 1PM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H6.png -link: https://www.youtube.com/pytorch -video: https://www.youtube.com/watch?v=m6ouC0XMYnc ---- diff --git a/_events/episode_14.md b/_events/episode_14.md deleted file mode 100644 index f754b8aee68a..000000000000 --- a/_events/episode_14.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: live-stream -title: Episode 14 -guest: Edgar Riba -company: Kornia -date: August 17, 2021 -time: 9AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C7.png -link: https://www.youtube.com/pytorch -video: https://www.youtube.com/watch?v=Ge5T6eZ2WY4 ---- diff --git a/_events/episode_15.md b/_events/episode_15.md deleted file mode 100644 index 72fcbe1281f2..000000000000 --- a/_events/episode_15.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: live-stream -title: Episode 15 -guest: Tom Bromley + Josh Izaac -company: Xanadu -date: August 25, 2021 -time: 9 AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K1.png -link: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K1.png -video: https://www.youtube.com/watch?v=-4kOxux_XSQ ---- diff --git a/_events/episode_16.md b/_events/episode_16.md deleted file mode 100644 index 240d68f8b640..000000000000 --- a/_events/episode_16.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: live-stream -title: Episode 16 -guest: Natasha Seelam + Patricio Cerda-Mardini -company: MindsDB -date: September 1, 2021 -time: 9AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H8.png -link: https://www.youtube.com/pytorch -video: https://www.youtube.com/watch?v=tHYgd7aP8m0 ---- diff --git a/_events/episode_17.md b/_events/episode_17.md deleted file mode 100644 index ef3f31504a17..000000000000 --- a/_events/episode_17.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 17 -guest: Sergey Kolesnikov -company: Catalyst -date: September 8, 2021 -time: 9 AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/K2.png -video: https://www.youtube.com/watch?v=GLdLz27GoLs ---- diff --git a/_events/episode_18.md b/_events/episode_18.md deleted file mode 100644 index 27f92c2a0683..000000000000 --- a/_events/episode_18.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 18 -guest: Victor Fomin -company: PyTorch Ignite -date: September 15, 2021 -time: 9 AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/G4.png -video: https://www.youtube.com/watch?v=vJqdgmTzo7E ---- diff --git a/_events/episode_19.md b/_events/episode_19.md deleted file mode 100644 index 4ad7053c2777..000000000000 --- a/_events/episode_19.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 19 -guest: Abhi Khobare and Chirag Patel -company: Qualcomm -date: September 22, 2021 -time: 9 AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D4.png -video: https://www.youtube.com/watch?v=1Q3I4OKU29I ---- diff --git a/_events/episode_2.md b/_events/episode_2.md deleted file mode 100644 index c2fdda7000a2..000000000000 --- a/_events/episode_2.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 2 -guest: Sylvain Gugger and Lysandre Debut -company: HuggingFace -date: May 26, 2021 -time: 1 PM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/C3.png -video: https://www.youtube.com/watch?v=wE3bk7JaH4E ---- diff --git a/_events/episode_20.md b/_events/episode_20.md deleted file mode 100644 index a1396a361439..000000000000 --- a/_events/episode_20.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 20 -guest: E.Ulises Moya Sanchez and Abraham Sanchez -company: PyTorch Monogenic ConvNet Layer -date: September 29, 2021 -time: 9 AM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/B7.png -video: https://www.youtube.com/watch?v=nIGjvxaE7jo ---- diff --git a/_events/episode_3.md b/_events/episode_3.md deleted file mode 100644 index 8d2927258f75..000000000000 --- a/_events/episode_3.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 3 -guest: Alex O'Connor and Bing Ouyang -company: AutoDesk -date: June 2, 2021 -time: 1 PM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A1.png -video: https://www.youtube.com/watch?v=LBOIxA5sg2A ---- diff --git a/_events/episode_4.md b/_events/episode_4.md deleted file mode 100644 index c54bf5620870..000000000000 --- a/_events/episode_4.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 4 -guest: Fernando Pérez-García -company: TorchIO -date: June 9, 2021 -time: 1 PM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/B7.png -video: https://www.youtube.com/watch?v=UEUVSw5-M9M ---- diff --git a/_events/episode_5.md b/_events/episode_5.md deleted file mode 100644 index b90c2a0374f4..000000000000 --- a/_events/episode_5.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 5 -guest: Kashif Rasul -company: PyTorchTS -date: June 16, 2021 -time: 1 PM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A2.png -video: https://www.youtube.com/watch?v=-Ib0lFlbDXs ---- diff --git a/_events/episode_6.md b/_events/episode_6.md deleted file mode 100644 index acb53da82f2d..000000000000 --- a/_events/episode_6.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 6 -guest: Ludovic Denoyer -company: Rlstructures -date: June 23, 2021 -time: 1 PM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/D2.png -video: https://www.youtube.com/watch?v=ZjwwMoVLkXQ ---- diff --git a/_events/episode_7.md b/_events/episode_7.md deleted file mode 100644 index 843958b9b444..000000000000 --- a/_events/episode_7.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 7 -guest: Michael Galarnyk and Richard Liaw -company: Ray -date: June 30, 2021 -time: 1 PM PST -poster: https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/H1.png -video: https://www.youtube.com/watch?v=3EOXDJPzSsY ---- diff --git a/_events/episode_8.md b/_events/episode_8.md deleted file mode 100644 index cc871c45e38d..000000000000 --- a/_events/episode_8.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: live-stream -title: Episode 8 -guest: Phillip Meier -company: Pystiche -date: July 7, 2021 -time: 1 PM PST -poster: https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbDB6Z1VhLUZqWngyT1ZHQXRuaEtnVjV6WkRQd3xBQ3Jtc0tsN0IxU1VfRW9aOExIYXBrNTAxWklxbWw5WnZiTXpyLUxPamcyT2NrUFpNYXF4Z1pNLWJndFEzR2FsMjBRY1lzQlo1Z1prYjFqZ1hsZFFZVjRnOFRma0ZMUWxsMUF6VHBtMjNHMUFsZjZpOGJPY1BoNA&q=https%3A%2F%2Fassets.pytorch.org%2Fpted2021%2Fposters%2FD7.png -video: https://www.youtube.com/watch?v=lCOjSiHnQwU ---- diff --git a/_events/episode_9.md b/_events/episode_9.md deleted file mode 100644 index cdf5be3cbcb7..000000000000 --- a/_events/episode_9.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -category: live-stream -title: Episode 9 -guest: Seth Juarez and Maryanne Wachter -company: PyLadies -date: July 14, 2021 -time: 1 PM PST -video: https://www.youtube.com/watch?v=rIwxrb89MNc ---- diff --git a/_events/example_event.md b/_events/example_event.md deleted file mode 100644 index adca4bda2049..000000000000 --- a/_events/example_event.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: test -title: Title of episode or webinar -guest: Name of the guest -company: Name of the company. This field only gets used for webinars -date: August 4, 2021 -time: 1PM PST* If possible try to keep the format of the date and time the same as the example -poster: Full URL to poster -link: Full URL to video* Only used for webinars ---- diff --git a/_events/kr-conf.md b/_events/kr-conf.md deleted file mode 100644 index 2acc9671a2a9..000000000000 --- a/_events/kr-conf.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -category: event -title: "PyTorch KR Conference" -date: March 30, 2025 ---- - -**Date**: March 30, 2025, 13:00 ~ 18:00 -**Location**: Seoul, Republic of Korea - -Hear from speakers from the PyTorch Foundation, Meta, FuriosaAI, Lablup, Nota AI, Rebellions, etc. - -[Event Info](https://event-us.kr/pytorchkr/event/100142) \ No newline at end of file diff --git a/_events/multi-modal-dl-frame.md b/_events/multi-modal-dl-frame.md deleted file mode 100644 index ed2539f2d0d0..000000000000 --- a/_events/multi-modal-dl-frame.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -category: event -title: "Multi-Modal Tabular Deep Learning with PyTorch Frame" -date: February 19 -poster: assets/images/multi-modal-dl-frame.png ---- - -**Date**: February 19, 12 pm PST - - -Multi-Modal Tabular Deep Learning with PyTorch Frame - - -In this talk, Akihiro introduced PyTorch Frame, a modular framework for multi-modal tabular deep learning. PyTorch Frame enables seamless integration with the PyTorch ecosystem, including PyTorch Geometric for graph-based message passing across relational data and Hugging Face Transformers for extracting rich text features. The talk also highlights its specialized data structures for efficiently handling sparse features, making PyTorch Frame an essential tool for modern tabular data. - -Akihiro Nitta is a software engineer on the ML team at Kumo.ai and a core contributor to PyTorch Frame and PyTorch Geometric, with prior experience as a maintainer of PyTorch Lightning. - -[Learn more about the event](/multi-modal-dl-frame) diff --git a/_events/optimizing_transformers.md b/_events/optimizing_transformers.md deleted file mode 100644 index ccd8c823658a..000000000000 --- a/_events/optimizing_transformers.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "PyTorch 2.0 Ask the Engineers Live Q&A Series: Optimizing Transformers for Inference" -date: February 2, 2023 - ---- - -**Date**: February 2, 2023, 1PM PST -**Speakers**: Hamid Shojanazeri and Mark Saroufim -[Watch on YouTube](https://www.youtube.com/watch?v=ZOWjOxC80qw&list=PL_lsbAsL_o2CQr8oh5sNWt96yWQphNEzM&index=9) -[Watch on LinkedIn](https://www.linkedin.com/events/7026218151285592064/?lipi=urn%3Ali%3Apage%3Ad_flagship3_company_admin%3BUIqLb0KRSZm%2FHtSu2bw%2B4g%3D%3D) diff --git a/_events/pt-26-live-q-a.md b/_events/pt-26-live-q-a.md deleted file mode 100644 index 6838babb7ebe..000000000000 --- a/_events/pt-26-live-q-a.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -category: event -title: "PyTorch 2.6 Live Q&A" -date: February 7, 2025 -poster: assets/images/ai-programming.png ---- - -**Date**: February 7, 10 am PST - - -PyTorch 2.6 Live Q&A - - -Wondering what's new in the recent PyTorch 2.6 release? Do you have questions? Join us for a live Q&A on PyTorch 2.6 with PyTorch Core Maintainer, Nikita Shulga (Meta). - -Nikita is a Software Engineer at Meta where he is, among other things, responsible for PyTorch releases and continuous integration. Nikita is committed to uplifting the developer community and continuously improving PyTorch. He earned his Master’s degree in Applied Mathematics from the Moscow Institute of Physics and Technology (MIPT). - -Bring your PyTorch 2.6 questions for Nikita during this live Q&A session. - -[More info on this event.](/pt-26-live-q-a) diff --git a/_events/pt-27-release-qa.md b/_events/pt-27-release-qa.md deleted file mode 100644 index d1e75363137e..000000000000 --- a/_events/pt-27-release-qa.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -category: event -title: "PyTorch 2.7 Release Live Q&A" -date: Apr 28, 2025 -poster: assets/images/pt27qa.png ---- - - -PyTorch 2.7 Release Q&A - - -**Date**: April 28, 12 pm PT -**Speakers**: Piotr Bialecki (NVIDIA) and Nikita Shulga (Meta) -**Location**: Online - -Have questions about PyTorch 2.7? Join PyTorch Core Maintainers Piotr Bialecki (NVIDIA) and Nikita Shulga (Meta) for a live Q&A session on Monday, April 28 at 12 PM PST. - -Piotr joined the PyTorch team at NVIDIA in 2019 and currently manages the team. He drives NVIDIA’s effort in maintaining and advancing PyTorch’s CUDA backend and received the PyTorch SUPERHERO award in 2023 for his community contributions, especially in the PyTorch discussion board. As a Core Maintainer, he is also focused on PyTorch’s long-term vision and development. - -Nikita is a Software Engineer at Meta where, among other things, he is responsible for PyTorch releases and continuous integration. Nikita is committed to uplifting the developer community and continuously improving PyTorch. He earned a Master’s degree in Applied Mathematics from the Moscow Institute of Physics and Technology (MIPT). - -Bring your PyTorch 2.7 questions for Piotr & Nikita during this live Q&A session. - -[Learn more about this event](/pt-27-release-qa) - diff --git a/_events/pt-day-china-2025.md b/_events/pt-day-china-2025.md deleted file mode 100644 index a8cb293c7fb8..000000000000 --- a/_events/pt-day-china-2025.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -category: event -title: "PyTorch Day China 2025" -date: June 7, 2025 ---- - - -PyTorch Day China 2025 - - -**Date:** June 7, 2025 -**Location:** Beijing, China - -PyTorch Day China 2025, proudly hosted by the PyTorch Foundation, is the premier gathering dedicated to open-source AI and machine learning innovation. Scheduled for June 7th in Beijing, China and co-located with the BAAI Conference, this community-driven event provides an unparalleled platform for PyTorch enthusiasts, machine learning engineers, AI researchers, and industry professionals. - -Immerse yourself in a vibrant day of insightful technical talks, interactive discussions, and engaging poster sessions designed to foster knowledge exchange and collaboration. PyTorch Day China is your gateway to connecting with leading experts and peers in the open-source AI community, offering you unique opportunities to explore cutting-edge advancements and shape the future of deep learning. - -[Read more about the event](https://www.lfasiallc.com/pytorch-day-china/) \ No newline at end of file diff --git a/_events/pt-day-france-2025.md b/_events/pt-day-france-2025.md deleted file mode 100644 index 09b44cb627cd..000000000000 --- a/_events/pt-day-france-2025.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -category: event -title: "PyTorch Day France 2025: Registration Open" -date: May 7, 2025 -poster: assets/images/pt-day-cfp.png ---- - - -PyTorch Day France 2025 - - -**Date**: May 7, 2025 -**Location**: Paris, France - -PyTorch Day France 2025, proudly hosted by the PyTorch Foundation, is the premier gathering dedicated to open-source AI and machine learning innovation. Scheduled for 7 May in Paris, France and co-located with the GOSIM AI Paris, this community-driven event provides an unparalleled platform for PyTorch enthusiasts, machine learning engineers, AI researchers, and industry professionals. -Immerse yourself in a vibrant day of insightful technical talks, interactive discussions, and engaging poster sessions designed to foster knowledge exchange and collaboration. PyTorch Day France is your gateway to connecting with leading experts and peers in the open-source AI community, offering you unique opportunities to explore cutting-edge advancements and shape the future of deep learning. - -[Register Now](https://events.linuxfoundation.org/pytorch-day-france/) diff --git a/_events/pt-dinov2-multi-label-plant-species-classification.md b/_events/pt-dinov2-multi-label-plant-species-classification.md deleted file mode 100644 index f4b7edede489..000000000000 --- a/_events/pt-dinov2-multi-label-plant-species-classification.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -category: event -title: "Using PyTorch and DINOv2 for Multi-label Plant Species Classification" -date: March 27 -poster: assets/images/pt-dinov2-multi-label-plant-species-classification.png ---- - -**Date**: March 27th, 12 PM PST - - -Using PyTorch and DINOv2 for Multi-label Plant Species Classification - - -Join us for an engaging webinar on our innovative transfer learning approach using self-supervised Vision Transformers (DINOv2) for multi-label plant species classification in the PlantCLEF 2024 challenge. We’ll cover how we efficiently extract feature embeddings from a dataset of 1.4 million images and utilize PyTorch Lightning for model training and Apache Spark for data management. Learn about our image processing techniques, including transforming images into grids of tiles and aggregating predictions to overcome computational challenges. Discover the significant performance improvements achieved and get insights into multi-label image classification. Perfect for PyTorch developers, this session will include a Q&A and access to our complete codebase at [github.com/dsgt-kaggle-clef/plantclef-2024](https://github.com/dsgt-kaggle-clef/plantclef-2024). - -Murilo Gustineli is a Senior AI Software Solutions Engineer at Intel, and is currently pursuing a Master’s in Computer Science at Georgia Tech focusing on machine learning. His work involves creating synthetic datasets, fine-tuning large language models, and training multi-modal models using Intel® Gaudi® Al accelerators as part of the Development Enablement team. He is particularly interested in deep learning, information retrieval, and biodiversity research, aiming to improve species identification and support conservation efforts. - -[Learn more about the event](/pt-dinov2-multi-label-plant-species-classification) diff --git a/_events/pt-korea.md b/_events/pt-korea.md deleted file mode 100644 index 667b916fc319..000000000000 --- a/_events/pt-korea.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: event -title: "PyTorch Korea User Group Meetup" -date: November 30, 2024 ---- - -**Date**: November 30, 2024 -**Location**: Seoul, South Korea - -[Event info](https://festa.io/events/6409) \ No newline at end of file diff --git a/_events/pt-shanghai.md b/_events/pt-shanghai.md deleted file mode 100644 index 136ca89600dc..000000000000 --- a/_events/pt-shanghai.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -category: event -title: "PyTorch Shanghai Meetup" -date: August 15, 2024 ---- - -**Date**: August 15, 2024 -**Location**: Shanghai, China - -[Read the notes](https://pytorch.org/blog/pytorch-shanghai-notes/) \ No newline at end of file diff --git a/_events/pytorch-conference-2023.md b/_events/pytorch-conference-2023.md deleted file mode 100644 index fe201e197904..000000000000 --- a/_events/pytorch-conference-2023.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "PyTorch Conference 2023" -date: October 16, 2023 ---- - -**Date**: October 16 - 17, 2023 - -The conference will showcase PyTorch 2.1, the next-generation release of the popular machine learning framework. As part of the Linux Foundation, the PyTorch Foundation Conference continues the tradition of bringing together leading researchers, developers, and academic communities to advance the education and development of end-to-end machine learning. - -The conference agenda features an engaging lineup of events, including an opening reception, engaging community and partner discussions, informative panels, poster sessions, enlightening use cases and community stories, as well as discussions on the latest trends in machine learning and deep learning development and deployment. - -[Find out more and register!](https://events.linuxfoundation.org/pytorch-conference/) diff --git a/_events/pytorch-conference-2024.md b/_events/pytorch-conference-2024.md deleted file mode 100644 index 445785b40b97..000000000000 --- a/_events/pytorch-conference-2024.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "PyTorch Conference 2024" -date: September 18, 2024 -poster: assets/images/pytorch-conf-2024.png ---- -**Date**: September 18 - 19, 2024 - - -PyTorch Conference banner - - -Join us in San Francisco on **September 18th-19th**, and learn about PyTorch, the cutting-edge renowned open-source machine learning framework. This year is a two-day event that brings together top-tier researchers, developers, and academic communities, fostering collaboration and advancing end-to-end machine learning. diff --git a/_events/pytorch-conference-2025.md b/_events/pytorch-conference-2025.md deleted file mode 100644 index 521df4612457..000000000000 --- a/_events/pytorch-conference-2025.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: "PyTorch Conference 2025" -date: October 23, 2025 -poster: assets/images/pytorch-conf-2025.jpg ---- -**Date**: October 22 - 23, 2025 - - -PyTorch Conference banner - - -Join us in San Francisco on **October 22-23, 2025** to learn about AI and PyTorch, the cutting-edge renowned open source machine learning framework. This two-day event that brings together top-tier researchers, developers, and academic communities, fostering collaboration and advancing end-to-end machine learning. diff --git a/_events/pytorch-ny-meetup.md b/_events/pytorch-ny-meetup.md deleted file mode 100644 index 12dad8c97cc3..000000000000 --- a/_events/pytorch-ny-meetup.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -category: event -title: "PyTorch New York Meetup" -date: March 9, 2023 - ---- - -**Date**: March 9, 2023, 3 PM PT -[Watch on YouTube](https://youtu.be/bwjM20wR3dQ) diff --git "a/_events/pytorch_conference\342\200\223dec_2nd_2022.md" "b/_events/pytorch_conference\342\200\223dec_2nd_2022.md" deleted file mode 100644 index 5486592c4c67..000000000000 --- "a/_events/pytorch_conference\342\200\223dec_2nd_2022.md" +++ /dev/null @@ -1,13 +0,0 @@ ---- -category: event -title: PyTorch Conference – Dec 2nd 2022 -date: December 2, 2022 -header-image: assets/images/pytorch_conference–dec_2nd_2022_ty.jpg ---- - -The PyTorch Conference brings together leading academics, researchers and developers from the Machine Learning community to learn more about software releases on PyTorch, ways PyTorch is being used in academia and industry, development in trends, and more. Find the full list of talks below: -- [Keynotes and Technical Talks](https://youtube.com/playlist?list=PL_lsbAsL_o2CC-RIvjdLzSvxo92fCdMAB) -- [Breakout Sessions](https://youtube.com/playlist?list=PL_lsbAsL_o2BFRvdSIAW9HxIVDtZIo3u5) -- [Community and Partner Talks](https://youtube.com/playlist?list=PL_lsbAsL_o2C__Imwx_CxtyR5_2evuIaJ) -- [Panel Discussion, PyTorch on Mars and AI Art](https://youtube.com/playlist?list=PL_lsbAsL_o2C_hizy1bAkn8xXq5tIR6bC) -- [Posters](https://pytorch.org/ecosystem/ptc/2022) diff --git a/_events/pytorch_developer_day.md b/_events/pytorch_developer_day.md deleted file mode 100644 index 27cc3ee92ef4..000000000000 --- a/_events/pytorch_developer_day.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -category: event -title: PyTorch Developer Day 2021 -date: December 1, 2021 -header-image: assets/images/pytorch_developer_day_2021.png ---- - -The PyTorch Developer Day is a virtual event that brings together leading researchers and developers from the Machine Learning (ML) community to join a multiple set of talks covering new software releases, ways PyTorch is being used in academia and industry, and current trends in ML development. Find all the talks below: - -- [Keynotes](https://www.youtube.com/c/PyTorch/playlists?view=50&sort=dd&shelf_id=4): Learn about the innovations, the new features, updates, and release of PyTorch, and how industries are using it for production and deployment. - -- [Fireside Chat](https://youtu.be/JWdDl9Tvw6g): An informal and intimate conversation with two pioneers in the field of AI (and PyTorch) sharing their thoughts and vision for the future, commentary on top-of-mind trends they are seeing. - -- [Community Talks](https://www.youtube.com/watch?v=7yQ4FgtYvj8): PyTorch has grown thanks to our community. Hear from our members on the work being done with PyTorch. \ No newline at end of file diff --git a/_events/pytorch_developer_day_2020.md b/_events/pytorch_developer_day_2020.md deleted file mode 100644 index 888cc423c278..000000000000 --- a/_events/pytorch_developer_day_2020.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -category: event -title: PyTorch Developer Day 2020 -date: November 12, 2020 -header-image: assets/images/pytorch_developer_day_2020.png ---- - -The PyTorch Developer Day, a virtual event that brings together leading researchers and developers from the Machine Learning (ML) community to join a multiple set of talks covering new software releases, ways PyTorch is being used in academia and industry, ML development trends, a poster session, and many opportunities for networking. diff --git a/_events/pytorch_ecosystem_day_2021.md b/_events/pytorch_ecosystem_day_2021.md deleted file mode 100644 index 48c4d7136662..000000000000 --- a/_events/pytorch_ecosystem_day_2021.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -category: event -title: PyTorch Ecosystem Day 2021 -date: April 21, 2021 -header-image: assets/images/pytorch_ecosystem_day_2021.jpeg ---- - -PyTorch Ecosystem Day, a virtual event designed for our ecosystem and industry communities to showcase their work and discover new opportunities to collaborate. Join us for discussions on new developments, trends, challenges, and best practices through keynotes, breakout sessions, and a unique networking opportunity hosted through Gather.Town. diff --git a/_events/pytorch_foundation.md b/_events/pytorch_foundation.md deleted file mode 100644 index 29814d1907de..000000000000 --- a/_events/pytorch_foundation.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -category: event -title: 'Webinar: PyTorch, a Foundation for Open Source AI/ML' -date: September 28, 2022 -header-image: assets/images/Event-Webinar-PyTorch-a-foundation-for-open-source.png ---- -**Date:** Thursday, September 28, 2022 at 7:30am PT - -Join us for a conversation between Soumith Chintala and Ibrahim Haddad about the creation of PyTorch Foundation. Over 150,000 developers and 18,000 organizations work with PyTorch today. In this conversation you will learn about: - -- PyTorch’s evolution as an open source project - -- Where PyTorch is used in production environments and in academic settings - -- Next steps for the project under the PyTorch Foundation - -[Register here](https://www.linuxfoundation.org/webinars/pytorch-a-foundation-for-open-source-ai-ml?hsLang=en) diff --git a/_events/pytorch_summer_hackathon_2020.md b/_events/pytorch_summer_hackathon_2020.md deleted file mode 100644 index 2b3a30617c4a..000000000000 --- a/_events/pytorch_summer_hackathon_2020.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -category: event -title: PyTorch Summer Hackathon 2020 -date: June 22, 2020 -header-image: assets/images/summer_hackathon_2020.png ---- -The PyTorch Summer Hackathon, a virtual event that invites developers to hack with the PyTorch community to build innovative, impactful models, applications and other projects that create positive impact for businesses or people. In it, developers are able to put their machine learning skills to the test in one of the following categories: -- **PyTorch Developer Tools** : Tools or libraries designed to improve productivity and efficiency of PyTorch for researchers and developers. - -- **Web/Mobile Applications powered by PyTorch** : Applications with web/mobile interfaces and/or embedded devices powered by PyTorch. - -- **PyTorch Responsible AI Development Tools** : Tools, libraries, or web/mobile apps for responsible AI development. diff --git a/_events/rethinking_data_loading.md b/_events/rethinking_data_loading.md deleted file mode 100644 index 13de08367df8..000000000000 --- a/_events/rethinking_data_loading.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "PyTorch 2.0 Ask the Engineers Live Q&A Series: Rethinking Data Loading with TorchData" -date: February 1, 2023 - ---- - -**Date**: February 1, 2023, 11AM PST -**Speakers**: Kevin Tse and Erjia Guan -[Watch on YouTube](https://www.youtube.com/watch?v=65DvI3YrFW8&list=PL_lsbAsL_o2CQr8oh5sNWt96yWQphNEzM&index=8) -[Watch on LinkedIn](https://www.linkedin.com/events/7024757312057708544/?lipi=urn%3Ali%3Apage%3Ad_flagship3_company_admin%3BUIqLb0KRSZm%2FHtSu2bw%2B4g%3D%3D) diff --git a/_events/torch-rl.md b/_events/torch-rl.md deleted file mode 100644 index 9fd4eaa4d869..000000000000 --- a/_events/torch-rl.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "PyTorch 2.0 Ask the Engineers Live Q&A Series: Torch RL" -date: February 15, 2023 - ---- - -**Date**: February 15, 2023, 10AM PST -**Speaker**: Vincent Moens -[Watch on YouTube](https://www.youtube.com/watch?v=myEfUoYrbts) -[Watch on LinkedIn](https://www.linkedin.com/video/event/urn:li:ugcPost:7029129170060283905/) diff --git a/_events/torchmultimodal.md b/_events/torchmultimodal.md deleted file mode 100644 index 38b411f4d4ec..000000000000 --- a/_events/torchmultimodal.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -category: event -title: "PyTorch 2.0 Ask the Engineers Live Q&A Series: TorchMultiModal" -date: February 23, 2023 - ---- - -**Date**: February 23, 2023, 11 AM PST -**Speakers**: Kartikay Khandelwal and Ankita De -[Watch on YouTube](https://www.youtube.com/watch?v=L7W2-0pwsFI&list=PL_lsbAsL_o2CQr8oh5sNWt96yWQphNEzM&index=12) -[Watch on LinkedIn](https://www.linkedin.com/video/event/urn:li:ugcPost:7032009478761844736/) diff --git a/_events/vancouver-meetup.md b/_events/vancouver-meetup.md deleted file mode 100644 index 2d956107c2e7..000000000000 --- a/_events/vancouver-meetup.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -category: event -title: "Vancouver Meetup" -date: May 10, 2023 - ---- - -**Date**: May 10, 2023 - -**Agenda** -3:45 pm - Meet in lobby and check in -4:00 - 4:30 pm - Generative AI and Stable Diffusion - Will Berman | Hugging Face -4:30 - 5:00 pm - Joe Spisak & Milad Mohammadi | Open XLA -5:00 - 5:10 pm - Break -5:10 - 5:40 pm - How and why to become a contributor to PyTorch - Dmitry Vinnik | Meta -5:40 - 6:00 pm - Social/networking - -[RSVP here](https://community.linuxfoundation.org/e/mcc7va/) \ No newline at end of file diff --git a/_features/cloud-support.md b/_features/cloud-support.md deleted file mode 100644 index 82d9d66911c3..000000000000 --- a/_features/cloud-support.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -title: Cloud Support -order: 8 -snippet: > - ```sh - export IMAGE_FAMILY="pytorch-latest-cpu" - export ZONE="us-west1-b" - export INSTANCE_NAME="my-instance" - - gcloud compute instances create $INSTANCE_NAME \ - --zone=$ZONE \ - --image-family=$IMAGE_FAMILY \ - --image-project=deeplearning-platform-release - ``` - -summary-home: PyTorch is well supported on major cloud platforms, providing frictionless development and easy scaling. -featured-home: true - ---- - -PyTorch is well supported on major cloud platforms, providing frictionless development and easy scaling through prebuilt images, large scale training on GPUs, ability to run models in a production scale environment, and more. diff --git a/_features/cplusplus-front-end.md b/_features/cplusplus-front-end.md deleted file mode 100644 index f0a46ca2cadb..000000000000 --- a/_features/cplusplus-front-end.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -title: C++ Front-End -order: 7 -snippet: > - ```cpp - #include - - torch::nn::Linear model(num_features, 1); - torch::optim::SGD optimizer(model->parameters()); - auto data_loader = torch::data::data_loader(dataset); - - for (size_t epoch = 0; epoch < 10; ++epoch) { - for (auto batch : data_loader) { - auto prediction = model->forward(batch.data); - auto loss = loss_function(prediction, batch.target); - loss.backward(); - optimizer.step(); - } - } - ``` ---- - -The C++ frontend is a pure C++ interface to PyTorch that follows the design and architecture of the established Python frontend. It is intended to enable research in high performance, low latency and bare metal C++ applications. diff --git a/_features/distributed-training.md b/_features/distributed-training.md deleted file mode 100644 index b7e5ad35ad8b..000000000000 --- a/_features/distributed-training.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -title: Distributed Training -order: 3 -snippet: > - ```python - import torch.distributed as dist - from torch.nn.parallel import DistributedDataParallel - - dist.init_process_group(backend='gloo') - model = DistributedDataParallel(model) - ``` - -summary-home: Scalable distributed training and performance optimization in research and production is enabled by the torch.distributed backend. -featured-home: true - ---- - -Optimize performance in both research and production by taking advantage of native support for asynchronous execution of collective operations and peer-to-peer communication that is accessible from Python and C++. diff --git a/_features/mobile.md b/_features/mobile.md deleted file mode 100644 index 31214d52751f..000000000000 --- a/_features/mobile.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: Mobile (Experimental) -order: 4 -snippet: > - ```python - ## Save your model - torch.jit.script(model).save("my_mobile_model.pt") - - ## iOS prebuilt binary - pod ‘LibTorch’ - ## Android prebuilt binary - implementation 'org.pytorch:pytorch_android:1.3.0' - - ## Run your model (Android example) - Tensor input = Tensor.fromBlob(data, new long[]{1, data.length}); - IValue output = module.forward(IValue.tensor(input)); - float[] scores = output.getTensor().getDataAsFloatArray(); - ``` - -summary-home: PyTorch supports an end-to-end workflow from Python to deployment on iOS and Android. It extends the PyTorch API to cover common preprocessing and integration tasks needed for incorporating ML in mobile applications. -featured-home: false - ---- - -PyTorch supports an end-to-end workflow from Python to deployment on iOS and Android. It extends the PyTorch API to cover common preprocessing and integration tasks needed for incorporating ML in mobile applications. diff --git a/_features/native-onnx-support.md b/_features/native-onnx-support.md deleted file mode 100644 index 1c7734e9ed77..000000000000 --- a/_features/native-onnx-support.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: Native ONNX Support -order: 6 -snippet: > - ```python - import torch.onnx - import torchvision - - dummy_input = torch.randn(1, 3, 224, 224) - model = torchvision.models.alexnet(pretrained=True) - torch.onnx.export(model, dummy_input, "alexnet.onnx") - ``` ---- - -Export models in the standard ONNX (Open Neural Network Exchange) format for direct access to ONNX-compatible platforms, runtimes, visualizers, and more. diff --git a/_features/production-ready.md b/_features/production-ready.md deleted file mode 100644 index 151de0f9b644..000000000000 --- a/_features/production-ready.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -title: Production Ready -order: 1 -snippet: > - ```python - import torch - class MyModule(torch.nn.Module): - - def __init__(self, N, M): - super(MyModule, self).__init__() - self.weight = torch.nn.Parameter(torch.rand(N, M)) - - def forward(self, input): - if input.sum() > 0: - output = self.weight.mv(input) - else: - output = self.weight + input - return output - - # Compile the model code to a static representation - my_script_module = torch.jit.script(MyModule(3, 4)) - - # Save the compiled code and model data so it can be loaded elsewhere - my_script_module.save("my_script_module.pt") - ``` - -summary-home: Transition seamlessly between eager and graph modes with TorchScript, and accelerate the path to production with TorchServe. -featured-home: true - ---- - -With TorchScript, PyTorch provides ease-of-use and flexibility in eager mode, while seamlessly transitioning to graph mode for speed, optimization, and functionality in C++ runtime environments. diff --git a/_features/robust-ecosystem.md b/_features/robust-ecosystem.md deleted file mode 100644 index f44406d6e801..000000000000 --- a/_features/robust-ecosystem.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -title: Robust Ecosystem -order: 5 -snippet: > - ```python - import torchvision.models as models - resnet18 = models.resnet18(pretrained=True) - alexnet = models.alexnet(pretrained=True) - squeezenet = models.squeezenet1_0(pretrained=True) - vgg16 = models.vgg16(pretrained=True) - densenet = models.densenet161(pretrained=True) - inception = models.inception_v3(pretrained=True) - ``` - -summary-home: A rich ecosystem of tools and libraries extends PyTorch and supports development in computer vision, NLP and more. -featured-home: true - ---- - -An active community of researchers and developers have built a rich ecosystem of tools and libraries for extending PyTorch and supporting development in areas from computer vision to reinforcement learning. diff --git a/_features/torchserve.md b/_features/torchserve.md deleted file mode 100644 index 4460014cd541..000000000000 --- a/_features/torchserve.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -title: TorchServe -order: 2 -snippet: > - ```python - ## Convert the model from PyTorch to TorchServe format - torch-model-archiver --model-name densenet161 \ - --version 1.0 --model-file serve/examples/image_classifier/densenet_161/model.py \ - --serialized-file densenet161-8d451a50.pth \ - --extra-files serve/examples/image_classifier/index_to_name.json \ - --handler image_classifier - - ## Host your PyTorch model - - torchserve --start --model-store model_store --models densenet161=densenet161.mar - ``` - -summary-home: TorchServe is an easy to use tool for deploying PyTorch models at scale. It is cloud and environment agnostic and supports features such as multi-model serving, logging, metrics and the creation of RESTful endpoints for application integration. -featured-home: false - ---- - -TorchServe is an easy to use tool for deploying PyTorch models at scale. It is cloud and environment agnostic and supports features such as multi-model serving, logging, metrics and the creation of RESTful endpoints for application integration. diff --git a/_get_started/get-started-locally.md b/_get_started/get-started-locally.md deleted file mode 100644 index 6a95566a3946..000000000000 --- a/_get_started/get-started-locally.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -layout: get_started -title: Start Locally -permalink: /get-started/locally/ -background-class: get-started-background -body-class: get-started -order: 0 -published: true -get-started-locally: true -redirect_from: "/get-started/" ---- - -## Start Locally - -
-
-
- {% include quick_start_local.html %} -
-
-
- ---- - -{% capture mac %} -{% include_relative installation/mac.md %} -{% endcapture %} - -{% capture linux %} -{% include_relative installation/linux.md %} -{% endcapture %} - -{% capture windows %} -{% include_relative installation/windows.md %} -{% endcapture %} - - -
-
{{ mac | markdownify }}
-
{{ linux | markdownify }}
-
{{ windows | markdownify }}
-
- - - - - diff --git a/_get_started/get-started-via-cloud-partners.md b/_get_started/get-started-via-cloud-partners.md deleted file mode 100644 index 6fba614843af..000000000000 --- a/_get_started/get-started-via-cloud-partners.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -layout: get_started -title: Start via Cloud Partners -permalink: /get-started/cloud-partners/ -background-class: get-started-background -body-class: get-started -order: 3 -published: true -get-started-via-cloud: true ---- - -## Start via Cloud Partners - -
-
-

Cloud platforms provide powerful hardware and infrastructure for training and deploying deep learning models. Select a cloud platform below to get started with PyTorch.

- {% include quick_start_cloud_options.html %} -
-
- ---- - -{% capture aws %} -{% include_relative installation/aws.md %} -{% endcapture %} - -{% capture azure %} -{% include_relative installation/azure.md %} -{% endcapture %} - -{% capture google-cloud %} -{% include_relative installation/google-cloud.md %} -{% endcapture %} - -{% capture lightning-studios %} -{% include_relative installation/lightning-studios.md %} -{% endcapture %} - -
-
{{aws | markdownify }}
-
{{google-cloud | markdownify }}
-
{{azure | markdownify }}
-
{{lightning-studios | markdownify }}
-
- - - - - diff --git a/_get_started/get-started-via-colab.md b/_get_started/get-started-via-colab.md deleted file mode 100644 index 940ef0bc8a56..000000000000 --- a/_get_started/get-started-via-colab.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -layout: get_started -title: Try Now via CoLab -permalink: /get-started/colab/ -background-class: get-started-background -body-class: get-started -order: 10 ---- - -## Try Now via CoLab - -Lorem ipsum dolor sit amet, ex mei graeco alienum imperdiet. Recusabo consequuntur mei ei, habeo iriure virtute eam cu, in erat placerat vis. Eu mea nostrum inimicus, cum id aeque utamur erroribus. - -Lorem ipsum dolor sit amet, ex mei graeco alienum imperdiet. Recusabo consequuntur mei ei, habeo iriure virtute eam cu, in erat placerat vis. Eu mea nostrum inimicus, cum id aeque utamur erroribus. - -{% highlight python %} -#!/usr/bin/python3 - -# Print the contents of the files listed on the command line. - -import sys - -for fn in sys.argv[1:]: - try: - fin = open(fn, 'r') - except: - (type, detail) = sys.exc_info()[:2] - print("\n*** %s: %s: %s ***" % (fn, type, detail)) - continue - print("\n*** Contents of", fn, "***") - - # Print the file, with line numbers. - lno = 1 - while 1: - line = fin.readline() - if not line: break; - print('%3d: %-s' % (lno, line[:-1])) - lno = lno + 1 - fin.close() -print() -{% endhighlight %} - -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. - - - - - diff --git a/_get_started/mobile.md b/_get_started/mobile.md deleted file mode 100644 index d709ee61e2f8..000000000000 --- a/_get_started/mobile.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -layout: get_started -title: PyTorch for Edge -permalink: /get-started/executorch/ -background-class: get-started-background -body-class: get-started -order: 5 -published: true ---- - -## Get Started with PyTorch ExecuTorch - -PyTorch’s edge specific library is [ExecuTorch](https://github.com/pytorch/executorch/) and is designed to be lightweight, very performant even on devices with constrained hardware such as mobile phones, embedded systems and microcontrollers. - -ExecuTorch relies heavily on PyTorch core technologies such as [torch.compile](https://pytorch.org/docs/stable/torch.compiler.html) and [torch.export](https://pytorch.org/docs/stable/export.html), and should be very familiar to anyone who has used PyTorch in the past. - -### Getting Started -You can get started by following the [general getting started guide](https://pytorch.org/executorch/stable/getting-started.html#) or jump to the specific steps for your target device. - -* [Using ExecuTorch on Android](https://pytorch.org/executorch/stable/using-executorch-android.html) -* [Using ExecuTorch on iOS](https://pytorch.org/executorch/stable/using-executorch-ios.html) -* [Using ExecuTorch with C++](https://pytorch.org/executorch/stable/using-executorch-cpp.html) - -### Hardware Acceleration -ExecuTorch provides out of the box hardware acceleration for a growing number of chip manufacturers. See the following resources to learn more about how to leverage them: - -* [Backend Overview](https://pytorch.org/executorch/stable/backends-overview.html) -* [XNNPACK](https://pytorch.org/executorch/stable/backends-xnnpack.html) -* [Core ML](https://pytorch.org/executorch/stable/backends-coreml.html) -* [MPS](https://pytorch.org/executorch/stable/backends-mps.html) -* [Vulkan](https://pytorch.org/executorch/stable/backends-vulkan.html) -* [ARM Ethos-U](https://pytorch.org/executorch/stable/backends-arm-ethos-u.html) -* [Qualcomm AI Engine](https://pytorch.org/executorch/stable/backends-qualcomm.html) -* [MediaTek](https://pytorch.org/executorch/stable/backends-mediatek.html) -* [Cadence Xtensa](https://pytorch.org/executorch/stable/backends-cadence.html) - - - - diff --git a/_get_started/previous-versions.md b/_get_started/previous-versions.md deleted file mode 100644 index d86ae87de17e..000000000000 --- a/_get_started/previous-versions.md +++ /dev/null @@ -1,1980 +0,0 @@ ---- -layout: get_started -title: Previous PyTorch Versions -permalink: /get-started/previous-versions/ -background-class: get-started-background -body-class: get-started -order: 4 -published: true -redirect_from: /previous-versions.html ---- - -## Installing previous versions of PyTorch - -We'd prefer you install the [latest version](https://pytorch.org/get-started/locally), -but old binaries and installation instructions are provided below for -your convenience. - -## Commands for Versions >= 1.0.0 - -### v2.6.0 - -#### Wheel - -##### OSX - -``` -pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 -``` - -##### Linux and Windows - -``` -# ROCM 6.1 (Linux only) -pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1 -# ROCM 6.2.4 (Linux only) -pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4 -# CUDA 11.8 -pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.4 -pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124 -# CUDA 12.6 -pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu126 -# CPU only -pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.5.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 pytorch-cuda=12.1 -c pytorch -c nvidia -# CUDA 12.4 -conda install pytorch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 pytorch-cuda=12.4 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 -``` - -##### Linux and Windows - -``` -# ROCM 6.1 (Linux only) -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.1 -# ROCM 6.2 (Linux only) -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2 -# CUDA 11.8 -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121 -# CUDA 12.4 -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124 -# CPU only -pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.5.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.1 -c pytorch -c nvidia -# CUDA 12.4 -conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.4 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 -``` - -##### Linux and Windows - -``` -# ROCM 6.1 (Linux only) -pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/rocm6.1 -# ROCM 6.2 (Linux only) -pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/rocm6.2 -# CUDA 11.8 -pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121 -# CUDA 12.4 -pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu124 -# CPU only -pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.4.1 -#### Conda -##### OSX -``` -# conda -conda install pytorch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 -c pytorch -``` -##### Linux and Windows -``` -# CUDA 11.8 -conda install pytorch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 pytorch-cuda=12.1 -c pytorch -c nvidia -# CUDA 12.4 -conda install pytorch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 pytorch-cuda=12.4 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 cpuonly -c pytorch -``` -#### Wheel -##### OSX -``` -pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 -``` -##### Linux and Windows -``` -# ROCM 6.1 (Linux only) -pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/rocm6.1 -# CUDA 11.8 -pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121 -# CUDA 12.4 -pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu124 -# CPU only -pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.4.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia -# CUDA 12.4 -conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.4 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 -``` - -##### Linux and Windows - -``` -# ROCM 6.1 (Linux only) -pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/rocm6.1 -# CUDA 11.8 -pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 -# CUDA 12.4 -pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124 -# CPU only -pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.3.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=12.1 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 -``` - -##### Linux and Windows - -``` -# ROCM 6.0 (Linux only) -pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/rocm6.0 -# CUDA 11.8 -pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121 -# CPU only -pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.3.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 pytorch-cuda=12.1 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 -``` - -##### Linux and Windows - -``` -# ROCM 6.0 (Linux only) -pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/rocm6.0 -# CUDA 11.8 -pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121 -# CPU only -pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.2.2 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 pytorch-cuda=12.1 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 -``` - -##### Linux and Windows - -``` -# ROCM 5.7 (Linux only) -pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm5.7 -# CUDA 11.8 -pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 -# CPU only -pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.2.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 pytorch-cuda=12.1 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 -``` - -##### Linux and Windows - -``` -# ROCM 5.7 (Linux only) -pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/rocm5.7 -# CUDA 11.8 -pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121 -# CPU only -pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.2.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=12.1 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 -``` - -##### Linux and Windows - -``` -# ROCM 5.6 (Linux only) -pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/rocm5.6 -# CUDA 11.8 -pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121 -# CPU only -pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.1.2 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 -``` - -##### Linux and Windows - -``` -# ROCM 5.6 (Linux only) -pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/rocm5.6 -# CUDA 11.8 -pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 -# CPU only -pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.1.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=12.1 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 -``` - -##### Linux and Windows - -``` -# ROCM 5.6 (Linux only) -pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/rocm5.6 -# CUDA 11.8 -pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121 -# CPU only -pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.1.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.8 -conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pytorch-cuda=11.8 -c pytorch -c nvidia -# CUDA 12.1 -conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pytorch-cuda=12.1 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 -``` - -##### Linux and Windows - -``` -# ROCM 5.6 (Linux only) -pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/rocm5.6 -# CUDA 11.8 -pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118 -# CUDA 12.1 -pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121 -# CPU only -pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.0.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.7 -conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.7 -c pytorch -c nvidia -# CUDA 11.8 -conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.8 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 -``` - -##### Linux and Windows - -``` -# ROCM 5.4.2 (Linux only) -pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/rocm5.4.2 -# CUDA 11.7 -pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 -# CUDA 11.8 -pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118 -# CPU only -pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cpu -``` - -### v2.0.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.7 -conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia -# CUDA 11.8 -conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 pytorch-cuda=11.8 -c pytorch -c nvidia -# CPU Only -conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 -``` - -##### Linux and Windows - -``` -# ROCM 5.4.2 (Linux only) -pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/rocm5.4.2 -# CUDA 11.7 -pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 -# CUDA 11.8 -pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 -# CPU only -pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cpu -``` - -### v1.13.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.6 -conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.6 -c pytorch -c nvidia -# CUDA 11.7 -conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia -# CPU Only -conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 -``` - -##### Linux and Windows - -``` -# ROCM 5.2 (Linux only) -pip install torch==1.13.1+rocm5.2 torchvision==0.14.1+rocm5.2 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/rocm5.2 -# CUDA 11.6 -pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 -# CUDA 11.7 -pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117 -# CPU only -pip install torch==1.13.1+cpu torchvision==0.14.1+cpu torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cpu -``` - -### v1.13.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.13.0 torchvision==0.14.0 torchaudio==0.13.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 11.6 -conda install pytorch==1.13.0 torchvision==0.14.0 torchaudio==0.13.0 pytorch-cuda=11.6 -c pytorch -c nvidia -# CUDA 11.7 -conda install pytorch==1.13.0 torchvision==0.14.0 torchaudio==0.13.0 pytorch-cuda=11.7 -c pytorch -c nvidia -# CPU Only -conda install pytorch==1.13.0 torchvision==0.14.0 torchaudio==0.13.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.13.0 torchvision==0.14.0 torchaudio==0.13.0 -``` - -##### Linux and Windows - -``` -# ROCM 5.2 (Linux only) -pip install torch==1.13.0+rocm5.2 torchvision==0.14.0+rocm5.2 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/rocm5.2 -# CUDA 11.6 -pip install torch==1.13.0+cu116 torchvision==0.14.0+cu116 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu116 -# CUDA 11.7 -pip install torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 -# CPU only -pip install torch==1.13.0+cpu torchvision==0.14.0+cpu torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cpu -``` - -### v1.12.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch -# CUDA 11.3 -conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch -# CUDA 11.6 -conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.6 -c pytorch -c conda-forge -# CPU Only -conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 -``` - -##### Linux and Windows - -``` -# ROCM 5.1.1 (Linux only) -pip install torch==1.12.1+rocm5.1.1 torchvision==0.13.1+rocm5.1.1 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 -# CUDA 11.6 -pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu116 -# CUDA 11.3 -pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113 -# CUDA 10.2 -pip install torch==1.12.1+cu102 torchvision==0.13.1+cu102 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu102 -# CPU only -pip install torch==1.12.1+cpu torchvision==0.13.1+cpu torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cpu -``` - -### v1.12.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=10.2 -c pytorch -# CUDA 11.3 -conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch -# CUDA 11.6 -conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.6 -c pytorch -c conda-forge -# CPU Only -conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 -``` - -##### Linux and Windows - -``` -# ROCM 5.1.1 (Linux only) -pip install torch==1.12.0+rocm5.1.1 torchvision==0.13.0+rocm5.1.1 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 -# CUDA 11.6 -pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 -# CUDA 11.3 -pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113 -# CUDA 10.2 -pip install torch==1.12.0+cu102 torchvision==0.13.0+cu102 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu102 -# CPU only -pip install torch==1.12.0+cpu torchvision==0.13.0+cpu torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu -``` - -### v1.11.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=10.2 -c pytorch - -# CUDA 11.3 -conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch - -# CPU Only -conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 -``` - -##### Linux and Windows - -``` -# ROCM 4.5.2 (Linux only) -pip install torch==1.11.0+rocm4.5.2 torchvision==0.12.0+rocm4.5.2 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/rocm4.5.2 - -# CUDA 11.3 -pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 - -# CUDA 10.2 -pip install torch==1.11.0+cu102 torchvision==0.12.0+cu102 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu102 - -# CPU only -pip install torch==1.11.0+cpu torchvision==0.12.0+cpu torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cpu -``` - -### v1.10.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch - -# CUDA 11.3 -conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge - -# CPU Only -conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 -``` - -##### Linux and Windows - -``` -# ROCM 4.2 (Linux only) -pip install torch==1.10.1+rocm4.2 torchvision==0.11.2+rocm4.2 torchaudio==0.10.1 -f https://download.pytorch.org/whl/rocm4.2/torch_stable.html - -# ROCM 4.1 (Linux only) -pip install torch==1.10.1+rocm4.1 torchvision==0.11.2+rocm4.1 torchaudio==0.10.1 -f https://download.pytorch.org/whl/torch_stable.html - -# ROCM 4.0.1 (Linux only) -pip install torch==1.10.1+rocm4.0.1 torchvision==0.10.2+rocm4.0.1 torchaudio==0.10.1 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 11.1 -pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html - -# CUDA 10.2 -pip install torch==1.10.1+cu102 torchvision==0.11.2+cu102 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu102/torch_stable.html - -# CPU only -pip install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html -``` - - -### v1.10.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 cudatoolkit=10.2 -c pytorch - -# CUDA 11.3 -conda install pytorch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 cudatoolkit=11.3 -c pytorch -c conda-forge - -# CPU Only -conda install pytorch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 -``` - -##### Linux and Windows - -``` -# ROCM 4.2 (Linux only) -pip install torch==1.10.0+rocm4.2 torchvision==0.11.0+rocm4.2 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html - -# ROCM 4.1 (Linux only) -pip install torch==1.10.0+rocm4.1 torchvision==0.11.0+rocm4.1 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html - -# ROCM 4.0.1 (Linux only) -pip install torch==1.10.0+rocm4.0.1 torchvision==0.10.1+rocm4.0.1 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 11.1 -pip install torch==1.10.0+cu111 torchvision==0.11.0+cu111 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 10.2 -pip install torch==1.10.0+cu102 torchvision==0.11.0+cu102 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.10.0+cpu torchvision==0.11.0+cpu torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html -``` - - -### v1.9.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1 cudatoolkit=10.2 -c pytorch - -# CUDA 11.3 -conda install pytorch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1 cudatoolkit=11.3 -c pytorch -c conda-forge - -# CPU Only -conda install pytorch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1 -``` - -##### Linux and Windows - -``` -# ROCM 4.2 (Linux only) -pip install torch==1.9.1+rocm4.2 torchvision==0.10.1+rocm4.2 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html - -# ROCM 4.1 (Linux only) -pip install torch==1.9.1+rocm4.1 torchvision==0.10.1+rocm4.1 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html - -# ROCM 4.0.1 (Linux only) -pip install torch==1.9.1+rocm4.0.1 torchvision==0.10.1+rocm4.0.1 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 11.1 -pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 10.2 -pip install torch==1.9.1+cu102 torchvision==0.10.1+cu102 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.9.1+cpu torchvision==0.10.1+cpu torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.9.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 cudatoolkit=10.2 -c pytorch - -# CUDA 11.3 -conda install pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 cudatoolkit=11.3 -c pytorch -c conda-forge - -# CPU Only -conda install pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 -``` - -##### Linux and Windows - -``` -# ROCM 4.2 (Linux only) -pip install torch==1.9.0+rocm4.2 torchvision==0.10.0+rocm4.2 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html - -# ROCM 4.1 (Linux only) -pip install torch==1.9.0+rocm4.1 torchvision==0.10.0+rocm4.1 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html - -# ROCM 4.0.1 (Linux only) -pip install torch==1.9.0+rocm4.0.1 torchvision==0.10.0+rocm4.0.1 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 11.1 -pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 10.2 -pip install torch==1.9.0+cu102 torchvision==0.10.0+cu102 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.9.0+cpu torchvision==0.10.0+cpu torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.8.2 with LTS support - -#### Conda - -##### OSX - -macOS is currently not supported for LTS. - -##### Linux and Windows - -``` -# CUDA 10.2 -# NOTE: PyTorch LTS version 1.8.2 is only supported for Python <= 3.8. -conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch-lts - -# CUDA 11.1 (Linux) -# NOTE: 'nvidia' channel is required for cudatoolkit 11.1
NOTE: Pytorch LTS version 1.8.2 is only supported for Python <= 3.8. -conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch-lts -c nvidia - -# CUDA 11.1 (Windows) -# 'conda-forge' channel is required for cudatoolkit 11.1
NOTE: Pytorch LTS version 1.8.2 is only supported for Python <= 3.8. -conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch-lts -c conda-forge - -# CPU Only -# Pytorch LTS version 1.8.2 is only supported for Python <= 3.8. -conda install pytorch torchvision torchaudio cpuonly -c pytorch-lts - -# ROCM5.x - -Not supported in LTS. -``` - -#### Wheel - -##### OSX - -macOS is currently not supported in LTS. - -##### Linux and Windows - -``` -# CUDA 10.2 -pip3 install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu102 - -# CUDA 11.1 -pip3 install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu111 - -# CPU Only -pip3 install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu - -# ROCM5.x - -Not supported in LTS. -``` - -### v1.8.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 cudatoolkit=10.2 -c pytorch - -# CUDA 11.3 -conda install pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 cudatoolkit=11.3 -c pytorch -c conda-forge - -# CPU Only -conda install pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 -``` - -##### Linux and Windows - -``` -# ROCM 4.0.1 (Linux only) -pip install torch==1.8.1+rocm4.0.1 torchvision==0.9.1+rocm4.0.1 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html - -# ROCM 3.10 (Linux only) -pip install torch==1.8.1+rocm3.10 torchvision==0.9.1+rocm3.10 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 11.1 -pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 10.2 -pip install torch==1.8.1+cu102 torchvision==0.9.1+cu102 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 10.1 -pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.8.1+cpu torchvision==0.9.1+cpu torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html -``` - - -### v1.8.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=10.2 -c pytorch - -# CUDA 11.1 -conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=11.1 -c pytorch -c conda-forge - -# CPU Only -conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 -``` - -##### Linux and Windows - -``` -# RocM 4.0.1 (Linux only) -pip install torch -f https://download.pytorch.org/whl/rocm4.0.1/torch_stable.html -pip install ninja -pip install 'git+https://github.com/pytorch/vision.git@v0.9.0' - -# CUDA 11.1 -pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 10.2 -pip install torch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 - -# CPU only -pip install torch==1.8.0+cpu torchvision==0.9.0+cpu torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.7.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.2 -conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=9.2 -c pytorch - -# CUDA 10.1 -conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.1 -c pytorch - -# CUDA 10.2 -conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.2 -c pytorch - -# CUDA 11.0 -conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0 -c pytorch - -# CPU Only -conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 -``` - -##### Linux and Windows - -``` -# CUDA 11.0 -pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 10.2 -pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 - -# CUDA 10.1 -pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 9.2 -pip install torch==1.7.1+cu92 torchvision==0.8.2+cu92 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.7.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.2 -conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=9.2 -c pytorch - -# CUDA 10.1 -conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=10.1 -c pytorch - -# CUDA 10.2 -conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=10.2 -c pytorch - -# CUDA 11.0 -conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=11.0 -c pytorch - -# CPU Only -conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 -``` - -##### Linux and Windows - -``` -# CUDA 11.0 -pip install torch==1.7.0+cu110 torchvision==0.8.0+cu110 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 10.2 -pip install torch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 - -# CUDA 10.1 -pip install torch==1.7.0+cu101 torchvision==0.8.0+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 9.2 -pip install torch==1.7.0+cu92 torchvision==0.8.0+cu92 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.7.0+cpu torchvision==0.8.0+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.6.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.6.0 torchvision==0.7.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.2 -conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=9.2 -c pytorch - -# CUDA 10.1 -conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.1 -c pytorch - -# CUDA 10.2 -conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.2 -c pytorch - -# CPU Only -conda install pytorch==1.6.0 torchvision==0.7.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.6.0 torchvision==0.7.0 -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -pip install torch==1.6.0 torchvision==0.7.0 - -# CUDA 10.1 -pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 9.2 -pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.5.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.5.1 torchvision==0.6.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.2 -conda install pytorch==1.5.1 torchvision==0.6.1 cudatoolkit=9.2 -c pytorch - -# CUDA 10.1 -conda install pytorch==1.5.1 torchvision==0.6.1 cudatoolkit=10.1 -c pytorch - -# CUDA 10.2 -conda install pytorch==1.5.1 torchvision==0.6.1 cudatoolkit=10.2 -c pytorch - -# CPU Only -conda install pytorch==1.5.1 torchvision==0.6.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.5.1 torchvision==0.6.1 -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -pip install torch==1.5.1 torchvision==0.6.1 - -# CUDA 10.1 -pip install torch==1.5.1+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 9.2 -pip install torch==1.5.1+cu92 torchvision==0.6.1+cu92 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.5.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.5.0 torchvision==0.6.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.2 -conda install pytorch==1.5.0 torchvision==0.6.0 cudatoolkit=9.2 -c pytorch - -# CUDA 10.1 -conda install pytorch==1.5.0 torchvision==0.6.0 cudatoolkit=10.1 -c pytorch - -# CUDA 10.2 -conda install pytorch==1.5.0 torchvision==0.6.0 cudatoolkit=10.2 -c pytorch - -# CPU Only -conda install pytorch==1.5.0 torchvision==0.6.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.5.0 torchvision==0.6.0 -``` - -##### Linux and Windows - -``` -# CUDA 10.2 -pip install torch==1.5.0 torchvision==0.6.0 - -# CUDA 10.1 -pip install torch==1.5.0+cu101 torchvision==0.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html - -# CUDA 9.2 -pip install torch==1.5.0+cu92 torchvision==0.6.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.4.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.4.0 torchvision==0.5.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.2 -conda install pytorch==1.4.0 torchvision==0.5.0 cudatoolkit=9.2 -c pytorch - -# CUDA 10.1 -conda install pytorch==1.4.0 torchvision==0.5.0 cudatoolkit=10.1 -c pytorch - -# CPU Only -conda install pytorch==1.4.0 torchvision==0.5.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.4.0 torchvision==0.5.0 -``` - -##### Linux and Windows - -``` -# CUDA 10.1 -pip install torch==1.4.0 torchvision==0.5.0 - -# CUDA 9.2 -pip install torch==1.4.0+cu92 torchvision==0.5.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.2.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.2.0 torchvision==0.4.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.2 -conda install pytorch==1.2.0 torchvision==0.4.0 cudatoolkit=9.2 -c pytorch - -# CUDA 10.0 -conda install pytorch==1.2.0 torchvision==0.4.0 cudatoolkit=10.0 -c pytorch - -# CPU Only -conda install pytorch==1.2.0 torchvision==0.4.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.2.0 torchvision==0.4.0 -``` - -##### Linux and Windows - -``` -# CUDA 10.0 -pip install torch==1.2.0 torchvision==0.4.0 - -# CUDA 9.2 -pip install torch==1.2.0+cu92 torchvision==0.4.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html - -# CPU only -pip install torch==1.2.0+cpu torchvision==0.4.0+cpu -f https://download.pytorch.org/whl/torch_stable.html -``` - -### v1.1.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.1.0 torchvision==0.3.0 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.0 -conda install pytorch==1.1.0 torchvision==0.3.0 cudatoolkit=9.0 -c pytorch - -# CUDA 10.0 -conda install pytorch==1.1.0 torchvision==0.3.0 cudatoolkit=10.0 -c pytorch - -# CPU Only -conda install pytorch-cpu==1.1.0 torchvision-cpu==0.3.0 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.1.0 torchvision==0.3.0 -``` - -##### Linux and Windows - -``` -# CUDA 10.0 -Download and install wheel from https://download.pytorch.org/whl/cu100/torch_stable.html - -# CUDA 9.0 -Download and install wheel from https://download.pytorch.org/whl/cu90/torch_stable.html - -# CPU only -Download and install wheel from https://download.pytorch.org/whl/cpu/torch_stable.html -``` - -### v1.0.1 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.0.1 torchvision==0.2.2 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 9.0 -conda install pytorch==1.0.1 torchvision==0.2.2 cudatoolkit=9.0 -c pytorch - -# CUDA 10.0 -conda install pytorch==1.0.1 torchvision==0.2.2 cudatoolkit=10.0 -c pytorch - -# CPU Only -conda install pytorch-cpu==1.0.1 torchvision-cpu==0.2.2 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.0.1 torchvision==0.2.2 -``` - -##### Linux and Windows - -``` -# CUDA 10.0 -Download and install wheel from https://download.pytorch.org/whl/cu100/torch_stable.html - -# CUDA 9.0 -Download and install wheel from https://download.pytorch.org/whl/cu90/torch_stable.html - -# CPU only -Download and install wheel from https://download.pytorch.org/whl/cpu/torch_stable.html -``` - -### v1.0.0 - -#### Conda - -##### OSX - -``` -# conda -conda install pytorch==1.0.0 torchvision==0.2.1 -c pytorch -``` - -##### Linux and Windows - -``` -# CUDA 10.0 -conda install pytorch==1.0.0 torchvision==0.2.1 cuda100 -c pytorch - -# CUDA 9.0 -conda install pytorch==1.0.0 torchvision==0.2.1 cuda90 -c pytorch - -# CUDA 8.0 -conda install pytorch==1.0.0 torchvision==0.2.1 cuda80 -c pytorch - -# CPU Only -conda install pytorch-cpu==1.0.0 torchvision-cpu==0.2.1 cpuonly -c pytorch -``` - -#### Wheel - -##### OSX - -``` -pip install torch==1.0.0 torchvision==0.2.1 -``` - -##### Linux and Windows - -``` -# CUDA 10.0 -Download and install wheel from https://download.pytorch.org/whl/cu100/torch_stable.html - -# CUDA 9.0 -Download and install wheel from https://download.pytorch.org/whl/cu90/torch_stable.html - -# CUDA 8.0 -Download and install wheel from https://download.pytorch.org/whl/cu80/torch_stable.html - -# CPU only -Download and install wheel from https://download.pytorch.org/whl/cpu/torch_stable.html -``` - -## Commands for Versions < 1.0.0 - -### Via conda - -> This should be used for most previous macOS version installs. - -To install a previous version of PyTorch via Anaconda or Miniconda, -replace "0.4.1" in the following commands with the desired version -(i.e., "0.2.0"). - -Installing with CUDA 9 - -`conda install pytorch=0.4.1 cuda90 -c pytorch` - -or - -`conda install pytorch=0.4.1 cuda92 -c pytorch` - -Installing with CUDA 8 - -`conda install pytorch=0.4.1 cuda80 -c pytorch` - -Installing with CUDA 7.5 - -`conda install pytorch=0.4.1 cuda75 -c pytorch` - -Installing without CUDA - -`conda install pytorch=0.4.1 -c pytorch` - -### From source - -It is possible to checkout an older version of [PyTorch](https://github.com/pytorch/pytorch) -and build it. -You can list tags in PyTorch git repository with `git tag` and checkout a -particular one (replace '0.1.9' with the desired version) with - -`git checkout v0.1.9` - -Follow the install from source instructions in the README.md of the PyTorch -checkout. - -### Via pip - -Download the `whl` file with the desired version from the following html pages: - -- # CPU-only build -- # CUDA 8.0 build -- # CUDA 9.0 build -- # CUDA 9.2 build -- # CUDA 10.0 build - -Then, install the file with `pip install [downloaded file]` - - -Note: most pytorch versions are available only for specific CUDA versions. For example pytorch=1.0.1 is not available for CUDA 9.2 - -### (Old) PyTorch Linux binaries compiled with CUDA 7.5 - -These predate the html page above and have to be manually installed by downloading the wheel file and `pip install downloaded_file` - -- [cu75/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl) -- [cu75/torch-0.3.0.post4-cp27-cp27m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.3.0.post4-cp27-cp27m-linux_x86_64.whl) -- [cu75/torch-0.2.0.post3-cp36-cp36m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp36-cp36m-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post3-cp35-cp35m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp35-cp35m-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post3-cp27-cp27mu-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp27-cp27mu-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post3-cp27-cp27m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post3-cp27-cp27m-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post2-cp36-cp36m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post2-cp36-cp36m-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post2-cp35-cp35m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post2-cp35-cp35m-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post2-cp27-cp27mu-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post2-cp27-cp27mu-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post2-cp27-cp27m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post2-cp27-cp27m-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post1-cp36-cp36m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post1-cp36-cp36m-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post1-cp35-cp35m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post1-cp35-cp35m-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post1-cp27-cp27mu-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post1-cp27-cp27mu-manylinux1_x86_64.whl) -- [cu75/torch-0.2.0.post1-cp27-cp27m-manylinux1_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.2.0.post1-cp27-cp27m-manylinux1_x86_64.whl) -- [cu75/torch-0.1.12.post2-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.12.post2-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.12.post2-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.12.post2-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.12.post2-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.12.post2-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.12.post1-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.12.post1-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.12.post1-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.12.post1-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.12.post1-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.12.post1-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.11.post5-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.11.post5-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.11.post5-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.11.post5-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.11.post4-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.11.post4-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.11.post4-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.11.post4-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.11.post4-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.11.post4-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.10.post2-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.10.post2-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.10.post2-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.10.post2-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.10.post2-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.10.post2-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.10.post1-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.10.post1-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.10.post1-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.10.post1-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.10.post1-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.10.post1-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.9.post2-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.9.post2-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.9.post2-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.9.post2-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.9.post2-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.9.post2-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.9.post1-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.9.post1-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.9.post1-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.9.post1-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.9.post1-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.9.post1-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.8.post1-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.8.post1-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.8.post1-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.8.post1-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.8.post1-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.8.post1-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.7.post2-cp36-cp36m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.7.post2-cp36-cp36m-linux_x86_64.whl) -- [cu75/torch-0.1.7.post2-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.7.post2-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.7.post2-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.7.post2-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.6.post22-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.6.post22-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.6.post22-cp27-none-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.6.post22-cp27-none-linux_x86_64.whl) -- [cu75/torch-0.1.6.post20-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.6.post20-cp35-cp35m-linux_x86_64.whl) -- [cu75/torch-0.1.6.post20-cp27-cp27mu-linux_x86_64.whl](https://download.pytorch.org/whl/cu75/torch-0.1.6.post20-cp27-cp27mu-linux_x86_64.whl) - -### Windows binaries - -- [cpu/torch-1.0.0-cp35-cp35m-win_amd64.whl](https://download.pytorch.org/whl/cpu/torch-1.0.0-cp35-cp35m-win_amd64.whl) -- [cu80/torch-1.0.0-cp35-cp35m-win_amd64.whl](https://download.pytorch.org/whl/cu80/torch-1.0.0-cp35-cp35m-win_amd64.whl) -- [cu90/torch-1.0.0-cp35-cp35m-win_amd64.whl](https://download.pytorch.org/whl/cu90/torch-1.0.0-cp35-cp35m-win_amd64.whl) -- [cu100/torch-1.0.0-cp35-cp35m-win_amd64.whl](https://download.pytorch.org/whl/cu100/torch-1.0.0-cp35-cp35m-win_amd64.whl) -- [cpu/torch-1.0.0-cp36-cp36m-win_amd64.whl](https://download.pytorch.org/whl/cpu/torch-1.0.0-cp36-cp36m-win_amd64.whl) -- [cu80/torch-1.0.0-cp36-cp36m-win_amd64.whl](https://download.pytorch.org/whl/cu80/torch-1.0.0-cp36-cp36m-win_amd64.whl) -- [cu90/torch-1.0.0-cp36-cp36m-win_amd64.whl](https://download.pytorch.org/whl/cu90/torch-1.0.0-cp36-cp36m-win_amd64.whl) -- [cu100/torch-1.0.0-cp36-cp36m-win_amd64.whl](https://download.pytorch.org/whl/cu100/torch-1.0.0-cp36-cp36m-win_amd64.whl) -- [cpu/torch-1.0.0-cp37-cp37m-win_amd64.whl](https://download.pytorch.org/whl/cpu/torch-1.0.0-cp37-cp37m-win_amd64.whl) -- [cu80/torch-1.0.0-cp37-cp37m-win_amd64.whl](https://download.pytorch.org/whl/cu80/torch-1.0.0-cp37-cp37m-win_amd64.whl) -- [cu90/torch-1.0.0-cp37-cp37m-win_amd64.whl](https://download.pytorch.org/whl/cu90/torch-1.0.0-cp37-cp37m-win_amd64.whl) -- [cu100/torch-1.0.0-cp37-cp37m-win_amd64.whl](https://download.pytorch.org/whl/cu100/torch-1.0.0-cp37-cp37m-win_amd64.whl) -- [cpu/torch-0.4.1-cp35-cp35m-win_amd64.whl](https://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-win_amd64.whl) -- [cu80/torch-0.4.1-cp35-cp35m-win_amd64.whl](https://download.pytorch.org/whl/cu80/torch-0.4.1-cp35-cp35m-win_amd64.whl) -- [cu90/torch-0.4.1-cp35-cp35m-win_amd64.whl](https://download.pytorch.org/whl/cu90/torch-0.4.1-cp35-cp35m-win_amd64.whl) -- [cu92/torch-0.4.1-cp35-cp35m-win_amd64.whl](https://download.pytorch.org/whl/cu92/torch-0.4.1-cp35-cp35m-win_amd64.whl) -- [cpu/torch-0.4.1-cp36-cp36m-win_amd64.whl](https://download.pytorch.org/whl/cpu/torch-0.4.1-cp36-cp36m-win_amd64.whl) -- [cu80/torch-0.4.1-cp36-cp36m-win_amd64.whl](https://download.pytorch.org/whl/cu80/torch-0.4.1-cp36-cp36m-win_amd64.whl) -- [cu90/torch-0.4.1-cp36-cp36m-win_amd64.whl](https://download.pytorch.org/whl/cu90/torch-0.4.1-cp36-cp36m-win_amd64.whl) -- [cu92/torch-0.4.1-cp36-cp36m-win_amd64.whl](https://download.pytorch.org/whl/cu92/torch-0.4.1-cp36-cp36m-win_amd64.whl) -- [cpu/torch-0.4.1-cp37-cp37m-win_amd64.whl](https://download.pytorch.org/whl/cpu/torch-0.4.1-cp37-cp37m-win_amd64.whl) -- [cu80/torch-0.4.1-cp37-cp37m-win_amd64.whl](https://download.pytorch.org/whl/cu80/torch-0.4.1-cp37-cp37m-win_amd64.whl) -- [cu90/torch-0.4.1-cp37-cp37m-win_amd64.whl](https://download.pytorch.org/whl/cu90/torch-0.4.1-cp37-cp37m-win_amd64.whl) -- [cu92/torch-0.4.1-cp37-cp37m-win_amd64.whl](https://download.pytorch.org/whl/cu92/torch-0.4.1-cp37-cp37m-win_amd64.whl) - -### Mac and misc. binaries - -For recent macOS binaries, use `conda`: - -e.g., - -`conda install pytorch=0.4.1 cuda90 -c pytorch` -`conda install pytorch=0.4.1 cuda92 -c pytorch` -`conda install pytorch=0.4.1 cuda80 -c pytorch` -`conda install pytorch=0.4.1 -c pytorch` # No CUDA - -- [torchvision-0.1.6-py3-none-any.whl](https://download.pytorch.org/whl/torchvision-0.1.6-py3-none-any.whl) -- [torchvision-0.1.6-py2-none-any.whl](https://download.pytorch.org/whl/torchvision-0.1.6-py2-none-any.whl) -- [torch-1.0.0-cp37-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/cpu/torch-1.0.0-cp37-none-macosx_10_7_x86_64.whl) -- [torch-1.0.0-cp36-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/cpu/torch-1.0.0-cp36-none-macosx_10_7_x86_64.whl) -- [torch-1.0.0-cp35-none-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/cpu/torch-1.0.0-cp35-none-macosx_10_6_x86_64.whl) -- [torch-1.0.0-cp27-none-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/cpu/torch-1.0.0-cp27-none-macosx_10_6_x86_64.whl) -- [torch-0.4.0-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.4.0-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.4.0-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.4.0-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.4.0-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.4.0-cp27-none-macosx_10_6_x86_64.whl) -- [torch-0.3.1-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.3.1-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.3.1-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.3.1-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.3.1-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.3.1-cp27-none-macosx_10_6_x86_64.whl) -- [torch-0.3.0.post4-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.3.0.post4-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.3.0.post4-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.3.0.post4-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.3.0.post4-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.3.0.post4-cp27-none-macosx_10_6_x86_64.whl) -- [torch-0.2.0.post3-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post3-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.2.0.post3-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post3-cp35-cp35m-macosx_10_7_x86_64.whl) -- [torch-0.2.0.post3-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post3-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.2.0.post2-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post2-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.2.0.post2-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post2-cp35-cp35m-macosx_10_7_x86_64.whl) -- [torch-0.2.0.post2-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post2-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.2.0.post1-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post1-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.2.0.post1-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post1-cp35-cp35m-macosx_10_7_x86_64.whl) -- [torch-0.2.0.post1-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.2.0.post1-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.12.post2-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.12.post2-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.12.post2-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.12.post2-cp35-cp35m-macosx_10_7_x86_64.whl) -- [torch-0.1.12.post2-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.12.post2-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.12.post1-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.12.post1-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.12.post1-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.12.post1-cp35-cp35m-macosx_10_7_x86_64.whl) -- [torch-0.1.12.post1-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.12.post1-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.11.post5-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.11.post5-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.11.post5-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.11.post5-cp35-cp35m-macosx_10_7_x86_64.whl) -- [torch-0.1.11.post5-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.11.post5-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.11.post4-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.11.post4-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.11.post4-cp35-cp35m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.11.post4-cp35-cp35m-macosx_10_7_x86_64.whl) -- [torch-0.1.11.post4-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.11.post4-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.10.post1-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.10.post1-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.10.post1-cp35-cp35m-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.10.post1-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.1.10.post1-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.10.post1-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.9.post2-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.9.post2-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.9.post2-cp35-cp35m-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.9.post2-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.1.9.post2-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.9.post2-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.9.post1-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.9.post1-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.9.post1-cp35-cp35m-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.9.post1-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.1.9.post1-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.9.post1-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.8.post1-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.8.post1-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.8.post1-cp35-cp35m-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.8.post1-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.1.8.post1-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.8.post1-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.7.post2-cp36-cp36m-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.7.post2-cp36-cp36m-macosx_10_7_x86_64.whl) -- [torch-0.1.7.post2-cp35-cp35m-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.7.post2-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.1.7.post2-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.7.post2-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.6.post22-cp35-cp35m-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.6.post22-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.1.6.post22-cp27-none-macosx_10_7_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.6.post22-cp27-none-macosx_10_7_x86_64.whl) -- [torch-0.1.6.post20-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.6.post20-cp35-cp35m-linux_x86_64.whl) -- [torch-0.1.6.post20-cp27-cp27mu-linux_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.6.post20-cp27-cp27mu-linux_x86_64.whl) -- [torch-0.1.6.post17-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.6.post17-cp35-cp35m-linux_x86_64.whl) -- [torch-0.1.6.post17-cp27-cp27mu-linux_x86_64.whl](https://download.pytorch.org/whl/torch-0.1.6.post17-cp27-cp27mu-linux_x86_64.whl) -- [torch-0.1-cp35-cp35m-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/torch-0.1-cp35-cp35m-macosx_10_6_x86_64.whl) -- [torch-0.1-cp27-cp27m-macosx_10_6_x86_64.whl](https://download.pytorch.org/whl/torch-0.1-cp27-cp27m-macosx_10_6_x86_64.whl) -- [torch_cuda80-0.1.6.post20-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/torch_cuda80-0.1.6.post20-cp35-cp35m-linux_x86_64.whl) -- [torch_cuda80-0.1.6.post20-cp27-cp27mu-linux_x86_64.whl](https://download.pytorch.org/whl/torch_cuda80-0.1.6.post20-cp27-cp27mu-linux_x86_64.whl) -- [torch_cuda80-0.1.6.post17-cp35-cp35m-linux_x86_64.whl](https://download.pytorch.org/whl/torch_cuda80-0.1.6.post17-cp35-cp35m-linux_x86_64.whl) -- [torch_cuda80-0.1.6.post17-cp27-cp27mu-linux_x86_64.whl](https://download.pytorch.org/whl/torch_cuda80-0.1.6.post17-cp27-cp27mu-linux_x86_64.whl) - - - - - diff --git a/_get_started/pytorch.md b/_get_started/pytorch.md deleted file mode 100644 index 9ea724d6ddf1..000000000000 --- a/_get_started/pytorch.md +++ /dev/null @@ -1,668 +0,0 @@ ---- -layout: get_started -title: PyTorch 2.x -permalink: /get-started/pytorch-2.0/ -featured-img: "assets/images/featured-img-pytorch-2.png" -background-class: get-started-background -body-class: get-started -order: 2 -published: true ---- - -## Overview - -Introducing PyTorch 2.0, our first steps toward the next generation 2-series release of PyTorch. Over the last few years we have innovated and iterated from PyTorch 1.0 to the most recent 1.13 and moved to the newly formed PyTorch Foundation, part of the Linux Foundation. - -PyTorch’s biggest strength beyond our amazing community is that we continue as a first-class Python integration, imperative style, simplicity of the API and options. PyTorch 2.0 offers the same eager-mode development and user experience, while fundamentally changing and supercharging how PyTorch operates at compiler level under the hood. We are able to provide faster performance and support for Dynamic Shapes and Distributed. - -Below you will find all the information you need to better understand what PyTorch 2.0 is, where it’s going and more importantly how to get started today (e.g., tutorial, requirements, models, common FAQs). There is still a lot to learn and develop but we are looking forward to community feedback and contributions to make the 2-series better and thank you all who have made the 1-series so successful. - -## PyTorch 2.x: faster, more pythonic and as dynamic as ever - -Today, we announce `torch.compile`, a feature that pushes PyTorch performance to new heights and starts the move for parts of PyTorch from C++ back into Python. We believe that this is a substantial new direction for PyTorch -- hence we call it 2.0. `torch.compile` is a fully additive (and optional) feature and hence 2.0 is 100% backward compatible by definition. - -Underpinning `torch.compile` are new technologies -- TorchDynamo, AOTAutograd, PrimTorch and TorchInductor. - -- **TorchDynamo** captures PyTorch programs safely using Python Frame Evaluation Hooks and is a significant innovation that was a result of 5 years of our R&D into safe graph capture - -* **AOTAutograd** overloads PyTorch’s autograd engine as a tracing autodiff for generating ahead-of-time backward traces. - -- **PrimTorch** canonicalizes ~2000+ PyTorch operators down to a closed set of ~250 primitive operators that developers can target to build a complete PyTorch backend. This substantially lowers the barrier of writing a PyTorch feature or backend. -- **TorchInductor** is a deep learning compiler that generates fast code for multiple accelerators and backends. For NVIDIA and AMD GPUs, it uses OpenAI Triton as a key building block. - -TorchDynamo, AOTAutograd, PrimTorch and TorchInductor are written in Python and support dynamic shapes (i.e. the ability to send in Tensors of different sizes without inducing a recompilation), making them flexible, easily hackable and lowering the barrier of entry for developers and vendors. - -To validate these technologies, we used a diverse set of 163 open-source models across various machine learning domains. We built this benchmark carefully to include tasks such as Image Classification, Object Detection, Image Generation, various NLP tasks such as Language Modeling, Q&A, Sequence Classification, Recommender Systems and Reinforcement Learning. We separate the benchmarks into three categories: - -
    -
  • 46 models from HuggingFace Transformers
  • -
  • 61 models from TIMM: a collection of state-of-the-art PyTorch image models by Ross Wightman
  • -
  • 56 models from TorchBench: a curated set of popular code-bases from across github
  • -
- - - -We don’t modify these open-source models except to add a `torch.compile` call wrapping them. - -We then measure speedups and validate accuracy across these models. Since speedups can be dependent on data-type, we measure speedups on both float32 and Automatic Mixed Precision (AMP). We report an uneven weighted average speedup of _0.75 * AMP + 0.25 * float32_ since we find AMP is more common in practice. - -Across these 163 open-source models `torch.compile` works 93% of time, and the model runs 43% faster in training on an NVIDIA A100 GPU. At Float32 precision, it runs 21% faster on average and at AMP Precision it runs 51% faster on average. - -**Caveats:** On a desktop-class GPU such as a NVIDIA 3090, we’ve measured that speedups are lower than on server-class GPUs such as A100. As of today, our default backend TorchInductor supports CPUs and NVIDIA Volta and Ampere GPUs. It does not (yet) support other GPUs, xPUs or older NVIDIA GPUs. - -

- -

Speedups for torch.compile against eager mode on an NVIDIA A100 GPU
-

- -**Try it:** `torch.compile` is in the early stages of development. Starting today, you can try out `torch.compile` in the `nightly` binaries. We expect to ship the first stable 2.0 release in early March 2023. - -In the roadmap of PyTorch 2.x we hope to push the compiled mode further and further in terms of performance and scalability. Some of this work is in-flight, as we talked about at the Conference today. Some of this work has not started yet. Some of this work is what we hope to see, but don’t have the bandwidth to do ourselves. If you are interested in contributing, come chat with us at the **Ask the Engineers: 2.0 Live Q&A Series** starting this month (details at the end of this post) and/or via Github / Forums. - -

- -

- -### Testimonials - -Here is what some of PyTorch’s users have to say about our new direction: - -**Sylvain Gugger** the **primary maintainer of HuggingFace transformers**: - -_"With just one line of code to add, PyTorch 2.0 gives a speedup between 1.5x and 2.x in training Transformers models. This is the most exciting thing since mixed precision training was introduced!"_ - -**Ross Wightman the primary maintainer of TIMM** (one of the largest vision model hubs within the PyTorch ecosystem): - -_“It just works out of the box with majority of TIMM models for inference and train workloads with no code changes”_ - -**Luca Antiga** the **CTO of Lightning AI** and one of the **primary maintainers of PyTorch Lightning** - -_“PyTorch 2.0 embodies the future of deep learning frameworks. The possibility to capture a PyTorch program with effectively no user intervention and get massive on-device speedups and program manipulation out of the box unlocks a whole new dimension for AI developers.”_ - -## Motivation - -Our philosophy on PyTorch has always been to keep flexibility and hackability our top priority, and performance as a close second. We strived for: - -1. High-Performance eager execution -2. Pythonic internals -3. Good abstractions for Distributed, Autodiff, Data loading, Accelerators, etc. - -Since we launched PyTorch in 2017, hardware accelerators (such as GPUs) have become ~15x faster in compute and about ~2x faster in the speed of memory access. So, to keep eager execution at high-performance, we've had to move substantial parts of PyTorch internals into C++. Moving internals into C++ makes them less hackable and increases the barrier of entry for code contributions. - -From day one, we knew the performance limits of eager execution. In July 2017, we started our first research project into developing a Compiler for PyTorch. The compiler needed to make a PyTorch program fast, but not at the cost of the PyTorch experience. Our key criteria was to preserve certain kinds of flexibility -- support for dynamic shapes and dynamic programs which researchers use in various stages of exploration. - -

- -

- -## Technology Overview - -Over the years, we've built several compiler projects within PyTorch. Let us break down the compiler into three parts: - -- graph acquisition -- graph lowering -- graph compilation - -Graph acquisition was the harder challenge when building a PyTorch compiler. - -In the past 5 years, we built `torch.jit.trace`, TorchScript, FX tracing, Lazy Tensors. But none of them felt like they gave us everything we wanted. Some were flexible but not fast, some were fast but not flexible and some were neither fast nor flexible. Some had bad user-experience (like being silently wrong). While TorchScript was promising, it needed substantial changes to your code and the code that your code depended on. This need for substantial change in code made it a non-starter for a lot of PyTorch users. - -

- -

The PyTorch compilation process
-

- -### TorchDynamo: Acquiring Graphs reliably and fast - -Earlier this year, we started working on TorchDynamo, an approach that uses a CPython feature introduced in [PEP-0523](https://peps.python.org/pep-0523/) called the Frame Evaluation API. We took a data-driven approach to validate its effectiveness on Graph Capture. We used 7,000+ Github projects written in PyTorch as our validation set. While TorchScript and others struggled to even acquire the graph 50% of the time, often with a big overhead, TorchDynamo acquired the graph [99% of the time](https://dev-discuss.pytorch.org/t/torchdynamo-update-8-torchdynamo-passed-correctness-check-on-7k-github-models/663), correctly, safely and with negligible overhead – without needing any changes to the original code. This is when we knew that we finally broke through the barrier that we were struggling with for many years in terms of flexibility and speed. - -### TorchInductor: fast codegen using a define-by-run IR - -For a new compiler backend for PyTorch 2.0, we took inspiration from how our users were writing high performance custom kernels: increasingly using the [Triton](https://github.com/openai/triton) language. We also wanted a compiler backend that used similar abstractions to PyTorch eager, and was general purpose enough to support the wide breadth of features in PyTorch. TorchInductor uses a pythonic define-by-run loop level IR to automatically map PyTorch models into generated Triton code on GPUs and C++/OpenMP on CPUs. TorchInductor’s core loop level IR contains only ~50 operators, and it is implemented in Python, making it easily hackable and extensible. - -### AOTAutograd: reusing Autograd for ahead-of-time graphs - -For PyTorch 2.0, we knew that we wanted to accelerate training. Thus, it was critical that we not only captured user-level code, but also that we captured backpropagation. Moreover, we knew that we wanted to reuse the existing battle-tested PyTorch autograd system. AOTAutograd leverages PyTorch’s **torch_dispatch** extensibility mechanism to trace through our Autograd engine, allowing us to capture the backwards pass “ahead-of-time”. This allows us to accelerate both our forwards _and_ backwards pass using TorchInductor. - -### PrimTorch: Stable Primitive operators - -Writing a backend for PyTorch is challenging. PyTorch has 1200+ operators, and 2000+ if you consider various overloads for each operator. - -

- -

A breakdown of the 2000+ PyTorch operators
-

- -Hence, writing a backend or a cross-cutting feature becomes a draining endeavor. Within the PrimTorch project, we are working on defining smaller and stable operator sets. PyTorch programs can consistently be lowered to these operator sets. We aim to define two operator sets: - -- Prim ops with about ~250 operators, which are fairly low-level. These are suited for compilers because they are low-level enough that you need to fuse them back together to get good performance. -- ATen ops with about ~750 canonical operators and suited for exporting as-is. These are suited for backends that already integrate at the ATen level or backends that won't have compilation to recover performance from a lower-level operator set like Prim ops. - -We discuss more about this topic below in the Developer/Vendor Experience section - -## User Experience - -We introduce a simple function `torch.compile` that wraps your model and returns a compiled model. - -```python -compiled_model = torch.compile(model) -``` - -This `compiled_model` holds a reference to your model and compiles the `forward` function to a more optimized version. When compiling the model, we give a few knobs to adjust it: - -```python -def torch.compile(model: Callable, - *, - mode: Optional[str] = "default", - dynamic: bool = False, - fullgraph:bool = False, - backend: Union[str, Callable] = "inductor", - # advanced backend options go here as kwargs - **kwargs -) -> torch._dynamo.NNOptimizedModule -``` - -- **mode** specifies what the compiler should be optimizing while compiling. - - - The default mode is a preset that tries to compile efficiently without taking too long to compile or using extra memory. - - Other modes such as `reduce-overhead` reduce the framework overhead by a lot more, but cost a small amount of extra memory. `max-autotune` compiles for a long time, trying to give you the fastest code it can generate. - -- **dynamic** specifies whether to enable the code path for Dynamic Shapes. Certain compiler optimizations cannot be applied to dynamic shaped programs. Making it explicit whether you want a compiled program with dynamic shapes or with static shapes will help the compiler give you better optimized code. -- **fullgraph** is similar to Numba’s `nopython`. It compiles the entire program into a single graph or gives an error explaining why it could not do so. Most users don’t need to use this mode. If you are very performance conscious, then you try to use it. -- **backend** specifies which compiler backend to use. By default, TorchInductor is used, but there are a few others available. - -

- -

- -The compile experience intends to deliver most benefits and the most flexibility in the default mode. Here is a mental model of what you get in each mode. - -Now, let us look at a full example of compiling a real model and running it (with random data) - -```python -import torch -import torchvision.models as models - -model = models.resnet18().cuda() -optimizer = torch.optim.SGD(model.parameters(), lr=0.01) -compiled_model = torch.compile(model) - -x = torch.randn(16, 3, 224, 224).cuda() -optimizer.zero_grad() -out = compiled_model(x) -out.sum().backward() -optimizer.step() -``` - -The first time you run the `compiled_model(x)`, it compiles the model. Hence, it takes longer to run. Subsequent runs are fast. - -### Modes - -The compiler has a few presets that tune the compiled model in different ways. -You might be running a small model that is slow because of framework overhead. Or, you might be running a large model that barely fits into memory. Depending on your need, you might want to use a different mode. - -```python -# API NOT FINAL -# default: optimizes for large models, low compile-time -# and no extra memory usage -torch.compile(model) - -# reduce-overhead: optimizes to reduce the framework overhead -# and uses some extra memory. Helps speed up small models -torch.compile(model, mode="reduce-overhead") - -# max-autotune: optimizes to produce the fastest model, -# but takes a very long time to compile -torch.compile(model, mode="max-autotune") - -``` - -### Reading and updating Attributes - -Accessing model attributes work as they would in eager mode. -You can access or modify attributes of your model (such as `model.conv1.weight`) as you generally would. This is completely safe and sound in terms of code correction. TorchDynamo inserts guards into the code to check if its assumptions hold true. If attributes change in certain ways, then TorchDynamo knows to recompile automatically as needed. - -```python -# optimized_model works similar to model, feel free to access its attributes and modify them -optimized_model.conv1.weight.fill_(0.01) - -# this change is reflected in model -``` - -### Hooks - -Module and Tensor [hooks](https://pytorch.org/docs/stable/notes/modules.html#module-hooks) don’t fully work at the moment, but they will eventually work as we finish development. - -### Serialization - -You can serialize the state-dict of the `optimized_model` OR the `model`. They point to the same parameters and state and hence are equivalent. - -```python -torch.save(optimized_model.state_dict(), "foo.pt") -# both these lines of code do the same thing -torch.save(model.state_dict(), "foo.pt") -``` - -You cannot serialize `optimized_model` currently. If you wish to save the object directly, save `model` instead. - -```python -torch.save(optimized_model, "foo.pt") # Error -torch.save(model, "foo.pt") # Works -``` - -### Inference and Export - -For model inference, after generating a compiled model using torch.compile, run some warm-up steps before actual model serving. This helps mitigate latency spikes during initial serving. - -In addition, we will be introducing a mode called `torch.export` that carefully exports the entire model and the guard infrastructure for environments that need guaranteed and predictable latency. `torch.export` would need changes to your program, especially if you have data dependent control-flow. - -```python -# API Not Final -exported_model = torch._dynamo.export(model, input) -torch.save(exported_model, "foo.pt") -``` - -This is in early stages of development. Catch the talk on Export Path at the PyTorch Conference for more details. You can also engage on this topic at our “Ask the Engineers: 2.0 Live Q&A Series” starting this month (more details at the end of this post). - -### Debugging Issues - -A compiled mode is opaque and hard to debug. You will have questions such as: - -- Why is my program crashing in compiled mode? -- Is compiled mode as accurate as eager mode? -- Why am I not seeing speedups? - -If compiled mode produces an error or a crash or diverging results from eager mode (beyond machine precision limits), it is very unlikely that it is your code’s fault. However, understanding what piece of code is the reason for the bug is useful. - -To aid in debugging and reproducibility, we have created several tools and logging capabilities out of which one stands out: **The Minifier.** - -The minifier automatically reduces the issue you are seeing to a small snippet of code. This small snippet of code reproduces the original issue and you can file a github issue with the minified code. This will help the PyTorch team fix the issue easily and quickly. - -If you are not seeing the speedups that you expect, then we have the **torch.\_dynamo.explain** tool that explains which parts of your code induced what we call “graph breaks”. Graph breaks generally hinder the compiler from speeding up the code, and reducing the number of graph breaks likely will speed up your code (up to some limit of diminishing returns). - -You can read about these and more in our [troubleshooting guide](https://pytorch.org/docs/stable/torch.compiler_troubleshooting.html). - -### Dynamic Shapes - -When looking at what was necessary to support the generality of PyTorch code, one key requirement was supporting dynamic shapes, and allowing models to take in tensors of different sizes without inducing recompilation every time the shape changes. - -As of today, support for Dynamic Shapes is limited and a rapid work in progress. It will be fully featured by stable release. It is gated behind a `dynamic=True` argument, and we have more progress on a feature branch (symbolic-shapes), on which we have successfully run BERT_pytorch in training with full symbolic shapes with TorchInductor. For inference with dynamic shapes, we have more coverage. For example, let’s look at a common setting where dynamic shapes are helpful - text generation with language models. - -We can see that even when the shape changes dynamically from 4 all the way to 256, Compiled mode is able to consistently outperform eager by up to 40%. Without support for dynamic shapes, a common workaround is to pad to the nearest power of two. However, as we can see from the charts below, it incurs a significant amount of performance overhead, and also results in significantly longer compilation time. Moreover, padding is sometimes non-trivial to do correctly. - -By supporting dynamic shapes in PyTorch 2.0’s Compiled mode, we can get the best of performance _and_ ease of use. - -
- - -
- -The current work is evolving very rapidly and we may temporarily let some models regress as we land fundamental improvements to infrastructure. The latest updates for our progress on dynamic shapes can be found [here](https://dev-discuss.pytorch.org/t/state-of-symbolic-shapes-branch/777/19). - -## Distributed - -In summary, torch.distributed’s two main distributed wrappers work well in compiled mode. - -Both `DistributedDataParallel` (DDP) and `FullyShardedDataParallel` (FSDP) work in compiled mode and provide improved performance and memory utilization relative to eager mode, with some caveats and limitations. - -

-

Speedups in AMP Precision
- -
Left: speedups for FSDP in Compiled mode over eager mode (AMP precision).
-Right: FSDP in Compiled mode takes substantially lesser memory than in eager mode
-

- -
- - -
- -### DistributedDataParallel (DDP) - -DDP relies on overlapping AllReduce communications with backwards computation, and grouping smaller per-layer AllReduce operations into ‘buckets’ for greater efficiency. AOTAutograd functions compiled by TorchDynamo prevent communication overlap, when combined naively with DDP, but performance is recovered by compiling separate subgraphs for each ‘bucket’ and allowing communication ops to happen outside and in-between the subgraphs. DDP support in compiled mode also currently requires `static_graph=False`. See [this post](https://dev-discuss.pytorch.org/t/torchdynamo-update-9-making-ddp-work-with-torchdynamo/860) for more details on the approach and results for DDP + TorchDynamo. - -### FullyShardedDataParallel (FSDP) - -FSDP itself is a “beta” PyTorch feature and has a higher level of system complexity than DDP due to the ability to tune which submodules are wrapped and because there are generally more configuration options. FSDP works with TorchDynamo and TorchInductor for a variety of popular models, if configured with the `use_original_params=True` flag. Some compatibility issues with particular models or configurations are expected at this time, but will be actively improved, and particular models can be prioritized if github issues are filed. - -Users specify an `auto_wrap_policy` argument to indicate which submodules of their model to wrap together in an FSDP instance used for state sharding, or manually wrap submodules in FSDP instances. For example, many transformer models work well when each ‘transformer block’ is wrapped in a separate FSDP instance and thus only the full state of one transformer block needs to be materialized at one time. Dynamo will insert graph breaks at the boundary of each FSDP instance, to allow communication ops in forward (and backward) to happen outside the graphs and in parallel to computation. - -If FSDP is used without wrapping submodules in separate instances, it falls back to operating similarly to DDP, but without bucketing. Hence all gradients are reduced in one operation, and there can be no compute/communication overlap even in Eager. This configuration has only been tested with TorchDynamo for functionality but not for performance. - -## Developer/Vendor Experience - -With PyTorch 2.0, we want to simplify the backend (compiler) integration experience. To do this, we have focused on **reducing the number of operators** and **simplifying the semantics** of the operator set necessary to bring up a PyTorch backend. - -In graphical form, the PT2 stack looks like: - -

- -

- -Starting in the middle of the diagram, AOTAutograd dynamically captures autograd logic in an ahead-of-time fashion, producing a graph of forward and backwards operators in FX graph format. - -We provide a set of hardened decompositions (i.e. operator implementations written in terms of other operators) that can be leveraged to **reduce** the number of operators a backend is required to implement. We also **simplify** the semantics of PyTorch operators by selectively rewriting complicated PyTorch logic including mutations and views via a process called _functionalization_, as well as guaranteeing operator metadata information such as shape propagation formulas. This work is actively in progress; our goal is to provide a _primitive_ and _stable_ set of ~250 operators with simplified semantics, called _PrimTorch,_ that vendors can leverage (i.e. opt-in to) in order to simplify their integrations. -After reducing and simplifying the operator set, backends may choose to integrate at the Dynamo (i.e. the middle layer, immediately after AOTAutograd) or Inductor (the lower layer). We describe some considerations in making this choice below, as well as future work around mixtures of backends. - -**Dynamo Backend** - -Vendors with existing compiler stacks may find it easiest to integrate as a TorchDynamo backend, receiving an FX Graph in terms of ATen/Prims IR. Note that for both training and inference, the integration point would be immediately after AOTAutograd, since we currently apply decompositions as part of AOTAutograd, and merely skip the backward-specific steps if targeting inference. - -**Inductor backend** - -Vendors can also integrate their backend directly into Inductor. Inductor takes in a graph produced by AOTAutograd that consists of ATen/Prim operations, and further lowers them down to a loop level IR. Today, Inductor provides lowerings to its loop-level IR for pointwise, reduction, scatter/gather and window operations. In addition, Inductor creates fusion groups, does indexing simplification, dimension collapsing, and tunes loop iteration order in order to support efficient code generation. Vendors can then integrate by providing the mapping from the loop level IR to hardware-specific code. Currently, Inductor has two backends: (1) C++ that generates multithreaded CPU code, (2) Triton that generates performant GPU code. These Inductor backends can be used as an inspiration for the alternate backends. - -**Mixture of Backends Interface (coming soon)** - -We have built utilities for partitioning an FX graph into subgraphs that contain operators supported by a backend and executing the remainder eagerly. These utilities can be extended to support a “mixture of backends,” configuring which portions of the graphs to run for which backend. However, there is not yet a stable interface or contract for backends to expose their operator support, preferences for patterns of operators, etc. This remains as ongoing work, and we welcome feedback from early adopters. - -## Final Thoughts - -We are super excited about the direction that we’ve taken for PyTorch 2.0 and beyond. The road to the final 2.0 release is going to be rough, but come join us on this journey early-on. If you are interested in deep-diving further or contributing to the compiler, please continue reading below which includes more information on how to get started (e.g., tutorials, benchmarks, models, FAQs) and **Ask the Engineers: 2.0 Live Q&A Series** starting this month. Additional resources include: - -- [Getting Started](https://pytorch.org/docs/stable/torch.compiler_get_started.html) -- [Tutorials](https://pytorch.org/tutorials/) -- [Documentation](https://pytorch.org/docs/stable) -- [Developer Discussions](https://dev-discuss.pytorch.org) - - - - - -## Accelerating Hugging Face and TIMM models with PyTorch 2.0 - -Author: Mark Saroufim - -`torch.compile()` makes it easy to experiment with different compiler backends to make PyTorch code faster with a single line decorator `torch.compile()`. It works either directly over an nn.Module as a drop-in replacement for torch.jit.script() but without requiring you to make any source code changes. We expect this one line code change to provide you with between 30%-2x training time speedups on the vast majority of models that you’re already running. - -```python -opt_module = torch.compile(module) -``` - -torch.compile supports arbitrary PyTorch code, control flow, mutation and comes with experimental support for dynamic shapes. We’re so excited about this development that we call it PyTorch 2.0. - -What makes this announcement different for us is we’ve already benchmarked some of the most popular open source PyTorch models and gotten substantial speedups ranging from 30% to 2x [https://github.com/pytorch/torchdynamo/issues/681](https://github.com/pytorch/torchdynamo/issues/681). - -There are no tricks here, we’ve pip installed popular libraries like [https://github.com/huggingface/transformers](https://github.com/huggingface/transformers), [https://github.com/huggingface/accelerate](https://github.com/huggingface/accelerate) and [https://github.com/rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models) and then ran torch.compile() on them and that’s it. - -It’s rare to get both performance and convenience, but this is why the core team finds PyTorch 2.0 so exciting. - -## Requirements - -For GPU (newer generation GPUs will see drastically better performance) - -``` -pip3 install numpy --pre torch --force-reinstall --index-url https://download.pytorch.org/whl/nightly/cu117 -``` - -For CPU - -``` -pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu -``` - -Optional: Verify Installation - -``` -git clone https://github.com/pytorch/pytorch -cd tools/dynamo -python verify_dynamo.py -``` - -Optional: Docker installation - -We also provide all the required dependencies in the PyTorch nightly -binaries which you can download with - -``` -docker pull ghcr.io/pytorch/pytorch-nightly -``` - -And for ad hoc experiments just make sure that your container has access to all your GPUs - -``` -docker run --gpus all -it ghcr.io/pytorch/pytorch-nightly:latest /bin/bash -``` - -## Getting Started - -Please read Mark Saroufim’s [full blog post](/blog/Accelerating-Hugging-Face-and-TIMM-models/) where he walks you through a tutorial and real models for you to try PyTorch 2.0 today. - -Our goal with PyTorch was to build a breadth-first compiler that would speed up the vast majority of actual models people run in open source. The Hugging Face Hub ended up being an extremely valuable benchmarking tool for us, ensuring that any optimization we work on actually helps accelerate models people want to run. - -The blog tutorial will show you exactly how to replicate those speedups so you can be as excited as to PyTorch 2.0 as we are. So please try out PyTorch 2.0, enjoy the free perf and if you’re not seeing it then please open an issue and we will make sure your model is supported [https://github.com/pytorch/torchdynamo/issues](https://github.com/pytorch/torchdynamo/issues) - -After all, we can’t claim we’re created a breadth-first unless **YOUR** models actually run faster. - -## FAQs - -1. **What is PT 2.0?** -2.0 is the latest PyTorch version. PyTorch 2.0 offers the same eager-mode development experience, while adding a compiled mode via torch.compile. This compiled mode has the potential to speedup your models during training and inference. - - -2. **Why 2.0 instead of 1.14?** -PyTorch 2.0 is what 1.14 would have been. We were releasing substantial new features that we believe change how you meaningfully use PyTorch, so we are calling it 2.0 instead. - -3. **How do I install 2.0? Any additional requirements?** - - Install the latest nightlies: - - CUDA 11.8
- ``` - pip3 install numpy --pre torch torchvision torchaudio --force-reinstall --index-url https://download.pytorch.org/whl/nightly/cu118 - ``` - CUDA 11.7 - ``` - pip3 install numpy --pre torch torchvision torchaudio --force-reinstall --index-url https://download.pytorch.org/whl/nightly/cu117 - ``` - CPU - ``` - pip3 install numpy --pre torch torchvision torchaudio --force-reinstall --index-url https://download.pytorch.org/whl/nightly/cpu - ``` - -4. **Is 2.0 code backwards-compatible with 1.X?** -Yes, using 2.0 will not require you to modify your PyTorch workflows. A single line of code `model = torch.compile(model)` can optimize your model to use the 2.0 stack, and smoothly run with the rest of your PyTorch code. This is completely opt-in, and you are not required to use the new compiler. - -5. **Is 2.0 enabled by default?** -2.0 is the name of the release. torch.compile is the feature released in 2.0, and you need to explicitly use torch.compile. - -6. **How do I migrate my PT1.X code to PT2.0?** -Your code should be working as-is without the need for any migrations. If you want to use the new Compiled mode feature introduced in 2.0, then you can start by optimizing your model with one line: `model = torch.compile(model)`. -While the speedups are primarily observed during training, you can also use it for inference if your model runs faster than eager mode. - ```python - import torch - - def train(model, dataloader): - model = torch.compile(model) - for batch in dataloader: - run_epoch(model, batch) - - def infer(model, input): - model = torch.compile(model) - return model(\*\*input) - ``` - -7. **Why should I use PT2.0 instead of PT 1.X?** -See answer to Question (2). - -8. **What is my code doing differently when running PyTorch 2.0?** -Out of the box, PyTorch 2.0 is the same as PyTorch 1.x, your models run in eager-mode i.e. every line of Python is executed one after the other. -In 2.0, if you wrap your model in `model = torch.compile(model)`, your model goes through 3 steps before execution: - 1. Graph acquisition: first the model is rewritten as blocks of subgraphs. Subgraphs which can be compiled by TorchDynamo are “flattened” and the other subgraphs (which might contain control-flow code or other unsupported Python constructs) will fall back to Eager-Mode. - 2. Graph lowering: all the PyTorch operations are decomposed into their constituent kernels specific to the chosen backend. - 3. Graph compilation, where the kernels call their corresponding low-level device-specific operations. - -9. **What new components does PT2.0 add to PT?** - - **TorchDynamo** generates FX Graphs from Python bytecode. It maintains the eager-mode capabilities using [guards](https://pytorch.org/docs/stable/torch.compiler_guards_overview.html#caching-and-guards-overview) to ensure the generated graphs are valid ([read more](https://dev-discuss.pytorch.org/t/torchdynamo-an-experiment-in-dynamic-python-bytecode-transformation/361)) - - **AOTAutograd** to generate the backward graph corresponding to the forward graph captured by TorchDynamo ([read more](https://dev-discuss.pytorch.org/t/torchdynamo-update-6-training-support-with-aotautograd/570)). - - **PrimTorch** to decompose complicated PyTorch operations into simpler and more elementary ops ([read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-2/645)). - - **\[Backend]** Backends integrate with TorchDynamo to compile the graph into IR that can run on accelerators. For example, **TorchInductor** compiles the graph to either **Triton** for GPU execution or **OpenMP** for CPU execution ([read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747)). - -10. **What compiler backends does 2.0 currently support?** -The default and the most complete backend is [TorchInductor](https://github.com/pytorch/pytorch/tree/master/torch/_inductor), but TorchDynamo has a growing list of backends that can be found by calling `torchdynamo.list_backends()`. - -11. **How does distributed training work with 2.0?** -DDP and FSDP in Compiled mode can run up to 15% faster than Eager-Mode in FP32 and up to 80% faster in AMP precision. PT2.0 does some extra optimization to ensure DDP’s communication-computation overlap works well with Dynamo’s partial graph creation. Ensure you run DDP with static_graph=False. More details [here](https://dev-discuss.pytorch.org/t/torchdynamo-update-9-making-ddp-work-with-torchdynamo/860). - -12. **How can I learn more about PT2.0 developments?** -The [PyTorch Developers forum](http://dev-discuss.pytorch.org/) is the best place to learn about 2.0 components directly from the developers who build them. - -13. **Help my code is running slower with 2.0’s Compiled Mode!** -The most likely reason for performance hits is too many graph breaks. For instance, something innocuous as a print statement in your model’s forward triggers a graph break. We have ways to diagnose these - read more [here](https://pytorch.org/docs/stable/torch.compiler_faq.html#why-am-i-not-seeing-speedups). - -14. **My previously-running code is crashing with 2.0’s Compiled Mode! How do I debug it?** -Here are some techniques to triage where your code might be failing, and printing helpful logs: [https://pytorch.org/docs/stable/torch.compiler_faq.html#why-is-my-code-crashing](https://pytorch.org/docs/stable/torch.compiler_faq.html#why-is-my-code-crashing). - -## Ask the Engineers: 2.0 Live Q&A Series - -We will be hosting a series of live Q&A sessions for the community to have deeper questions and dialogue with the experts. Please check back to see the full calendar of topics throughout the year. If you are unable to attend: 1) They will be recorded for future viewing and 2) You can attend our Dev Infra Office Hours every Friday at 10 AM PST @ [https://github.com/pytorch/pytorch/wiki/Dev-Infra-Office-Hours](https://github.com/pytorch/pytorch/wiki/Dev-Infra-Office-Hours). - -Please click [here](https://pytorchconference22.splashthat.com/) to see dates, times, descriptions and links. - -Disclaimer: Please do not share your personal information, last name, company when joining the live sessions and submitting questions. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TOPICHOST
The new developer experience of using 2.0 (install, setup, clone an example, run with 2.0)Suraj Subramanian
- LinkedIn | - Twitter -
PT2 Profiling and DebuggingBert Maher
- LinkedIn | - Twitter -
A deep dive on TorchInductor and PT 2.0 Backend IntegrationNatalia Gimelshein, Bin Bao and Sherlock Huang
- Natalia Gimelshein
- LinkedIn
- Sherlock Huang
- LinkedIn -
Extend PyTorch without C++ and functorch: JAX-like composable function transforms for PyTorchAnjali Chourdia and Samantha Andow
- Anjali Chourdia
- LinkedIn | - Twitter
- Samantha Andow
- LinkedIn | - Twitter -
A deep dive on TorchDynamoMichael Voznesensky
- LinkedIn -
Rethinking data loading with TorchData:Datapipes and Dataloader2Kevin Tse
- LinkedIn -
Composable training (+ torcheval, torchsnapshot)Ananth Subramaniam
How and why contribute code and tutorials to PyTorchZain Rizvi, Svetlana Karslioglu and Carl Parker
- Zain Rizvi
- LinkedIn | - Twitter
- Svetlana Karslioglu
- LinkedIn | - Twitter -
Dynamic Shapes and Calculating Maximum Batch SizeEdward Yang and Elias Ellison
- Edward Yang
- Twitter -
PyTorch 2.0 Export: Sound Whole Graph Capture for PyTorchMichael Suo and Yanan Cao
- Yanan Cao
- LinkedIn -
2-D Parallelism using DistributedTensor and PyTorch DistributedTensorWanchao Liang and Alisson Gusatti Azzolini
- Wanchao Liang
- LinkedIn | - Twitter
- Alisson Gusatti Azzolini
- LinkedIn -
TorchRec and FSDP in ProductionDennis van der Staay, Andrew Gu and Rohan Varma
- Dennis van der Staay
- LinkedIn
- Rohan Varma
- LinkedIn | - Twitter -
The Future of PyTorch On-DeviceRaziel Alvarez Guevara
- LinkedIn | - Twitter -
TorchMultiModal
- Intro Blog
- Scaling Blog
Kartikay Khandelwal
- LinkedIn | - Twitter -
BetterTransformers (+ integration with Hugging Face), Model Serving and Optimizations
- Blog 1
- Github
Hamid Shojanazeri and Mark Saroufim
- Mark Saroufim
- LinkedIn | - Twitter -
PT2 and DistributedWill Constable
- LinkedIn -
- -## Watch the Talks from PyTorch Conference - -- [TorchDynamo](https://www.youtube.com/watch?v=vbtGZL7IrAw) -- [TorchInductor](https://www.youtube.com/watch?v=vbtGZL7IrAw) -- [Dynamic Shapes](https://www.youtube.com/watch?v=vbtGZL7IrAw) -- [Export Path](https://www.youtube.com/watch?v=vbtGZL7IrAw) - - diff --git a/_hub b/_hub deleted file mode 160000 index 3fae0ff2f1d6..000000000000 --- a/_hub +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3fae0ff2f1d6e4cfc4004a213c9499fa7d996f16 diff --git a/_includes/analytics.html b/_includes/analytics.html deleted file mode 100644 index fb09ac21df90..000000000000 --- a/_includes/analytics.html +++ /dev/null @@ -1,13 +0,0 @@ - - - diff --git a/_includes/blog_modal.html b/_includes/blog_modal.html deleted file mode 100644 index eea315a4b91e..000000000000 --- a/_includes/blog_modal.html +++ /dev/null @@ -1,18 +0,0 @@ - - -
- -
- - diff --git a/_includes/community_events.html b/_includes/community_events.html deleted file mode 100644 index c9d97e6c0d09..000000000000 --- a/_includes/community_events.html +++ /dev/null @@ -1,10 +0,0 @@ - diff --git a/_includes/compact_hub_cards.html b/_includes/compact_hub_cards.html deleted file mode 100644 index 6abcd928d420..000000000000 --- a/_includes/compact_hub_cards.html +++ /dev/null @@ -1,29 +0,0 @@ -
-
- {% assign hub = site.hub | where: "category", include.category | sort: "order" %} - - {% for item in hub %} - - {% endfor %} -
-
diff --git a/_includes/contributor_side_nav.html b/_includes/contributor_side_nav.html deleted file mode 100644 index 39a8d3b13f00..000000000000 --- a/_includes/contributor_side_nav.html +++ /dev/null @@ -1,55 +0,0 @@ -
-
-
    - {% assign past_issues = site.past_issues | sort_natural: "date" | reverse %} - - {% for item in past_issues %} - {% assign currentdate = item.date | date: "%B %Y" %} - {% if currentdate != date %} - {% assign date = currentdate %} - {% endif %} - - - {% endfor %} -
-
-
- - - diff --git a/_includes/cookie_banner.html b/_includes/cookie_banner.html deleted file mode 100644 index 27e1820f4765..000000000000 --- a/_includes/cookie_banner.html +++ /dev/null @@ -1,6 +0,0 @@ - diff --git a/_includes/deep_learning_event_tracking.html b/_includes/deep_learning_event_tracking.html deleted file mode 100644 index 65f8f1ab4a0a..000000000000 --- a/_includes/deep_learning_event_tracking.html +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/_includes/deep_learning_form.html b/_includes/deep_learning_form.html deleted file mode 100644 index d5b5baa76030..000000000000 --- a/_includes/deep_learning_form.html +++ /dev/null @@ -1,22 +0,0 @@ -
- -
- - diff --git a/_includes/ecosystem_form.html b/_includes/ecosystem_form.html deleted file mode 100644 index a816be567fc5..000000000000 --- a/_includes/ecosystem_form.html +++ /dev/null @@ -1,17 +0,0 @@ -
- - - -
\ No newline at end of file diff --git a/_includes/ecosystem_sort.html b/_includes/ecosystem_sort.html deleted file mode 100644 index 3791244bae47..000000000000 --- a/_includes/ecosystem_sort.html +++ /dev/null @@ -1,41 +0,0 @@ - - Sort - - - diff --git a/_includes/educational_courses_module.html b/_includes/educational_courses_module.html deleted file mode 100644 index 23c0bfeb9493..000000000000 --- a/_includes/educational_courses_module.html +++ /dev/null @@ -1,26 +0,0 @@ -
-
-
-
-

Educational Courses

- - See all Courses - -
-
- -
- {% assign courses = site.courses | sort: 'order' %} - - {% for course in courses limit: 4 %} - - {% endfor %} -
-
-
diff --git a/_includes/events_side_nav.html b/_includes/events_side_nav.html deleted file mode 100644 index a616af89c66d..000000000000 --- a/_includes/events_side_nav.html +++ /dev/null @@ -1,48 +0,0 @@ -
-
    - {% assign events = site.events | where: "category", "event" | sort_natural: "date" | reverse %} - - {% for item in events %} - - {% endfor %} -
-
- - - diff --git a/_includes/footer.html b/_includes/footer.html deleted file mode 100644 index a74402d61751..000000000000 --- a/_includes/footer.html +++ /dev/null @@ -1,96 +0,0 @@ -
-
-
-
-

Docs

-

Access comprehensive developer documentation for PyTorch

- View Docs -
- -
-

Tutorials

-

Get in-depth tutorials for beginners and advanced developers

- View Tutorials -
- -
-

Resources

-

Find development resources and get your questions answered

- View Resources -
-
-
-
- -
- -
- -{% include mobile_menu.html %} - -{% include footer_scripts.html %} - -{% include cookie_banner.html %} diff --git a/_includes/footer_scripts.html b/_includes/footer_scripts.html deleted file mode 100644 index 97c5ed3909f0..000000000000 --- a/_includes/footer_scripts.html +++ /dev/null @@ -1,42 +0,0 @@ - - - -{% if page.layout != "deep_learning" %} - -{% endif %} - - - - -{% if jekyll.environment == 'production' %} - - -{% endif %} diff --git a/_includes/get_started_locally.html b/_includes/get_started_locally.html deleted file mode 100644 index d68edadeb9ad..000000000000 --- a/_includes/get_started_locally.html +++ /dev/null @@ -1,12 +0,0 @@ -
-
- {% include get_started_locally_side_nav.html %} -
-
-
-
- {{ content }} -
-
-
-
diff --git a/_includes/get_started_locally_side_nav.html b/_includes/get_started_locally_side_nav.html deleted file mode 100644 index c90b6a302a6f..000000000000 --- a/_includes/get_started_locally_side_nav.html +++ /dev/null @@ -1,5 +0,0 @@ -
-

Shortcuts

-
    -
    - diff --git a/_includes/get_started_via_cloud.html b/_includes/get_started_via_cloud.html deleted file mode 100644 index ba08423cafee..000000000000 --- a/_includes/get_started_via_cloud.html +++ /dev/null @@ -1,12 +0,0 @@ -
    -
    - {% include get_started_via_cloud_side_nav.html %} -
    -
    -
    -
    - {{ content }} -
    -
    -
    -
    diff --git a/_includes/get_started_via_cloud_side_nav.html b/_includes/get_started_via_cloud_side_nav.html deleted file mode 100644 index cb088dd3097c..000000000000 --- a/_includes/get_started_via_cloud_side_nav.html +++ /dev/null @@ -1,3 +0,0 @@ -
    -
      -
      diff --git a/_includes/google_pixel.html b/_includes/google_pixel.html deleted file mode 100644 index db996dd5dc69..000000000000 --- a/_includes/google_pixel.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/_includes/head.html b/_includes/head.html deleted file mode 100644 index b86b1e202467..000000000000 --- a/_includes/head.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - - - - {% if page.title %} - {{ page.title }} | PyTorch - {% else %} - PyTorch - {% endif %} - - {% include open_graph_and_meta.html %} - - - - - - - - - {% if jekyll.environment == 'production' %} - {% include pixel.html %} - {% include twitter_pixel.html %} - {% endif %} - - - diff --git a/_includes/header.html b/_includes/header.html deleted file mode 100644 index 45c484e1845e..000000000000 --- a/_includes/header.html +++ /dev/null @@ -1,10 +0,0 @@ -
      -
      - Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. -
      -
      -
      -
      - {% include nav.html %} -
      -
      diff --git a/_includes/hub_cards.html b/_includes/hub_cards.html deleted file mode 100644 index 38d367aa52b9..000000000000 --- a/_includes/hub_cards.html +++ /dev/null @@ -1,34 +0,0 @@ -
      -
      - {% assign hub = site.hub | where: "category", include.category | sort: "order" %} - - {% for item in hub %} - - {% endfor %} -
      - - -
      diff --git a/_includes/hub_developer_tags_and_cards.html b/_includes/hub_developer_tags_and_cards.html deleted file mode 100644 index 695757dfcf47..000000000000 --- a/_includes/hub_developer_tags_and_cards.html +++ /dev/null @@ -1,33 +0,0 @@ - - -
      - -{% if page.compact == true %} - - {% include compact_hub_cards.html category="developers" %} - -{% else %} - - {% include hub_cards.html category="developers" %} - -{% endif %} diff --git a/_includes/hub_icons.html b/_includes/hub_icons.html deleted file mode 100644 index 46ec4cd0c78c..000000000000 --- a/_includes/hub_icons.html +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/_includes/hub_model_tags.html b/_includes/hub_model_tags.html deleted file mode 100644 index cadf01c94687..000000000000 --- a/_includes/hub_model_tags.html +++ /dev/null @@ -1,12 +0,0 @@ -
      - -
      diff --git a/_includes/hub_researcher_tags_and_cards.html b/_includes/hub_researcher_tags_and_cards.html deleted file mode 100644 index 375e65a8bdde..000000000000 --- a/_includes/hub_researcher_tags_and_cards.html +++ /dev/null @@ -1,33 +0,0 @@ - - -
      - -{% if page.compact == true %} - - {% include compact_hub_cards.html category="researchers" %} - -{% else %} - - {% include hub_cards.html category="researchers" %} - -{% endif %} diff --git a/_includes/hub_search.html b/_includes/hub_search.html deleted file mode 100644 index 43e227f66437..000000000000 --- a/_includes/hub_search.html +++ /dev/null @@ -1,11 +0,0 @@ -
      -
      -
      - - -
      -
      diff --git a/_includes/hub_sort.html b/_includes/hub_sort.html deleted file mode 100644 index 50a455586991..000000000000 --- a/_includes/hub_sort.html +++ /dev/null @@ -1,17 +0,0 @@ - - Sort - - - diff --git a/_includes/latest_episodes.html b/_includes/latest_episodes.html deleted file mode 100644 index ae0f982721a8..000000000000 --- a/_includes/latest_episodes.html +++ /dev/null @@ -1,36 +0,0 @@ -

      Latest Episodes

      -
      - {% assign events = site.events | where: "category", "live-stream" | sort_natural: "date" | reverse %} - {% capture now %}{{'now' | date: '%s' | plus: 0 %}}{% endcapture %} - {% for item in events | where: "type" == "live-stream" | sort: "date" %} - {% capture date %}{{item.date | date: '%s' | plus: 0 %}}{% endcapture %} - {% if date <= now %} -
      -
      - {{ item.title }} -
        -
      • {{ item.guest }}
      • -
      • {{ item.company }}
      • -
      • -
          -
        • {{ item.date | date: "%m/%d/%Y" }}
        • - | -
        • {{ item.time }}
        • -
        -
      • - -
      -
        - {% if item.poster %} -
      • Poster
      • - {% endif %} - {% if item.video %} -
      • Watch
      • - {% endif %} -
      -
      -
      - {% endif %} - {% endfor %} -
      - \ No newline at end of file diff --git a/_includes/live_event_video.html b/_includes/live_event_video.html deleted file mode 100644 index 23a40b1ee006..000000000000 --- a/_includes/live_event_video.html +++ /dev/null @@ -1,12 +0,0 @@ -
      -
      -
      - {{item.video}} -
      -
      -

      {{ item.title }}

      -

      {{ item.summary }}

      - CTA to Video -
      -
      -
      diff --git a/_includes/live_events.html b/_includes/live_events.html deleted file mode 100644 index e506c78bf17c..000000000000 --- a/_includes/live_events.html +++ /dev/null @@ -1,16 +0,0 @@ -
      -
      - {% include events_side_nav.html %} -
      -
      -
      -
      - {% assign events = site.events | where: "category", "event" | sort_natural: "date" | reverse %} - {% capture now %}{{'now' | date: '%s' | plus: 0 %}}{% endcapture %} - {% capture date %}{{item.date | date: '%s' | plus: 0 %}}{% endcapture %} - {% include upcoming-live-events.html %} - {% include past-live-events.html %} -
      -
      -
      -
      diff --git a/_includes/live_stream.html b/_includes/live_stream.html deleted file mode 100644 index d372e63ec7d9..000000000000 --- a/_includes/live_stream.html +++ /dev/null @@ -1,13 +0,0 @@ - diff --git a/_includes/main_menu.html b/_includes/main_menu.html deleted file mode 100644 index 46cc727fedf5..000000000000 --- a/_includes/main_menu.html +++ /dev/null @@ -1,197 +0,0 @@ - - - diff --git a/_includes/march_2021.md b/_includes/march_2021.md deleted file mode 100644 index e33189a9f2c8..000000000000 --- a/_includes/march_2021.md +++ /dev/null @@ -1,39 +0,0 @@ - -# Issue \#1 - -Welcome to the first issue of the PyTorch Contributors newsletter! Keeping track of everything that’s happening in the PyTorch developer world is a big task; here you will find curated news including RFCs, feature roadmaps, notable PRs, editorials from developers, and more. If you have questions or suggestions for the newsletter, we'd love to [hear from you](https://forms.gle/2KApHZa3oDHuAQ288) - -## PyTorch 1.8.0 - -PyTorch 1.8 was released on March 4th with support for functional transformations using `torch.fx`, stabilized frontend APIs for scientific computing (`torch.fft`, `torch.linalg`, Autograd for complex tensors) and significant improvements to distributed training. Read the full [Release Notes](https://github.com/pytorch/pytorch/releases/tag/v1.8.0){:target="_blank"}. - -## PyTorch Ecosystem Day - -On April 21, we’re hosting a virtual event for our ecosystem and industry communities to showcase their work and discover new opportunities to collaborate. The day will be filled with discussion on new developments, trends, challenges and best practices through posters, breakout sessions and networking. - -## [The PyTorch open source process](http://blog.ezyang.com/2021/01/pytorch-open-source-process/){:target="_blank"} - -[@ezyang](https://github.com/ezyang){:target="_blank"} describes the challenges of maintaining a PyTorch-scale project, and the current open source processes (triaging and CI oncalls, RFC discussions) to help PyTorch operate effectively. - -## Developers forum - -We launched https://dev-discuss.pytorch.org/ a low-traffic high-signal forum for long-form discussions about PyTorch internals. - -## [RFC] [Dataloader v2](https://github.com/pytorch/pytorch/issues/49440) - -[@VitalyFedyunin](https://github.com/VitalyFedyunin) proposes redesigning the DataLoader to support lazy loading, sharding, pipelining data operations (including async) and shuffling & sampling in a more modular way. Join the discussion [here](https://github.com/pytorch/pytorch/issues/49440). - -## [RFC] [Improving TorchScript Usability](https://dev-discuss.pytorch.org/t/torchscript-usability/55) - -In a series of 3 blog posts ([1](https://lernapparat.de/scripttorch/), [2](https://lernapparat.de/jit-python-graphops/), [3](https://lernapparat.de/jit-fallback/)) [@t-vi](https://github.com/t-vi) explores ideas to improve the user and developer experience of TorchScript. - -## [RFC] [CSR and DM storage formats for sparse tensors](https://github.com/pytorch/rfcs/pull/13) - -[@pearu](https://github.com/pearu) proposes an [RFC](https://github.com/pytorch/rfcs/pull/13) to make linear algebra operations more performant by - -- implementing the CSR storage format, where a 2D array is defined by shape and 1D tensors for compressed row indices, column indices, and values (PyTorch 1D tensor) -- introducing the Dimension Mapping storage format that generalizes a 2D CSR to multidimensional arrays using a bijective mapping between the storage and wrapper elements. - -## [RFC] [Forward Mode AD](https://github.com/pytorch/rfcs/pull/11) - -[@albanD](https://github.com/albanD) proposes an [RFC](https://github.com/pytorch/rfcs/pull/11) to implement forward mode autodiff using Tensor-based [dual numbers](https://blog.demofox.org/2014/12/30/dual-numbers-automatic-differentiation/), where the real part represents the tensor and the *dual* part stores the forward gradient of the tensor. The core of the feature has landed [(PR)](https://github.com/pytorch/pytorch/pull/49734), with more formulas in WIP. Complete forward mode AD is expected to land by July 2021. diff --git a/_includes/mobile_menu.html b/_includes/mobile_menu.html deleted file mode 100644 index 70e11e57ec2a..000000000000 --- a/_includes/mobile_menu.html +++ /dev/null @@ -1,154 +0,0 @@ -
      -
      -
      -
      - - -
      -
      -
      - - -
      diff --git a/_includes/mobile_page_side_nav.html b/_includes/mobile_page_side_nav.html deleted file mode 100644 index ead73160240e..000000000000 --- a/_includes/mobile_page_side_nav.html +++ /dev/null @@ -1,5 +0,0 @@ -
      -

      Shortcuts

      -
        -
        - diff --git a/_includes/nav.html b/_includes/nav.html deleted file mode 100644 index d6291beb5b90..000000000000 --- a/_includes/nav.html +++ /dev/null @@ -1,9 +0,0 @@ -{% assign current = page.url | downcase | remove: ".html" | split: '/' %} - -
        - - - {% include main_menu.html %} - - -
        diff --git a/_includes/news_banner_info.html b/_includes/news_banner_info.html deleted file mode 100644 index 13d92d18516f..000000000000 --- a/_includes/news_banner_info.html +++ /dev/null @@ -1,33 +0,0 @@ -{% assign news_collection_size = site.news.size %} - -{% if news_collection_size == 1 %} - - - -{% elsif news_item.order == news_collection_size %} - - - - - -{% elsif news_item.order > 1 %} - - - - - -{% else %} - - - - - -{% endif %} diff --git a/_includes/newsletter_subscribe_form.html b/_includes/newsletter_subscribe_form.html deleted file mode 100644 index 2555d16239f9..000000000000 --- a/_includes/newsletter_subscribe_form.html +++ /dev/null @@ -1,28 +0,0 @@ -
        - -
        diff --git a/_includes/open_graph_and_meta.html b/_includes/open_graph_and_meta.html deleted file mode 100644 index 8feeeeac8c24..000000000000 --- a/_includes/open_graph_and_meta.html +++ /dev/null @@ -1,18 +0,0 @@ - - - -{% if page.featured-img %} - - -{% else %} - - -{% endif %} - - - - - - - - diff --git a/_includes/pagination_buttons.html b/_includes/pagination_buttons.html deleted file mode 100644 index bb32f85da452..000000000000 --- a/_includes/pagination_buttons.html +++ /dev/null @@ -1,30 +0,0 @@ -
        -
          - {% if paginator.previous_page %} -
        • - {% if page.title contains 'Blog' %} - Previous - {% else %} - Previous - {% endif %} -
        • - {% else %} -
        • - Previous -
        • - {% endif %} - - {% if paginator.next_page %} - {% if page.title contains 'Blog' %} -
        • Next - {% else %} -
        • Next - {% endif %} -
        • - {% else %} -
        • - Next -
        • - {% endif %} -
        -
        diff --git a/_includes/past-live-events.html b/_includes/past-live-events.html deleted file mode 100644 index bc3a8ab03eab..000000000000 --- a/_includes/past-live-events.html +++ /dev/null @@ -1,22 +0,0 @@ -

        Past Events

        -{% assign events = site.events | where: "category", "event" | sort_natural: "date" | reverse %} -{% capture now %}{{'now' | date: '%s' | plus: 0 %}}{% endcapture %} -{% for item in events %} - {% capture date %}{{item.date | date: '%s' | plus: 0 %}}{% endcapture %} - {% if date <= now %} -
        - {% assign events = site.events %} - {% capture date %}{{item.date | date: '%s' | plus: 0 %}}{% endcapture %} -
        -

        {{ item.title }}

        - {% if item.header-image And item.header-image != "" And item.header-image != nil %} - - {% endif %} -

        {{ item.content | markdownify }}

        - {% if item.video %} - {% include live_event_video.html %} - {% endif %} -
        -
        - {% endif %} -{% endfor %} diff --git a/_includes/past_issues.html b/_includes/past_issues.html deleted file mode 100644 index e3d20d5d7795..000000000000 --- a/_includes/past_issues.html +++ /dev/null @@ -1,17 +0,0 @@ - diff --git a/_includes/pixel.html b/_includes/pixel.html deleted file mode 100644 index 258a3a209ab7..000000000000 --- a/_includes/pixel.html +++ /dev/null @@ -1,18 +0,0 @@ - - - diff --git a/_includes/podcast.html b/_includes/podcast.html deleted file mode 100644 index 20f63324749e..000000000000 --- a/_includes/podcast.html +++ /dev/null @@ -1,26 +0,0 @@ - diff --git a/_includes/production.html b/_includes/production.html deleted file mode 100644 index f38e5cd7d63f..000000000000 --- a/_includes/production.html +++ /dev/null @@ -1,505 +0,0 @@ -
        -
        -
        - {% include production_side_nav.html %} -
        -
        -
        -
        - - - - - - - - - - - - - - - - -
        -
        -
        -
        -
        diff --git a/_includes/production_side_nav.html b/_includes/production_side_nav.html deleted file mode 100644 index dd0af6c8a834..000000000000 --- a/_includes/production_side_nav.html +++ /dev/null @@ -1,61 +0,0 @@ - - - - - diff --git a/_includes/pytorch-side-nav.html b/_includes/pytorch-side-nav.html deleted file mode 100644 index 5d4cbf1c2417..000000000000 --- a/_includes/pytorch-side-nav.html +++ /dev/null @@ -1,59 +0,0 @@ - - \ No newline at end of file diff --git a/_includes/pytorch.html b/_includes/pytorch.html deleted file mode 100644 index d2b93cfe1257..000000000000 --- a/_includes/pytorch.html +++ /dev/null @@ -1,9 +0,0 @@ -
        -
        {% include pytorch-side-nav.html %}
        -
        -
        -
        {{ content }}
        -
        -
        -
        - \ No newline at end of file diff --git a/_includes/quick-start-module.js b/_includes/quick-start-module.js deleted file mode 100644 index 345c1d0434bb..000000000000 --- a/_includes/quick-start-module.js +++ /dev/null @@ -1,284 +0,0 @@ -// Keys are Substrings as diplayed by navigator.platform -var supportedOperatingSystems = new Map([ - ['linux', 'linux'], - ['mac', 'macos'], - ['win', 'windows'], -]); - -var archInfoMap = new Map([ - ['cuda', {title: "CUDA", platforms: new Set(['linux', 'windows'])}], - ['rocm', {title: "ROCm", platforms: new Set(['linux'])}], - ['accnone', {title: "CPU", platforms: new Set(['linux', 'macos', 'windows'])}] -]); - -let version_map={{ ACC ARCH MAP }} -let stable_version={{ VERSION }}; - -var default_selected_os = getAnchorSelectedOS() || getDefaultSelectedOS(); -var opts = { - cuda: getPreferredCuda(default_selected_os), - os: default_selected_os, - pm: 'pip', - language: 'python', - ptbuild: 'stable', -}; - -var supportedCloudPlatforms = [ - 'aws', - 'google-cloud', - 'microsoft-azure', - 'lightning-studios', -]; - -var os = $(".os > .option"); -var package = $(".package > .option"); -var language = $(".language > .option"); -var cuda = $(".cuda > .option"); -var ptbuild = $(".ptbuild > .option"); - -os.on("click", function() { - selectedOption(os, this, "os"); -}); -package.on("click", function() { - selectedOption(package, this, "pm"); -}); -language.on("click", function() { - selectedOption(language, this, "language"); -}); -cuda.on("click", function() { - selectedOption(cuda, this, "cuda"); -}); -ptbuild.on("click", function() { - selectedOption(ptbuild, this, "ptbuild") -}); - -// Pre-select user's operating system -$(function() { - var userOsOption = document.getElementById(opts.os); - var userCudaOption = document.getElementById(opts.cuda); - if (userOsOption) { - $(userOsOption).trigger("click"); - } - if (userCudaOption) { - $(userCudaOption).trigger("click"); - } -}); - - -// determine os (mac, linux, windows) based on user's platform -function getDefaultSelectedOS() { - var platform = navigator.platform.toLowerCase(); - for (var [navPlatformSubstring, os] of supportedOperatingSystems.entries()) { - if (platform.indexOf(navPlatformSubstring) !== -1) { - return os; - } - } - // Just return something if user platform is not in our supported map - return supportedOperatingSystems.values().next().value; -} - -// determine os based on location hash -function getAnchorSelectedOS() { - var anchor = location.hash; - var ANCHOR_REGEX = /^#[^ ]+$/; - // Look for anchor in the href - if (!ANCHOR_REGEX.test(anchor)) { - return false; - } - // Look for anchor with OS in the first portion - var testOS = anchor.slice(1).split("-")[0]; - for (var [navPlatformSubstring, os] of supportedOperatingSystems.entries()) { - if (testOS.indexOf(navPlatformSubstring) !== -1) { - return os; - } - } - return false; -} - -// determine CUDA version based on OS -function getPreferredCuda(os) { - // Only CPU builds are currently available for MacOS - if (os == 'macos') { - return 'accnone'; - } - return 'cuda.x'; -} - -// Disable compute platform not supported on OS -function disableUnsupportedPlatforms(os) { - - if(opts.ptbuild == "preview") - archMap = version_map.nightly - else - archMap = version_map.release - - for (const [arch_key, info] of archInfoMap) { - var elems = document.querySelectorAll('[id^="'+arch_key+'"]'); - if (elems == null) { - console.log("Failed to find element for architecture " + arch_key); - return; - } - for (var i=0; i < elems.length;i++) { - var supported = info.platforms.has(os); - elems[i].style.textDecoration = supported ? "" : "line-through"; - - // Officially supported arch but not available - if(!archMap[elems[i].id]) { - elems[i].style.textDecoration = "line-through"; - } - } - } -} - -// Change compute versions depending on build type -function changeVersion(ptbuild) { - - if(ptbuild == "preview") - archMap = version_map.nightly - else - archMap = version_map.release - - for (const [arch_key, info] of archInfoMap) { - var elems = document.querySelectorAll('[id^="'+arch_key+'"]'); - for (var i=0; i < elems.length;i++) { - if(archMap[elems[i].id]) { - elems[i].style.textDecoration = ""; - elems[i].children[0].textContent = info.title + " " + archMap[elems[i].id][1] - } else { - elems[i].style.textDecoration = "line-through"; - } - } - } - var stable_element = document.getElementById("stable"); - stable_element.children[0].textContent = stable_version; -} - - - -// Change accnone name depending on OS type -function changeAccNoneName(osname) { - var accnone_element = document.getElementById("accnone"); - if (accnone_element == null) { - console.log("Failed to find accnone element"); - return; - } - if (osname == "macos") { - accnone_element.children[0].textContent = "Default"; - } else { - accnone_element.children[0].textContent = "CPU"; - } -} - -function selectedOption(option, selection, category) { - $(option).removeClass("selected"); - $(selection).addClass("selected"); - opts[category] = selection.id; - if (category === "pm") { - var elements = document.getElementsByClassName("language")[0].children; - if (selection.id !== "libtorch" && elements["cplusplus"].classList.contains("selected")) { - $(elements["cplusplus"]).removeClass("selected"); - $(elements["python"]).addClass("selected"); - opts["language"] = "python"; - } else if (selection.id == "libtorch") { - for (var i = 0; i < elements.length; i++) { - if (elements[i].id === "cplusplus") { - $(elements[i]).addClass("selected"); - opts["language"] = "cplusplus"; - } else { - $(elements[i]).removeClass("selected"); - } - } - } - } else if (category === "language") { - var elements = document.getElementsByClassName("package")[0].children; - if (selection.id !== "cplusplus" && elements["libtorch"].classList.contains("selected")) { - $(elements["libtorch"]).removeClass("selected"); - $(elements["pip"]).addClass("selected"); - opts["pm"] = "pip"; - } else if (selection.id == "cplusplus") { - for (var i = 0; i < elements.length; i++) { - if (elements[i].id === "libtorch") { - $(elements[i]).addClass("selected"); - opts["pm"] = "libtorch"; - } else { - $(elements[i]).removeClass("selected"); - } - } - } - } else if (category == "ptbuild") { - changeVersion(opts.ptbuild); - //make sure unsupported platforms are disabled - disableUnsupportedPlatforms(opts.os); - } - commandMessage(buildMatcher()); - if (category === "os") { - disableUnsupportedPlatforms(opts.os); - display(opts.os, 'installation', 'os'); - } - changeAccNoneName(opts.os); -} - -function display(selection, id, category) { - var container = document.getElementById(id); - // Check if there's a container to display the selection - if (container === null) { - return; - } - var elements = container.getElementsByClassName(category); - for (var i = 0; i < elements.length; i++) { - if (elements[i].classList.contains(selection)) { - $(elements[i]).addClass("selected"); - } else { - $(elements[i]).removeClass("selected"); - } - } -} - -function buildMatcher() { - return ( - opts.ptbuild.toLowerCase() + - "," + - opts.pm.toLowerCase() + - "," + - opts.os.toLowerCase() + - "," + - opts.cuda.toLowerCase() + - "," + - opts.language.toLowerCase() - ); -} - -// Cloud Partners sub-menu toggle listeners -$("[data-toggle='cloud-dropdown']").on("click", function(e) { - if ($(this).hasClass("open")) { - $(this).removeClass("open"); - // If you deselect a current drop-down item, don't display it's info any longer - display(null, 'cloud', 'platform'); - } else { - $("[data-toggle='cloud-dropdown'].open").removeClass("open"); - $(this).addClass("open"); - var cls = $(this).find(".cloud-option-body")[0].className; - for (var i = 0; i < supportedCloudPlatforms.length; i++) { - if (cls.includes(supportedCloudPlatforms[i])) { - display(supportedCloudPlatforms[i], 'cloud', 'platform'); - } - } - } -}); - -function commandMessage(key) { - var object = {{ installMatrix }}; - - if (!object.hasOwnProperty(key)) { - $("#command").html( - "
         # Follow instructions at this URL: https://github.com/pytorch/pytorch#from-source 
        " - ); - } else if (key.indexOf("lts") == 0 && key.indexOf('rocm') < 0) { - $("#command").html("
        " + object[key] + "
        "); - } else { - $("#command").html("
        " + object[key] + "
        "); - } -} - -// Set cuda version right away -changeVersion("stable") diff --git a/_includes/quick_start_cloud_options.html b/_includes/quick_start_cloud_options.html deleted file mode 100644 index 5951f7b71002..000000000000 --- a/_includes/quick_start_cloud_options.html +++ /dev/null @@ -1,58 +0,0 @@ -
        - - -
        -
        -
        - Google Cloud Platform -
        - - - - - -
        -
        - -
        -
        -
        -

        Microsoft Azure

        -
        - - -
        -
        - -
        -
        -
        - Lightning Studios -
        - -
        -
        -
        diff --git a/_includes/quick_start_local.html b/_includes/quick_start_local.html deleted file mode 100644 index 81bd69fbf1d4..000000000000 --- a/_includes/quick_start_local.html +++ /dev/null @@ -1,112 +0,0 @@ -

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should - be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. - Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also - install previous versions of PyTorch. Note that LibTorch is only available for C++. -

        - -

        NOTE: Latest PyTorch requires Python 3.9 or later.

        - -
        -
        -
        -
        PyTorch Build
        -
        -
        -
        Your OS
        -
        -
        -
        Package
        -
        -
        -
        Language
        -
        -
        -
        Compute Platform
        -
        -
        -
        Run this Command:
        -
        -
        - -
        -
        -
        -
        PyTorch Build
        -
        -
        -
        Stable (1.13.0)
        -
        -
        -
        Preview (Nightly)
        -
        -
        -
        -
        -
        Your OS
        -
        -
        -
        Linux
        -
        -
        -
        Mac
        -
        -
        -
        Windows
        -
        -
        -
        -
        -
        Package
        -
        -
        -
        Pip
        -
        -
        -
        LibTorch
        -
        -
        -
        Source
        -
        -
        -
        -
        -
        Language
        -
        -
        -
        Python
        -
        -
        -
        C++ / Java
        -
        -
        -
        -
        -
        Compute Platform
        -
        -
        -
        CUDA 11.8
        -
        -
        -
        CUDA 12.1
        -
        -
        -
        CUDA 12.4
        -
        -
        -
        ROCm 5.2
        -
        -
        -
        CPU
        -
        -
        -
        -
        -
        Run this Command:
        -
        -
        -
        pip install torch torchvision
        -
        -
        -
        -
        -
        diff --git a/_includes/quick_start_module.html b/_includes/quick_start_module.html deleted file mode 100644 index 7fabcb5c55f3..000000000000 --- a/_includes/quick_start_module.html +++ /dev/null @@ -1,26 +0,0 @@ -
        -
        -
        -
        - -

        Install PyTorch

        - - {% include quick_start_local.html %} - - - Previous versions of PyTorch - -
        - -
        -

        Quick Start With
        Cloud Partners

        - -

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        - - {% include quick_start_cloud_options.html %} -
        -
        -
        -
        - - diff --git a/_includes/research.html b/_includes/research.html deleted file mode 100644 index 220864e26d9a..000000000000 --- a/_includes/research.html +++ /dev/null @@ -1,186 +0,0 @@ - diff --git a/_includes/research_side_nav.html b/_includes/research_side_nav.html deleted file mode 100644 index a76229dc104e..000000000000 --- a/_includes/research_side_nav.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - diff --git a/_includes/sample_code_block.html b/_includes/sample_code_block.html deleted file mode 100644 index ffd0b1e03039..000000000000 --- a/_includes/sample_code_block.html +++ /dev/null @@ -1,12 +0,0 @@ -{% highlight python %} -#!/usr/bin/python3 - -# Simple while loop -a = 0 -while a < 15: - print(a, end=' ') - if a == 10: - print("made it to ten!!") - a = a + 1 -print() -{% endhighlight %} diff --git a/_includes/similar_posts_module.html b/_includes/similar_posts_module.html deleted file mode 100644 index 8c227730aa3e..000000000000 --- a/_includes/similar_posts_module.html +++ /dev/null @@ -1,40 +0,0 @@ -{% assign current_category = page.category.first %} - -{% if current_category != nil %} - -
        -
        - - - -
        - {% for category in current_category %} - {% for post in site.categories[category] limit: 3 %} - {% if post.url != page.url %} - -
        -

        {{ post.date | date: '%B %d, %Y' }}

        -

        - {{ post.title }} -

        -

        {{ post.excerpt | remove: '

        ' | remove: '

        ' | truncate: 150 }}

        -
        - {% endif %} - {% endfor %} - {% endfor %} -
        -
        -
        - -{% endif %} diff --git a/_includes/svgs/pytorch-language.svg b/_includes/svgs/pytorch-language.svg deleted file mode 100644 index da71062e1374..000000000000 --- a/_includes/svgs/pytorch-language.svg +++ /dev/null @@ -1,30 +0,0 @@ - - - - Group 5 - Created with Sketch. - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/_includes/tag_manager_script.html b/_includes/tag_manager_script.html deleted file mode 100644 index 1ba3f99cdea9..000000000000 --- a/_includes/tag_manager_script.html +++ /dev/null @@ -1,4 +0,0 @@ - - - diff --git a/_includes/twitter_pixel.html b/_includes/twitter_pixel.html deleted file mode 100644 index 3633c20fe253..000000000000 --- a/_includes/twitter_pixel.html +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/_includes/upcoming-live-events.html b/_includes/upcoming-live-events.html deleted file mode 100644 index d0d0bd02a865..000000000000 --- a/_includes/upcoming-live-events.html +++ /dev/null @@ -1,22 +0,0 @@ -

        Upcoming Events

        -{% assign events = site.events | where: "category", "event" | sort_natural: "date" %} -{% capture now %}{{'now' | date: '%s' | plus: 0 %}}{% endcapture %} -{% for item in events %} - {% capture date %}{{item.date | date: '%s' | plus: 0 %}}{% endcapture %} - {% if date >= now %} -
        - {% assign events = site.events %} - {% capture date %}{{item.date | date: '%s' | plus: 0 %}}{% endcapture %} -
        -

        {{ item.title }}

        - {% if item.header-image And item.header-image != "" And item.header-image != nil %} - - {% endif %} -

        {{ item.content | markdownify }}

        - {% if item.video %} - {% include live_event_video.html %} - {% endif %} -
        -
        - {% endif %} -{% endfor %} diff --git a/_includes/upcoming_episodes.html b/_includes/upcoming_episodes.html deleted file mode 100644 index 6a2fc529d8e2..000000000000 --- a/_includes/upcoming_episodes.html +++ /dev/null @@ -1,30 +0,0 @@ - diff --git a/_layouts/blog.html b/_layouts/blog.html deleted file mode 100644 index ca0613176c02..000000000000 --- a/_layouts/blog.html +++ /dev/null @@ -1,71 +0,0 @@ - - -{% include head.html %} - - - {% include tag_manager_script.html %} - {% include header.html %} - -
        - - {% assign posts = paginator.posts %} - {% assign display_post_categories = site.posts | map: 'categories' | join: ',' | replace: '-', ' ' | split: ',' | uniq | sort %} - {% assign current_page = page.url | downcase | remove: ".html" | split: '/' %} - {% assign post_categories = site.posts | map: 'categories' | join: ',' | split: ',' | uniq | sort %} - -
        -
        - {% for post in posts limit:1 %} -

        Featured Post

        -

        - {{ post.title }} -

        - - - - Read More - - - {% endfor %} -
        -
        - -
        -
        -
        -
        - - {% for post in posts %} - - {% if forloop.first %} - {% continue %} - {% endif %} - -
        -
        -

        {{ post.date | date: '%B %d, %Y' }}

        -

        - {{ post.title }} -

        -

        {{ post.excerpt | strip_html | truncate: 500}}

        - -
        - - Read More - -
        - {% endfor %} -
        - - {% include pagination_buttons.html %} -
        -
        -
        - - {% include quick_start_module.html %} - - {% include footer.html %} - - - - diff --git a/_layouts/blog_detail.html b/_layouts/blog_detail.html deleted file mode 100644 index eb80011a163b..000000000000 --- a/_layouts/blog_detail.html +++ /dev/null @@ -1,52 +0,0 @@ - - -{% include head.html %} - - - {% include tag_manager_script.html %} -
        -
        -
        - Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. -
        -
        - -
        -
        - {% include nav.html %} -
        -
        - -
        -
        -

        {{ page.date | date: '%B %d, %Y' }}

        -

        - {{ page.title }} -

        -
        -
        - -
        -
        -
        - -
        -

        - by - {% if page.author %} - {{ page.author }} - {% else %} - {{ site.default_author }} - {% endif %} -

        - {{ content }} -
        -
        -
        -
        - - - {% include footer.html %} - - - diff --git a/_layouts/deep_learning.html b/_layouts/deep_learning.html deleted file mode 100644 index bb9680d24a22..000000000000 --- a/_layouts/deep_learning.html +++ /dev/null @@ -1,52 +0,0 @@ - - - {% include head.html %} -
        -
        -
        - -
        -
        -
        - - - {% include tag_manager_script.html %} -
        - -
        -
        -
        -
        -

        Deep Learning
        with PyTorch

        - {% if page.deep-learning-landing == true %} -

        Download a free copy of the full book and learn how to get started with AI / ML development using PyTorch

        - {% else %} -

        Thanks for requesting a copy of the Deep Learning with PyTorch book! - Click here to download the book. -

        - {% endif %} -
        -
        - -
        -
        -
        -
        - -
        -
        -
        - - {{ content }} - -
        -
        -
        - - {% include footer.html %} - - {% if page.deep-learning-landing == false and jekyll.environment == 'production' %} - {% include deep_learning_event_tracking.html %} - {% endif %} - - diff --git a/_layouts/default.html b/_layouts/default.html deleted file mode 100644 index a652d10f1b78..000000000000 --- a/_layouts/default.html +++ /dev/null @@ -1,14 +0,0 @@ - - - {% include head.html %} - - {% include tag_manager_script.html %} - {% include header.html %} - -
        - - {{ content }} - - {% include footer.html %} - - diff --git a/_layouts/docs_redirect.html b/_layouts/docs_redirect.html deleted file mode 100755 index bbf46b1847cc..000000000000 --- a/_layouts/docs_redirect.html +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - Page Redirection - - - {% include tag_manager_script.html %} - If you are not redirected automatically, follow this link to the latest documentation. -
        - If you want to view documentation for a particular version, follow this link. - - diff --git a/_layouts/ecosystem_detail.html b/_layouts/ecosystem_detail.html deleted file mode 100644 index 295718b58280..000000000000 --- a/_layouts/ecosystem_detail.html +++ /dev/null @@ -1,69 +0,0 @@ - - - {% include head.html %} - - {% include tag_manager_script.html %} - {% include header.html %} - -
        - -
        -
        -

        {{ page.title }}

        - - {% include svgs/pytorch-language.svg %} - -

        {{ page.summary }}

        - - - {{ page.call-to-action }} - -
        -
        - -
        -
        -
        -
        - {{ content }} -
        -
        - -
        -
        -
        -
        -

        Similar Projects

        - - - See all Projects - -
        -
        - -
        - {% for item in site.ecosystem limit:3 %} - - {% endfor %} -
        -
        -
        -
        -
        - - {% include footer.html %} - - diff --git a/_layouts/general.html b/_layouts/general.html deleted file mode 100644 index 0a873f21244d..000000000000 --- a/_layouts/general.html +++ /dev/null @@ -1,25 +0,0 @@ - - -{% include head.html %} - - - {% include tag_manager_script.html %} -
        -
        - {% include nav.html %} -
        -
        - -
        -
        -
        -
        - {{ content }} -
        -
        -
        -
        - - {% include footer.html %} - - diff --git a/_layouts/get_started.html b/_layouts/get_started.html deleted file mode 100644 index 66c887cf42e4..000000000000 --- a/_layouts/get_started.html +++ /dev/null @@ -1,61 +0,0 @@ - - - {% include head.html %} - - {% include tag_manager_script.html %} {% include header.html %} - -
        - - {% assign items = site.get_started | where: "published",true | sort: "order" - %} - -
        -
        -

        Get Started

        - -

        - Select preferences and run the command to install PyTorch locally, or - get started quickly with one of the supported cloud platforms. -

        -
        -
        - -
        -
        - - -
        - {% if page.get-started-locally == true %} {% include - get_started_locally.html %} {% elsif page.order == 2 %} {% include - pytorch.html %} {% elsif page.get-started-via-cloud == true %} {% - include get_started_via_cloud.html %} {% else %} - -
        -
        {{ content }}
        -
        - - {% endif %} -
        -
        -
        - - {% include footer.html %} - - diff --git a/_layouts/hub_detail.html b/_layouts/hub_detail.html deleted file mode 100644 index 0adaa77d299a..000000000000 --- a/_layouts/hub_detail.html +++ /dev/null @@ -1,69 +0,0 @@ - - - {% include head.html %} - - {% include tag_manager_script.html %} - {% include header.html %} - -
        - -
        -
        - - {% if page.category == 'researchers' %} - < - {% else %} - < - {% endif %} - -

        - {{ page.title }} -

        - -
        -
        -

        By {{ page.author }}

        -
        - -
        -

        {{ page.summary }}

        -
        - - - {% if page.demo-model-link %} - {% if page.demo-model-button-text == blank or page.demo-model-button-text == nil %} - - {% else %} - - {% endif %} - {% endif %} -
        -
        -
        -
        -
        - -
        -
        -
        -
        -
        - - -
        -
        - -
        -
        -
        -
        -
        - - {% include footer.html %} - - - - diff --git a/_layouts/hub_index.html b/_layouts/hub_index.html deleted file mode 100644 index 989bd0fa94ed..000000000000 --- a/_layouts/hub_index.html +++ /dev/null @@ -1,51 +0,0 @@ - - - {% include head.html %} - - {% include tag_manager_script.html %} - {% include header.html %} - -
        - -
        -
        -

        - PyTorch Hub
        - {{ page.title }} -

        - -

        {{ page.summary }}

        -
        -
        - -
        -
        -
        - -
        -
        - {{content}} -
        -
        -
        -
        -
        - - {% include footer.html %} - - - - - - - -{% if page.compact == true %} - -
        - -{% else %} -
        - -{% endif %} - - diff --git a/_layouts/marketo_email.html b/_layouts/marketo_email.html deleted file mode 100644 index 0a873f21244d..000000000000 --- a/_layouts/marketo_email.html +++ /dev/null @@ -1,25 +0,0 @@ - - -{% include head.html %} - - - {% include tag_manager_script.html %} -
        -
        - {% include nav.html %} -
        -
        - -
        -
        -
        -
        - {{ content }} -
        -
        -
        -
        - - {% include footer.html %} - - diff --git a/_layouts/mobile.html b/_layouts/mobile.html deleted file mode 100644 index 840a42b5f479..000000000000 --- a/_layouts/mobile.html +++ /dev/null @@ -1,57 +0,0 @@ - - - {% include head.html %} - - {% include tag_manager_script.html %} - {% include header.html %} - -
        - - {% assign mobile_items = site.mobile | where: "published",true | sort: "order" %} - -
        -
        -

        PyTorch Mobile

        - -

        End-to-end workflow from Training to Deployment for iOS and Android mobile devices

        -
        -
        - -
        -
        - - -
        -
        -
        - {% include mobile_page_side_nav.html %} -
        -
        -
        -
        - {{ content }} -
        -
        -
        -
        -
        -
        -
        - - {% include footer.html %} - - - - diff --git a/_mobile/android.md b/_mobile/android.md deleted file mode 100644 index 0acaeb333138..000000000000 --- a/_mobile/android.md +++ /dev/null @@ -1,431 +0,0 @@ ---- -layout: mobile -title: Android -permalink: /mobile/android/ -background-class: mobile-background -body-class: mobile -order: 3 -published: true ---- - -
        -

        Note

        -

        PyTorch Mobile is no longer actively supported. Please check out ExecuTorch, PyTorch’s all-new on-device inference library. You can also review this page to learn more about how to use ExecuTorch to build an Android app.

        -
        - -# Android - -## Quickstart with a HelloWorld Example - -[HelloWorld](https://github.com/pytorch/android-demo-app/tree/master/HelloWorldApp) is a simple image classification application that demonstrates how to use PyTorch Android API. -This application runs TorchScript serialized TorchVision pretrained resnet18 model on static image which is packaged inside the app as android asset. - -#### 1. Model Preparation - -Let’s start with model preparation. If you are familiar with PyTorch, you probably should already know how to train and save your model. In case you don’t, we are going to use a pre-trained image classification model ([MobileNetV2](https://pytorch.org/hub/pytorch_vision_mobilenet_v2/)). -To install it, run the command below: -``` -pip install torchvision -``` - -To serialize the model you can use python [script](https://github.com/pytorch/android-demo-app/blob/master/HelloWorldApp/trace_model.py) in the root folder of HelloWorld app: -``` -import torch -import torchvision -from torch.utils.mobile_optimizer import optimize_for_mobile - -model = torchvision.models.mobilenet_v2(pretrained=True) -model.eval() -example = torch.rand(1, 3, 224, 224) -traced_script_module = torch.jit.trace(model, example) -traced_script_module_optimized = optimize_for_mobile(traced_script_module) -traced_script_module_optimized._save_for_lite_interpreter("app/src/main/assets/model.ptl") - -``` -If everything works well, we should have our model - `model.ptl` generated in the assets folder of android application. -That will be packaged inside android application as `asset` and can be used on the device. - -More details about TorchScript you can find in [tutorials on pytorch.org](https://pytorch.org/docs/stable/jit.html) - -#### 2. Cloning from github -``` -git clone https://github.com/pytorch/android-demo-app.git -cd HelloWorldApp -``` -If [Android SDK](https://developer.android.com/studio/index.html#command-tools) and [Android NDK](https://developer.android.com/ndk/downloads) are already installed you can install this application to the connected android device or emulator with: -``` -./gradlew installDebug -``` - -We recommend you to open this project in [Android Studio 3.5.1+](https://developer.android.com/studio). At the moment PyTorch Android and demo applications use [android gradle plugin of version 3.5.0](https://developer.android.com/studio/releases/gradle-plugin#3-5-0), which is supported only by Android Studio version 3.5.1 and higher. -Using Android Studio you will be able to install Android NDK and Android SDK with Android Studio UI. - -#### 3. Gradle dependencies - -Pytorch android is added to the HelloWorld as [gradle dependencies](https://github.com/pytorch/android-demo-app/blob/master/HelloWorldApp/app/build.gradle#L28-L29) in build.gradle: - -``` -repositories { - jcenter() -} - -dependencies { - implementation 'org.pytorch:pytorch_android_lite:1.9.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.9.0' -} -``` -Where `org.pytorch:pytorch_android` is the main dependency with PyTorch Android API, including libtorch native library for all 4 android abis (armeabi-v7a, arm64-v8a, x86, x86_64). -Further in this doc you can find how to rebuild it only for specific list of android abis. - -`org.pytorch:pytorch_android_torchvision` - additional library with utility functions for converting `android.media.Image` and `android.graphics.Bitmap` to tensors. - -#### 4. Reading image from Android Asset - -All the logic happens in [`org.pytorch.helloworld.MainActivity`](https://github.com/pytorch/android-demo-app/blob/master/HelloWorldApp/app/src/main/java/org/pytorch/helloworld/MainActivity.java#L31-L69). -As a first step we read `image.jpg` to `android.graphics.Bitmap` using the standard Android API. -``` -Bitmap bitmap = BitmapFactory.decodeStream(getAssets().open("image.jpg")); -``` - -#### 5. Loading Mobile Module -``` -Module module = Module.load(assetFilePath(this, "model.ptl")); -``` -`org.pytorch.Module` represents `torch::jit::mobile::Module` that can be loaded with `load` method specifying file path to the serialized to file model. - -#### 6. Preparing Input -``` -Tensor inputTensor = TensorImageUtils.bitmapToFloat32Tensor(bitmap, - TensorImageUtils.TORCHVISION_NORM_MEAN_RGB, TensorImageUtils.TORCHVISION_NORM_STD_RGB); -``` -`org.pytorch.torchvision.TensorImageUtils` is part of `org.pytorch:pytorch_android_torchvision` library. -The `TensorImageUtils#bitmapToFloat32Tensor` method creates tensors in the [torchvision format](https://pytorch.org/vision/stable/models.html) using `android.graphics.Bitmap` as a source. - -> All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB images of shape (3 x H x W), where H and W are expected to be at least 224. -> The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]` and `std = [0.229, 0.224, 0.225]` - -`inputTensor`'s shape is `1x3xHxW`, where `H` and `W` are bitmap height and width appropriately. - -#### 7. Run Inference - -``` -Tensor outputTensor = module.forward(IValue.from(inputTensor)).toTensor(); -float[] scores = outputTensor.getDataAsFloatArray(); -``` - -`org.pytorch.Module.forward` method runs loaded module's `forward` method and gets result as `org.pytorch.Tensor` outputTensor with shape `1x1000`. - -#### 8. Processing results -Its content is retrieved using `org.pytorch.Tensor.getDataAsFloatArray()` method that returns java array of floats with scores for every image net class. - -After that we just find index with maximum score and retrieve predicted class name from `ImageNetClasses.IMAGENET_CLASSES` array that contains all ImageNet classes. - -``` -float maxScore = -Float.MAX_VALUE; -int maxScoreIdx = -1; -for (int i = 0; i < scores.length; i++) { - if (scores[i] > maxScore) { - maxScore = scores[i]; - maxScoreIdx = i; - } -} -String className = ImageNetClasses.IMAGENET_CLASSES[maxScoreIdx]; -``` - -In the following sections you can find detailed explanations of PyTorch Android API, code walk through for a bigger [demo application](https://github.com/pytorch/android-demo-app/tree/master/PyTorchDemoApp), -implementation details of the API, how to customize and build it from source. - -## PyTorch Demo Application - -We have also created another more complex PyTorch Android demo application that does image classification from camera output and text classification in the [same github repo](https://github.com/pytorch/android-demo-app/tree/master/PyTorchDemoApp). - -To get device camera output it uses [Android CameraX API](https://developer.android.com/training/camerax -). -All the logic that works with CameraX is separated to [`org.pytorch.demo.vision.AbstractCameraXActivity`](https://github.com/pytorch/android-demo-app/blob/master/PyTorchDemoApp/app/src/main/java/org/pytorch/demo/vision/AbstractCameraXActivity.java) class. - - -``` -void setupCameraX() { - final PreviewConfig previewConfig = new PreviewConfig.Builder().build(); - final Preview preview = new Preview(previewConfig); - preview.setOnPreviewOutputUpdateListener(output -> mTextureView.setSurfaceTexture(output.getSurfaceTexture())); - - final ImageAnalysisConfig imageAnalysisConfig = - new ImageAnalysisConfig.Builder() - .setTargetResolution(new Size(224, 224)) - .setCallbackHandler(mBackgroundHandler) - .setImageReaderMode(ImageAnalysis.ImageReaderMode.ACQUIRE_LATEST_IMAGE) - .build(); - final ImageAnalysis imageAnalysis = new ImageAnalysis(imageAnalysisConfig); - imageAnalysis.setAnalyzer( - (image, rotationDegrees) -> { - analyzeImage(image, rotationDegrees); - }); - - CameraX.bindToLifecycle(this, preview, imageAnalysis); - } - - void analyzeImage(android.media.Image, int rotationDegrees) -``` - -Where the `analyzeImage` method process the camera output, `android.media.Image`. - -It uses the aforementioned [`TensorImageUtils.imageYUV420CenterCropToFloat32Tensor`](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java#L90) method to convert `android.media.Image` in `YUV420` format to input tensor. - -After getting predicted scores from the model it finds top K classes with the highest scores and shows on the UI. - -#### Language Processing Example - -Another example is natural language processing, based on an LSTM model, trained on a reddit comments dataset. -The logic happens in [`TextClassificattionActivity`](https://github.com/pytorch/android-demo-app/blob/master/PyTorchDemoApp/app/src/main/java/org/pytorch/demo/nlp/TextClassificationActivity.java). - -Result class names are packaged inside the TorchScript model and initialized just after initial module initialization. -The module has a `get_classes` method that returns `List[str]`, which can be called using method `Module.runMethod(methodName)`: -``` - mModule = Module.load(moduleFileAbsoluteFilePath); - IValue getClassesOutput = mModule.runMethod("get_classes"); -``` -The returned `IValue` can be converted to java array of `IValue` using `IValue.toList()` and processed to an array of strings using `IValue.toStr()`: -``` - IValue[] classesListIValue = getClassesOutput.toList(); - String[] moduleClasses = new String[classesListIValue.length]; - int i = 0; - for (IValue iv : classesListIValue) { - moduleClasses[i++] = iv.toStr(); - } -``` - -Entered text is converted to java array of bytes with `UTF-8` encoding. `Tensor.fromBlobUnsigned` creates tensor of `dtype=uint8` from that array of bytes. -``` - byte[] bytes = text.getBytes(Charset.forName("UTF-8")); - final long[] shape = new long[]{1, bytes.length}; - final Tensor inputTensor = Tensor.fromBlobUnsigned(bytes, shape); -``` - -Running inference of the model is similar to previous examples: -``` -Tensor outputTensor = mModule.forward(IValue.from(inputTensor)).toTensor() -``` - -After that, the code processes the output, finding classes with the highest scores. - -## More PyTorch Android Demo Apps - -### D2go - -[D2Go](https://github.com/pytorch/android-demo-app/tree/master/D2Go) demonstrates a Python script that creates the much lighter and much faster Facebook [D2Go](https://github.com/facebookresearch/d2go) model that is powered by PyTorch 1.8, torchvision 0.9, and Detectron2 with built-in SOTA networks for mobile, and an Android app that uses it to detect objects from pictures in your photos, taken with camera, or with live camera. This demo app also shows how to use the native pre-built torchvision-ops library. - -### Image Segmentation - -[Image Segmentation](https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation) demonstrates a Python script that converts the PyTorch [DeepLabV3](https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101/) model and an Android app that uses the model to segment images. - -### Object Detection - -[Object Detection](https://github.com/pytorch/android-demo-app/tree/master/ObjectDetection) demonstrates how to convert the popular [YOLOv5](https://pytorch.org/hub/ultralytics_yolov5/) model and use it in an Android app that detects objects from pictures in your photos, taken with camera, or with live camera. - -### Neural Machine Translation - -[Neural Machine Translation](https://github.com/pytorch/android-demo-app/tree/master/Seq2SeqNMT) demonstrates how to convert a sequence-to-sequence neural machine translation model trained with the code in the [PyTorch NMT tutorial](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) and use the model in an Android app to do French-English translation. - -### Question Answering - -[Question Answering](https://github.com/pytorch/android-demo-app/tree/master/QuestionAnswering) demonstrates how to convert a powerful transformer QA model and use the model in an Android app to answer questions about PyTorch Mobile and more. - -### Vision Transformer - -[Vision Transformer](https://github.com/pytorch/android-demo-app/tree/master/ViT4MNIST) demonstrates how to use Facebook's latest Vision Transformer [DeiT](https://github.com/facebookresearch/deit) model to do image classification, and how convert another Vision Transformer model and use it in an Android app to perform handwritten digit recognition. - -### Speech recognition - -[Speech Recognition](https://github.com/pytorch/android-demo-app/tree/master/SpeechRecognition) demonstrates how to convert Facebook AI's wav2vec 2.0, one of the leading models in speech recognition, to TorchScript and how to use the scripted model in an Android app to perform speech recognition. - -### Video Classification - -[TorchVideo](https://github.com/pytorch/android-demo-app/tree/master/TorchVideo) demonstrates how to use a pre-trained video classification model, available at the newly released [PyTorchVideo](https://github.com/facebookresearch/pytorchvideo), on Android to see video classification results, updated per second while the video plays, on tested videos, videos from the Photos library, or even real-time videos. - - -## PyTorch Android Tutorial and Recipes - -### [Image Segmentation DeepLabV3 on Android](https://pytorch.org/tutorials/beginner/deeplabv3_on_android.html) - -A comprehensive step-by-step tutorial on how to prepare and run the PyTorch DeepLabV3 image segmentation model on Android. - -### [PyTorch Mobile Performance Recipes](https://pytorch.org/tutorials/recipes/mobile_perf.html) - -List of recipes for performance optimizations for using PyTorch on Mobile. - -### [Making Android Native Application That Uses PyTorch Android Prebuilt Libraries](https://pytorch.org/tutorials/recipes/android_native_app_with_custom_op.html) - -Learn how to make Android application from the scratch that uses LibTorch C++ API and uses TorchScript model with custom C++ operator. - -### [Fuse Modules recipe](https://pytorch.org/tutorials/recipes/fuse.html) - -Learn how to fuse a list of PyTorch modules into a single module to reduce the model size before quantization. - -### [Quantization for Mobile Recipe](https://pytorch.org/tutorials/recipes/quantization.html) - -Learn how to reduce the model size and make it run faster without losing much on accuracy. - -### [Script and Optimize for Mobile](https://pytorch.org/tutorials/recipes/script_optimized.html) - -Learn how to convert the model to TorchScipt and (optional) optimize it for mobile apps. - -### [Model Preparation for Android Recipe](https://pytorch.org/tutorials/recipes/model_preparation_android.html) - -Learn how to add the model in an Android project and use the PyTorch library for Android. - -## Building PyTorch Android from Source - -In some cases you might want to use a local build of PyTorch android, for example you may build custom LibTorch binary with another set of operators or to make local changes, or try out the latest PyTorch code. - -For this you can use `./scripts/build_pytorch_android.sh` script. -``` -git clone https://github.com/pytorch/pytorch.git -cd pytorch -sh ./scripts/build_pytorch_android.sh -``` - -The workflow contains several steps: - -1\. Build libtorch for android for all 4 android abis (armeabi-v7a, arm64-v8a, x86, x86_64) - -2\. Create symbolic links to the results of those builds: -`android/pytorch_android/src/main/jniLibs/${abi}` to the directory with output libraries -`android/pytorch_android/src/main/cpp/libtorch_include/${abi}` to the directory with headers. These directories are used to build `libpytorch_jni.so` library, as part of the `pytorch_android-release.aar` bundle, that will be loaded on android device. - -3\. And finally run `gradle` in `android/pytorch_android` directory with task `assembleRelease` - -Script requires that Android SDK, Android NDK, Java SDK, and gradle are installed. -They are specified as environment variables: - -`ANDROID_HOME` - path to [Android SDK](https://developer.android.com/studio/command-line/sdkmanager.html) - -`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk). It's recommended to use NDK 21.x. - -`GRADLE_HOME` - path to [gradle](https://gradle.org/releases/) - -`JAVA_HOME` - path to [JAVA JDK](https://www.oracle.com/java/technologies/javase-downloads.html#javasejdk) - - -After successful build, you should see the result as aar file: - -``` -$ find android -type f -name *aar -android/pytorch_android/build/outputs/aar/pytorch_android-release.aar -android/pytorch_android_torchvision/build/outputs/aar/pytorch_android_torchvision-release.aar -``` - -## Using the PyTorch Android Libraries Built from Source or Nightly - -First add the two aar files built above, or downloaded from the nightly built PyTorch Android repos at [here](https://oss.sonatype.org/#nexus-search;quick~pytorch_android) and [here](https://oss.sonatype.org/#nexus-search;quick~torchvision_android), to the Android project's `lib` folder, then add in the project's app `build.gradle` file: -``` -allprojects { - repositories { - flatDir { - dirs 'libs' - } - } -} - -dependencies { - - // if using the libraries built from source - implementation(name:'pytorch_android-release', ext:'aar') - implementation(name:'pytorch_android_torchvision-release', ext:'aar') - - // if using the nightly built libraries downloaded above, for example the 1.8.0-snapshot on Jan. 21, 2021 - // implementation(name:'pytorch_android-1.8.0-20210121.092759-172', ext:'aar') - // implementation(name:'pytorch_android_torchvision-1.8.0-20210121.092817-173', ext:'aar') - - ... - implementation 'com.android.support:appcompat-v7:28.0.0' - implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3' -} -``` - -Also we have to add all transitive dependencies of our aars. As `pytorch_android` depends on `com.android.support:appcompat-v7:28.0.0` or `androidx.appcompat:appcompat:1.2.0`, we need to one of them. (In case of using maven dependencies they are added automatically from `pom.xml`). - -## Using the Nightly PyTorch Android Libraries - -Other than using the aar files built from source or downloaded from the links in the previous section, you can also use the nightly built Android PyTorch and TorchVision libraries by adding in your app `build.gradle` file the maven url and the nightly libraries implementation as follows: - -``` -repositories { - maven { - url "https://oss.sonatype.org/content/repositories/snapshots" - } -} - -dependencies { - ... - implementation 'org.pytorch:pytorch_android:1.8.0-SNAPSHOT' - implementation 'org.pytorch:pytorch_android_torchvision:1.8.0-SNAPSHOT' -} -``` - -This is the easiest way to try out the latest PyTorch code and the Android libraries, if you do not need to make any local changes. But be aware you may need to build the model used on mobile in the latest PyTorch - using either the latest PyTorch code or a quick nightly install with commands like `pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html` - to avoid possible model version mismatch errors when running the model on mobile. - -## Custom Build - -To reduce the size of binaries you can do custom build of PyTorch Android with only set of operators required by your model. -This includes two steps: preparing the list of operators from your model, rebuilding pytorch android with specified list. - -1\. Verify your PyTorch version is 1.4.0 or above. You can do that by checking the value of `torch.__version__`. - -2\. Preparation of the list of operators - -List of operators of your serialized torchscript model can be prepared in yaml format using python api function `torch.jit.export_opnames()`. -To dump the operators in your model, say `MobileNetV2`, run the following lines of Python code: -``` -# Dump list of operators used by MobileNetV2: -import torch, yaml -model = torch.jit.load('MobileNetV2.pt') -ops = torch.jit.export_opnames(model) -with open('MobileNetV2.yaml', 'w') as output: - yaml.dump(ops, output) -``` -3\. Building PyTorch Android with prepared operators list. - -To build PyTorch Android with the prepared yaml list of operators, specify it in the environment variable `SELECTED_OP_LIST`. Also in the arguments, specify which Android ABIs it should build; by default it builds all 4 Android ABIs. - -``` -# Build PyTorch Android library customized for MobileNetV2: -SELECTED_OP_LIST=MobileNetV2.yaml scripts/build_pytorch_android.sh arm64-v8a -``` - -After successful build you can integrate the result aar files to your android gradle project, following the steps from previous section of this tutorial (Building PyTorch Android from Source). - -## Use PyTorch JIT interpreter - -PyTorch JIT interpreter is the default interpreter before 1.9 (a version of our PyTorch interpreter that is not as size-efficient). It will still be supported in 1.9, and can be used via `build.gradle`: -``` -repositories { - jcenter() -} - -dependencies { - implementation 'org.pytorch:pytorch_android:1.9.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.9.0' -} -``` - - -## Android Tutorials - -Watch the following [video](https://youtu.be/5Lxuu16_28o) as PyTorch Partner Engineer Brad Heintz walks through steps for setting up the PyTorch Runtime for Android projects: - -[![PyTorch Mobile Runtime for Android](https://i.ytimg.com/vi/O_2KBhkIvnc/maxresdefault.jpg){:height="75%" width="75%"}](https://youtu.be/5Lxuu16_28o "PyTorch Mobile Runtime for Android") - -The corresponding code can be found [here](https://github.com/pytorch/workshops/tree/master/PTMobileWalkthruAndroid). - -Checkout our [Mobile Performance Recipes](https://pytorch.org/tutorials/recipes/mobile_perf.html) which cover how to optimize your model and check if optimizations helped via benchmarking. - -In addition, follow this recipe to learn how to [make Native Android Application that use PyTorch prebuilt libraries](https://pytorch.org/tutorials/recipes/android_native_app_with_custom_op.html). - -## API Docs - -You can find more details about the PyTorch Android API in the [Javadoc](https://pytorch.org/javadoc/). - - - - diff --git a/_mobile/home.md b/_mobile/home.md deleted file mode 100644 index 8638e1058c9b..000000000000 --- a/_mobile/home.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -layout: mobile -title: Home -permalink: /mobile/home/ -background-class: mobile-background -body-class: mobile -order: 1 -published: true -redirect_from: "/mobile/" ---- - -
        -

        Note

        -

        PyTorch Mobile is no longer actively supported. Please check out ExecuTorch, PyTorch’s all-new on-device inference library.

        -
        - -# PyTorch Mobile - -There is a growing need to execute ML models on edge devices to reduce latency, preserve privacy, and enable new interactive use cases. - -The PyTorch Mobile runtime beta release allows you to seamlessly go from training a model to deploying it, while staying entirely within the PyTorch ecosystem. It provides an end-to-end workflow that simplifies the research to production environment for mobile devices. In addition, it paves the way for privacy-preserving features via federated learning techniques. - -PyTorch Mobile is in beta stage right now, and is already in wide scale production use. It will soon be available as a stable release once the APIs are locked down. - - -## Key features -* Available for [iOS]({{site.baseurl}}/mobile/ios), [Android]({{site.baseurl}}/mobile/android) and Linux -* Provides APIs that cover common preprocessing and integration tasks needed for incorporating ML in mobile applications -* Support for tracing and scripting via TorchScript IR -* Support for XNNPACK floating point kernel libraries for Arm CPUs -* Integration of QNNPACK for 8-bit quantized kernels. Includes support for per-channel quantization, dynamic quantization and more -* Provides an [efficient mobile interpreter in Android and iOS](https://pytorch.org/tutorials/recipes/mobile_interpreter.html). Also supports build level optimization and selective compilation depending on the operators needed for user applications (i.e., the final binary size of the app is determined by the actual operators the app needs). -* Streamline model optimization via optimize_for_mobile -* Support for hardware backends like GPU, DSP, and NPU will be available soon in Beta - - -## Prototypes -We have launched the following features in prototype, available in the PyTorch nightly releases, and would love to get your feedback on the [PyTorch forums](https://discuss.pytorch.org/c/mobile/18): - -* GPU support on [iOS via Metal](https://pytorch.org/tutorials/prototype/ios_gpu_workflow.html) -* GPU support on [Android via Vulkan](https://pytorch.org/tutorials/prototype/vulkan_workflow.html) -* DSP and NPU support on Android via [Google NNAPI](https://pytorch.org/tutorials/prototype/nnapi_mobilenetv2.html) - - -## Deployment workflow - -A typical workflow from training to mobile deployment with the optional model optimization steps is outlined in the following figure. -
        - -
        - -## Examples to get you started - -* [PyTorch Mobile Runtime for iOS](https://www.youtube.com/watch?v=amTepUIR93k) -* [PyTorch Mobile Runtime for Android](https://www.youtube.com/watch?v=5Lxuu16_28o) -* [PyTorch Mobile Recipes in Tutorials](https://pytorch.org/tutorials/recipes/ptmobile_recipes_summary.html) -* [Image Segmentation DeepLabV3 on iOS](https://pytorch.org/tutorials/beginner/deeplabv3_on_ios.html) -* [Image Segmentation DeepLabV3 on Android](https://pytorch.org/tutorials/beginner/deeplabv3_on_android.html) -* [D2Go Object Detection on iOS](https://github.com/pytorch/ios-demo-app/tree/master/D2Go) -* [D2Go Object Detection on Android](https://github.com/pytorch/android-demo-app/tree/master/D2Go) -* [PyTorchVideo on iOS](https://github.com/pytorch/ios-demo-app/tree/master/TorchVideo) -* [PyTorchVideo on Android](https://github.com/pytorch/android-demo-app/tree/master/TorchVideo) -* [Speech Recognition on iOS](https://github.com/pytorch/ios-demo-app/tree/master/SpeechRecognition) -* [Speech Recognition on Android](https://github.com/pytorch/android-demo-app/tree/master/SpeechRecognition) -* [Question Answering on iOS](https://github.com/pytorch/ios-demo-app/tree/master/QuestionAnswering) -* [Question Answering on Android](https://github.com/pytorch/android-demo-app/tree/master/QuestionAnswering) - -## Demo apps - -Our new demo apps also include examples of image segmentation, object detection, neural machine translation, -question answering, and vision transformers. They are available on both iOS and Android: - -* [iOS demo apps](https://github.com/pytorch/ios-demo-app) -* [Android demo apps](https://github.com/pytorch/android-demo-app) - - - - - - diff --git a/_mobile/ios.md b/_mobile/ios.md deleted file mode 100644 index 85a473df82f2..000000000000 --- a/_mobile/ios.md +++ /dev/null @@ -1,330 +0,0 @@ ---- -layout: mobile -title: iOS -permalink: /mobile/ios/ -background-class: mobile-background -body-class: mobile -order: 2 -published: true ---- - -
        -

        Note

        -

        PyTorch Mobile is no longer actively supported. Please check out ExecuTorch, PyTorch’s all-new on-device inference library. You can also review this page to learn more about how to use ExecuTorch to build an iOS app.

        -
        - -# iOS - -To get started with PyTorch on iOS, we recommend exploring the following [HelloWorld](https://github.com/pytorch/ios-demo-app/tree/master/HelloWorld). - -## Quickstart with a Hello World Example - -HelloWorld is a simple image classification application that demonstrates how to use PyTorch C++ libraries on iOS. The code is written in Swift and uses Objective-C as a bridge. - -### Requirements - -- XCode 11.0 or above -- iOS 12.0 or above - -### Model Preparation - -Let's start with model preparation. If you are familiar with PyTorch, you probably should already know how to train and save your model. In case you don't, we are going to use a pre-trained image classification model - [MobileNet v2](https://pytorch.org/hub/pytorch_vision_mobilenet_v2/), which is already packaged in [TorchVision](https://pytorch.org/vision/stable/index.html). To install it, run the command below. - -> We highly recommend following the [Pytorch Github page](https://github.com/pytorch/pytorch) to set up the Python development environment on your local machine. - -```shell -pip install torchvision -``` - -Once we have TorchVision installed successfully, let's navigate to the HelloWorld folder and run `trace_model.py`. The script contains the code of tracing and saving a [torchscript model](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) that can be run on mobile devices. - -```shell -python trace_model.py -``` - -If everything works well, `model.pt` should be generated and saved in the `HelloWorld/HelloWorld/model` folder. - -> To find out more details about TorchScript, please visit [tutorials on pytorch.org](https://pytorch.org/tutorials/advanced/cpp_export.html) - -### Install LibTorch-Lite via Cocoapods - -The PyTorch C++ library is available in [Cocoapods](https://cocoapods.org/), to integrate it to our project, simply run - -```ruby -pod install -``` - -Now it's time to open the `HelloWorld.xcworkspace` in XCode, select an iOS simulator and launch it (cmd + R). If everything works well, we should see a wolf picture on the simulator screen along with the prediction result. - - - -### Code Walkthrough - -In this part, we are going to walk through the code step by step. - -#### Image Loading - -Let's begin with image loading. - -```swift -let image = UIImage(named: "image.jpg")! -imageView.image = image -let resizedImage = image.resized(to: CGSize(width: 224, height: 224)) -guard var pixelBuffer = resizedImage.normalized() else { - return -} -``` - -We first load the image from our bundle and resize it to 224x224. Then we call this `normalized()` category method to normalize the pixel buffer. Let's take a closer look at the code below. - -```swift -var normalizedBuffer: [Float32] = [Float32](repeating: 0, count: w * h * 3) -// normalize the pixel buffer -// see https://pytorch.org/hub/pytorch_vision_resnet/ for more detail -for i in 0 ..< w * h { - normalizedBuffer[i] = (Float32(rawBytes[i * 4 + 0]) / 255.0 - 0.485) / 0.229 // R - normalizedBuffer[w * h + i] = (Float32(rawBytes[i * 4 + 1]) / 255.0 - 0.456) / 0.224 // G - normalizedBuffer[w * h * 2 + i] = (Float32(rawBytes[i * 4 + 2]) / 255.0 - 0.406) / 0.225 // B -} -``` - -The code might look weird at first glance, but it’ll make sense once we understand our model. The input data is a 3-channel RGB image of shape (3 x H x W), where H and W are expected to be at least 224. The image has to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]` and `std = [0.229, 0.224, 0.225]`. - -#### TorchScript Module - -Now that we have preprocessed our input data and we have a pre-trained TorchScript model, the next step is to use them to run prediction. To do that, we'll first load our model into the application. - -```swift -private lazy var module: TorchModule = { - if let filePath = Bundle.main.path(forResource: "model", ofType: "pt"), - let module = TorchModule(fileAtPath: filePath) { - return module - } else { - fatalError("Can't find the model file!") - } -}() -``` -Note that the `TorchModule` Class is an Objective-C wrapper of `torch::jit::mobile::Module`. - -```cpp -torch::jit::mobile::Module module = torch::jit::_load_for_mobile(filePath.UTF8String); -``` -Since Swift can not talk to C++ directly, we have to either use an Objective-C class as a bridge, or create a C wrapper for the C++ library. For demo purpose, we're going to wrap everything in this Objective-C class. - -#### Run Inference - -Now it's time to run inference and get the results. - -```swift -guard let outputs = module.predict(image: UnsafeMutableRawPointer(&pixelBuffer)) else { - return -} -``` -Again, the `predict` method is just an Objective-C wrapper. Under the hood, it calls the C++ `forward` function. Let's take a look at how it's implemented. - -```cpp -at::Tensor tensor = torch::from_blob(imageBuffer, {1, 3, 224, 224}, at::kFloat); -c10::InferenceMode guard; -auto outputTensor = _impl.forward({tensor}).toTensor(); -float* floatBuffer = outputTensor.data_ptr(); -``` -The C++ function `torch::from_blob` will create an input tensor from the pixel buffer. Note that the shape of the tensor is `{1,3,224,224}` which represents `{N, C, H, W}` as we discussed in the above section. - -```cpp -c10::InferenceMode guard; -``` -The above line tells PyTorch to do inference only. - -Finally, we can call this `forward` function to get the output tensor and convert it to a `float` buffer. - -```cpp -auto outputTensor = _impl.forward({tensor}).toTensor(); -float* floatBuffer = outputTensor.data_ptr(); -``` - -### Collect Results - -The output tensor is a one-dimensional float array of shape 1x1000, where each value represents the confidence that a label is predicted from the image. The code below sorts the array and retrieves the top three results. - -```swift -let zippedResults = zip(labels.indices, outputs) -let sortedResults = zippedResults.sorted { $0.1.floatValue > $1.1.floatValue }.prefix(3) -``` - -## PyTorch Demo App - -For more complex use cases, we recommend to check out the [PyTorch demo application](https://github.com/pytorch/ios-demo-app). The demo app contains two showcases. A camera app that runs a quantized model to predict the images coming from device’s rear-facing camera in real time. And a text-based app that uses a text classification model to predict the topic from the input string. - -## More PyTorch iOS Demo Apps - -### Image Segmentation - -[Image Segmentation](https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation) demonstrates a Python script that converts the PyTorch [DeepLabV3](https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101/) model for mobile apps to use and an iOS app that uses the model to segment images. - -### Object Detection - -[Object Detection](https://github.com/pytorch/ios-demo-app/tree/master/ObjectDetection) demonstrates how to convert the popular [YOLOv5](https://pytorch.org/hub/ultralytics_yolov5/) model and use it on an iOS app that detects objects from pictures in your photos, taken with camera, or with live camera. - -### Neural Machine Translation - -[Neural Machine Translation](https://github.com/pytorch/ios-demo-app/tree/master/Seq2SeqNMT) demonstrates how to convert a sequence-to-sequence neural machine translation model trained with the code in the [PyTorch NMT tutorial](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) and use the model in an iOS app to do French-English translation. - -### Question Answering - -[Question Answering](https://github.com/pytorch/ios-demo-app/tree/master/QuestionAnswering) demonstrates how to convert a powerful transformer QA model and use the model in an iOS app to answer questions about PyTorch Mobile and more. - -### Vision Transformer - -[Vision Transformer](https://github.com/pytorch/ios-demo-app/tree/master/ViT4MNIST) demonstrates how to use Facebook's latest Vision Transformer [DeiT](https://github.com/facebookresearch/deit) model to do image classification, and how convert another Vision Transformer model and use it in an iOS app to perform handwritten digit recognition. - -### Speech recognition - -[Speech Recognition](https://github.com/pytorch/ios-demo-app/tree/master/SpeechRecognition) demonstrates how to convert Facebook AI's wav2vec 2.0, one of the leading models in speech recognition, to TorchScript and how to use the scripted model in an iOS app to perform speech recognition. - -### Video Classification - -[TorchVideo](https://github.com/pytorch/ios-demo-app/tree/master/TorchVideo) demonstrates how to use a pre-trained video classification model, available at the newly released [PyTorchVideo](https://github.com/facebookresearch/pytorchvideo), on iOS to see video classification results, updated per second while the video plays, on tested videos, videos from the Photos library, or even real-time videos. - - -## PyTorch iOS Tutorial and Recipes - -### [Image Segmentation DeepLabV3 on iOS](https://pytorch.org/tutorials/beginner/deeplabv3_on_ios.html) - -A comprehensive step-by-step tutorial on how to prepare and run the PyTorch DeepLabV3 image segmentation model on iOS. - -### [PyTorch Mobile Performance Recipes](https://pytorch.org/tutorials/recipes/mobile_perf.html) - -List of recipes for performance optimizations for using PyTorch on Mobile. - -### [Fuse Modules recipe](https://pytorch.org/tutorials/recipes/fuse.html) - -Learn how to fuse a list of PyTorch modules into a single module to reduce the model size before quantization. - -### [Quantization for Mobile Recipe](https://pytorch.org/tutorials/recipes/quantization.html) - -Learn how to reduce the model size and make it run faster without losing much on accuracy. - -### [Script and Optimize for Mobile](https://pytorch.org/tutorials/recipes/script_optimized.html) - -Learn how to convert the model to TorchScipt and (optional) optimize it for mobile apps. - -### [Model Preparation for iOS Recipe](https://pytorch.org/tutorials/recipes/model_preparation_ios.html) - -Learn how to add the model in an iOS project and use PyTorch pod for iOS. - -## Build PyTorch iOS Libraries from Source - -To track the latest updates for iOS, you can build the PyTorch iOS libraries from the source code. - -``` -git clone --recursive https://github.com/pytorch/pytorch -cd pytorch -# if you are updating an existing checkout -git submodule sync -git submodule update --init --recursive -``` - -> Make sure you have `cmake` and Python installed correctly on your local machine. We recommend following the [Pytorch Github page](https://github.com/pytorch/pytorch) to set up the Python development environment - -### Build LibTorch-Lite for iOS Simulators - -Open terminal and navigate to the PyTorch root directory. Run the following command (if you already build LibTorch-Lite for iOS devices (see below), run `rm -rf build_ios` first): - -``` -BUILD_PYTORCH_MOBILE=1 IOS_PLATFORM=SIMULATOR ./scripts/build_ios.sh -``` -After the build succeeds, all static libraries and header files will be generated under `build_ios/install` - -### Build LibTorch-Lite for arm64 Devices - -Open terminal and navigate to the PyTorch root directory. Run the following command (if you already build LibTorch-Lite for iOS simulators, run `rm -rf build_ios` first): - -``` -BUILD_PYTORCH_MOBILE=1 IOS_ARCH=arm64 ./scripts/build_ios.sh -``` -After the build succeeds, all static libraries and header files will be generated under `build_ios/install` - -### XCode Setup - -Open your project in XCode, go to your project Target's `Build Phases` - `Link Binaries With Libraries`, click the + sign and add all the library files located in `build_ios/install/lib`. Navigate to the project `Build Settings`, set the value **Header Search Paths** to `build_ios/install/include` and **Library Search Paths** to `build_ios/install/lib`. - -In the build settings, search for **other linker flags**. Add a custom linker flag below - -``` --all_load -``` - -To use the custom built libraries the project, replace `#import ` (in `TorchModule.mm`) which is needed when using LibTorch-Lite via Cocoapods with the code below: -``` -#include -#include -#include -``` - -Finally, disable bitcode for your target by selecting the Build Settings, searching for **Enable Bitcode**, and set the value to **No**. - -## Using the Nightly PyTorch iOS Libraries in CocoaPods -If you want to try out the latest features added to PyTorch iOS, you can use the `LibTorch-Lite-Nightly` pod in your `Podfile`, it includes the nightly built libraries: -``` -pod 'LibTorch-Lite-Nightly' -``` -And then run `pod install` to add it to your project. If you wish to update the nightly pod to the newer one, you can run `pod update` to get the latest version. But be aware you may need to build the model used on mobile in the latest PyTorch - using either the latest PyTorch code or a quick nightly install with commands like `pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html` - to avoid possible model version mismatch errors when running the model on mobile. - -## Custom Build - -Starting from 1.4.0, PyTorch supports custom build. You can now build the PyTorch library that only contains the operators needed by your model. To do that, follow the steps below - -1\. Verify your PyTorch version is 1.4.0 or above. You can do that by checking the value of `torch.__version__`. - -2\. To dump the operators in your model, say `MobileNetV2`, run the following lines of Python code: - -```python -import torch, yaml -model = torch.jit.load('MobileNetV2.pt') -ops = torch.jit.export_opnames(model) -with open('MobileNetV2.yaml', 'w') as output: - yaml.dump(ops, output) -``` -In the snippet above, you first need to load the ScriptModule. Then, use `export_opnames` to return a list of operator names of the ScriptModule and its submodules. Lastly, save the result in a yaml file. - -3\. To run the iOS build script locally with the prepared yaml list of operators, pass in the yaml file generate from the last step into the environment variable `SELECTED_OP_LIST`. Also in the arguments, specify `BUILD_PYTORCH_MOBILE=1` as well as the platform/architechture type. Take the arm64 build for example, the command should be: - -``` -SELECTED_OP_LIST=MobileNetV2.yaml BUILD_PYTORCH_MOBILE=1 IOS_ARCH=arm64 ./scripts/build_ios.sh -``` -4\. After the build succeeds, you can integrate the result libraries to your project by following the [XCode Setup](#xcode-setup) section above. - -5\. The last step is to add a single line of C++ code before running `forward`. This is because by default JIT will do some optimizations on operators (fusion for example), which might break the consistency with the ops we dumped from the model. - -```cpp -torch::jit::GraphOptimizerEnabledGuard guard(false); -``` - -## Use PyTorch JIT interpreter -PyTorch JIT interpreter is the default interpreter before 1.9 (a version of our PyTorch interpreter that is not as size-efficient). It will still be supported in 1.9, and can be used in CocoaPods: -``` -pod 'LibTorch', '~>1.9.0' -``` - -## iOS Tutorials - -Watch the following [video](https://youtu.be/amTepUIR93k) as PyTorch Partner Engineer Brad Heintz walks through steps for setting up the PyTorch Runtime for iOS projects: - -[![PyTorch Mobile Runtime for iOS](https://i.ytimg.com/vi/JFy3uHyqXn0/maxresdefault.jpg){:height="75%" width="75%"}](https://youtu.be/amTepUIR93k" PyTorch Mobile Runtime for iOS") - -The corresponding code can be found [here](https://github.com/pytorch/workshops/tree/master/PTMobileWalkthruIOS). - -Additionally, checkout our [Mobile Performance Recipes](https://pytorch.org/tutorials/recipes/mobile_perf.html) which cover how to optimize your model and check if optimizations helped via benchmarking. - - -## API Docs - -Currently, the iOS framework uses the Pytorch C++ front-end APIs directly. The C++ document can be found [here](https://pytorch.org/cppdocs/). To learn more about it, we recommend exploring the [C++ front-end tutorials](https://pytorch.org/tutorials/advanced/cpp_frontend.html) on PyTorch webpage. - -## Issues and Contribution - -If you have any questions or want to contribute to PyTorch, please feel free to drop issues or open a pull request to get in touch. - - - - diff --git a/_news/news-item-1.md b/_news/news-item-1.md deleted file mode 100644 index a4fffa020a22..000000000000 --- a/_news/news-item-1.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -order: 2 -link: /get-started/pytorch-2.0/#ask-the-engineers-20-live-qa-series -summary: "Ask the Engineers: 2.0 Live Q&A Series" ---- \ No newline at end of file diff --git a/_news/news-item-2.md b/_news/news-item-2.md deleted file mode 100644 index f3dcc0df5a93..000000000000 --- a/_news/news-item-2.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -order: 3 -link: https://fb.me/e/29RoWnqBX -summary: "Watch the PyTorch Conference online" ---- diff --git a/_news/news-item-3.md b/_news/news-item-3.md deleted file mode 100644 index 767382aa5c28..000000000000 --- a/_news/news-item-3.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -order: 1 -link: /blog/pytorch-2.0-release/ -summary: "PyTorch 2.0: Our next generation release that is faster, more Pythonic and Dynamic as ever" ---- diff --git a/_past_issues/2021-03-11-issue-1.md b/_past_issues/2021-03-11-issue-1.md deleted file mode 100644 index a7561fd6ce31..000000000000 --- a/_past_issues/2021-03-11-issue-1.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: "Issue #1" -issue: 1 -date: 2021-03-11 ---- - - -# Issue \#1 - -Welcome to the first issue of the PyTorch Contributors newsletter! Keeping track of everything that’s happening in the PyTorch developer world is a big task; here you will find curated news including RFCs, feature roadmaps, notable PRs, editorials from developers, and more. If you have questions or suggestions for the newsletter, just reply back to this email. - -## PyTorch 1.8.0 - -PyTorch 1.8 was released on March 4th with support for functional transformations using `torch.fx`, stabilized frontend APIs for scientific computing (`torch.fft`, `torch.linalg`, Autograd for complex tensors) and significant improvements to distributed training. Read the full [Release Notes](https://github.com/pytorch/pytorch/releases/tag/v1.8.0){:target="_blank"}. - -## PyTorch Ecosystem Day - -On April 21, we’re hosting a virtual event for our ecosystem and industry communities to showcase their work and discover new opportunities to collaborate. The day will be filled with discussion on new developments, trends, challenges and best practices through posters, breakout sessions and networking. - -## [The PyTorch open source process](http://blog.ezyang.com/2021/01/pytorch-open-source-process/){:target="_blank"} - -[@ezyang](https://github.com/ezyang){:target="_blank"} describes the challenges of maintaining a PyTorch-scale project, and the current open source processes (triaging and CI oncalls, RFC discussions) to help PyTorch operate effectively. - -## Developers forum - -We launched https://dev-discuss.pytorch.org/ a low-traffic high-signal forum for long-form discussions about PyTorch internals. - -## [RFC] [Dataloader v2](https://github.com/pytorch/pytorch/issues/49440) - -[@VitalyFedyunin](https://github.com/VitalyFedyunin) proposes redesigning the DataLoader to support lazy loading, sharding, pipelining data operations (including async) and shuffling & sampling in a more modular way. Join the discussion [here](https://github.com/pytorch/pytorch/issues/49440). - -## [RFC] [Improving TorchScript Usability](https://dev-discuss.pytorch.org/t/torchscript-usability/55) - -In a series of 3 blog posts ([1](https://lernapparat.de/scripttorch/), [2](https://lernapparat.de/jit-python-graphops/), [3](https://lernapparat.de/jit-fallback/)) [@t-vi](https://github.com/t-vi) explores ideas to improve the user and developer experience of TorchScript. - -## [RFC] [CSR and DM storage formats for sparse tensors](https://github.com/pytorch/rfcs/pull/13) - -[@pearu](https://github.com/pearu) proposes an [RFC](https://github.com/pytorch/rfcs/pull/13) to make linear algebra operations more performant by - -- implementing the CSR storage format, where a 2D array is defined by shape and 1D tensors for compressed row indices, column indices, and values (PyTorch 1D tensor) -- introducing the Dimension Mapping storage format that generalizes a 2D CSR to multidimensional arrays using a bijective mapping between the storage and wrapper elements. - -## [RFC] [Forward Mode AD](https://github.com/pytorch/rfcs/pull/11) - -[@albanD](https://github.com/albanD) proposes an [RFC](https://github.com/pytorch/rfcs/pull/11) to implement forward mode autodiff using Tensor-based [dual numbers](https://blog.demofox.org/2014/12/30/dual-numbers-automatic-differentiation/), where the real part represents the tensor and the *dual* part stores the forward gradient of the tensor. The core of the feature has landed [(PR)](https://github.com/pytorch/pytorch/pull/49734), with more formulas in WIP. Complete forward mode AD is expected to land by July 2021. diff --git a/_past_issues/2021-05-11-issue-2.md b/_past_issues/2021-05-11-issue-2.md deleted file mode 100644 index 8324c5a923d5..000000000000 --- a/_past_issues/2021-05-11-issue-2.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -title: "Issue #2" -issue: 2 -date: 2021-05-20 ---- - - -# Issue \#2 - -Welcome to the second edition of the PyTorch newsletter! In this issue, read about how we celebrated the PyTorch community at the first-ever PyTorch Ecosystem Day (PTED), discover a new podcast for PyTorch developers, and learn about important updates to the PyTorch frontend. - -## PyTorch Ecosystem Day - -**Piotr Bialecki (Sr. Software Engineer, NVIDIA)** spoke about his journey of using PyTorch and what he sees in the future for PyTorch. **Miquel Farré (Sr. Technology Manager, Disney)** spoke about the Creative Genome project that uses the PyTorch ecosystem to annotate all Disney content. **Ritchie Ng (CEO, Hessian Matrix)** spoke about the growth of AI in the Asia Pacific region, and how to get started with PyTorch for production AI use cases. Members of the community showcased how they were using PyTorch via 71 posters and pop-up breakout sessions. See all of the [posters](https://pytorch.org/ecosystem/pted/2021) and listen to the opening [keynote talks](https://www.youtube.com/playlist?list=PL_lsbAsL_o2At9NcX1mR9d12KYUWqxOx9) here! - -## PyTorch Developer Podcast - -**Edward Yang (Research Engineer, Facebook AI)** talks about internal development concepts like binding C++ in Python, the dispatcher, PyTorch’s library structure and more. Check out this new series; each episode is around 15 minutes long. [Listen to it](https://pytorch-dev-podcast.simplecast.com/) wherever you get your podcasts. - -## Forward Mode AD -The core logic for Forward Mode AD (based on “dual tensors”) is now in PyTorch. All the APIs to manipulate such Tensors, codegen and view handling are in `master (1.9.0a0)` already. Gradcheck and a first set of formulas will be added in the following month; full support for all PyTorch functions, custom Autograd functions and higher order gradients will happen later this year. Read more about this or share your feedback with [@albanD](https://github.com/albanD) on the corresponding [RFC](https://github.com/pytorch/rfcs/pull/11). - -## Make complex conjugation lazy - -[PR #54987](https://github.com/pytorch/pytorch/pull/54987) makes the conjugate operation on complex tensors return a view that has a special `is_conj()` bit flipped. Aside from saving memory by not creating a full tensor, this grants a potential speedup if the following operation can handle conjugated inputs directly. For such operations (like `gemm`), a flag is passed to the low-level API; for others the conjugate is materialized before passing to the operation. - -## torch.use_deterministic_algorithms is stable - -`torch.use_deterministic_algorithms()` ([docs](https://pytorch.org/docs/master/generated/torch.use_deterministic_algorithms.html)) is stable in `master (1.9.0a0)`. If True, the flag switches non-deterministic operations to their deterministic implementation if available, and throws a `RuntimeError` if not. - -## torch.linalg and torch.special - -`torch.linalg` is now stable; the module maintains fidelity with NumPy’s np.linalg linear algebra functions. -`torch.special` (beta) contains functions in scipy.special. Here’s the [tracking issue](https://github.com/pytorch/pytorch/issues/50345) if you’d like to contribute functions to torch.special. If you want a function not already on the list, let us know on the tracking issue about your use case and why it should be added. - -## Generalizing AMP to work on CPU - -> [@ezyang](https://dev-discuss.pytorch.org/t/generalizing-amp-to-work-on-cpu/201): Intel is interested in bringing automatic mixed precision to CPU in [[RFC] Extend Autocast to CPU/CUDA with BF16 data type · Issue #55374 · pytorch/pytorch ·](https://github.com/pytorch/pytorch/issues/55374) One big question is what the API for autocasting should be for CPU; should we provide a single, generalized API torch.autocast (keep in mind that CPU autocasting would be through bfloat16, while the existing GPU autocasting is via float16), or provide separate APIs for CPU/CUDA? If you have any thoughts or opinions on the subject, please chime in on the issue. - -
        -
        - -Are you enjoying reading this newsletter? What would you like to know more about? All feedback is welcome and appreciated! To share your suggestions, use this [form](https://forms.gle/K75ELciLJxnabKKH9) or simply reply to this email. diff --git a/_posts/2017-5-11-a-tour-of-pytorch-internals-1.md b/_posts/2017-5-11-a-tour-of-pytorch-internals-1.md deleted file mode 100644 index a29ced22ed3e..000000000000 --- a/_posts/2017-5-11-a-tour-of-pytorch-internals-1.md +++ /dev/null @@ -1,323 +0,0 @@ ---- -layout: blog_detail -title: "A Tour of PyTorch Internals (Part I)" -author: "Trevor Killeen" -date: 2017-05-11 12:00:00 -0500 -redirect_from: /2017/05/11/Internals.html ---- - -The fundamental unit in PyTorch is the Tensor. This post will serve as an overview for how we implement Tensors in PyTorch, such that the user can interact with it from the Python shell. In particular, we want to answer four main questions: - -- How does PyTorch extend the Python interpreter to define a Tensor type that can be manipulated from Python code? -- How does PyTorch wrap the C libraries that actually define the Tensor's properties and methods? -- How does PyTorch cwrap work to generate code for Tensor methods? -- How does PyTorch's build system take all of these components to compile and generate a workable application? - -## Extending the Python Interpreter - -PyTorch defines a new package `torch`. In this post we will consider the `._C` module. This module is known as an "extension module" - a Python module written in C. Such modules allow us to define new built-in object types (e.g. the `Tensor`) and to call C/C++ functions. - -The `._C` module is defined in `torch/csrc/Module.cpp`. The `init_C()` / `PyInit__C()` function creates the module and adds the method definitions as appropriate. This module is passed around to a number of different `__init()` functions that add further objects to the module, register new types, etc. - -One collection of these `__init()` calls is the following: - -```cpp -ASSERT_TRUE(THPDoubleTensor_init(module)); -ASSERT_TRUE(THPFloatTensor_init(module)); -ASSERT_TRUE(THPHalfTensor_init(module)); -ASSERT_TRUE(THPLongTensor_init(module)); -ASSERT_TRUE(THPIntTensor_init(module)); -ASSERT_TRUE(THPShortTensor_init(module)); -ASSERT_TRUE(THPCharTensor_init(module)); -ASSERT_TRUE(THPByteTensor_init(module)); -``` - -These `__init()` functions add the Tensor object for each type to the `._C` module so that they can be used in the module. Let's learn how these methods work. - -## The THPTensor Type - -Much like the underlying `TH` and `THC` libraries, PyTorch defines a "generic" Tensor which is then specialized to a number of different types. Before considering how this specialization works, let's first consider how defining a new type in Python works, and how we create the generic `THPTensor` type. - -The Python runtime sees all Python objects as variables of type `PyObject *`, which serves as a "base type" for all Python objects. Every Python type contains the refcount for the object, and a pointer to the object's *type object*. The type object determines the properties of the type. For example, it might contain a list of methods associated with the type, and which C functions get called to implement those methods. The object also contains any fields necessary to represent its state. - -The formula for defining a new type is as follows: - -- Create a struct that defines what the new object will contain -- Define the type object for the type - -The struct itself could be very simple. Inn Python, all floating point types are actually objects on the heap. The Python float struct is defined as: -```cpp -typedef struct { - PyObject_HEAD - double ob_fval; -} PyFloatObject; -``` -The `PyObject_HEAD` is a macro that brings in the code that implements an object's reference counting, and a pointer to the corresponding type object. So in this case, to implement a float, the only other "state" needed is the floating point value itself. - -Now, let's see the struct for our `THPTensor` type: -```cpp -struct THPTensor { - PyObject_HEAD - THTensor *cdata; -}; -``` -Pretty simple, right? We are just wrapping the underlying `TH` tensor by storing a pointer to it. - -The key part is defining the "type object" for a new type. An example definition of a type object for our Python float takes the form: -```cpp -static PyTypeObject py_FloatType = { - PyVarObject_HEAD_INIT(NULL, 0) - "py.FloatObject", /* tp_name */ - sizeof(PyFloatObject), /* tp_basicsize */ - 0, /* tp_itemsize */ - 0, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_as_async */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - "A floating point number", /* tp_doc */ -}; -``` -The easiest way to think of a *type object* is as a set of fields which define the properties of the object. For example, the `tp_basicsize` field is set to `sizeof(PyFloatObject)`. This is so that Python knows how much memory to allocate when calling `PyObject_New()` for a `PyFloatObject.` The full list of fields you can set is defined in `object.h` in the CPython backend: -https://github.com/python/cpython/blob/master/Include/object.h. - -The type object for our `THPTensor` is `THPTensorType`, defined in `csrc/generic/Tensor.cpp`. This object defines the name, size, mapping methods, etc. for a `THPTensor`. - -As an example, let's take a look at the `tp_new` function we set in the `PyTypeObject`: - -```cpp -PyTypeObject THPTensorType = { - PyVarObject_HEAD_INIT(NULL, 0) - ... - THPTensor_(pynew), /* tp_new */ -}; -``` -The `tp_new` function enables object creation. It is responsible for creating (as opposed to initializing) objects of that type and is equivalent to the `__new__()` method at the Python level. The C implementation is a static method that is passed the type being instantiated and any arguments, and returns a newly created object. - -```cpp -static PyObject * THPTensor_(pynew)(PyTypeObject *type, PyObject *args, PyObject *kwargs) -{ - HANDLE_TH_ERRORS - Py_ssize_t num_args = args ? PyTuple_Size(args) : 0; - - THPTensorPtr self = (THPTensor *)type->tp_alloc(type, 0); -// more code below -``` -The first thing our new function does is allocate the `THPTensor`. It then runs through a series of initializations based off of the args passed to the function. For example, when creating a `THPTensor` *x* from another `THPTensor` *y*, we set the newly created `THPTensor`'s `cdata` field to be the result of calling `THTensor_(newWithTensor)` with the *y*'s underlying `TH` Tensor as an argument. Similar constructors exist for sizes, storages, NumPy arrays, and sequences. - -** Note that we solely use `tp_new`, and not a combination of `tp_new` and `tp_init` (which corresponds to the `__init__()` function). - -The other important thing defined in Tensor.cpp is how indexing works. PyTorch Tensors support Python's **Mapping Protocol**. This allows us to do things like: -```python -x = torch.Tensor(10).fill_(1) -y = x[3] // y == 1 -x[4] = 2 -// etc. -``` -** Note that this indexing extends to Tensor with more than one dimension - -We are able to use the `[]`-style notation by defining the three mapping methods described [here.](https://docs.python.org/3.7/c-api/typeobj.html#c.PyMappingMethods) - -The most important methods are `THPTensor_(getValue)` and `THPTensor_(setValue)` which describe how to index a Tensor, for returning a new Tensor/Scalar, or updating the values of an existing Tensor in place. Read through these implementations to better understand how PyTorch supports basic tensor indexing. - -### Generic Builds (Part One) - -We could spend a ton of time exploring various aspects of the `THPTensor` and how it relates to defining a new Python object. But we still need to see how the `THPTensor_(init)()` function is translated to the `THPIntTensor_init()` we used in our module initialization. How do we take our `Tensor.cpp` file that defines a "generic" Tensor and use it to generate Python objects for all the permutations of types? To put it another way, `Tensor.cpp` is littered with lines of code like: -```cpp -return THPTensor_(New)(THTensor_(new)(LIBRARY_STATE_NOARGS)); -``` -This illustrates both cases we need to make type-specific: - -* Our output code will call `THPTensor_New(...)` in place of `THPTensor_(New)` -* Our output code will call `THTensor_new(...)` in place of `THTensor_(new)` - -In other words, for all supported Tensor types, we need to "generate" source code that has done the above substitutions. This is part of the "build" process for PyTorch. PyTorch relies on Setuptools (https://setuptools.readthedocs.io/en/latest/) for building the package, and we define a `setup.py` file in the top-level directory to customize the build process. - -One component building an Extension module using Setuptools is to list the source files involved in the compilation. However, our `csrc/generic/Tensor.cpp` file is not listed! So how does the code in this file end up being a part of the end product? - -Recall that we are calling the `THPTensor*` functions (such as `init`) from the directory above `generic`. If we take a look in this directory, there is another file `Tensor.cpp` defined. The last line of this file is important: -```cpp -//generic_include TH torch/csrc/generic/Tensor.cpp -``` -Note that this `Tensor.cpp` file is included in `setup.py`, but it is wrapped in a call to a Python helper function called `split_types`. This function takes as input a file, and looks for the "//generic_include" string in the file contents. If it is found, it generates a new output file for each Tensor type, with the following changes: - -- The output file is renamed to `Tensor.cpp` -- The output file is slightly modified as follows: - -```cpp -# Before: -//generic_include TH torch/csrc/generic/Tensor.cpp - -# After: -#define TH_GENERIC_FILE "torch/src/generic/Tensor.cpp" -#include "TH/THGenerateType.h" -``` -Including the header file on the second line has the side effect of including the source code in `Tensor.cpp` with some additional context defined. Let's take a look at one of the headers: - -```cpp -#ifndef TH_GENERIC_FILE -#error "You must define TH_GENERIC_FILE before including THGenerateFloatType.h" -#endif - -#define real float -#define accreal double -#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) -#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val) -#define Real Float -#define THInf FLT_MAX -#define TH_REAL_IS_FLOAT -#line 1 TH_GENERIC_FILE -#include TH_GENERIC_FILE -#undef accreal -#undef real -#undef Real -#undef THInf -#undef TH_REAL_IS_FLOAT -#undef TH_CONVERT_REAL_TO_ACCREAL -#undef TH_CONVERT_ACCREAL_TO_REAL - -#ifndef THGenerateManyTypes -#undef TH_GENERIC_FILE -#endif -``` - -What this is doing is bringing in the code from the generic `Tensor.cpp` file and surrounding it with the following macro definitions. For example, we define real as a float, so any code in the generic Tensor implementation that refers to something as a real will have that real replaced with a float. In the corresponding file `THGenerateIntType.h`, the same macro would replace `real` with `int`. - -These output files are returned from `split_types` and added to the list of source files, so we can see how the `.cpp` code for different types is created. - -There are a few things to note here: First, the `split_types` function is not strictly necessary. We could wrap the code in `Tensor.cpp` in a single file, repeating it for each type. The reason we split the code into separate files is to speed up compilation. Second, what we mean when we talk about the type replacement (e.g. replace real with a float) is that the C preprocessor will perform these substitutions during compilation. Merely surrounding the source code with these macros has no side effects until preprocessing. - -### Generic Builds (Part Two) - -Now that we have source files for all the Tensor types, we need to consider how the corresponding header declarations are created, and also how the conversions from `THTensor_(method)` and `THPTensor_(method)` to `THTensor_method` and `THPTensor_method` work. For example, `csrc/generic/Tensor.h` has declarations like: -```cpp -THP_API PyObject * THPTensor_(New)(THTensor *ptr); -``` -We use the same strategy for generating code in the source files for the headers. In `csrc/Tensor.h`, we do the following: -```cpp -#include "generic/Tensor.h" -#include - -#include "generic/Tensor.h" -#include -``` -This has the same effect, where we draw in the code from the generic header, wrapped with the same macro definitions, for each type. The only difference is that the resulting code is contained all within the same header file, as opposed to being split into multiple source files. - -Lastly, we need to consider how we "convert" or "substitute" the function types. If we look in the same header file, we see a bunch of `#define` statements, including: -```cpp -#define THPTensor_(NAME) TH_CONCAT_4(THP,Real,Tensor_,NAME) -``` -This macro says that any string in the source code matching the format `THPTensor_(NAME)` should be replaced with `THPRealTensor_NAME`, where Real is derived from whatever the symbol Real is `#define`'d to be at the time. Because our header code and source code is surrounded by macro definitions for all the types as seen above, after the preprocessor has run, the resulting code is what we would expect. The code in the `TH` library defines the same macro for `THTensor_(NAME)`, supporting the translation of those functions as well. In this way, we end up with header and source files with specialized code. - -#### Module Objects and Type Methods - -Now we have seen how we have wrapped `TH`'s Tensor definition in `THP`, and generated THP methods such as `THPFloatTensor_init(...)`. Now we can explore what the above code actually does in terms of the module we are creating. The key line in `THPTensor_(init)` is: -```cpp -# THPTensorBaseStr, THPTensorType are also macros that are specific -# to each type -PyModule_AddObject(module, THPTensorBaseStr, (PyObject *)&THPTensorType); -``` -This function registers our Tensor objects to the extension module, so we can use THPFloatTensor, THPIntTensor, etc. in our Python code. - -Just being able to create Tensors isn't very useful - we need to be able to call all the methods that `TH` defines. A simple example shows calling the in-place `zero_` method on a Tensor. -```python -x = torch.FloatTensor(10) -x.zero_() -``` -Let's start by seeing how we add methods to newly defined types. One of the fields in the "type object" is `tp_methods`. This field holds an array of method definitions (`PyMethodDef`s) and is used to associate methods (and their underlying C/C++ implementations) with a type. Suppose we wanted to define a new method on our `PyFloatObject` that replaces the value. We could implement this as follows: -```cpp -static PyObject * replace(PyFloatObject *self, PyObject *args) { - double val; - if (!PyArg_ParseTuple(args, "d", &val)) - return NULL; - self->ob_fval = val; - Py_RETURN_NONE -} -``` -This is equivalent to the Python method: -```python -def replace(self, val): - self.ob_fval = val -``` -It is instructive to read more about how defining methods works in CPython. In general, methods take as the first parameter the instance of the object, and optionally parameters for the positional arguments and keyword arguments. This static function is registered as a method on our float: -```cpp -static PyMethodDef float_methods[] = { - {"replace", (PyCFunction)replace, METH_VARARGS, - "replace the value in the float" - }, - {NULL} /* Sentinel */ -} -``` -This registers a method called replace, which is implemented by the C function of the same name. The `METH_VARARGS` flag indicates that the method takes a tuple of arguments representing all the arguments to the function. This array is set to the `tp_methods` field of the type object, and then we can use the `replace` method on objects of that type. - -We would like to be able to call all of the methods for `TH` tensors on our `THP` tensor equivalents. However, writing wrappers for all of the `TH` methods would be time-consuming and error prone. We need a better way to do this. - -### PyTorch cwrap - -PyTorch implements its own cwrap tool to wrap the `TH` Tensor methods for use in the Python backend. We define a `.cwrap` file containing a series of C method declarations in our custom [YAML format](http://yaml.org). The cwrap tool takes this file and outputs `.cpp` source files containing the wrapped methods in a format that is compatible with our `THPTensor` Python object and the Python C extension method calling format. This tool is used to generate code to wrap not only `TH`, but also `CuDNN`. It is defined to be extensible. - -An example YAML "declaration" for the in-place `addmv_` function is as follows: -``` -[[ - name: addmv_ - cname: addmv - return: self - arguments: - - THTensor* self - - arg: real beta - default: AS_REAL(1) - - THTensor* self - - arg: real alpha - default: AS_REAL(1) - - THTensor* mat - - THTensor* vec -]] -``` -The architecture of the cwrap tool is very simple. It reads in a file, and then processes it with a series of **plugins.** See `tools/cwrap/plugins/__init__.py` for documentation on all the ways a plugin can alter the code. - -The source code generation occurs in a series of passes. First, the YAML "declaration" is parsed and processed. Then the source code is generated piece-by-piece - adding things like argument checks and extractions, defining the method header, and the actual call to the underlying library such as `TH`. Finally, the cwrap tool allows for processing the entire file at a time. The resulting output for `addmv_` can be [explored here](https://gist.github.com/killeent/c00de46c2a896335a52552604cc4d74b). - -In order to interface with the CPython backend, the tool generates an array of `PyMethodDef`s that can be stored or appended to the `THPTensor`'s `tp_methods` field. - -In the specific case of wrapping Tensor methods, the build process first generates the output source file from `TensorMethods.cwrap`. This source file is `#include`'d in the generic Tensor source file. This all occurs before the preprocessor does its magic. As a result, all of the method wrappers that are generated undergo the same pass as the `THPTensor` code above. Thus a single generic declaration and definition is specialized for each type as well. - -### Putting It All Together - -So far, we have shown how we extend the Python interpreter to create a new extension module, how such a module defines our new `THPTensor` type, and how we can generate source code for Tensors of all types that interface with `TH`. Briefly, we will touch on compilation. - -Setuptools allows us to define an Extension for compilation. The entire `torch._C` extension is compiled by collecting all of the source files, header files, libraries, etc. and creating a setuptools `Extension`. Then setuptools handles building the extension itself. I will explore the build process more in a subsequent post. - -To summarize, let's revisit our four questions: - -- **How does PyTorch extend the Python interpreter to define a Tensor type that can be manipulated from Python code?** - -It uses CPython's framework for extending the Python interpreter and defining new types, while taking special care to generate code for all types. - -- **How does PyTorch wrap the C libraries that actually define the Tensor's properties and methods?** - -It does so by defining a new type, `THPTensor`, that is backed by a `TH` Tensor. Function calls are forwarded to this tensor via the CPython backend's conventions. - -- **How does PyTorch cwrap work to generate code for Tensor methods?** - -It takes our custom YAML-formatted code and generates source code for each method by processing it through a series of steps using a number of plugins. - -- **How does PyTorch's build system take all of these components to compile and generate a workable application?** - -It takes a bunch of source/header files, libraries, and compilation directives to build an extension using Setuptools. - -This is just a snapshot of parts of the build system for PyTorch. There is more nuance, and detail, but I hope this serves as a gentle introduction to a lot of the components of our Tensor library. - -### Resources: - - - is invaluable for understanding how to write C/C++ Extension to Python diff --git a/_posts/2017-6-27-a-tour-of-pytorch-internals-2.md b/_posts/2017-6-27-a-tour-of-pytorch-internals-2.md deleted file mode 100644 index a396981dc278..000000000000 --- a/_posts/2017-6-27-a-tour-of-pytorch-internals-2.md +++ /dev/null @@ -1,587 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Internals Part II - The Build System" -author: "Trevor Killeen" -date: 2017-06-27 12:00:00 -0500 -redirect_from: /2017/06/27/Internals2.html ---- - -In the first [post]({{ site.baseurl }}{% link _posts/2017-5-11-a-tour-of-pytorch-internals-1.md %}) I explained how we generate a `torch.Tensor` object that you can use in your Python interpreter. Next, I will explore the build system for PyTorch. The PyTorch codebase has a variety of components: - - - The core Torch libraries: TH, THC, THNN, THCUNN - - Vendor libraries: CuDNN, NCCL - - Python Extension libraries - - Additional third-party libraries: NumPy, MKL, LAPACK - -How does a simple invocation of `python setup.py install` do the work that allows you to call `import torch` and use the PyTorch library in your code? - -The first part of this document will explain the build process from and end-user point of view. This will explain how we take the components above to build the library. The second part of the document will be important for PyTorch developers. It will document ways to improve your iteration speed by building only a subset of the code that you are working on. - -### Setuptools and PyTorch's setup( ) function - -Python uses [Setuptools](https://setuptools.readthedocs.io/en/latest/index.html) to build the library. Setuptools is an extension to the original distutils system from the core Python library. The core component of Setuptools is the `setup.py` file which contains all the information needed to build the project. The most important function is the `setup()` function which serves as the main entry point. Let's take a look at the one in PyTorch: - -```python -setup(name="torch", version=version, - description="Tensors and Dynamic neural networks in Python with strong GPU acceleration", - ext_modules=extensions, - cmdclass={ - 'build': build, - 'build_py': build_py, - 'build_ext': build_ext, - 'build_deps': build_deps, - 'build_module': build_module, - 'develop': develop, - 'install': install, - 'clean': clean, - }, - packages=packages, - package_data={'torch': [ - 'lib/*.so*', 'lib/*.dylib*', - 'lib/torch_shm_manager', - 'lib/*.h', - 'lib/include/TH/*.h', 'lib/include/TH/generic/*.h', - 'lib/include/THC/*.h', 'lib/include/THC/generic/*.h']}, - install_requires=['pyyaml'], - ) -``` - -The function is composed entirely of keyword arguments, which serve two purposes: - -- Metadata (e.g. name, description, version) -- The contents of the package - -We are concerned with #2. Let's break down the individual components: - - - **ext_modules**: Python modules are either "pure" modules, containing only Python code, or "extension" modules written in the low-level language of the Python implementation. Here we are listing the extension modules in the build, including the main `torch._C` library that contains our Python Tensor - - **cmdclass**: When using the `setup.py` script from the command line, the user must specify one or more "commands", code snippets that perform a specific action. For example, the "install" command builds and installs the package. This mapping routes specific commands to functions in `setup.py` that implement them - - **packages**: The list of packages in the project. These are "pure" - i.e. they only contain Python code. These are defined elsewhere in `setup.py` - - **package_data**: Additional files that need to be installed into a package: in this case the header files and shared libraries that the build will generate must be included in our installation - - **install_requires**: In order to build PyTorch, we need pyyaml. Setuptools will handle making sure that pyyaml will be available, downloading and installing it if necessary - -We will consider these components in more detail, but for now it is instructive to look at the end product of an installation -- i.e. what Setuptools does after building the code. - -### site_packages - -Third party packages are by default installed into the `lib//site_packages` directory associated with your Python binary. For example, because I am using an [Miniconda](https://conda.io/miniconda.html) environment, my Python binary is found at: - -```bash -(p3) killeent@devgpu047:pytorch (master)$ which python -~/local/miniconda2/envs/p3/bin/python -``` -And thus packages are installed into: - -```bash -/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages -``` -I installed PyTorch, and let's take a look into torch folder in site-packages: - -```bash -(p3) killeent@devgpu047:site-packages$ cd torch -(p3) killeent@devgpu047:torch$ ls -autograd backends _C.cpython-36m-x86_64-linux-gnu.so cuda distributed _dl.cpython-36m-x86_64-linux-gnu.so functional.py __init__.py legacy lib multiprocessing nn optim __pycache__ serialization.py _six.py sparse storage.py _tensor_docs.py tensor.py _tensor_str.py _thnn _torch_docs.py utils _utils.py version.py -``` - -Note that everything we would expect to be here is here: - - - All the "pure" packages are here [todo print packages from setup.py to explain] - - The extension libraries are here - the ._C* and ._dl* shared libraries - - The package_data is here: the contents of lib/ match exactly what we described in the setup function: - -```bash -(p3) killeent@devgpu047:torch$ ls lib/ -include libnccl.so.1 libTHC.so.1 libTHCUNN.so.1 libTHNN.so.1 libTH.so.1 THCUNN.h torch_shm_manager libnccl.so libshm.so libTHCS.so.1 libTHD.so.1 libTHPP.so.1 libTHS.so.1 THNN.h -``` - -The Python interpreter looks into `site_packages` during an import. If we call `import torch` in our Python code it will find the module here and initialize and import it. You can read more about the import system [here](https://docs.python.org/3/tutorial/modules.html). - -### Building Individual Parts - -Next, we will look at the various individual components of the build from start to finish. This will illustrate how we combine all the code we mentioned in the introduction. - -### Backend Torch and Vendor Libraries - -Let's take a look at the `install` cmd override in PyTorch's `setup.py`: - -```python -class install(setuptools.command.install.install): - - def run(self): - if not self.skip_build: - self.run_command('build_deps') - setuptools.command.install.install.run(self) -``` - -We note the first thing it does is run a command called "build_deps" - let's take a look at it's `run()` method: - -```python -def run(self): - from tools.nnwrap import generate_wrappers as generate_nn_wrappers - build_all_cmd = ['bash', 'torch/lib/build_all.sh'] - if WITH_CUDA: - build_all_cmd += ['--with-cuda'] - if WITH_NCCL and not SYSTEM_NCCL: - build_all_cmd += ['--with-nccl'] - if WITH_DISTRIBUTED: - build_all_cmd += ['--with-distributed'] - if subprocess.call(build_all_cmd) != 0: - sys.exit(1) - generate_nn_wrappers() -``` - -Here we note that that we have a shell script `build_all.sh` in the `torch/lib/` directory. This script is configurable by whether we are on a system with CUDA enabled, the NCCL library enabled, and PyTorch's distributed library enabled. - -Let's take a look in `torch/lib`: - -```bash -(p3) killeent@devgpu047:lib (master)$ ls -build_all.sh libshm nccl README.md TH THC THCS THCUNN THD THNN THPP THS -``` - -Here we see the directories for all the backend libraries. `TH`, `THC`, `THNN`, `THCUNN`, and `nccl` are [git subtrees](https://developer.atlassian.com/blog/2015/05/the-power-of-git-subtree/) that are in sync with the libraries in e.g. [github.com/torch](https://github.com/torch/torch7/tree/master/lib/TH). `THS`, `THCS`, `THD`, `THPP` and `libshm` are libraries specific to PyTorch. All of the libraries contain `CMakeLists.txt` - indicating they are built with CMake. - -The `build_all.sh` is essentially a script that runs the CMake configure step on all of these libraries, and then `make install`. Let's run `./build_all.sh` and see what we are left with: - -```bash -(p3) killeent@devgpu047:lib (master)$ ./build_all.sh --with-cuda --with-nccl --with-distributed -[various CMake output logs] -(p3) killeent@devgpu047:lib (master)$ ls -build build_all.sh include libnccl.so libnccl.so.1 libshm libshm.so libTHC.so.1 libTHCS.so.1 libTHCUNN.so.1 libTHD.so.1 libTHNN.so.1 libTHPP.so.1 libTH.so.1 libTHS.so.1 nccl README.md TH THC THCS THCUNN THCUNN.h THD THNN THNN.h THPP THS tmp_install torch_shm_manager -``` - -Now there are a number of extra things in the directory: - - - Shared library files for each library - - Headers for `THNN` and `THCUNN` - - `build` and `tmp_install` directories - - The `torch_shm_manager` executable - -Let's explore further. In the shell script, we create the `build` directory and a subdir for each library to build: - -```bash -# We create a build directory for the library, which will -# contain the cmake output. $1 is the library to be built - mkdir -p build/$1 - cd build/$1 -``` - -Thus e.g. `build/TH` contains the CMake configuration output including the `Makefile` for building TH, and also the result of running make install in this directory. - -Let's also look at `tmp_install`: - -```bash -(p3) killeent@devgpu047:lib (master)$ ls tmp_install/ -bin include lib share -``` - -`tmp_install` looks like a standard install directory containing binaries, header files and library files. For example, `tmp_install/include/TH` contains all the `TH` headers, and `tmp_install/lib/` contains the `libTH.so.1` file. - -So why have this directory? It is used to compile the libraries that depend on each other. For example, the `THC` library depends on the `TH` library and its headers. This is referenced in the build shell script as arguments to the `cmake` command: - -```bash -# install_dir is tmp_install -cmake ... - -DTH_INCLUDE_PATH="$INSTALL_DIR/include" \ - -DTH_LIB_PATH="$INSTALL_DIR/lib" \ -``` - -And indeed if we look at the `THC` library we built: - -```bash -(p3) killeent@devgpu047:lib (master)$ ldd libTHC.so.1 - ... - libTH.so.1 => /home/killeent/github/pytorch/torch/lib/tmp_install/lib/./libTH.so.1 (0x00007f84478b7000) -``` - -The way the `build_all.sh` specifies the include and library paths is a little messy but this is representative of the overall idea. Finally, at the end of the script: - -```bash -# If all the builds succeed we copy the libraries, headers, -# binaries to torch/lib -cp $INSTALL_DIR/lib/* . -cp THNN/generic/THNN.h . -cp THCUNN/generic/THCUNN.h . -cp -r $INSTALL_DIR/include . -cp $INSTALL_DIR/bin/* . -``` - -As we can see, at the end, we copy everything to the top-level `torch/lib` directory - explaining the contents we saw above. We'll see why we do this next: - -### NN Wrappers - -Briefly, let's touch on the last part of the `build_deps` command: `generate_nn_wrappers()`. We bind into the backend libraries using PyTorch's custom `cwrap` tooling, which we touched upon in a previous post. For binding `TH` and `THC` we manually write the YAML declarations for each function. However, due to the relative simplicity of the `THNN` and `THCUNN` libraries, we auto-generate both the cwrap declarations and the resulting C++ code. - -The reason we copy the `THNN.h` and `THCUNN.h` header files into `torch/lib` is that this is where the `generate_nn_wrappers()` code expects these files to be located. `generate_nn_wrappers()` does a few things: - -1. Parses the header files, generating cwrap YAML declarations and writing them to output `.cwrap` files -2. Calls `cwrap` with the appropriate plugins on these `.cwrap` files to generate source code for each -3. Parses the headers *a second time* to generate `THNN_generic.h` - a library that takes `THPP` Tensors, PyTorch's "generic" C++ Tensor Library, and calls into the appropriate `THNN`/`THCUNN` library function based on the dynamic type of the Tensor - -If we take a look into `torch/csrc/nn` after running `generate_nn_wrappers()` we can see the output: - -```bash -(p3) killeent@devgpu047:nn (master)$ ls -THCUNN.cpp THCUNN.cwrap THNN.cpp THNN.cwrap THNN_generic.cpp THNN_generic.cwrap THNN_generic.h THNN_generic.inc.h -``` - -For example, the code generates cwrap like: - -``` -[[ - name: FloatBatchNormalization_updateOutput - return: void - cname: THNN_FloatBatchNormalization_updateOutput - arguments: - - void* state - - THFloatTensor* input - - THFloatTensor* output - - type: THFloatTensor* - name: weight - nullable: True - - type: THFloatTensor* - name: bias - nullable: True - - THFloatTensor* running_mean - - THFloatTensor* running_var - - THFloatTensor* save_mean - - THFloatTensor* save_std - - bool train - - double momentum - - double eps -]] -``` - -with corresponding `.cpp`: - -```cpp -extern "C" void THNN_FloatBatchNormalization_updateOutput(void*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, bool, double, double); - -PyObject * FloatBatchNormalization_updateOutput(PyObject *_unused, PyObject *args) { - // argument checking, unpacking - PyThreadState *_save = NULL; - try { - Py_UNBLOCK_THREADS; - THNN_FloatBatchNormalization_updateOutput(arg_state, arg_input, arg_output, arg_weight, arg_bias, arg_running_mean, arg_running_var, arg_save_mean, arg_save_std, arg_train, arg_momentum, arg_eps); - Py_BLOCK_THREADS; - Py_RETURN_NONE; - } catch (...) { - if (_save) { - Py_BLOCK_THREADS; - } - throw; - } - - ... -} -``` - -In the `THPP` generated code, the function looks like this: - -```cpp -void BatchNormalization_updateOutput(thpp::Tensor* input, thpp::Tensor* output, thpp::Tensor* weight, thpp::Tensor* bias, thpp::Tensor* running_mean, thpp::Tensor* running_var, thpp::Tensor* save_mean, thpp::Tensor* save_std, bool train, double momentum, double eps) { - // Call appropriate THNN function based on tensor type, whether its on CUDA, etc. -} -``` - -We will look a little more at how these source files are used later. - -### "Building" the Pure Python Modules - -Now that we have built the backend libraries (the "dependencies") we can move forward with building the actual PyTorch code. The next Setuptools command that runs is `build_py`, which is used to build all the "Pure" python modules in our library. These are the "packages" passed to `setup.py`. - -The packages are found using the Setuptools' utility function `find_packages()`: - -```python -packages = find_packages(exclude=('tools.*',)) -['torch', 'torch._thnn', 'torch.autograd', 'torch.backends', 'torch.cuda', 'torch.distributed', 'torch.legacy', 'torch.multiprocessing', 'torch.nn', 'torch.optim', 'torch.sparse', 'torch.utils', 'torch.autograd._functions', 'torch.backends.cudnn', 'torch.legacy.nn', 'torch.legacy.optim', 'torch.nn._functions', 'torch.nn.backends', 'torch.nn.modules', 'torch.nn.parallel', 'torch.nn.utils', 'torch.nn._functions.thnn', 'torch.utils.data', 'torch.utils.ffi', 'torch.utils.serialization', 'torch.utils.trainer', 'torch.utils.backcompat', 'torch.utils.trainer.plugins'] -``` - -As we can see, `find_package` has recursively traversed the `torch` directory, finding all the directory paths that have an `__init__.py` file. - -When building with Setuptools, the tool creates a `build` directory in the distribution root, i.e. the same location as the `setup.py` file. Because PyTorch is composed of both "Pure" python modules and Extension Modules, we need to preserve information about the Operating System and Python version used when performing the build. So if we look in my `build` directory, we see: - -```bash -(p3) killeent@devgpu047:pytorch (master)$ ls build -lib.linux-x86_64-3.6 temp.linux-x86_64-3.6 -``` - -This indicates that I've built the project on `linux-x86-64` using Python 3.6. The lib directory contains the library files, while the temp directory contains files generated during the build that aren't needed in the final installation. - -Because "Pure" python modules are just Python code, and don't need to be "compiled", the `build_py` process simply copies files from their locations as found by `find_packages` to the equivalent location in `build/`. So our build output is littered with lines like: - -```bash -copying torch/autograd/_functions/blas.py -> build/lib.linux-x86_64-3.6/torch/autograd/_functions -``` - -We also noted earlier that we could pass files and directories to the `package_data` keyword argument to the main `setup()` function, and that Setuptools would handle copying those files to the installation location. During `build_py`, these files are copied to the `build/` directory, so we also see lines like: - -```bash -copying torch/lib/libTH.so.1 -> build/lib.linux-x86_64-3.6/torch/lib -... -copying torch/lib/include/THC/generic/THCTensor.h -> build/lib.linux-x86_64-3.6/torch/lib/include/THC/generic -``` - -### Building the Extension Modules - -Finally, we need to build the Extension Modules, i.e. the PyTorch modules written in C++ using the CPython backend. This also constitutes the majority of the code logic in `setup.py`. Our overridden `build_ext` Command has some special logic before the extensions themselves are actually built: - -```python -from tools.cwrap import cwrap -from tools.cwrap.plugins.THPPlugin import THPPlugin -from tools.cwrap.plugins.ArgcountSortPlugin import ArgcountSortPlugin -from tools.cwrap.plugins.AutoGPU import AutoGPU -from tools.cwrap.plugins.BoolOption import BoolOption -from tools.cwrap.plugins.KwargsPlugin import KwargsPlugin -from tools.cwrap.plugins.NullableArguments import NullableArguments -from tools.cwrap.plugins.CuDNNPlugin import CuDNNPlugin -from tools.cwrap.plugins.WrapDim import WrapDim -from tools.cwrap.plugins.AssertNDim import AssertNDim -from tools.cwrap.plugins.Broadcast import Broadcast -from tools.cwrap.plugins.ProcessorSpecificPlugin import ProcessorSpecificPlugin - thp_plugin = THPPlugin() - cwrap('torch/csrc/generic/TensorMethods.cwrap', plugins=[ - ProcessorSpecificPlugin(), BoolOption(), thp_plugin, - AutoGPU(condition='IS_CUDA'), ArgcountSortPlugin(), KwargsPlugin(), - AssertNDim(), WrapDim(), Broadcast() - ]) - cwrap('torch/csrc/cudnn/cuDNN.cwrap', plugins=[ - CuDNNPlugin(), NullableArguments() - ]) -``` - -Recall above that I documented that we auto-generated C++ code for calling into the `THNN` etc. libraries. Here is where we bind `TH`, `THC` and `CuDNN`. We take the YAML declarations in `TensorMethods.cwrap`, and use them to generate output C++ source files that contain implementations that work within PyTorch's C++ Ecosystem. For example, a simple declaration like zero_: - -``` -[[ - name: zero_ - cname: zero - return: self - arguments: - - THTensor* self -]] -``` - -Generates code like: - -```cpp - PyObject * THPTensor_(zero_)(PyObject *self, PyObject *args, PyObject *kwargs) { - ... - THTensor_(zero)(LIBRARY_STATE arg_self); - ... -} -``` - -In the previous post we documented how these functions are tied to specific Tensor types, so I won't expand on that there. For the build process its enough to know that these C++ files are generated prior to the extension being built, because these source files are used during Extension compilation. - -### Specifying the Extensions - -Unlike pure modules, it’s not enough just to list modules or packages and expect the Setuptools to go out and find the right files; you have to specify the extension name, source file(s), and any compile/link requirements (include directories, libraries to link with, etc.). - -The bulk (200~ LOC at the time of this writing) of the `setup.py` goes into specifying how to build these Extensions. Here, some of the choices we make in `build_all.sh` begin to make sense. For example, we saw that our build script specified a `tmp_install` directory where we installed our backend libraries. In our `setup.py` code, we reference this directory when adding to the list of directories containing header files to include: - -```python -# tmp_install_path is torch/lib/tmp_install -include_dirs += [ - cwd, - os.path.join(cwd, "torch", "csrc"), - tmp_install_path + "/include", - tmp_install_path + "/include/TH", - tmp_install_path + "/include/THPP", - tmp_install_path + "/include/THNN", -``` - -Similarly, we copied the shared object libraries to `torch/csrc` at the end of the `build_all.sh` script. We reference these locations directly in our `setup.py` code when identifying libraries that we may link against: - -```python -# lib_path is torch/lib -TH_LIB = os.path.join(lib_path, 'libTH.so.1') -THS_LIB = os.path.join(lib_path, 'libTHS.so.1') -THC_LIB = os.path.join(lib_path, 'libTHC.so.1') -THCS_LIB = os.path.join(lib_path, 'libTHCS.so.1') -THNN_LIB = os.path.join(lib_path, 'libTHNN.so.1') -# ... -``` - -Let's consider how we build the main `torch._C` Extension Module: - -```python -C = Extension("torch._C", - libraries=main_libraries, - sources=main_sources, - language='c++', - extra_compile_args=main_compile_args + extra_compile_args, - include_dirs=include_dirs, - library_dirs=library_dirs, - extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')], - ) -``` - - - The main libraries are all the libraries we link against. This includes things like `shm`, PyTorch's shared memory management library, and also system libraries like `cudart` and `cudnn`. Note that the `TH` libraries *are not* listed here - - The main sources are the C++ files that make up the C++ backend for PyTorch - - The compile args are various flags that configure compilation. For example, we might want to add debug flags when compiling in debug mode - - The include dirs are the paths to all the directories containing header files. This is also another example where the `build_all.sh` script is important - for example, we look for the `TH` header files in `torch/lib/tmp_install/include/TH` - which is the install location we specified with our CMake configuration - - The library dirs are directories to search for shared libraries at link time. For example, we include `torch/lib` - the location we copied our `.so` files to at the end of `build_all.sh`, but also the paths to the CUDA and CuDNN directories - - The link arguments are used when linking object files together to create the extension. In PyTorch, this includes more *normal* options like decided to link `libstdc++` statically. However, there is one key component: **this is where we link the backend TH libraries**. Note that we have lines like: - -```python -# The explicit paths to .so files we described above -main_link_args = [TH_LIB, THS_LIB, THPP_LIB, THNN_LIB] -``` - -You might be wondering why we do this as opposed to adding these libraries to the list we pass to the `libraries` keyword argument. After all, that is a list of libraries to link against. The issue is that Lua Torch installs often set the `LD_LIBRARY_PATH` variable, and thus we could mistakenly link against a `TH` library built for Lua Torch, instead of the library we have built locally. This would be problematic because the code could be out of date, and also there are various configuration options for Lua Torch's `TH` that would not play nicely with PyTorch. - -As such, we manually specify the paths to the shared libraries we generated directly to the linker. - -There are other extensions needed to power PyTorch and they are built in a similar way. The Setuptools library invokes the C++ compiler and linker to build all of these extensions. If the builds succeed, we have successfully *built* the PyTorch library and we can move on to installation. - -### Installation - -After building has finished, installation is quite simple. We simply have to copy everything from our `build/lib.linux-x86_64-3.6` directory to the appropriate installation directory. Recall that we noted above that this directory is the `site_packages` directory associated with our Python binary. As a result, we see lines like: - -```bash -running install_lib -creating /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch -copying build/lib.linux-x86_64-3.6/torch/_C.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch -copying build/lib.linux-x86_64-3.6/torch/_dl.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch -creating /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn -copying build/lib.linux-x86_64-3.6/torch/_thnn/_THNN.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn -copying build/lib.linux-x86_64-3.6/torch/_thnn/_THCUNN.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn -``` - -Finally lets power up the Python interpreter. When the Python interpreter executes an import statement, it searches for Python code and extension modules along a search path. A default value for the path is configured into the Python binary when the interpreter is built. - -```bash -# note we are now in my home directory -(p3) killeent@devgpu047:~$ python -Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23) -[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux -Type "help", "copyright", "credits" or "license" for more information. ->>> import sys ->>> sys.path -['', '/home/killeent/local/miniconda2/envs/p3/lib/python36.zip', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/lib-dynload', '/home/killeent/.local/lib/python3.6/site-packages', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages', '/home/killeent/github/pytorch', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/setuptools-27.2.0-py3.6.egg'] -``` - -As we can see, the `site-packages` directory we copied our PyTorch installation to is part of search path. Now let's load the `torch` module and see its location: - -```python ->>> import torch ->>> import inspect ->>> inspect.getfile(torch) -'/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/__init__.py' -``` - -As we can see, we have loaded the module from `site_packages` as expected - and our build and installation is successful! - -**Note:** Python prepends the empty string to `sys.path` to represent the current working directory - making it the first place we search for a module. So if we run Python from the pytorch directory, we would accidentally load the local version of PyTorch rather than our installed version. This is something to watch out for. - -### Addendum - Developer Efficiency, 3rd Party Libraries, Things I Didn't Cover - -The entire installation loop for PyTorch can be quite time-consuming. On my devserver, it takes around 5 minutes for an installation from source. Often times, when developing PyTorch, we only want to work on a subset of the entire project, and re-build only that subset in order to test changes. Fortunately, our build system enables this. - -### Setuptools Develop Mode - -The main tool that supports this is Setuptools `develop` command. The documentation states that: - ->This command allows you to deploy your project’s source for use in one or more “staging areas” where it will be available for importing. This deployment is done in such a way that changes to the project source are immediately available in the staging area(s), without needing to run a build or install step after each change. - -But how does it work? Suppose we run `python setup.py build develop` in the PyTorch directory. The `build` command is run, building our dependencies (`TH`, `THPP`, etc.) and the extension libraries. However, if we look inside `site-packages`: - -```bash -(p3) killeent@devgpu047:site-packages$ ls -la torch* --rw-r--r--. 1 killeent users 31 Jun 27 08:02 torch.egg-link -``` - -Looking at the contents of the `torch.egg-link` file, it simply references the PyTorch directory: - -```bash -(p3) killeent@devgpu047:site-packages$ cat torch.egg-link -/home/killeent/github/pytorch -``` - -If we navigate back to the PyTorch directory, we see there is a new directory `torch.egg-info`: - -```bash -(p3) killeent@devgpu047:pytorch (master)$ ls -la torch.egg-info/ -total 28 -drwxr-xr-x. 2 killeent users 4096 Jun 27 08:09 . -drwxr-xr-x. 10 killeent users 4096 Jun 27 08:01 .. --rw-r--r--. 1 killeent users 1 Jun 27 08:01 dependency_links.txt --rw-r--r--. 1 killeent users 255 Jun 27 08:01 PKG-INFO --rw-r--r--. 1 killeent users 7 Jun 27 08:01 requires.txt --rw-r--r--. 1 killeent users 16080 Jun 27 08:01 SOURCES.txt --rw-r--r--. 1 killeent users 12 Jun 27 08:01 top_level.txt -``` - -This file contains metadata about the PyTorch project. For example, `requirements.txt` lists all of the dependencies for setting up PyTorch: - -```bash -(p3) killeent@devgpu047:pytorch (master)$ cat torch.egg-info/requires.txt -pyyaml -``` - -Without going into too much detail, `develop` allows us to essentially treat the PyTorch repo itself as if it were in `site-packages`, so we can import the module and it just works: - -```bash -(p3) killeent@devgpu047:~$ python -Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23) -[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux -Type "help", "copyright", "credits" or "license" for more information. ->>> import torch ->>> torch.__file__ -'/home/killeent/github/pytorch/torch/__init__.py' -``` - -As a result, the following consequences hold: - -- If we change a Python source file, the changes are automatically picked up, and we don't have to run any commands to let the Python interpreter *see* this change -- If we change a C++ Source File in one of the extension libraries, we can re-run the `develop` command, it will re-build the extension - -Thus we can develop the PyTorch codebases seamlessly, and test our changes in an easy way. - -#### Working on the Dependency Libraries - -If we are working on the dependencies (e.g. `TH`, `THPP`, etc.) we can re-build our changes more quickly by simply running the `build_deps` command directly. This will automatically call into `build_all.sh` to re-build our libraries, and copy the generated libraries appropriately. If we are using Setuptools `develop` mode, we will be using the local extension library built in the PyTorch directory. Because we have specified the paths to the shared libraries when compiling our Extension Libraries, the changes will be picked up: - -```bash -# we are using the local extension -(p3) killeent@devgpu047:~$ python -Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23) -[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux -Type "help", "copyright", "credits" or "license" for more information. ->>> import torch ->>> torch._C.__file__ -'/home/killeent/github/pytorch/torch/_C.cpython-36m-x86_64-linux-gnu.so' - -# it references the local shared object library we just re-built -(p3) killeent@devgpu047:~$ ldd /home/killeent/github/pytorch/torch/_C.cpython-36m-x86_64-linux-gnu.so -# ... -libTH.so.1 => /home/killeent/github/pytorch/torch/lib/libTH.so.1 (0x00007f543d0e2000) -# ... -``` - -As such, we can test any changes here without having to do a full rebuild. - -#### 3rd Party Libraries - -PyTorch has dependencies on some 3rd party libraries. The usual mechanism for using these libraries is to install them via Anaconda, and then link against them. For example, we can use the `mkl` library with PyTorch by doing: - -```bash -# installed to miniconda2/envs/p3/lib/libmkl_intel_lp64.so -conda install mkl -``` - -And then as long as we have the path to this `lib` directory on our `$CMAKE_PREFIX_PATH`, it will successfully find this library when compiling: - -```bash -# in the site-packages dir -(p3) killeent@devgpu047:torch$ ldd _C.cpython-36m-x86_64-linux-gnu.so -# ... -libmkl_intel_lp64.so => /home/killeent/local/miniconda2/envs/p3/lib/libmkl_intel_lp64.so (0x00007f3450bba000) -# ... -``` - -### Not Covered, But Also Relevant - -- How `ccache` is used to speed up build times -- How PyTorch's top-level `__init__.py` file handles the initial module import and pulling together all the various modules and extension libraries -- The CMake build system, how the backend libraries are configured and built with CMake diff --git a/_posts/2018-01-19-a-year-in.md b/_posts/2018-01-19-a-year-in.md deleted file mode 100644 index 86647d110bce..000000000000 --- a/_posts/2018-01-19-a-year-in.md +++ /dev/null @@ -1,177 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch, a year in...." -author: "The PyTorch Team" -date: 2018-01-19 12:00:00 -0500 -redirect_from: /2018/01/19/a-year-in.html ---- - -Today marks 1 year since PyTorch was released publicly. It's been a wild ride — our quest to build a flexible deep learning research platform. Over the last year, we've seen an amazing community of people using, contributing to and evangelizing PyTorch — thank you for the love. - -Looking back, we wanted to summarize PyTorch over the past year: the progress, the news and highlights from the community. - -## Community - -We've been blessed with a strong organic community of researchers and engineers who fell in love with PyTorch. The core team has engineers and researchers from multiple countries, companies and universities, and we couldn't have made PyTorch what it is without each contribution. - - -### Research papers, packages and Github - -Within days of release, users from the community started to implement their favorite research papers in PyTorch and release the code on Github. Open-source code is a primary and essential tool for researchers today. - -Folks came together to create [torchtext](https://github.com/pytorch/text), [torchvision](https://github.com/pytorch/vision) and [torchaudio](https://github.com/pytorch/audio) packages to help facilitate and democratize research in different domains. - -The first community package based on PyTorch came from Brandon Amos, [titled Block](https://twitter.com/brandondamos/status/828652480573607937), and helped with easier manipulation of block matrices. The Locus Lab at **CMU** subsequently went on to [publish PyTorch packages](https://github.com/locuslab) and implementations for most of their research. The first research paper code came from Sergey Zagoruyko titled [Paying more attention to attention](https://twitter.com/PyTorch/status/822561885744726016). - -Jun-Yan Zhu, Taesung Park, Phillip Isola, Alyosha Efros and team from **U.C.Berkeley** released the hugely popular [Cycle-GAN and pix2pix](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix) which does image to image transforms. - -
        - -
        - -The researchers at **HarvardNLP** and **Systran** started developing and improving [OpenNMT in PyTorch](https://github.com/OpenNMT/OpenNMT-py), seeded by initial reimplementation of the [Lua]Torch code from Adam Lerer at Facebook. - -The MagicPony team at **Twitter** contributed implementations of their [Super-resolution work early on into PyTorch's examples](https://twitter.com/Rob_Bishop/status/821793080877588480). - -**Salesforce Research** released several packages, including their highlight release of [PyTorch-QRNN](https://twitter.com/Smerity/status/917472260851560448), a type of RNN that is 2x to 17x faster than standard LSTMs optimized by CuDNN. James Bradbury and team form one of the most active and engaging forces in the PyTorch community. - - - - -Researchers from **Uber**, **Northeastern** and **Stanford** came together to form an active probabilistic programming community around their packages [Pyro](http://pyro.ai/) and [ProbTorch](https://github.com/probtorch/probtorch). They are actively developing the torch.distributions core package. This community is so active and fast-moving, we had our first pytorch-probabilistic-programming meetup at NIPS 2017 with Fritz Obermeyer, Noah Goodman, Jan-Willem van de Meent, Brooks Paige, Dustin Tran and 22 additional attendees discussing how to make the world bayesian. - -
        - -
        - -**NVIDIA** Researchers released three high-quality repositories that implemented [pix2pix-HD](https://github.com/NVIDIA/pix2pixHD), [Sentiment Neuron](https://github.com/NVIDIA/sentiment-discovery) and [FlowNet2](https://github.com/NVIDIA/flownet2-pytorch) papers. Their analysis of scalability of different [Data Parallel models in PyTorch](https://github.com/NVIDIA/sentiment-discovery/blob/master/analysis/scale.md) was helpful to the community. - -
        - -
        - -The Allen Institute for AI released [AllenNLP](http://allennlp.org/) which includes several state-of-the-art models in NLP — reference implementations and easy to use [web demos](http://demo.allennlp.org/machine-comprehension) for standard NLP tasks. - -
        - -
        - -We also had our first Kaggle winning team grt123 in July. They won the DataScience Bowl 2017 on Lung Cancer detection and [subsequently released their PyTorch implementations](https://twitter.com/PyTorch/status/881573658166267904). - -On the visualization front, Tzu-Wei Huang implemented a [TensorBoard-PyTorch plugin](https://github.com/lanpa/tensorboard-pytorch) and Facebook AI Research released PyTorch compatibility for their [visdom](https://github.com/facebookresearch/visdom) visualization package. - -
        - - -
        - -Lastly, **Facebook AI Research** released several projects such as [ParlAI, fairseq-py, VoiceLoop and FaderNetworks](https://github.com/facebookresearch/) that implemented cutting-edge models and interfaced datasets in multiple domains. - -There are countless good projects that we haven't highlighted for the lack of space, you can find a curated list [here](https://github.com/soumith?tab=stars). - -We would also like to give a huge shout-out to folks who actively help others out on the Forums, especially [ptrblck](https://discuss.pytorch.org/u/ptrblck/summary), [jpeg729](https://discuss.pytorch.org/u/jpeg729/summary), [QuantScientist](https://discuss.pytorch.org/u/quantscientist/summary), [albanD](https://discuss.pytorch.org/u/alband/summary), [Thomas Viehmann](https://discuss.pytorch.org/u/tom/summary) and [chenyuntc](https://discuss.pytorch.org/u/chenyuntc/summary). You are providing an invaluable service, thank you so much! - -## Metrics - -In terms of sheer numbers, - -* 87,769 lines of Python code on github that [import torch](https://github.com/search?l=Python&q=import+torch&type=Code) -* [3,983 repositories on Github that mention PyTorch in their name or description](https://github.com/search?q=pytorch&type=Repositories) -* More than half a million downloads of PyTorch binaries. 651,916 to be precise. -* **5,400 users** wrote **21,500 posts** discussing 5,200 topics on our forums discuss.pytorch.org (http://discuss.pytorch.org/) -* 131 mentions of PyTorch on Reddit's /r/machinelearning since the day of release. In the same period, TensorFlow was mentioned 255 times. - - -### Research Metrics - -PyTorch is a research-focused framework. So one of the metrics of interest is to see the usage of PyTorch in machine learning research papers. - - -* In the recent ICLR2018 conference submissions, PyTorch was mentioned in **87 papers**, compared to TensorFlow at 228 papers, Keras at 42 papers, Theano and Matlab at 32 papers. - -* [Monthly arxiv.org mentions for frameworks](https://twitter.com/fchollet/status/951828914103402497) had PyTorch at 72 mentions, with TensorFlow at 273 mentions, Keras at 100 mentions, Caffe at 94 mentions and Theano at 53 mentions. - -## Courses, Tutorials and Books - -When we released PyTorch, we had good API documentation, but our tutorials were limited to a few ipython notebooks — helpful, but not good enough. - -[Sasank Chilamkurthy](https://github.com/chsasank) took it upon himself to revamp the tutorials into the [beautiful website](https://pytorch.org/tutorials/) that it is today. - -
        - -
        - -[Sean Robertson](https://github.com/spro/practical-pytorch) and [Justin Johnson](https://github.com/jcjohnson/pytorch-examples) wrote great new tutorials — in NLP, and to learn by example. [Yunjey Choi](https://github.com/yunjey/pytorch-tutorial) wrote a beautiful tutorial where most models were implemented in 30 lines or less. -Each new tutorial helped users find their way faster, with different approaches to learning. - -[Goku Mohandas and Delip Rao](https://twitter.com/PyTorch/status/888500355943641088) switched the code content of their book-in-progress to use PyTorch. - -We've seen quite a few university machine learning courses being taught with PyTorch as the primary tool, such as Harvard's [CS287](https://harvard-ml-courses.github.io/cs287-web/). Taking it one step further and democratizing learning, we had three online courses pop up that teach using PyTorch. - -- **Fast.ai's** “Deep Learning for Coders” is a popular online course. In September, Jeremy and Rachel [announced that the next fast.ai courses will be nearly entirely based on PyTorch](http://www.fast.ai/2017/09/08/introducing-pytorch-for-fastai/). -- Ritchie Ng, a researcher with ties to NUS Singapore and Tsinghua released [a Udemy course](https://www.udemy.com/practical-deep-learning-with-pytorch/) titled Practical Deep Learning with PyTorch. -- Sung Kim from HKUST released an [online course on Youtube](https://www.youtube.com/playlist?list=PLlMkM4tgfjnJ3I-dbhO9JTw7gNty6o_2m) that was aimed towards a general audience, titled: “PyTorch Zero to All”. - - -## Engineering - -Over the last year we implemented multiple features, improved performance across the board and fixed lots of bugs. A full list of the work we've done is found in our [release notes](https://github.com/pytorch/pytorch/releases). -Here are highlights from our work over the last year: - -## Higher-order gradients - - With the release of several papers that implement penalties of gradients and with ongoing research in 2nd order gradient methods, this was an essential and sought-after feature. In August, we implemented a generalized interface that can take n-th order derivatives and increased the coverage of functions that support higher-order gradients over time, such that at the moment of writing almost all ops support this. - - -## Distributed PyTorch - -In August, we released a small distributed package that followed the highly popular MPI-collective approach. The package has multiple backends such as TCP, MPI, Gloo and NCCL2 to support various types of CPU/GPU collective operations and use-cases, and integrates distributed technologies such as Infiniband and RoCE. Distributed is hard, and we had bugs in the initial iteration. Over subsequent releases, we made the package more stable and improved performance. - -## Closer to NumPy - -One of the biggest demands from users were NumPy features that they were familiar with. Features such as Broadcasting and Advanced Indexing are convenient and save users a lot of verbosity. We implemented these features and started to align our API to be closer to NumPy. Over time, we expect to get closer and closer to NumPy's API where appropriate. - -## Sparse Tensors - -In March, we released a small package supporting sparse Tensors and in May we released CUDA support for the sparse package. The package is small and limited in functionality, and is used for implementing Sparse Embeddings and commonly used sparse paradigms in deep learning. This package is still small in scope and there's demand to expand it — if you are interested in working on expanding the sparse package, reach out to us on our [Discussion Boards](https://discuss.pytorch.org/) - - -## Performance - -Performance is always an ongoing battle, especially for PyTorch which is a dynamic framework that wants to maximize flexibility. Over the last year, we've improved performance across board, from our core Tensor library to the neural network operators, writing faster micro-optimized across board. - -* We've added specialized AVX and AVX2 intrinsics for Tensor operations -* Wrote faster GPU kernels for frequent workloads like concatenation and Softmax (among many other things) -* Rewrote the code for several neural network operators (too many to list), but notably nn.Embedding and group convolutions. - -**Reducing framework overhead by 10x across board** - -Since PyTorch is a dynamic graph framework, we create a new graph on the fly at every iteration of a training loop. Hence, the framework overhead has to be low, or the workload has to be large enough that the framework overhead is hidden. In August, the authors of DyNet (Graham Neubig and team) showcased that it's much faster than PyTorch on small NLP models. This was an interesting challenge, we didn't realize that models of those sizes were being trained. In a multi-month (and ongoing) effort, we embarked upon a significant rewrite of PyTorch internals that reduced the framework overhead from more than 10 microseconds per operator execution to as little as 1 microsecond. - -**ATen** - -As we embarked upon a redesign of the PyTorch internals, we built the [ATen C++11](https://github.com/pytorch/pytorch/tree/master/aten) library that now powers all of the PyTorch backend. ATen has an API that mirrors PyTorch's Python API, which makes it a convenient C++ library for Tensor computation. ATen can be built and used independently of PyTorch. - -## Exporting models to production — ONNX Support and the JIT compiler - -One of the common requests we've received was to export PyTorch models to another framework. Users engaged in a rapid research cycle in PyTorch and when they were done, they wanted to ship it to larger projects with C++ only requirements. - -With this in mind, we built a tracer for PyTorch — which can export PyTorch models into an intermediate representation. -The subsequent trace can be either used to run the current PyTorch model more efficiently (by running optimization passes on it), or be converted to the [ONNX](http://onnx.ai/) format to be shipped to other frameworks such as Caffe2, MXNet, TensorFlow and others or directly to the hardware accelerated libraries like CoreML or TensorRT. Over the next year, you will hear more about the JIT compiler for performance improvements. - - -## Users being funny :) - -Our users express their support in funny ways, made us laugh, thanks for this :) - - - - - - - - - - - - diff --git a/_posts/2018-03-5-tensor-comprehensions.md b/_posts/2018-03-5-tensor-comprehensions.md deleted file mode 100644 index df83ea75dccd..000000000000 --- a/_posts/2018-03-5-tensor-comprehensions.md +++ /dev/null @@ -1,195 +0,0 @@ ---- -layout: blog_detail -title: 'Tensor Comprehensions in PyTorch' -author: Priya Goyal (FAIR), Nicolas Vasilache (FAIR), Oleksandr Zinenko (Inria & DI ENS), Theodoros Theodoridis (ETH Zürich), Zachary DeVito (FAIR), William S. Moses (MIT CSAIL), Sven Verdoolaege (FAIR), Andrew Adams (FAIR), Albert Cohen (Inria & DI ENS & FAIR) -redirect_from: /2018/03/05/tensor-comprehensions.html ---- - -Tensor Comprehensions (TC) is a tool that lowers the barrier for writing high-performance code. It generates GPU code from a simple high-level language and autotunes the code for specific input sizes. - -**We highly recommend reading the [Tensor Comprehensions blogpost](https://research.fb.com/announcing-tensor-comprehensions/) first.** - -If you ran into any of the following scenarios, TC is a useful tool for you. - -- Your PyTorch layer is large and slow, and you contemplated writing a dedicated C++ or CUDA code for it. But you don't know how to program in CUDA or write low-level code. - -- You wrote a CUDA layer, but it took a week to write, debug, optimize for speed. You wished you could do this in an hour. - -- You want to fuse multiple layers like Conv-ReLU-BatchNorm or Linear-ReLU-Linear-ReLU in your network for speed, but it was quite difficult to comprehend - -- Your research involves weird Tensor shapes that CuDNN and MKL are not optimized for. For example, you do convolutions of 13 x 24 with an input image of 143 x 55. You tried running it with CuDNN and it was slower than you wished. - -- Your code is slowed-down by transposing Tensors constantly to fit a particular memory layout. You wish it was easy to write custom code that operates efficiently on your input layout. - - -Tensor Comprehensions are seamless to use in PyTorch, interoperating with PyTorch Tensors and `nn` Variables. - -Let us run through using TC with PyTorch. - -#### 1. Install the package - -```bash -conda install -c pytorch -c tensorcomp tensor_comprehensions -``` - -At this time we only provide Linux-64 binaries which have been tested on Ubuntu 16.04 and CentOS7. - -TC depends on heavyweight C++ projects such as [Halide](http://halide-lang.org/), [Tapir-LLVM](https://github.com/wsmoses/Tapir-LLVM) and ISL. Hence, we rely on Anaconda to distribute these dependencies reliably. For the same reason, TC is not available via PyPI. - -#### 2. Import the python package - -```python -import tensor_comprehensions as tc -``` - -#### 3. Define the TC expression and create a python function - -```python -lang = """ -def fcrelu(float(B,M) I, float(N,M) W1, float(N) B1) -> (O1) { - O1(b, n) +=! I(b, m) * W1(n, m) - O1(b, n) = O1(b, n) + B1(n) - O1(b, n) = fmax(O1(b, n), 0) -} -""" -fcrelu = tc.define(lang, name="fcrelu") -``` - -This `fcrelu` function takes PyTorch Tensors as input and returns a PyTorch Tensor. It takes input `I`, weight `W1`, bias `B1` and returns output `O1`. - -#### 4. Let's create some dummy input tensors - -```python -B, M, N = 100, 128, 100 -I, W1, B1 = torch.randn(B, M).cuda(), torch.randn(N, M).cuda(), torch.randn(N).cuda() -``` - -#### 5. Now autotune the function for your input sizes - -```python -fcrelu.autotune(I, W1, B1, cache="fcrelu_100_128_100.tc") -``` - -The autotuner is your biggest friend. You generally do not want to use a `tc` function without autotuning it first. - -When the autotuning is running, the current best performance is displayed. If you are satisfied with the current result or you are out of time, stop the tuning procedure by pressing `Ctrl+C`. - -`cache` saves the results of the autotuned kernel search and saves it to the file `fcrelu_100_128_100.tc`. The next time you call the same line of code, it loads the results of the autotuning without recomputing it. - -The autotuner has a few hyperparameters (just like your ConvNet has learning rate, number of layers, etc.). We pick reasonable defaults, but you can read about using advanced options [here](https://facebookresearch.github.io/TensorComprehensions/framework/pytorch_integration/writing_layers.html#specifying-mapping-options). - -#### 6. Call the function with the inputs, to get your result - -```python -out = fcrelu(I, W1, B1) -``` - -Now, let's look at how to write TC expressions. - -## A quick primer on the TC language - -The TC notation focuses on the mathematical nature of the layer, leaving performance considerations to it's backend code that uses Halide and polyhedral compilation techniques which accumulate decades of cutting edge Loop Nest Optimization (LNO) research. - -TC is close to [np.einsum](https://docs.scipy.org/doc/numpy/reference/generated/numpy.einsum.html). We shall quickly learn TC by example - -```python -lang = """ -def matmul(float(M,N) A, float(N,K) B) -> (output) { - output(i, j) +=! A(i, kk) * B(kk, j) -} -""" -``` - -In this example, we define a function `matmul` which takes two input `A` and `B` of shapes `M x N` and `N x K` and returns a single `output`. The shape of `output` is automatically inferred by the TC language (discussed below). - -Let's look at this line: - -```python -output(i, j) +=! A(i, kk) * B(kk, j) -``` - -It says: - -- `output(i, j)` means output is 2D. -- for each location `output(i, j)`, we add (`+=`) `A(i, kk) * B(kk, j)`. -- `i` is well-defined as all locations in `A` dim=0, i.e. `i in range(0, M)` -- `j` is well-defined as all locations in `B` dim=1, i.e. `j in range(0, K)` -- `kk` is inferred as all locations from `0` to `N` - -The shape of output is inferred from the maximum values `i` and `j` can take, which is `M` and `K`, so output is of size `M x K`. - -The `!` symbol initializes output with `0.0`. It is equivalent to: - -```python -output(i, j) = 0 -output(i, j) += A(i, kk) * B(kk, j) -``` - -**Scalar inputs and range constraints: implement AvgPool2d** - -```python -""" - -{% raw %}def avgpool(float(B, C, H, W) input) -> (output) {{{% endraw %} - output(b, c, h, w) += input(b, c, h * {sH} + kh, w * {sW} + kw) where kh in 0:{kH}, kw in 0:{kW} -{% raw %}}}{% endraw %} - -""" -avgpool = tc.define(LANG, name="avgpool", constants={"sH":1, "sW":1, "kH":2, "kW":2}) -``` - -here the `where` keyword can take ranges of values to operate on. `0:{kH}` is equivalent `range(kH)` in Python. - -Note: the syntax for passing in scalars is subject to change in the next release. - -## torch.nn layers - -We added some sugar-coating around the basic PyTorch integration of TC to make it easy to integrate TC into larger `torch.nn` models by defining the forward and backward TC expressions and taking `Variable` inputs / outputs. - -## Some essentials that you will miss (we're working on them) - -### Autotuning for variable-length sequences - -The TC auto-tuner requires all input sizes to be specified before-hand. For example, if you have input `I1` which is an image batch, the autotuner wants to know the exact shape of `I1` to generate an optimized kernel. You cannot specify: `image with height between 200 and 300`. This is more essential in sequence data such as NLP, where each sentence can have a different length. - -The reason why the autotuner is non-parametric is because it's harder and harder to auto-tune parametric constraints, this is active research. Hence, for the first release, we made a conscious decision to give you the tool in a form where we know it works well. - -As a work-around, if you know that you have a few specific shapes of interest, you can run the autotuner with these multiple shapes. - -```python -relu = tc.define(LANG, name="relu") -batch, channels = 16, 3 -tc.autotune((batch, channels, 32, 32)) # image of size 32 x 32 -tc.autotune((batch, channels, 48, 48)) # image of size 48 x 48 -tc.autotune((batch, channels, 64, 64)) # image of size 64 x 64 -``` - -Now the autotuner is tuned for these three specific image sizes `32x32`, `48x48` and `64x64`. - -### Lack of loops - -If you want to write an RNN, it's easy to see it as a `for` loop over time. However, the TC language does not have loops yet. If you really want to write RNNs, you can write unrolled loops. - -### Strided-Tensors - -The TC backend does not support non-contiguous Tensors yet. If the inputs you give are not contiguous, they are made contiguous before passing to the TC backend. - -### Reshaping Tensors within a TC expression - -You cannot write this operation in TC: `torch.matmul(...).view(...).mean(...)`. Whenever there is need for a `view` to change the shape of an input, you have to get the output, `view` it at the PyTorch level. - -## Getting Started - -- [Walk through Tutorial](https://facebookresearch.github.io/TensorComprehensions/framework/pytorch_integration/writing_layers.html) to quickly get started with understanding and using Tensor Comprehensions PyTorch package. -- Over 20 examples of various ML layers with TC, including `avgpool`, `maxpool`, `matmul`, matmul - give output buffers and `batch-matmul`, `convolution`, `strided-convolution`, `batchnorm`, `copy`, `cosine similarity`, `Linear`, `Linear + ReLU`, `group-convolutions`, strided `group-convolutions`, `indexing`, `Embedding` (lookup table), small-mobilenet, `softmax`, `tensordot`, `transpose` -- [Detailed docs](https://facebookresearch.github.io/TensorComprehensions/framework/pytorch_integration/getting_started.html) on Tensor Comprehensions and integration with PyTorch. - -## Communication - -- Slack: For discussion around framework integration, build support, collaboration, etc. join our slack channel. -- Email: tensorcomp@fb.com -- [GitHub](https://github.com/facebookresearch/TensorComprehensions): bug reports, feature requests, install issues, RFCs, thoughts, etc. - -## Acknowledgements - -We would like to thank Soumith Chintala, [Edward Yang](https://github.com/ezyang) and [Sam Gross](https://github.com/colesbury) for their immense guidance and help in making the integration API nice and smooth. We would also like to thank rest of the PyTorch team and our pre-release users for their helpful feedback that guided us in making the integration better. diff --git a/_posts/2018-04-22-pytorch-0_4_0-migration-guide.md b/_posts/2018-04-22-pytorch-0_4_0-migration-guide.md deleted file mode 100644 index 39aced0791ed..000000000000 --- a/_posts/2018-04-22-pytorch-0_4_0-migration-guide.md +++ /dev/null @@ -1,379 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 0.4.0 Migration Guide' -redirect_from: /2018/04/22/0_4_0-migration-guide.html ---- - -Welcome to the migration guide for PyTorch 0.4.0. In this release we introduced [many exciting new features and critical bug fixes](https://github.com/pytorch/pytorch/releases/tag/v0.4.0), with the goal of providing users a better and cleaner interface. In this guide, we will cover the most important changes in migrating existing code from previous versions: - -- `Tensors` and `Variables` have merged -- Support for 0-dimensional (scalar) `Tensors` -- Deprecation of the `volatile` flag -- `dtypes`, `devices`, and Numpy-style `Tensor` creation functions -- Writing device-agnostic code -- New edge-case constraints on names of submodules, parameters, and buffers in `nn.Module` - -## Merging [`Tensor`](https://pytorch.org/docs/0.4.0/tensors.html) and `Variable` and classes - -[`torch.Tensor`](https://pytorch.org/docs/0.4.0/tensors.html) and `torch.autograd.Variable` are now the same class. More precisely, [`torch.Tensor`](https://pytorch.org/docs/0.4.0/tensors.html) is capable of tracking history and behaves like the old `Variable`; `Variable` wrapping continues to work as before but returns an object of type [`torch.Tensor`](https://pytorch.org/docs/0.4.0/tensors.html). This means that you don't need the `Variable` wrapper everywhere in your code anymore. - -### The `type()` of a [`Tensor`](https://pytorch.org/docs/0.4.0/tensors.html) has changed - -Note also that the `type()` of a Tensor no longer reflects the data type. Use `isinstance()` or `x.type()`instead: - -```python ->>> x = torch.DoubleTensor([1, 1, 1]) ->>> print(type(x)) # was torch.DoubleTensor -"" ->>> print(x.type()) # OK: 'torch.DoubleTensor' -'torch.DoubleTensor' ->>> print(isinstance(x, torch.DoubleTensor)) # OK: True -True -``` - -### When does [`autograd`](https://pytorch.org/docs/0.4.0/autograd.html) start tracking history now? - -`requires_grad`, the central flag for [`autograd`](https://pytorch.org/docs/0.4.0/autograd.html), is now an attribute on `Tensors`. The same rules previously used for `Variables` applies to `Tensors`; [`autograd`](https://pytorch.org/docs/0.4.0/autograd.html) starts tracking history when any input `Tensor` of an operation has `requires_grad=True`. For example, - -```python ->>> x = torch.ones(1) # create a tensor with requires_grad=False (default) ->>> x.requires_grad -False ->>> y = torch.ones(1) # another tensor with requires_grad=False ->>> z = x + y ->>> # both inputs have requires_grad=False. so does the output ->>> z.requires_grad -False ->>> # then autograd won't track this computation. let's verify! ->>> z.backward() -RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn ->>> ->>> # now create a tensor with requires_grad=True ->>> w = torch.ones(1, requires_grad=True) ->>> w.requires_grad -True ->>> # add to the previous result that has require_grad=False ->>> total = w + z ->>> # the total sum now requires grad! ->>> total.requires_grad -True ->>> # autograd can compute the gradients as well ->>> total.backward() ->>> w.grad -tensor([ 1.]) ->>> # and no computation is wasted to compute gradients for x, y and z, which don't require grad ->>> z.grad == x.grad == y.grad == None -True -``` - -#### Manipulating `requires_grad` flag - -Other than directly setting the attribute, you can change this flag `in-place` using [`my_tensor.requires_grad_()`](https://pytorch.org/docs/0.4.0/tensors.html#torch.Tensor.requires_grad_), or, as in the above example, at creation time by passing it in as an argument (default is `False`), e.g., - -```python ->>> existing_tensor.requires_grad_() ->>> existing_tensor.requires_grad -True ->>> my_tensor = torch.zeros(3, 4, requires_grad=True) ->>> my_tensor.requires_grad -True -``` - -### What about `.data?` - -`.data` was the primary way to get the underlying `Tensor` from a `Variable`. After this merge, calling `y = x.data` still has similar semantics. So `y` will be a `Tensor` that shares the same data with `x`, is unrelated with the computation history of `x`, and has `requires_grad=False`. - -However, `.data` can be unsafe in some cases. Any changes on `x.data` wouldn't be tracked by `autograd`, and the computed gradients would be incorrect if `x` is needed in a backward pass. A safer alternative is to use [`x.detach()`](https://pytorch.org/docs/master/autograd.html#torch.Tensor.detach), which also returns a `Tensor` that shares data with `requires_grad=False`, but will have its in-place changes reported by `autograd` if `x` is needed in backward. - -Here is an example of the difference between `.data` and `x.detach()` (and why we recommend using `detach` in general). - -If you use `Tensor.detach()`, the gradient computation is guaranteed to be correct. - -```python ->>> a = torch.tensor([1,2,3.], requires_grad = True) ->>> out = a.sigmoid() ->>> c = out.detach() ->>> c.zero_() -tensor([ 0., 0., 0.]) - ->>> out # modified by c.zero_() !! -tensor([ 0., 0., 0.]) - ->>> out.sum().backward() # Requires the original value of out, but that was overwritten by c.zero_() -RuntimeError: one of the variables needed for gradient computation has been modified by an -``` - -However, using `Tensor.data` can be unsafe and can easily result in incorrect gradients when a tensor is required for gradient computation but modified in-place. - -```python ->>> a = torch.tensor([1,2,3.], requires_grad = True) ->>> out = a.sigmoid() ->>> c = out.data ->>> c.zero_() -tensor([ 0., 0., 0.]) - ->>> out # out was modified by c.zero_() -tensor([ 0., 0., 0.]) - ->>> out.sum().backward() ->>> a.grad # The result is very, very wrong because `out` changed! -tensor([ 0., 0., 0.]) -``` - -## Support for 0-dimensional (scalar) Tensors - -Previously, indexing into a `Tensor` vector (1-dimensional tensor) gave a Python number but indexing into a `Variable` vector gave (inconsistently!) a vector of size `(1,)`! Similar behavior existed with reduction functions, e.g. `tensor.sum()` would return a Python number, but `variable.sum()` would return a vector of size `(1,)`. - -Fortunately, this release introduces proper scalar (0-dimensional tensor) support in PyTorch! Scalars can be created using the new `torch.tensor` function (which will be explained in more detail later; for now just think of it as the PyTorch equivalent of `numpy.array`). Now you can do things like: - -```python ->>> torch.tensor(3.1416) # create a scalar directly -tensor(3.1416) ->>> torch.tensor(3.1416).size() # scalar is 0-dimensional -torch.Size([]) ->>> torch.tensor([3]).size() # compare to a vector of size 1 -torch.Size([1]) ->>> ->>> vector = torch.arange(2, 6) # this is a vector ->>> vector -tensor([ 2., 3., 4., 5.]) ->>> vector.size() -torch.Size([4]) ->>> vector[3] # indexing into a vector gives a scalar -tensor(5.) ->>> vector[3].item() # .item() gives the value as a Python number -5.0 ->>> mysum = torch.tensor([2, 3]).sum() ->>> mysum -tensor(5) ->>> mysum.size() -torch.Size([]) -``` - -### Accumulating losses - -Consider the widely used pattern `total_loss += loss.data[0]`. Before 0.4.0. `loss` was a `Variable` wrapping a tensor of size `(1,)`, but in 0.4.0 `loss` is now a scalar and has `0` dimensions. Indexing into a scalar doesn't make sense (it gives a warning now, but will be a hard error in 0.5.0). Use `loss.item()` to get the Python number from a scalar. - -Note that if you don't convert to a Python number when accumulating losses, you may find increased memory usage in your program. This is because the right-hand-side of the above expression used to be a Python float, while it is now a zero-dim Tensor. The total loss is thus accumulating Tensors and their gradient history, which may keep around large autograd graphs for much longer than necessary. - -## Deprecation of volatile flag - -The `volatile` flag is now deprecated and has no effect. Previously, any computation that involves a `Variable` with `volatile=True` wouldn't be tracked by `autograd`. This has now been replaced by a [set of more flexible context managers](https://pytorch.org/docs/0.4.0/torch.html#locally-disabling-gradient-computation) including `torch.no_grad()`, `torch.set_grad_enabled(grad_mode)`, and others. - -```python ->>> x = torch.zeros(1, requires_grad=True) ->>> with torch.no_grad(): -... y = x * 2 ->>> y.requires_grad -False ->>> ->>> is_train = False ->>> with torch.set_grad_enabled(is_train): -... y = x * 2 ->>> y.requires_grad -False ->>> torch.set_grad_enabled(True) # this can also be used as a function ->>> y = x * 2 ->>> y.requires_grad -True ->>> torch.set_grad_enabled(False) ->>> y = x * 2 ->>> y.requires_grad -False -``` - -## [`dtypes`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.dtype), [`devices`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.device) and NumPy-style creation functions - -In previous versions of PyTorch, we used to specify data type (e.g. float vs double), device type (cpu vs cuda) and layout (dense vs sparse) together as a "tensor type". For example, `torch.cuda.sparse.DoubleTensor` was the `Tensor` type representing the `double` data type, living on CUDA devices, and with [COO sparse tensor](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)) layout. - -In this release, we introduce [`torch.dtype`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.dtype), [`torch.device`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.device) and [`torch.layout`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.layout) classes to allow better management of these properties via NumPy-style creation functions. - -### [`torch.dtype`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.dtype) - -Below is a complete list of available [`torch.dtype`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.dtype)s (data types) and their corresponding tensor types. - -| Data | `type torch.dtype` | Tensor types | -|------|------------------|--------------| -| 32-bit floating point | `torch.float32` or `torch.float` | `torch.*.FloatTensor` -| 64-bit floating point | `torch.float64` or `torch.double` | `torch.*.DoubleTensor` -| 16-bit floating point | `torch.float16` or `torch.half` | `torch.*.HalfTensor` -| 8-bit integer (unsigned) | `torch.uint8` | `torch.*.ByteTensor` -| 8-bit integer (signed) | `torch.int8` | `torch.*.CharTensor` -| 16-bit integer (signed) | `torch.int16` or `torch.short` | `torch.*.ShortTensor` -| 32-bit integer (signed) | `torch.int32` or `torch.int` | `torch.*.IntTensor` -| 64-bit integer (signed) | `torch.int64` or `torch.long` | `torch.*.LongTensor` - -The dtype of a tensor can be access via its `dtype` attribute. - -### [`torch.device`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.device) - -A [`torch.device`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.device) contains a device type (`'cpu'` or `'cuda'`) and optional device ordinal (id) for the device type. It can be initialized with `torch.device('{device_type}')` or `torch.device('{device_type}:{device_ordinal}')`. - -If the device ordinal is not present, this represents the current device for the device type; e.g., `torch.device('cuda')` is equivalent to `torch.device('cuda:X')` where `X` is the result of `torch.cuda.current_device()`. - -The device of a tensor can be accessed via its `device` attribute. - -### [`torch.layout`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.layout) - -[`torch.layout`](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.layout) represents the data layout of a [`Tensor`](https://pytorch.org/docs/0.4.0/tensors.html). Currently `torch.strided` (dense tensors, the default) and `torch.sparse_coo` (sparse tensors with COO format) are supported. - -The layout of a tensor can be access via its `layout` attribute. - -### Creating Tensors - -[Methods that create a](https://pytorch.org/docs/0.4.0/torch.html#creation-ops) [`Tensor`](https://pytorch.org/docs/0.4.0/tensors.html) now also take in `dtype`, `device`, `layout`, and `requires_grad` options to specify the desired attributes on the returned `Tensor`. For example, - -```python ->>> device = torch.device("cuda:1") ->>> x = torch.randn(3, 3, dtype=torch.float64, device=device) -tensor([[-0.6344, 0.8562, -1.2758], - [ 0.8414, 1.7962, 1.0589], - [-0.1369, -1.0462, -0.4373]], dtype=torch.float64, device='cuda:1') ->>> x.requires_grad # default is False -False ->>> x = torch.zeros(3, requires_grad=True) ->>> x.requires_grad -True -``` - -##### [`torch.tensor(data, ...)`](https://pytorch.org/docs/0.4.0/torch.html#torch.tensor) - -[`torch.tensor`](https://pytorch.org/docs/0.4.0/torch.html#torch.tensor) is one of the newly added [tensor creation methods](https://pytorch.org/docs/0.4.0/torch.html#creation-ops). It takes in array-like data of all kinds and copies the contained values into a new `Tensor`. As mentioned earlier, [`torch.tensor`](https://pytorch.org/docs/0.4.0/torch.html#torch.tensor) is the PyTorch equivalent of NumPy's `numpy.array`constructor. Unlike the `torch.*Tensor` methods, you can also create zero-dimensional `Tensor`s (aka scalars) this way (a single python number is treated as a Size in the `torch.*Tensor methods`). Moreover, if a `dtype` argument isn't given, it will infer the suitable `dtype` given the data. It is the recommended way to create a tensor from existing data like a Python list. For example, - -```python ->>> cuda = torch.device("cuda") ->>> torch.tensor([[1], [2], [3]], dtype=torch.half, device=cuda) -tensor([[ 1], - [ 2], - [ 3]], device='cuda:0') ->>> torch.tensor(1) # scalar -tensor(1) ->>> torch.tensor([1, 2.3]).dtype # type inferece -torch.float32 ->>> torch.tensor([1, 2]).dtype # type inferece -torch.int64 -``` - -We've also added more tensor creation methods. Some of them have `torch.*_like` and/or `tensor.new_*` variants. - -- `torch.*_like` takes in an input `Tensor` instead of a shape. It returns a `Tensor` with same attributes as the input `Tensor` by default unless otherwise specified: - - ```python - >>> x = torch.randn(3, dtype=torch.float64) - >>> torch.zeros_like(x) - tensor([ 0., 0., 0.], dtype=torch.float64) - >>> torch.zeros_like(x, dtype=torch.int) - tensor([ 0, 0, 0], dtype=torch.int32) - ``` - -- `tensor.new_*` can also create `Tensors` with same attributes as `tensor`, but it always takes in a shape argument: - - ```python - >>> x = torch.randn(3, dtype=torch.float64) - >>> x.new_ones(2) - tensor([ 1., 1.], dtype=torch.float64) - >>> x.new_ones(4, dtype=torch.int) - tensor([ 1, 1, 1, 1], dtype=torch.int32) - ``` - -To specify the desired shape, you can either use a tuple (e.g., `torch.zeros((2, 3))`) or variable arguments (e.g., `torch.zeros(2, 3)`) in most cases. - -| Name | Returned `Tensor` | `torch.*_like` variant | `tensor.new_*` variant | -|------|-----------------|----------------------|----------------------| -| [`torch.empty`](https://pytorch.org/docs/0.4.0/torch.html#torch.empty) | uninitialized memory | ✔ | ✔ | -| [`torch.zeros`](https://pytorch.org/docs/0.4.0/torch.html#torch.zeros) | all zeros | ✔ | ✔ | -| [`torch.ones`](https://pytorch.org/docs/0.4.0/torch.html#torch.ones) | all ones | ✔ | ✔ | -| [`torch.full`](https://pytorch.org/docs/0.4.0/torch.html#torch.full) | filled with a given value | ✔ | ✔ | -| [`torch.rand`](https://pytorch.org/docs/0.4.0/torch.html#torch.rand) | i.i.d. continuous Uniform[0, 1) | ✔ | -| [`torch.randn`](https://pytorch.org/docs/0.4.0/torch.html#torch.randn) | i.i.d. `Normal(0, 1)` | ✔ | -| [`torch.randint`](https://pytorch.org/docs/0.4.0/torch.html#torch.randint) | i.i.d. discrete Uniform in given range | ✔ | -| [`torch.randperm`](https://pytorch.org/docs/0.4.0/torch.html#torch.randperm) | random permutation of `{0, 1, ..., n - 1}` | -| [`torch.tensor`](https://pytorch.org/docs/0.4.0/torch.html#torch.tensor) | copied from existing data (list, NumPy ndarray, etc.) | | ✔ | -| [`torch.from_numpy`*](https://pytorch.org/docs/0.4.0/torch.html#torch.from_numpy) | from NumPy `ndarray` (sharing storage without copying) | -| [`torch.arange`](https://pytorch.org/docs/0.4.0/torch.html#torch.arange), [`torch.range`](https://pytorch.org/docs/0.4.0/torch.html#torch.range), and [`torch.linspace`](https://pytorch.org/docs/0.4.0/torch.html#torch.linspace) | uniformly spaced values in a given range | -| [`torch.logspace`](https://pytorch.org/docs/0.4.0/torch.html#torch.logspace) | logarithmically spaced values in a given range | -| [`torch.eye`](https://pytorch.org/docs/0.4.0/torch.html#torch.eye) | identity matrix | - - -*: [`torch.from_numpy`](https://pytorch.org/docs/0.4.0/torch.html#torch.from_numpy) only takes in a NumPy `ndarray` as its input argument. - - -## Writing device-agnostic code - -Previous versions of PyTorch made it difficult to write code that was device agnostic (i.e. that could run on both CUDA-enabled and CPU-only machines without modification). - -PyTorch 0.4.0 makes this easier in two ways: - -- The `device` attribute of a Tensor gives the [torch.device](https://pytorch.org/docs/0.4.0/tensor_attributes.html#torch.torch.device) for all Tensors (`get_device` only works for CUDA tensors) -- The `to` method of `Tensors` and `Modules` can be used to easily move objects to different devices (instead of having to call `cpu()` or `cuda()` based on the context) - -We recommend the following pattern: - -```python -# at beginning of the script -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - -... - -# then whenever you get a new Tensor or Module -# this won't copy if they are already on the desired device -input = data.to(device) -model = MyModule(...).to(device) -``` - -## New edge-case constraints on names of submodules, parameters, and buffers in `nn.Module` - -`name` that is an empty string or contains `"."` is no longer permitted in `module.add_module(name, value)`, `module.add_parameter(name, value)` or `module.add_buffer(name, value)` because such names may cause lost data in the `state_dict`. If you are loading a checkpoint for modules containing such names, please update the module definition and patch the `state_dict` before loading it. - -## Code Samples (Putting it all together) - -To get a flavor of the overall recommended changes in 0.4.0, let's look at a quick example for a common code pattern in both 0.3.1 and 0.4.0: - -- 0.3.1 (old): - ```python - model = MyRNN() - if use_cuda: - model = model.cuda() - - # train - total_loss = 0 - for input, target in train_loader: - input, target = Variable(input), Variable(target) - hidden = Variable(torch.zeros(*h_shape)) # init hidden - if use_cuda: - input, target, hidden = input.cuda(), target.cuda(), hidden.cuda() - ... # get loss and optimize - total_loss += loss.data[0] - - # evaluate - for input, target in test_loader: - input = Variable(input, volatile=True) - if use_cuda: - ... - ... - ``` - -- 0.4.0 (new): - ```python - # torch.device object used throughout this script - device = torch.device("cuda" if use_cuda else "cpu") - - model = MyRNN().to(device) - - # train - total_loss = 0 - for input, target in train_loader: - input, target = input.to(device), target.to(device) - hidden = input.new_zeros(*h_shape) # has the same device & dtype as `input` - ... # get loss and optimize - total_loss += loss.item() # get Python number from 1-element Tensor - - # evaluate - with torch.no_grad(): # operations inside don't track history - for input, target in test_loader: - ... - ``` - -Thank you for reading! Please refer to our [documentation](https://pytorch.org/docs/0.4.0/index.html) and [release notes](https://github.com/pytorch/pytorch/releases/tag/v0.4.0) for more details. - -Happy PyTorch-ing! diff --git a/_posts/2018-05-2-the-road-to-1_0.md b/_posts/2018-05-2-the-road-to-1_0.md deleted file mode 100644 index df93b5731974..000000000000 --- a/_posts/2018-05-2-the-road-to-1_0.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -layout: blog_detail -title: 'The road to 1.0: production ready PyTorch' -author: The PyTorch Team -redirect_from: /2018/05/02/road-to-1.0.html ---- - -We would like to give you a preview of the roadmap for PyTorch 1.0 , the next release of PyTorch. Over the last year, we've had 0.2, 0.3 and 0.4 transform PyTorch from a [Torch+Chainer]-like interface into something cleaner, adding double-backwards, numpy-like functions, advanced indexing and removing Variable boilerplate. At this time, we're confident that the API is in a reasonable and stable state to confidently release a 1.0. - -However, 1.0 isn't just about stability of the interface. - -One of PyTorch's biggest strengths is its first-class Python integration, imperative style, simplicity of the API and options. These are aspects that make PyTorch good for research and hackability. - -One of its biggest downsides has been production-support. What we mean by production-support is the countless things one has to do to models to run them efficiently at massive scale: - -- exporting to C++-only runtimes for use in larger projects -- optimizing mobile systems on iPhone, Android, Qualcomm and other systems -- using more efficient data layouts and performing kernel fusion to do faster inference (saving 10% of speed or memory at scale is a big win) -- quantized inference (such as 8-bit inference) - -Startups, large companies and anyone who wants to build a product around PyTorch have asked for production support. At Facebook (the largest stakeholder for PyTorch) we have Caffe2, which has been the production-ready platform, running in our datacenters and shipping to more than 1 billion phones spanning eight generations of iPhones and six generations of Android CPU architectures. It has server-optimized inference on Intel / ARM, TensorRT support, and all the necessary bits for production. Considering all this value locked-in to a platform that the PyTorch team works quite closely with, **we decided to marry PyTorch and Caffe2 which gives the production-level readiness for PyTorch**. - -Supporting production features without adding usability issues for our researchers and end-users needs creative solutions. - -## Production != Pain for researchers - -Adding production capabilities involves increasing the API complexity and number of configurable options for models. One configures memory-layouts (NCHW vs NHWC vs N,C/32,H,W,32, each providing different performance characteristics), quantization (8-bit? 3-bit?), fusion of low-level kernels (you used a Conv + BatchNorm + ReLU, let's fuse them into a single kernel), separate backend options (MKLDNN backend for a few layers and NNPACK backend for other layers), etc. - -PyTorch's central goal is to provide a great platform for research and hackability. So, while we add all these optimizations, we've been working with a hard design constraint to never trade these off against usability. - -To pull this off, we are introducing `torch.jit`, a just-in-time (JIT) compiler that at runtime takes your PyTorch models and rewrites them to run at production-efficiency. The JIT compiler can also export your model to run in a C++-only runtime based on Caffe2 bits. - -> In 1.0, your code continues to work as-is, we're not making any big changes to the existing API. - -Making your model production-ready is an opt-in annotation, which uses the `torch.jit` compiler to export your model to a Python-less environment, and improving its performance. Let's walk through the JIT compiler in detail. - -## `torch.jit`: A JIT-compiler for your models - -We strongly believe that it's hard to match the productivity you get from specifying your models directly as idiomatic Python code. This is what makes PyTorch so flexible, but it also means that PyTorch pretty much never knows the operation you'll run next. This however is a big blocker for export/productionization and heavyweight automatic performance optimizations because they need full upfront knowledge of how the computation will look before it even gets executed. - -We provide two opt-in ways of recovering this information from your code, one based on tracing native python code and one based on compiling a subset of the python language annotated into a python-free intermediate representation. After thorough discussions we concluded that they're both going to be useful in different contexts, and as such you will be able to mix and match them freely. - -## Tracing Mode - -The PyTorch tracer, `torch.jit.trace`, is a function that records all the native PyTorch operations performed in a code region, along with the data dependencies between them. In fact, PyTorch has had a tracer since 0.3, which has been used for exporting models through ONNX. What changes now, is that you no longer necessarily need to take the trace and run it elsewhere - PyTorch can re-execute it for you, using a carefully designed high-performance C++ runtime. As we develop PyTorch 1.0 this runtime will integrate all the optimizations and hardware integrations that Caffe2 provides. - -The biggest benefit of this approach is that it doesn't really care how your Python code is structured — you can trace through generators or coroutines, modules or pure functions. Since we only record native PyTorch operators, these details have no effect on the trace recorded. This behavior, however, is a double-edged sword. For example, if you have a loop in your model, it will get unrolled in the trace, inserting a copy of the loop body for as many times as the loop ran. This opens up opportunities for zero-cost abstraction (e.g. you can loop over modules, and the actual trace will be loop-overhead free!), but on the other hand this will also affect data dependent loops (think of e.g. processing sequences of varying lengths), effectively hard-coding a single length into the trace. - -For networks that do not contain loops and if statements, tracing is non-invasive and is robust enough to handle a wide variety of coding styles. This code example illustrates what tracing looks like: - -```python -# This will run your nn.Module or regular Python function with the example -# input that you provided. The returned callable can be used to re-execute -# all operations that happened during the example run, but it will no longer -# use the Python interpreter. -from torch.jit import trace -traced_model = trace(model, example_input=input) -traced_fn = trace(fn, example_input=input) - -# The training loop doesn't change. Traced model behaves exactly like an -# nn.Module, except that you can't edit what it does or change its attributes. -# Think of it as a "frozen module". -for input, target in data_loader: - loss = loss_fn(traced_model(input), target) -``` - -## Script Mode - -Tracing mode is a great way to minimize the impact on your code, but we're also very excited about the models that fundamentally make use of control flow such as RNNs. Our solution to this is a scripting mode. - -In this case you write out a regular Python function, except that you can no longer use certain more complicated language features. Once you isolated the desired functionality, you let us know that you'd like the function to get compiled by decorating it with an `@script` decorator. This annotation will transform your python function directly into our high-performance C++ runtime. This lets us recover all the PyTorch operations along with loops and conditionals. They will be embedded into our internal representation of this function, and will be accounted for every time this function is run. - -```python -from torch.jit import script - -@script -def rnn_loop(x): - hidden = None - for x_t in x.split(1): - x, hidden = model(x, hidden) - return x -``` - -## Optimization and Export - -Regardless of whether you use tracing or `@script`, the result is a python-free representation of your model, which can be used to optimize the model or to export the model from python for use in production environments. - -Extracting bigger segments of the model into an intermediate representation makes it possible to do sophisticated whole-program optimizations and to offload computation to specialized AI accelerators which operate on graphs of computation. We have already been developing the beginnings of these optimizations, including passes that fuse GPU operations together to improve the performance of smaller RNN models. - -It also allows us to use existing high-performance backends available in Caffe2 today to run the model efficiently. Additionally, @script functions (and modules!) can be fully exported to ONNX in a way that retains their dynamic nature, such that you can easily run them in a Python-free environment using the model executors from Caffe2 or by transferring the model to any other framework supporting ONNX. - -## Usability - -**We care deeply about maintaining our current level of usability and we know that execution of the code not directly in Python leads to harder debugging, but this is something that we think about a lot, and we're making sure that you're not getting locked in to a completely different programming language.** - -First, we follow the principle of pay for what you use — if you don't need to optimize or export your model, you do not have to use these new features and won't see any downsides. Furthermore, use of traced or @script modules/functions can be done incrementally. For instance, all of these behaviors are allowed: You can trace part of your model and use the trace in a larger non-traced model. You can use tracing for 90% of your model, and use @script for the one sub-module that actually has some control flow in it. You can write a function using @script and have it call a native python function. If something appears incorrect in an @script function, you can remove the annotation and the code will execute in native python where it is easy to debug using your favorite tools and methods. Think of tracing and @script like type annotations using MyPy or TypeScript — each additional annotation can be tested incrementally, and none are required until you want to optimize or productionize. - -Most importantly, these modes will be built into the core of PyTorch so that mixing and matching them with your existing code can happen seamlessly. - -_Note: The name JIT for these components is a bit of a misnomer and comes from historical reasons. The tracing/function execution in PyTorch started out as an optimizing JIT compiler that generated fused CUDA kernels but then grew to encompass optimization, @script, and export. When it is ready for release we will likely rename this functionality to the hybrid frontend, but we wanted to present it here as it is named in the code so that you can follow along as we develop it._ - -## Other changes and improvements - -Production support is the big feature for 1.0, but we will continue optimizing and fixing other parts of PyTorch as course of the standard release process. - -On the backend side of things, PyTorch will see some changes, which might affect user-written C and C++ extensions. We are replacing (or refactoring) the backend ATen library to incorporate features and optimizations from Caffe2. - -## Last Words - -We aim to release 1.0 some time during the summer. You can follow-along our progress on the [Pull Requests](https://github.com/pytorch/pytorch/pulls) page. - -You can read this from the perspective of the Caffe2 project at: [https://caffe2.ai/blog/2018/05/02/Caffe2_PyTorch_1_0.html](https://caffe2.ai/blog/2018/05/02/Caffe2_PyTorch_1_0.html) diff --git a/_posts/2019-05-08-model-serving-in-pyorch.md b/_posts/2019-05-08-model-serving-in-pyorch.md deleted file mode 100644 index 512268e5f198..000000000000 --- a/_posts/2019-05-08-model-serving-in-pyorch.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -layout: blog_detail -title: 'Model Serving in PyTorch' -author: Jeff Smith -redirect_from: /2019/05/08/model-serving-in-pyorch.html ---- - -PyTorch has seen a lot of adoption in research, but people can get confused about how well PyTorch models can be taken into production. This blog post is meant to clear up any confusion people might have about the road to production in PyTorch. -Usually when people talk about taking a model “to production,” they usually mean performing **inference**, sometimes called model evaluation or prediction or serving. At the level of a function call, in PyTorch, inference looks something like this: - -* In Python - * `module(input)` -* In traced modules - * `module(input)` -* In C++ - * `at::Tensor output = module->forward(inputs).toTensor();` - -Since we at Facebook perform inference operations using PyTorch hundreds of trillions of times per day, we've done a lot to make sure that inference runs as efficiently as possible. - -## Serving Strategies - -That zoomed-in view of how you use models in inference isn't usually the whole story, though. In a real world machine learning system, you often need to do more than just run a single inference operation in the REPL or Jupyter notebook. Instead, you usually need to integrate your model into a larger application in some way. Depending on what you need to do, you can usually take one of the following approaches. - -### Direct embedding - -In application settings like mobile, we often just directly call the model as part of a larger program. This isn't just for apps; usually this is how robotics and dedicated devices work as well. At a code-level, the call to the model is exactly the same as what is shown above in the section about inference shown above. A key concern is often that a Python interpreter is not present in such environments, which is why PyTorch allows you to call your models from C++ and ship a model without the need for a Python runtime. - -### Model microservices - -If you're using your model in a server side context and you're managing multiple models, you might choose to treat each individual model (or each individual model version) as a separate service, usually using some sort of packaging mechanism like a Docker container. Then that service is often made network accessible via some sort of service, either using JSON over HTTP or an RPC technology like gRPC. The key characteristic of this approach is that you're defining a service with a single endpoint that just calls your model. Then you do do all of your model management (promotion, rollback, etc.) via whatever system you already use to manage your services (e.g. kubernetes, ECS). - -### Model servers - -An additional possible solution is to use a model server. This is an application built to manage and serve models. It allows you to upload multiple models and get distinct prediction endpoints for each of them. Typically such systems include a number of other features to help solve more of the whole problem of managing and serving models. This can include things like metrics, visualization, data pre-processing, and more. Even something as simple as having a system for automatically versioning models can make building important features like model rollbacks much easier. - -### Evolving Patterns - -The above is a somewhat arbitrary breakdown of different approaches based on a snapshot in time. Design patterns are still evolving. Recently, model server designs have started to adopt more of the technologies of general service infrastructure such as Docker containers and kubernetes, so many model servers have started to share properties of the model microservice design discussed above. For a deeper dive into the general concepts of model server designs, you can check out my [book on machine learning systems](https://www.manning.com/books/machine-learning-systems). - -## Serving PyTorch Models - -So, if you're a PyTorch user, what should you use if you want to take your models to production? - -If you're on mobile or working on an embedded system like a robot, direct embedding in your application is often the right choice. -For mobile specifically, your use case might be served by the ONNX export functionality. -Note that ONNX, by its very nature, has limitations and doesn't support all of the functionality provided by the larger PyTorch project. -You can check out [this tutorial](https://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html) on deploying PyTorch models to mobile using ONNX to see if this path might suit your use case. -That said, we've heard that there's a lot more that PyTorch users want to do on mobile, so look for more mobile-specific functionality in PyTorch in the future. -For other embedded systems, like robots, running [inference on a PyTorch model from the C++ API](https://pytorch.org/tutorials/advanced/cpp_export.html) could be the right solution. - -If you can't use the cloud or prefer to manage all services using the same technology, you can follow [this example](https://medium.com/datadriveninvestor/deploy-your-pytorch-model-to-production-f69460192217) to build a simple model microservice using the Flask web framework. - -If you want to manage multiple models within a non-cloud service solution, there are teams developing PyTorch support in model servers like [MLFlow](https://mlflow.org/), [Kubeflow](https://www.kubeflow.org/), and [RedisAI.](https://oss.redislabs.com/redisai/) We're excited to see innovation from multiple teams building OSS model servers, and we'll continue to highlight innovation in the PyTorch ecosystem in the future. - -If you can use the cloud for your application, there are several great choices for working with models in the cloud. For AWS Sagemaker, you can start find a guide to [all of the resources from AWS for working with PyTorch](https://docs.aws.amazon.com/sagemaker/latest/dg/pytorch.html), including docs on how to use the [Sagemaker Python SDK](https://sagemaker.readthedocs.io/en/stable/using_pytorch.html). You can also see [some](https://youtu.be/5h1Ot2dPi2E) [talks](https://youtu.be/qc5ZikKw9_w) we've given on using PyTorch on Sagemaker. Finally, if you happen to be using PyTorch via FastAI, then they've written a really simple guide to getting up and running on Sagemaker. - -The story is similar across other major clouds. On Google Cloud, you can follow [these instructions](https://cloud.google.com/deep-learning-vm/docs/pytorch_start_instance) to get access to a Deep Learning VM with PyTorch pre-installed. On Microsoft Azure, you have a number of ways to get started from [Azure Machine Learning Service](https://azure.microsoft.com/en-us/services/machine-learning-service/) to [Azure Notebooks](https://notebooks.azure.com/pytorch/projects/tutorials) showing how to use PyTorch. - -## Your Models - -Whichever approach you take to bringing your PyTorch models to production, we want to support you and enable your success. Do you love one of the options above? Are you having difficulty with that one crucial feature you can't find support for? We'd love to discuss more on the [deployment category](https://discuss.pytorch.org/c/deployment) on the PyTorch Discuss forums. We'd love to help, and where you're seeing success, amplify your story. diff --git a/_posts/2019-05-1-pytorch-adds-new-dev-tools.md b/_posts/2019-05-1-pytorch-adds-new-dev-tools.md deleted file mode 100644 index 855ef785ad0e..000000000000 --- a/_posts/2019-05-1-pytorch-adds-new-dev-tools.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch adds new dev tools as it hits production scale' -author: The PyTorch Team ---- - -_This is a partial re-post of the original blog post on the Facebook AI Blog. The full post can be [viewed here](https://ai.facebook.com/blog/pytorch-adds-new-dev-tools-as-it-hits-production-scale/)_ - -Since its release just a few months ago, [PyTorch 1.0](http://pytorch.org/) has been rapidly adopted as a powerful, flexible deep learning platform that enables engineers and researchers to move quickly from research to production. We are highlighting some of the ways the AI engineering and research community is using PyTorch 1.0. We’re also sharing new details about the latest release, PyTorch 1.1, and showcasing some of the new development tools created by the community. - -Building on the initial launch of PyTorch in 2017, we partnered with the AI community to ship the stable release of PyTorch 1.0 [last December](https://code.fb.com/ai-research/pytorch-developer-ecosystem-expands-1-0-stable-release/). Along with enhanced production-oriented capabilities and deep integration with leading cloud platforms, PyTorch 1.0 expands on the open source library’s core features, with the addition of PyTorch JIT (Just in time compilation) that seamlessly transitions between eager mode and graph mode to provide both flexibility and speed. - -Leading businesses across industries are beginning to use PyTorch to both facilitate their research and then also deploy at large scale for applications such as translation, computer vision, conversational interfaces, pharmaceutical research, factory optimization, and automated driving research. Community adoption of PyTorch has also continued to expand. Stanford, UC Berkeley, Caltech, and other universities are using PyTorch as a fundamental tool for their machine learning (ML) courses; new ecosystem projects have launched to support development on PyTorch; and major cloud platforms have expanded their integration with PyTorch. - -## Using PyTorch across industries - -Many leading businesses are moving to PyTorch 1.0 to accelerate development and deployment of new AI systems. Here are some examples: - -- Airbnb leveraged PyTorch's rich libraries and APIs for conversational AI and deployed a Smart Reply to help the company’s service agents respond more effectively to customers. -- [ATOM](https://atomscience.org/) is building a platform to generate and optimize new drug candidates significantly faster and with greater success than conventional processes. Using machine learning frameworks such as PyTorch, ATOM was able to design a variational autoencoder for representing diverse chemical structures and designing new drug candidates. -- Genentech is utilizing PyTorch’s flexible control structures and dynamic graphs to train deep learning models that will aid in the development of individualized cancer therapy. -- Microsoft is using PyTorch across its organization to develop ML models at scale and deploy them via the ONNX Runtime. Using PyTorch, Microsoft Cognition has built distributed language models that scale to billions of words and are now in production in offerings such as Cognitive Services. -- Toyota Research Institute (TRI) is developing a two-pronged approach toward automated driving with Toyota Guardian and Toyota Chauffeur technologies. The Machine Learning Team at TRI is creating new deep learning algorithms to leverage Toyota's 10 million sales per year data advantage. The flexibility of PyTorch has vastly accelerated their pace of exploration and its new production features will enable faster deployment towards their safety critical applications. - -Following the release of PyTorch 1.0 in December 2018, we’re now announcing the availability of v1.1, which improves performance, adds new model understanding and visualization tools to improve usability, and provides new APIs. - -Key features of PyTorch v1.1 include: - -- [TensorBoard](https://www.tensorflow.org/tensorboard): First-class and native support for visualization and model debugging with TensorBoard, a web application suite for inspecting and understanding training runs and graphs. PyTorch now natively supports TensorBoard with a simple “from torch.utils.tensorboard import SummaryWriter” command. -- JIT compiler: Improvements to just-in-time (JIT) compilation. These include various bug fixes as well as expanded capabilities in TorchScript, such as support for dictionaries, user classes, and attributes. -- New APIs: Support for Boolean tensors and better support for custom recurrent neural networks. -- Distributed Training: Improved performance for common models such as CNNs, added support for multi device modules including the ability to split models across GPUs while still using Distributed Data Parallel (DDP) and support for modules where not all parameters are used in every iteration (e.g. control flow, like adaptive softmax, etc). See the latest tutorials [here](https://pytorch.org/tutorials/intermediate/model_parallel_tutorial.html). - -We’ve also continued to partner with the community to foster projects and tools aimed at supporting ML engineers for needs ranging from improved model understanding to auto-tuning using AutoML methods. With the release of Ax and BoTorch (below), we will be sharing some of our core algorithms, including meta-learning for efficiently optimizing hyperparameters from based on historical tasks. We are excited to see this work open-sourced for the community to build on. - -This ecosystem includes open source projects and tools that have been deployed at production scale, as well as products and services from our partnership with industry leaders who share our vision of an open and collaborative AI community. Here are a few of the latest tools: - -- [BoTorch](https://ai.facebook.com/blog/open-sourcing-ax-and-botorch-new-ai-tools-for-adaptive-experimentation/): BoTorch is a research framework built on top of PyTorch to provide Bayesian optimization, a sample-efficient technique for sequential optimization of costly-to-evaluate black-box functions. -- [Ax](https://ai.facebook.com/blog/open-sourcing-ax-and-botorch-new-ai-tools-for-adaptive-experimentation/): Ax is an ML platform for managing adaptive experiments. It enables researchers and engineers to systematically explore large configuration spaces in order to optimize machine learning models, infrastructure, and products. -- [PyTorch-BigGraph](https://ai.facebook.com/blog/open-sourcing-pytorch-biggraph-for-faster-embeddings-of-extremely-large-graphs/): PBG is a distributed system for creating embeddings of very large graphs with billions of entities and trillions of edges. It includes support for sharding and negative sampling and it offers sample use cases based on Wikidata embeddings. -- [Google AI Platform Notebooks](https://cloud.google.com/ai-platform-notebooks/): AI Platform Notebooks is a new, hosted JupyterLab service from Google Cloud Platform. Data scientists can quickly create virtual machines running JupyterLab with the latest version of PyTorch preinstalled. It is also tightly integrated with GCP services such as BigQuery, Cloud Dataproc, Cloud Dataflow, and AI Factory, making it easy to execute the full ML cycle without ever leaving JupyterLab. - -We’re also excited to see many interesting new projects from the broader PyTorch community. Highlights include: - -- [BigGAN-PyTorch](https://github.com/ajbrock/BigGAN-PyTorch):This is a full PyTorch reimplementation that uses gradient accumulation to provide the benefits of big batches on as few as four GPUs. -- [GeomLoss](http://www.kernel-operations.io/geomloss/index.html): A Python API that defines PyTorch layers for geometric loss functions between sampled measures, images, and volumes. It includes MMD, Wasserstein, Sinkhorn, and more. - -
        - -
        - -- [PyTorch Geometric](https://github.com/rusty1s/pytorch_geometric): A deep learning extension library for PyTorch that offers several methods for deep learning on graphs and other irregular structures (also known as [geometric deep learning](http://geometricdeeplearning.com)) from a variety of published papers. -- [Curve-GCN](https://github.com/fidler-lab/curve-gcn): A real-time, interactive image annotation approach that uses an end-to-end-trained graph convolutional network (GCN). It supports object annotation by either polygons or splines, facilitating labeling efficiency for both line-based and curved objects. Curve-GCN runs 10x faster than traditional methods, such as Polygon-RNN++. - -## Udacity, fast.ai, and others develop new PyTorch resources - -PyTorch is ideal for teaching ML development because it enables rapid experimentation through its flexible, dynamic programming environment and user-friendly Pythonic interface. In addition, Google Colab now offers an interactive Jupyter Notebook environment that natively supports PyTorch, allowing developers to run any PyTorch tutorial immediately with free CPU and GPU resources. - -University-level classes — including [Stanford NLP](http://web.stanford.edu/class/cs224n), [UC Berkeley](https://inst.eecs.berkeley.edu/~cs280/sp18/) Computer Vision, and [Caltech](http://cast.caltech.edu) Robotics courses — are now being taught on PyTorch. In addition, massive open online courses (MOOCs) are training thousands of new PyTorch developers. - -Today, we’re announcing a [new Udacity course](https://blog.udacity.com/2019/05/announcing-the-secure-and-private-ai-scholarship-challenge-with-facebook.html), building upon the Intro to Deep Learning course launched last year. This new course, led by Andrew Trask of Oxford University and OpenMined, covers important concepts around privacy in AI, including methods such as differential privacy and federated learning. Facebook will also be providing scholarships to support students as they continue their ML education in Udacity’s full Nanodegree programs. - -The [fast.ai](https://www.fast.ai) community is also continuing to invest energy and resources in PyTorch. In June, fast.ai will launch a new course called Deep Learning from the Foundations, which will show developers how to go all the way from writing matrix multiplication from scratch to how to train and implement a state-of-the-art ImageNet model. The course will include deep dives into the underlying implementation of methods in the PyTorch and fast.ai libraries, and will use the code to explain and illustrate the academic papers that underlie these methods. - -As part of the course, fast.ai will also release new software modules, including fastai.audio, which brings the power of fast.ai’s deep abstractions and curated algorithms to the new PyTorch.audio module, and show how fastai.vision can be used to [create stunning high-resolution videos](https://www.fast.ai/2019/05/03/decrappify) from material such as old classic movies, and from cutting-edge microscopy sequences through a collaboration with the [Salk Institute](https://www.salk.edu). In addition, fast.ai is contributing its new X-ResNet module, including a suite of models pretrained on ImageNet. - -## Getting started with PyTorch - -Everyone in the AI community — including those new to ML development as well as researchers and engineers looking for ways to accelerate their end-to-end workflows — can experiment with PyTorch instantly by visiting [pytorch.org](https://pytorch.org) and launching a [tutorial](https://pytorch.org/tutorials) in Colab. There are also many easy ways to [get started](https://pytorch.org/get-started/locally) both locally and on popular cloud platforms. diff --git a/_posts/2019-06-10-towards-reproducible-research-with-pytorch-hub.md b/_posts/2019-06-10-towards-reproducible-research-with-pytorch-hub.md deleted file mode 100644 index 35a4306d7557..000000000000 --- a/_posts/2019-06-10-towards-reproducible-research-with-pytorch-hub.md +++ /dev/null @@ -1,221 +0,0 @@ ---- -layout: blog_detail -title: 'Towards Reproducible Research with PyTorch Hub' -author: Team PyTorch -redirect_from: /2019/06/10/pytorch_hub.html ---- - -Reproducibility is an essential requirement for many fields of research including those based on machine learning techniques. However, many machine learning publications are either not reproducible or are difficult to reproduce. With the continued growth in the number of research publications, including tens of thousands of papers now hosted on arXiv and submissions to conferences at an all time high, research reproducibility is more important than ever. While many of these publications are accompanied by code as well as trained models which is helpful but still leaves a number of steps for users to figure out for themselves. - -We are excited to announce the availability of PyTorch Hub, a simple API and workflow that provides the basic building blocks for improving machine learning research reproducibility. PyTorch Hub consists of a pre-trained model repository designed specifically to facilitate research reproducibility and enable new research. It also has built-in support for [Colab](https://colab.research.google.com/), integration with [*Papers With Code*](https://paperswithcode.com/) and currently contains a broad set of models that include Classification and Segmentation, Generative, Transformers, etc. - -
        - -
        - -## [Owner] Publishing models - -PyTorch Hub supports the publication of pre-trained models (model definitions and pre-trained weights) to a GitHub repository by adding a simple ```hubconf.py``` file. -This provides an enumeration of which models are to be supported and a list of dependencies needed to run the models. -Examples can be found in the [torchvision](https://github.com/pytorch/vision/blob/master/hubconf.py), [huggingface-bert](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/hubconf.py) and [gan-model-zoo](https://github.com/facebookresearch/pytorch_GAN_zoo) repositories. - -Let us look at the simplest case: `torchvision`'s `hubconf.py`: - -```python -# Optional list of dependencies required by the package -dependencies = ['torch'] - -from torchvision.models.alexnet import alexnet -from torchvision.models.densenet import densenet121, densenet169, densenet201, densenet161 -from torchvision.models.inception import inception_v3 -from torchvision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152,\ -resnext50_32x4d, resnext101_32x8d -from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1 -from torchvision.models.vgg import vgg11, vgg13, vgg16, vgg19, vgg11_bn, vgg13_bn, vgg16_bn, vgg19_bn -from torchvision.models.segmentation import fcn_resnet101, deeplabv3_resnet101 -from torchvision.models.googlenet import googlenet -from torchvision.models.shufflenetv2 import shufflenet_v2_x0_5, shufflenet_v2_x1_0 -from torchvision.models.mobilenet import mobilenet_v2 -``` - -In `torchvision`, the models have the following properties: -- Each model file can function and be executed independently -- They dont require any package other than PyTorch (encoded in `hubconf.py` as `dependencies['torch']`) -- They dont need separate entry-points, because the models when created, work seamlessly out of the box - -Minimizing package dependencies reduces the friction for users to load your model for immediate experimentation. - -A more involved example is HuggingFace's BERT models. Here is their `hubconf.py` - -```python -dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex'] - -from hubconfs.bert_hubconf import ( - bertTokenizer, - bertModel, - bertForNextSentencePrediction, - bertForPreTraining, - bertForMaskedLM, - bertForSequenceClassification, - bertForMultipleChoice, - bertForQuestionAnswering, - bertForTokenClassification -) -``` - -Each model then requires an entrypoint to be created. Here is a code snippet to specify an entrypoint of the ```bertForMaskedLM``` model, which returns the pre-trained model weights. - -```python -def bertForMaskedLM(*args, **kwargs): - """ - BertForMaskedLM includes the BertModel Transformer followed by the - pre-trained masked language modeling head. - Example: - ... - """ - model = BertForMaskedLM.from_pretrained(*args, **kwargs) - return model -``` - -These entry-points can serve as wrappers around complex model factories. They can give a clean and consistent help docstring, have logic to support downloading of pretrained weights (for example via `pretrained=True`) or have additional hub-specific functionality such as visualization. - -With a `hubconf.py` in place, you can send a pull request based on the template [here](https://github.com/pytorch/hub/blob/master/docs/template.md). -Our goal is to curate high-quality, easily-reproducible, maximally-beneficial models for research reproducibility. -Hence, we may work with you to refine your pull request and in some cases reject some low-quality models to be published. -Once we accept your pull request, your model will soon appear on [Pytorch hub webpage](https://pytorch.org/hub) for all users to explore. - - -## [User] Workflow - -As a user, PyTorch Hub allows you to follow a few simple steps and do things like: 1) explore available models; 2) load a model; and 3) understand what methods are available for any given model. Let's walk through some examples of each. - -### Explore available entrypoints. - -Users can list all available entrypoints in a repo using the ```torch.hub.list()``` API. - -```python ->>> torch.hub.list('pytorch/vision') ->>> -['alexnet', -'deeplabv3_resnet101', -'densenet121', -... -'vgg16', -'vgg16_bn', -'vgg19', - 'vgg19_bn'] - ``` - -Note that PyTorch Hub also allows auxillary entrypoints (other than pretrained models), e.g. ```bertTokenizer``` for preprocessing in the BERT models, to make the user workflow smoother. - - -### Load a model - -Now that we know which models are available in the Hub, users can load a model entrypoint using the ```torch.hub.load()``` API. This only requires a single command without the need to install a wheel. In addition the ```torch.hub.help()``` API can provide useful information about how to instantiate the model. - -```python -print(torch.hub.help('pytorch/vision', 'deeplabv3_resnet101')) -model = torch.hub.load('pytorch/vision', 'deeplabv3_resnet101', pretrained=True) -``` - -It is also common that repo owners will want to continually add bug fixes or performance improvements. PyTorch Hub makes it super simple for users to get the latest update by calling: - -```python -model = torch.hub.load(..., force_reload=True) -``` - -We believe this will help to alleviate the burden of repetitive package releases by repo owners and instead allow them to focus more on their research. -It also ensures that, as a user, you are getting the freshest available models. - -On the contrary, stability is important for users. Hence, some model owners serve them from a specificed branch or tag, rather than the `master` branch, to ensure stability of the code. -For example, `pytorch_GAN_zoo` serves them from the `hub` branch: - -```python -model = torch.hub.load('facebookresearch/pytorch_GAN_zoo:hub', 'DCGAN', pretrained=True, useGPU=False) -``` - -Note that the ```*args```, ```**kwargs``` passed to `hub.load()` are used to *instantiate* a model. In the above example, `pretrained=True` and `useGPU=False` are given to the model's entrypoint. - - -### Explore a loaded model - -Once you have a model from PyTorch Hub loaded, you can use the following workflow to find out the available methods that are supported as well as understand better what arguments are requires to run it. - - -```dir(model)``` to see all available methods of the model. Let's take a look at `bertForMaskedLM`'s available methods. - -```python ->>> dir(model) ->>> -['forward' -... -'to' -'state_dict', -] -``` - -```help(model.forward)``` provides a view into what arguments are required to make your loaded model run - -```python ->>> help(model.forward) ->>> -Help on method forward in module pytorch_pretrained_bert.modeling: -forward(input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None) -... -``` - -Have a closer look at the BERT and [DeepLabV3](https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101/) pages, where you can see how these models can be used once loaded. - -### Other ways to explore - -Models available in PyTorch Hub also support both [Colab](https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/facebookresearch_pytorch-gan-zoo_pgan.ipynb) and are directly linked on [Papers With Code](https://paperswithcode.com/) and you can get started with a single click. [Here](https://paperswithcode.com/paper/densely-connected-convolutional-networks) is a good example to get started with (shown below). - -
        - -
        - -## Additional resources: - -* PyTorch Hub API documentation can be found [here](https://pytorch.org/docs/stable/hub.html). -* Submit a model [here](https://github.com/pytorch/hub) for publication in PyTorch Hub. -* Go to [https://pytorch.org/hub](https://pytorch.org/hub) to learn more about the available models. -* Look for more models to come on [paperswithcode.com](https://paperswithcode.com/). - - -A BIG thanks to the folks at HuggingFace, the PapersWithCode team, fast.ai and Nvidia as well as Morgane Riviere (FAIR Paris) and lots of others for helping bootstrap this effort!! - -Cheers! - -Team PyTorch - - - - -## FAQ: - -**Q: If we would like to contribute a model that is already in the Hub but perhaps mine has better accuracy, should I still contribute?** - - -A: Yes!! A next step for Hub is to implement an upvote/downvote system to surface the best models. - -**Q: Who hosts the model weights for PyTorch Hub?** - - -A: You, as the contributor, are responsible to host the model weights. You can host your model in your favorite cloud storage or, if it fits within the limits, on GitHub. If it is not within your means to host the weights, check with us via opening an issue on the hub repository. - -**Q: What if my model is trained on private data? Should I still contribute this model?** - - -A: No! PyTorch Hub is centered around open research and that extends to the usage of open datasets to train these models on. If a pull request for a proprietary model is submitted, we will kindly ask that you resubmit a model trained on something open and available. - -**Q: Where are my downloaded models saved?** - - -A: We follow the [XDG Base Directory Specification](https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html) and adhere to common standards around cached files and directories. - -The locations are used in the order of: - -* Calling ```hub.set_dir()``` -* ```$TORCH_HOME/hub```, if environment variable ```TORCH_HOME``` is set. -* ```$XDG_CACHE_HOME/torch/hub```, if environment variable ```XDG_CACHE_HOME``` is set. -* ```~/.cache/torch/hub``` diff --git a/_posts/2019-07-18-pytorch-ecosystem.md b/_posts/2019-07-18-pytorch-ecosystem.md deleted file mode 100644 index 1be05469bb83..000000000000 --- a/_posts/2019-07-18-pytorch-ecosystem.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch Adds New Ecosystem Projects for Encrypted AI and Quantum Computing, Expands PyTorch Hub' -author: Team PyTorch ---- - -The PyTorch ecosystem includes projects, tools, models and libraries from a broad community of researchers in academia and industry, application developers, and ML engineers. The goal of this ecosystem is to support, accelerate, and aid in your exploration with PyTorch and help you push the state of the art, no matter what field you are exploring. Similarly, we are expanding the recently launched PyTorch Hub to further help you discover and reproduce the latest research. - -In this post, we’ll highlight some of the projects that have been added to the PyTorch ecosystem this year and provide some context on the criteria we use to evaluate community projects. We’ll also provide an update on the fast-growing PyTorch Hub and share details on our upcoming PyTorch Summer Hackathon. - -
        - -
        - -## Recently added ecosystem projects - -From private AI to quantum computing, we’ve seen the community continue to expand into new and interesting areas. The latest projects include: - -- [Advertorch](https://github.com/BorealisAI/advertorch): A Python toolbox for adversarial robustness research. The primary functionalities are implemented in PyTorch. Specifically, AdverTorch contains modules for generating adversarial perturbations and defending against adversarial examples, as well as scripts for adversarial training. - -- [botorch](https://botorch.org/): A modular and easily extensible interface for composing Bayesian optimization primitives, including probabilistic models, acquisition functions, and optimizers. - -- [Skorch](https://github.com/skorch-dev/skorch): A high-level library for PyTorch that provides full scikit-learn compatibility. - -- [PyTorch Geometric](https://github.com/rusty1s/pytorch_geometric): A library for deep learning on irregular input data such as graphs, point clouds, and manifolds. - -- [PySyft](https://github.com/OpenMined/PySyft): A Python library for encrypted, privacy preserving deep learning. - -- [PennyLane](https://pennylane.ai/): A library for quantum ML, automatic differentiation, and optimization of hybrid quantum-classical computations. - -- [Flair](https://github.com/zalandoresearch/flair): A very simple framework for state-of-the-art natural language processing (NLP). - -### What makes a great project? - -When we review project submissions for the PyTorch ecosystem, we take into account a number of factors that we feel are important and that we would want in the projects we use ourselves. Some of these criteria include: - -1. *Well-tested:* Users should be confident that ecosystem projects will work well with PyTorch, and include support for CI to ensure that testing is occurring on a continuous basis and the project can run on the latest version of PyTorch. -2. *Clear utility:* Users should understand where each project fits within the PyTorch ecosystem and the value it brings. -3. *Permissive licensing:* Users must be able to utilize ecosystem projects without licensing concerns. e.g. BSD-3, Apache-2 and MIT licenses -4. *Easy onboarding:* Projects need to have support for binary installation options (pip/Conda), clear documentation and a rich set of tutorials (ideally built into Jupyter notebooks). -5. *Ongoing maintenance:* Project authors need to be committed to supporting and maintaining their projects. -6. *Community:* Projects should have (or be on track to building) an active, broad-based community. - -If you would like to have your project included in the PyTorch ecosystem and featured on [pytorch.org/ecosystem](http://pytorch.org/ecosystem), please complete the form [here](https://github.com/pytorch-fdn/ecosystem). If you've previously submitted a project for consideration and haven't heard back, we promise to get back to you as soon as we can - we've received a lot of submissions! - -## PyTorch Hub for reproducible research | New models - -Since [launching](https://pytorch.org/blog/towards-reproducible-research-with-pytorch-hub/) the PyTorch Hub in beta, we’ve received a lot of interest from the community including the contribution of many new models. Some of the latest include [U-Net for Brain MRI](https://pytorch.org/hub/mateuszbuda_brain-segmentation-pytorch_unet/) contributed by researchers at Duke University, [Single Shot Detection](https://pytorch.org/hub/nvidia_deeplearningexamples_ssd/) from NVIDIA and Transformer-XL from HuggingFace. - -We’ve seen organic integration of the PyTorch Hub by folks like [paperswithcode](https://paperswithcode.com/), making it even easier for you to try out the state of the art in AI research. In addition, companies like [Seldon](https://github.com/axsaucedo/seldon-core/tree/pytorch_hub/examples/models/pytorchhub) provide production-level support for PyTorch Hub models on top of Kubernetes. - -### What are the benefits of contributing a model in the PyTorch Hub? - -- *Compatibility:* PyTorch Hub models are prioritized first for testing by the TorchScript and Cloud TPU teams, and used as baselines for researchers across a number of fields. - -- *Visibility:* Models in the Hub will be promoted on [pytorch.org](http://pytorch.org/) as well as on [paperswithcode](https://paperswithcode.com/). - -- *Ease of testing and reproducibility:* Each model comes with code, clear preprocessing requirements, and methods/dependencies to run. There is also tight integration with [Google Colab](https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/facebookresearch_WSL-Images_resnext.ipynb#scrollTo=LM_l7vXJvnDM), making it a true single click to get started. - -### PyTorch Hub contributions welcome! - -We are actively looking to grow the PyTorch Hub and welcome contributions. You don’t need to be an original paper author to contribute, and we’d love to see the number of domains and fields broaden. So what types of contributions are we looking for? - -- Artifacts of a published or an arXiv paper (or something of a similar nature that serves a different audience — such as ULMFit) that a large audience would need. - - AND - -- Reproduces the published results (or better) - -Overall these models are aimed at researchers either trying to reproduce a baseline, or trying to build downstream research on top of the model (such as feature-extraction or fine-tuning) as well as researchers looking for a demo of the paper for subjective evaluation. Please keep this audience in mind when contributing. - -If you are short on inspiration or would just like to find out what the SOTA is an any given field or domain, checkout the Paperswithcode [state-of-the-art gallery](https://paperswithcode.com/sota). - -## PyTorch Summer Hackathon - -We’ll be hosting the first PyTorch Summer Hackathon next month. We invite you to apply to participate in the in-person hackathon on August 8th to 9th at Facebook's Menlo Park campus. We'll be bringing the community together to work on innovative ML projects that can solve a broad range of complex challenges. - -Applications will be reviewed and accepted on a rolling basis until spaces are filled. For those who cannot join this Hackathon in person, we’ll be following up soon with other ways to participate. - -Please visit [this link to apply](https://www.eventbrite.com/e/pytorch-summer-hackathon-in-menlo-park-registration-63756668913). - -Thank you for being part of the PyTorch community! - --Team PyTorch diff --git a/_posts/2019-07-23-mapillary-research.md b/_posts/2019-07-23-mapillary-research.md deleted file mode 100644 index ee01ffcbd9cc..000000000000 --- a/_posts/2019-07-23-mapillary-research.md +++ /dev/null @@ -1,114 +0,0 @@ ---- -layout: blog_detail -title: 'Mapillary Research: Seamless Scene Segmentation and In-Place Activated BatchNorm' -author: Lorenzo Porzi, Mapillary -redirect_from: /2019/07/23/mapillary-research.html ---- - -With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary's independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry. - -Today, people and organizations all over the world have contributed more than 600 million images toward Mapillary's mission of helping people understand the world's places through images and making this data available, with clients and partners including the World Bank, HERE, and Toyota Research Institute. - -Mapillary’s computer vision technology brings intelligence to maps in an unprecedented way, increasing our overall understanding of the world. [Mapillary](https://www.mapillary.com/) runs state-of-the-art semantic image analysis and image-based 3d modeling at scale and on all its images. In this post we discuss two recent works from Mapillary Research and their implementations in PyTorch - Seamless Scene Segmentation [1] and In-Place Activated BatchNorm [2] - generating Panoptic segmentation results and saving up to 50% of GPU memory during training, respectively. - -## Seamless Scene Segmentation - -_Github project page: [https://github.com/mapillary/seamseg/](https://github.com/mapillary/seamseg/)_ - -
        - -
        - -The objective of Seamless Scene Segmentation is to predict a “panoptic” segmentation [3] from an image, that is a complete labeling where each pixel is assigned with a class id and, where possible, an instance id. Like many modern CNNs dealing with instance detection and segmentation, we adopt the Mask R-CNN framework [4], using ResNet50 + FPN [5] as a backbone. This architecture works in two stages: first, the “Proposal Head” selects a set of candidate bounding boxes on the image (i.e. the proposals) that could contain an object; then, the “Mask Head” focuses on each proposal, predicting its class and segmentation mask. The output of this process is a “sparse” instance segmentation, covering only the parts of the image that contain countable objects (e.g. cars and pedestrians). - -To complete our panoptic approach coined Seamless Scene Segmentation, we add a third stage to Mask R-CNN. Stemming from the same backbone, the “Semantic Head” predicts a dense semantic segmentation over the whole image, also accounting for the uncountable or amorphous classes (e.g. road and sky). The outputs of the Mask and Semantic heads are finally fused using a simple non-maximum suppression algorithm to generate the final panoptic prediction. All details about the actual network architecture, used losses and underlying math can be found at the [project website](https://research.mapillary.com/publication/cvpr19a) for our CVPR 2019 paper [1]. - -While several versions of Mask R-CNN are publicly available, including an [official implementation](https://github.com/facebookresearch/Detectron) written in Caffe2, at Mapillary we decided to build Seamless Scene Segmentation from scratch using PyTorch, in order to have full control and understanding of the whole pipeline. While doing so we encountered a couple of main stumbling blocks, and had to come up with some creative workarounds we are going to describe next. - -## Dealing with variable-sized tensors - -Something that sets aside panoptic segmentation networks from traditional CNNs is the prevalence of variable-sized data. In fact, many of the quantities we are dealing with cannot be easily represented with fixed sized tensors: each image contains a different number of objects, the Proposal head can produce a different number of proposals for each image, and the images themselves can have different sizes. While this is not a problem per-se -- one could just process images one at a time -- we would still like to exploit batch-level parallelism as much as possible. Furthermore, when performing distributed training with multiple GPUs, `DistributedDataParallel` expects its inputs to be batched, uniformly-sized tensors. - -
        - -
        - -Our solution to these issues is to wrap each batch of variable-sized tensors in a `PackedSequence`. `PackedSequence` is little more than a glorified list class for tensors, tagging its contents as “related”, ensuring that they all share the same type, and providing useful methods like moving all the tensors to a particular device, etc. When performing light-weight operations that wouldn’t be much faster with batch-level parallelism, we simply iterate over the contents of the `PackedSequence` in a for loop. When performance is crucial, e.g. in the body of the network, we simply concatenate the contents of the PackedSequence, adding zero padding as required (like in RNNs with variable-length inputs), and keeping track of the original dimensions of each tensor. - -`PackedSequence`s also help us deal with the second problem highlighted above. We slightly modify `DistributedDataParallel` to recognize `PackedSequence` inputs, splitting them in equally sized chunks and distributing their contents across the GPUs. - -## Asymmetric computational graphs with Distributed Data Parallel - -Another, perhaps more subtle, peculiarity of our network is that it can generate asymmetric computational graphs across GPUs. In fact, some of the modules that compose the network are “optional”, in the sense that they are not always computed for all images. As an example, when the Proposal head doesn’t output any proposal, the Mask head is not traversed at all. If we are training on multiple GPUs with `DistributedDataParallel`, this results in one of the replicas not computing gradients for the Mask head parameters. - -Prior to PyTorch 1.1, this resulted in a crash, so we had to develop a workaround. Our simple but effective solution was to compute a “fake forward pass” when no actual forward is required, i.e. something like this: - -```python -def fake_forward(): - fake_input = get_correctly_shaped_fake_input() - fake_output = mask_head(fake_input) - fake_loss = fake_output.sum() * 0 - return fake_loss -``` - -Here, we generate a batch of bogus data, pass it through the Mask head, and return a loss that always back-progates zeros to all parameters. - -Starting from PyTorch 1.1 this workaround is no longer required: by setting `find_unused_parameters=True` in the constructor, `DistributedDataParallel` is told to identify parameters whose gradients have not been computed by all replicas and correctly handle them. This leads to some substantial simplifications in our code base! - -## In-place Activated BatchNorm - -_Github project page: [https://github.com/mapillary/inplace_abn/](https://github.com/mapillary/inplace_abn/)_ - -Most researchers would probably agree that there are always constraints in terms of available GPU resources, regardless if their research lab has access to only a few or multiple thousands of GPUs. In a time where at Mapillary we still worked at rather few and mostly 12GB Titan X - style prosumer GPUs, we were searching for a solution that virtually enhances the usable memory during training, so we would be able to obtain and push state-of-the-art results on dense labeling tasks like semantic segmentation. In-place activated BatchNorm is enabling us to use up to 50% more memory (at little computational overhead) and is therefore deeply integrated in all our current projects (including Seamless Scene Segmentation described above). - -
        - -
        - -When processing a BN-Activation-Convolution sequence in the forward pass, most deep learning frameworks (including PyTorch) need to store two big buffers, i.e. the input x of BN and the input z of Conv. This is necessary because the standard implementations of the backward passes of BN and Conv depend on their inputs to calculate the gradients. Using InPlace-ABN to replace the BN-Activation sequence, we can safely discard x, thus saving up to 50% GPU memory at training time. To achieve this, we rewrite the backward pass of BN in terms of its output y, which is in turn reconstructed from z by inverting the activation function. - -The only limitation of InPlace-ABN is that it requires using an invertible activation function, such as leaky relu or elu. Except for this, it can be used as a direct, drop-in replacement for BN+activation modules in any network. Our native CUDA implementation offers minimal computational overhead compared to PyTorch’s standard BN, and is available for anyone to use from here: [https://github.com/mapillary/inplace_abn/](https://github.com/mapillary/inplace_abn/). - -## Synchronized BN with asymmetric graphs and unbalanced batches - -When training networks with synchronized SGD over multiple GPUs and/or multiple nodes, it’s common practice to compute BatchNorm statistics separately on each device. However, in our experience working with semantic and panoptic segmentation networks, we found that accumulating mean and variance across all workers can bring a substantial boost in accuracy. This is particularly true when dealing with small batches, like in Seamless Scene Segmentation where we train with a single, super-high resolution image per GPU. - -InPlace-ABN supports synchronized operation over multiple GPUs and multiple nodes, and, since version 1.1, this can also be achieved in the standard PyTorch library using [SyncBatchNorm](https://pytorch.org/docs/stable/nn.html#syncbatchnorm). Compared to SyncBatchNorm, however, we support some additional functionality which is particularly important for Seamless Scene Segmentation: unbalanced batches and asymmetric graphs. - -As mentioned before, Mask R-CNN-like networks naturally give rise to variable-sized tensors. Thus, in InPlace-ABN we calculate synchronized statistics using a variant of the parallel algorithm described [here](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm), which properly takes into account the fact that each GPU can hold a different number of samples. PyTorch’s SyncBatchNorm is currently being revised to support this, and the improved functionality will be available in a future release. - -Asymmetric graphs (in the sense mentioned above) are another complicating factor one has to deal with when creating a synchronized BatchNorm implementation. Luckily, PyTorch’s distributed group functionality allows us to restrict distributed communication to a subset of workers, easily excluding those that are currently inactive. The only missing piece is that, in order to create a distributed group, each process needs to know the ids of all processes that will participate in the group, and even processes that are not part of the group need to call the `new_group()` function. In InPlace-ABN we handle it with a function like this: - -```python -import torch -import torch.distributed as distributed - -def active_group(active): - """Initialize a distributed group where each process can independently decide whether to participate or not""" - world_size = distributed.get_world_size() - rank = distributed.get_rank() - - # Gather active status from all workers - active = torch.tensor(rank if active else -1, dtype=torch.long, device=torch.cuda.current_device()) - active_workers = torch.empty(world_size, dtype=torch.long, device=torch.cuda.current_device()) - distributed.all_gather(list(active_workers.unbind(0)), active) - - # Create group - active_workers = [int(i) for i in active_workers.tolist() if i != -1] - group = distributed.new_group(active_workers) - return group -``` - -First each process, including inactive ones, communicates its status to all others through an `all_gather` call, then it creates the distributed group with the shared information. In the actual implementation we also include a caching mechanism for groups, since `new_group()` is usually too expensive to call at each batch. - -## References - -[1] Seamless Scene Segmentation; Lorenzo Porzi, Samuel Rota Bulò, Aleksander Colovic, Peter Kontschieder; Computer Vision and Pattern Recognition (CVPR), 2019 - -[2] In-place Activated BatchNorm for Memory-Optimized Training of DNNs; Samuel Rota Bulò, Lorenzo Porzi, Peter Kontschieder; Computer Vision and Pattern Recognition (CVPR), 2018 - -[3] Panoptic Segmentation; Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, Piotr Dollar; Computer Vision and Pattern Recognition (CVPR), 2019 - -[4] Mask R-CNN; Kaiming He, Georgia Gkioxari, Piotr Dollar, Ross Girshick; International Conference on Computer Vision (ICCV), 2017 - -[5] Feature Pyramid Networks for Object Detection; Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan, Serge Belongie; Computer Vision and Pattern Recognition (CVPR), 2017 diff --git a/_posts/2019-08-08-pytorch-1.2-and-domain-api-release.md b/_posts/2019-08-08-pytorch-1.2-and-domain-api-release.md deleted file mode 100644 index bcc30d86963a..000000000000 --- a/_posts/2019-08-08-pytorch-1.2-and-domain-api-release.md +++ /dev/null @@ -1,189 +0,0 @@ ---- -layout: blog_detail -title: 'New Releases: PyTorch 1.2, torchtext 0.4, torchaudio 0.3, and torchvision 0.4' -author: Team PyTorch -redirect_from: /2019/08/06/pytorch_aug2019_releases.html ---- - -Since the release of PyTorch 1.0, we’ve seen the community expand to add new tools, contribute to a growing set of models available in the PyTorch Hub, and continually increase usage in both research and production. - -From a core perspective, PyTorch has continued to add features to support both research and production usage, including the ability to bridge these two worlds via [TorchScript](https://pytorch.org/docs/stable/jit.html). Today, we are excited to announce that we have four new releases including PyTorch 1.2, torchvision 0.4, torchaudio 0.3, and torchtext 0.4. You can get started now with any of these releases at [pytorch.org](https://pytorch.org/get-started/locally/). - -# PyTorch 1.2 - -With PyTorch 1.2, the open source ML framework takes a major step forward for production usage with the addition of an improved and more polished TorchScript environment. These improvements make it even easier to ship production models, expand support for exporting ONNX formatted models, and enhance module level support for Transformers. In addition to these new features, [TensorBoard](https://pytorch.org/docs/stable/tensorboard.html) is now no longer experimental - you can simply type `from torch.utils.tensorboard import SummaryWriter` to get started. - -## TorchScript Improvements - -Since its release in PyTorch 1.0, TorchScript has provided a path to production for eager PyTorch models. The TorchScript compiler converts PyTorch models to a statically typed graph representation, opening up opportunities for -optimization and execution in constrained environments where Python is not available. You can incrementally convert your model to TorchScript, mixing compiled code seamlessly with Python. - -PyTorch 1.2 significantly expands TorchScript's support for the subset of Python used in PyTorch models and delivers a new, easier-to-use API for compiling your models to TorchScript. See the [migration guide](https://pytorch.org/docs/master/jit.html#migrating-to-pytorch-1-2-recursive-scripting-api) for details. Below is an example usage of the new API: - -```python -import torch - -class MyModule(torch.nn.Module): - def __init__(self, N, M): - super(MyModule, self).__init__() - self.weight = torch.nn.Parameter(torch.rand(N, M)) - - def forward(self, input): - if input.sum() > 0: - output = self.weight.mv(input) - else: - output = self.weight + input - return output - -# Compile the model code to a static representation -my_script_module = torch.jit.script(MyModule(3, 4)) - -# Save the compiled code and model data so it can be loaded elsewhere -my_script_module.save("my_script_module.pt") -``` - -To learn more, see our [Introduction to TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript.html) and [Loading a -PyTorch Model in C++](https://pytorch.org/tutorials/advanced/cpp_export.html) tutorials. - -## Expanded ONNX Export - -The [ONNX](http://onnx.ai/) community continues to grow with an open [governance structure](https://github.com/onnx/onnx/wiki/Expanded-ONNX-Steering-Committee-Announced!) and additional steering committee members, special interest groups (SIGs), and working groups (WGs). In collaboration with Microsoft, we’ve added full support to export ONNX Opset versions 7(v1.2), 8(v1.3), 9(v1.4) and 10 (v1.5). We’ve have also enhanced the constant folding pass to support Opset 10, the latest available version of ONNX. ScriptModule has also been improved including support for multiple outputs, tensor factories, and tuples as inputs and outputs. Additionally, users are now able to register their own symbolic to export custom ops, and specify the dynamic dimensions of inputs during export. Here is a summary of the all of the major improvements: - -* Support for multiple Opsets including the ability to export dropout, slice, flip, and interpolate in Opset 10. -* Improvements to ScriptModule including support for multiple outputs, tensor factories, and tuples as inputs and outputs. -* More than a dozen additional PyTorch operators supported including the ability to export a custom operator. -* Many big fixes and test infra improvements. - -You can try out the latest tutorial [here](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html), contributed by @lara-hdr at Microsoft. A big thank you to the entire Microsoft team for all of their hard work to make this release happen! - -## nn.Transformer - -In PyTorch 1.2, we now include a standard [nn.Transformer](https://pytorch.org/docs/stable/nn.html?highlight=transformer#torch.nn.Transformer) module, based on the paper “[Attention is All You Need](https://arxiv.org/abs/1706.03762)”. The `nn.Transformer` module relies entirely on an [attention mechanism](https://pytorch.org/docs/stable/nn.html?highlight=nn%20multiheadattention#torch.nn.MultiheadAttention) to draw global dependencies between input and output. The individual components of the `nn.Transformer` module are designed so they can be adopted independently. For example, the [nn.TransformerEncoder](https://pytorch.org/docs/stable/nn.html?highlight=nn%20transformerencoder#torch.nn.TransformerEncoder) can be used by itself, without the larger `nn.Transformer`. The new APIs include: - -* `nn.Transformer` -* `nn.TransformerEncoder` and `nn.TransformerEncoderLayer` -* `nn.TransformerDecoder` and `nn.TransformerDecoderLayer` - -
        - -
        - -See the [Transformer Layers](https://pytorch.org/docs/stable/nn.html#transformer-layers) documentation for more information. See [here](https://github.com/pytorch/pytorch/releases) for the full PyTorch 1.2 release notes. - -# Domain API Library Updates - -PyTorch domain libraries like torchvision, torchtext, and torchaudio provide convenient access to common datasets, models, and transforms that can be used to quickly create a state-of-the-art baseline. Moreover, they also provide common abstractions to reduce boilerplate code that users might have to otherwise repeatedly write. Since research domains have distinct requirements, an ecosystem of specialized libraries called domain APIs (DAPI) has emerged around PyTorch to simplify the development of new and existing algorithms in a number of fields. We’re excited to release three updated DAPI libraries for text, audio, and vision that compliment the PyTorch 1.2 core release. - -## Torchaudio 0.3 with Kaldi Compatibility, New Transforms - -
        - -
        - -Torchaudio specializes in machine understanding of audio waveforms. It is an ML library that provides relevant signal processing functionality (but is not a general signal processing library). It leverages PyTorch’s GPU support to provide many tools and transformations for waveforms to make data loading and standardization easier and more readable. For example, it offers data loaders for waveforms using sox, and transformations such as spectrograms, resampling, and mu-law encoding and decoding. - -We are happy to announce the availability of torchaudio 0.3.0, with a focus on standardization and complex numbers, a transformation (resample) and two new functionals (phase_vocoder, ISTFT), Kaldi compatibility, and a new tutorial. Torchaudio was redesigned to be an extension of PyTorch and a part of the domain APIs (DAPI) ecosystem. - -### Standardization - -Significant effort in solving machine learning problems goes into data preparation. In this new release, we've updated torchaudio's interfaces for its transformations to standardize around the following vocabulary and conventions. - -Tensors are assumed to have channel as the first dimension and time as the last dimension (when applicable). This makes it consistent with PyTorch's dimensions. For size names, the prefix `n_` is used (e.g. "a tensor of size (`n_freq`, `n_mel`)") whereas dimension names do not have this prefix (e.g. "a tensor of dimension (channel, time)"). The input of all transforms and functions now assumes channel first. This is done to be consistent with PyTorch, which has channel followed by the number of samples. The channel parameter of all transforms and functions is now deprecated. - -The output of `STFT` is (channel, frequency, time, 2), meaning for each channel, the columns are the Fourier transform of a certain window, so as we travel horizontally we can see each column (the Fourier transformed waveform) change over time. This matches the output of librosa so we no longer need to transpose in our test comparisons with `Spectrogram`, `MelScale`, `MelSpectrogram`, and `MFCC`. Moreover, because of these new conventions, we deprecated `LC2CL` and `BLC2CBL` which were used to transfer from one shape of signal to another. - -As part of this release, we're also introducing support for complex numbers via tensors of dimension (..., 2), and providing `magphase` to convert such a tensor into its magnitude and phase, and similarly `complex_norm` and `angle`. - -The details of the standardization are provided in the [README](https://github.com/pytorch/audio/blob/v0.3.0/README.md#Conventions). - -### Functionals, Transformations, and Kaldi Compatibility - -Prior to the standardization, we separated state and computation into `torchaudio.transforms` and `torchaudio.functional`. - -As part of the transforms, we're adding a new transformation in 0.3.0: `Resample`. `Resample` can upsample or downsample a waveform to a different frequency. - -As part of the functionals, we're introducing: `phase_vocoder`, a phase vocoder to change the speed of a waveform without changing its pitch, and `ISTFT`, the inverse `STFT` implemented to be compatible with STFT provided by PyTorch. This separation allows us to make functionals weak scriptable and to utilize JIT in 0.3.0. We thus have JIT and CUDA support for the following transformations: `Spectrogram`, `AmplitudeToDB` (previously named `SpectrogramToDB`), `MelScale`, -`MelSpectrogram`, `MFCC`, `MuLawEncoding`, `MuLawDecoding` (previously named `MuLawExpanding`). - -We now also provide a compatibility interface with Kaldi to ease onboarding and reduce a user's code dependency on Kaldi. We now have an interface for `spectrogram`, `fbank`, and `resample_waveform`. - -### New Tutorial - -To showcase the new conventions and transformations, we have a [new tutorial](https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html) demonstrating how to preprocess waveforms using torchaudio. This tutorial walks through an example of loading a waveform and applying some of the available transformations to it. - -We are excited to see an active community around torchaudio and eager to further grow and support it. We encourage you to go ahead and experiment for yourself with this tutorial and the two datasets that are available: VCTK and YESNO! They have an interface to download the datasets and preprocess them in a convenient format. You can find the details in the release notes [here](https://github.com/pytorch/audio/releases). - -## Torchtext 0.4 with supervised learning datasets - -A key focus area of torchtext is to provide the fundamental elements to help accelerate NLP research. This includes easy access to commonly used datasets and basic preprocessing pipelines for working on raw text based data. The torchtext 0.4.0 release includes several popular supervised learning baselines with "one-command" data loading. A [tutorial](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html) is included to show how to use the new datasets for text classification analysis. We also added and improved on a few functions such as get_tokenizer and build_vocab_from_iterator to make it easier to implement future datasets. Additional examples can be found [here](https://github.com/pytorch/text/tree/master/examples/text_classification). - -Text classification is an important task in Natural Language Processing with many applications, such as sentiment analysis. The new release includes several popular text classification datasets for supervised learning including: - -* AG_NEWS -* SogouNews -* DBpedia -* YelpReviewPolarity -* YelpReviewFull -* YahooAnswers -* AmazonReviewPolarity -* AmazonReviewFull - -Each dataset comes with two parts (train vs. test), and can be easily loaded with a single command. The datasets also support an ngrams feature to capture the partial information about the local word order. Take a look at the tutorial [here](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html) to learn more about how to use the new datasets for supervised problems such as text classification analysis. - -```python -from torchtext.datasets.text_classification import DATASETS -train_dataset, test_dataset = DATASETS['AG_NEWS'](ngrams=2) -``` - -In addition to the domain library, PyTorch provides many tools to make data loading easy. Users now can load and preprocess the text classification datasets with some well supported tools, like [torch.utils.data.DataLoader](https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html) and [torch.utils.data.IterableDataset](https://pytorch.org/docs/master/data.html#torch.utils.data.IterableDataset). Here are a few lines to wrap the data with DataLoader. More examples can be found [here](https://github.com/pytorch/text/tree/master/examples/text_classification). - -```python -from torch.utils.data import DataLoader -data = DataLoader(train_dataset, collate_fn=generate_batch) -``` - -Check out the release notes [here](https://github.com/pytorch/text/releases) to learn more and try out the [tutorial here](http://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html). - -## Torchvision 0.4 with Support for Video - -Video is now a first-class citizen in torchvision, with support for data loading, datasets, pre-trained models, and transforms. The 0.4 release of torchvision includes: - -* Efficient IO primitives for reading/writing video files (including audio), with support for arbitrary encodings and formats. -* Standard video datasets, compatible with `torch.utils.data.Dataset` and `torch.utils.data.DataLoader`. -* Pre-trained models built on the Kinetics-400 dataset for action classification on videos (including the training scripts). -* Reference training scripts for training your own video models. - -We wanted working with video data in PyTorch to be as straightforward as possible, without compromising too much on performance. -As such, we avoid the steps that would require re-encoding the videos beforehand, as it would involve: - -* A preprocessing step which duplicates the dataset in order to re-encode it. -* An overhead in time and space because this re-encoding is time-consuming. -* Generally, an external script should be used to perform the re-encoding. - -Additionally, we provide APIs such as the utility class, `VideoClips`, that simplifies the task of enumerating all possible clips of fixed size in a list of video files by creating an index of all clips in a set of videos. It also allows you to specify a fixed frame-rate for the videos. An example of the API is provided below: - -```python -from torchvision.datasets.video_utils import VideoClips - -class MyVideoDataset(object): - def __init__(self, video_paths): - self.video_clips = VideoClips(video_paths, - clip_length_in_frames=16, - frames_between_clips=1, - frame_rate=15) - - def __getitem__(self, idx): - video, audio, info, video_idx = self.video_clips.get_clip(idx) - return video, audio - - def __len__(self): - return self.video_clips.num_clips() -``` - -Most of the user-facing API is in Python, similar to PyTorch, which makes it easily extensible. Plus, the underlying implementation is fast — torchvision decodes as little as possible from the video on-the-fly in order to return a clip from the video. - -Check out the torchvision 0.4 [release notes here](https://github.com/pytorch/vision/releases) for more details. - -We look forward to continuing our collaboration with the community and hearing your feedback as we further improve and expand the PyTorch deep learning platform. - -*We’d like to thank the entire PyTorch team and the community for all of the contributions to this work!* diff --git a/_posts/2019-10-10-pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors.md b/_posts/2019-10-10-pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors.md deleted file mode 100644 index 3fa1d06bd88a..000000000000 --- a/_posts/2019-10-10-pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors.md +++ /dev/null @@ -1,135 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.3 adds mobile, privacy, quantization, and named tensors' -author: Team PyTorch ---- - -PyTorch continues to gain momentum because of its focus on meeting the needs of researchers, its streamlined workflow for production use, and most of all because of the enthusiastic support it has received from the AI community. PyTorch citations in papers on ArXiv [grew 194 percent in the first half of 2019 alone, as noted by O’Reilly](https://www.oreilly.com/ideas/one-simple-graphic-researchers-love-pytorch-and-tensorflow?fbclid=IwAR3kYmlyD7zky37IYFu0cafQn7yemhl8P-7MNyB30z0q5RDzxcTOrP8kxDk), and the number of contributors to the platform has grown more than 50 percent over the last year, to nearly 1,200. Facebook, Microsoft, Uber, and other organizations across industries are increasingly using it as the foundation for their most important machine learning (ML) research and production workloads. - -We are now advancing the platform further with the release of PyTorch 1.3, which includes experimental support for features such as seamless model deployment to mobile devices, model quantization for better performance at inference time, and front-end improvements, like the ability to name tensors and create clearer code with less need for inline comments. We’re also launching a number of additional tools and libraries to support model interpretability and bringing multimodal research to production. - -Additionally, we’ve collaborated with Google and Salesforce to add broad support for Cloud Tensor Processing Units, providing a significantly accelerated option for training large-scale deep neural networks. [Alibaba Cloud](https://data.aliyun.com/bigdata/pai-pytorch?spm=5176.12825654.a9ylfrljh.d112.7b652c4ayuOO4M&scm=20140722.1068.1.1098&aly_as=-PvJ5e4c) also joins Amazon Web Services, Microsoft Azure, and Google Cloud as supported cloud platforms for PyTorch users. You can get started now at [pytorch.org](https://pytorch.org/get-started/locally/). - -# PyTorch 1.3 - -The 1.3 release of PyTorch brings significant new features, including experimental support for mobile device deployment, eager mode quantization at 8-bit integer, and the ability to name tensors. With each of these enhancements, we look forward to additional contributions and improvements from the PyTorch community. - -## Named tensors (experimental) - -Cornell University’s [Sasha Rush has argued](http://nlp.seas.harvard.edu/NamedTensor) that, despite its ubiquity in deep learning, the traditional implementation of tensors has significant shortcomings, such as exposing private dimensions, broadcasting based on absolute position, and keeping type information in documentation. He proposed named tensors as an alternative approach. - -Today, we name and access dimensions by comment: - -```python -# Tensor[N, C, H, W] - images = torch.randn(32, 3, 56, 56) - images.sum(dim=1) - images.select(dim=1, index=0) -``` - -But naming explicitly leads to more readable and maintainable code: - -```python -NCHW = [‘N’, ‘C’, ‘H’, ‘W’] - images = torch.randn(32, 3, 56, 56, names=NCHW) - images.sum('C') - images.select('C', index=0) -``` - -## Quantization (experimental) - -It’s important to make efficient use of both server-side and on-device compute resources when developing ML applications. To support more efficient deployment on servers and edge devices, PyTorch 1.3 now supports 8-bit model quantization using the familiar eager mode Python API. Quantization refers to techniques used to perform computation and storage at reduced precision, such as 8-bit integer. This currently experimental feature includes support for post-training quantization, dynamic quantization, and quantization-aware training. It leverages the [FBGEMM](https://github.com/pytorch/FBGEMM) and [QNNPACK](https://github.com/pytorch/QNNPACK) state-of-the-art quantized kernel back ends, for x86 and ARM CPUs, respectively, which are integrated with PyTorch and now share a common API. - -To learn more about the design and architecture, check out the API docs [here](https://pytorch.org/docs/master/quantization.html), and get started with any of the supported techniques using the tutorials available [here](https://pytorch.org/tutorials/). - -## PyTorch mobile (experimental) - -Running ML on edge devices is growing in importance as applications continue to demand lower latency. It is also a foundational element for privacy-preserving techniques such as federated learning. To enable more efficient on-device ML, PyTorch 1.3 now supports an end-to-end workflow from Python to deployment on iOS and Android. - -This is an early, experimental release, optimized for end-to-end development. Coming releases will focus on: - -* Optimization for size: Build level optimization and selective compilation depending on the operators needed for user applications (i.e., you pay binary size for only the operators you need) -* Performance: Further improvements to performance and coverage on mobile CPUs and GPUs -* High level API: Extend mobile native APIs to cover common preprocessing and integration tasks needed for incorporating ML in mobile applications. e.g. Computer vision and NLP - -Learn more or get started on Android or iOS [here](http://pytorch.org/mobile). - -# New tools for model interpretability and privacy - -## Captum - -As models become ever more complex, it is increasingly important to develop new methods for model interpretability. To help address this need, we’re launching Captum, a tool to help developers working in PyTorch understand why their model generates a specific output. Captum provides state-of-the-art tools to understand how the importance of specific neurons and layers and affect predictions made by the models. Captum’s algorithms include integrated gradients, conductance, SmoothGrad and VarGrad, and DeepLift. - -The example below shows how to apply model interpretability algorithms on a pretrained ResNet model and then visualize the attributions for each pixel by overlaying them on the image. - -```python -noise_tunnel = NoiseTunnel(integrated_gradients) - -attributions_ig_nt, delta = noise_tunnel.attribute(input, n_samples=10, nt_type='smoothgrad_sq', target=pred_label_idx) -_ = viz.visualize_image_attr_multiple(["original_image", "heat_map"], - ["all", "positive"], - np.transpose(attributions_ig_nt.squeeze().cpu().detach().numpy(), (1,2,0)), - np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)), - cmap=default_cmap, - show_colorbar=True) -``` - -
        - -
        -
        - -
        - -Learn more about Captum at [captum.ai](https://www.captum.ai/). - -## CrypTen - -Practical applications of ML via cloud-based or machine-learning-as-a-service (MLaaS) platforms pose a range of security and privacy challenges. In particular, users of these platforms may not want or be able to share unencrypted data, which prevents them from taking full advantage of ML tools. To address these challenges, the ML community is exploring a number of technical approaches, at various levels of maturity. These include homomorphic encryption, secure multiparty computation, trusted execution environments, on-device computation, and differential privacy. - -To provide a better understanding of how some of these technologies can be applied, we are releasing CrypTen, a new community-based research platform for taking the field of privacy-preserving ML forward. Learn more about CrypTen [here](https://ai.facebook.com/blog/crypten-a-new-research-tool-for-secure-machine-learning-with-pytorch). It is available on GitHub [here](https://github.com/facebookresearch/CrypTen). - -# Tools for multimodal AI systems - -Digital content is often made up of several modalities, such as text, images, audio, and video. For example, a single public post might contain an image, body text, a title, a video, and a landing page. Even one particular component may have more than one modality, such as a video that contains both visual and audio signals, or a landing page that is composed of images, text, and HTML sources. - -The ecosystem of tools and libraries that work with PyTorch offer enhanced ways to address the challenges of building multimodal ML systems. Here are some of the latest libraries launching today: - -## Detectron2 - -Object detection and segmentation are used for tasks ranging from autonomous vehicles to content understanding for platform integrity. To advance this work, Facebook AI Research (FAIR) is releasing Detectron2, an object detection library now implemented in PyTorch. Detectron2 provides support for the latest models and tasks, increased flexibility to aid computer vision research, and improvements in maintainability and scalability to support production use cases. - -Detectron2 is available [here](https://github.com/facebookresearch/detectron2) and you can learn more [here](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-). - -## Speech extensions to fairseq - -Language translation and audio processing are critical components in systems and applications such as search, translation, speech, and assistants. There has been tremendous progress in these fields recently thanks to the development of new architectures like transformers, as well as large-scale pretraining methods. We’ve extended fairseq, a framework for sequence-to-sequence applications such as language translation, to include support for end-to-end learning for speech and audio recognition tasks.These extensions to fairseq enable faster exploration and prototyping of new speech research ideas while offering a clear path to production. - -Get started with fairseq [here](https://github.com/pytorch/fairseq/tree/master/examples/speech_recognition). - -# Cloud provider and hardware ecosystem support - -Cloud providers such as Amazon Web Services, Microsoft Azure, and Google Cloud provide extensive support for anyone looking to develop ML on PyTorch and deploy in production. We’re excited to share the general availability of Google Cloud TPU support and a newly launched integration with Alibaba Cloud. We’re also expanding hardware ecosystem support. - -* Google Cloud TPU support now broadly available. To accelerate the largest-scale machine learning (ML) applications deployed today and enable rapid development of the ML applications of tomorrow, Google created custom silicon chips called Tensor Processing Units ([TPUs](https://cloud.google.com/tpu/)). When assembled into multi-rack ML supercomputers called [Cloud TPU Pods](https://cloud.google.com/blog/products/ai-machine-learning/cloud-tpu-pods-break-ai-training-records), these TPUs can complete ML workloads in minutes or hours that previously took days or weeks on other systems. Engineers from Facebook, Google, and Salesforce worked together to enable and pilot Cloud TPU support in PyTorch, including experimental support for Cloud TPU Pods. PyTorch support for Cloud TPUs is also available in Colab. Learn more about how to get started with PyTorch on Cloud TPUs [here](https://github.com/pytorch/xla). -* Alibaba adds support for PyTorch in Alibaba Cloud. The initial integration involves a one-click solution for PyTorch 1.x, Data Science Workshop notebook service, distributed training with Gloo/NCCL, as well as seamless integration with Alibaba IaaS such as OSS, ODPS, and NAS. Together with the toolchain provided by Alibaba, we look forward to significantly reducing the overhead necessary for adoption, as well as helping Alibaba Cloud’s global customer base leverage PyTorch to develop new AI applications. -* ML hardware ecosystem expands. In addition to key GPU and CPU partners, the PyTorch ecosystem has also enabled support for dedicated ML accelerators. Updates from [Intel](https://www.intel.ai/nnpi-glow-pytorch/) and [Habana](https://medium.com/@HabanaLabs/unlocking-ai-scaling-through-software-and-hardware-interface-standardization-77561cb7598b) showcase how PyTorch, connected to the Glow optimizing compiler, enables developers to utilize these market-specific solutions. - -# Growth in the PyTorch community - -As an open source, community-driven project, PyTorch benefits from wide range of contributors bringing new capabilities to the ecosystem. Here are some recent examples: - -* Mila SpeechBrain aims to provide an open source, all-in-one speech toolkit based on PyTorch. The goal is to develop a single, flexible, user-friendly toolkit that can be used to easily develop state-of-the-art systems for speech recognition (both end to end and HMM-DNN), speaker recognition, speech separation, multi-microphone signal processing (e.g., beamforming), self-supervised learning, and many others. [Learn more](https://speechbrain.github.io/) -* SpaCy is a new wrapping library with consistent and easy-to-use interfaces to several models, in order to extract features to power NLP pipelines. Support is provided for via spaCy’s standard training API. The library also calculates an alignment so the transformer features can be related back to actual words instead of just wordpieces. [Learn more](https://explosion.ai/blog/spacy-pytorch-transformers) -* HuggingFace PyTorch-Transformers (formerly known as pytorch-pretrained-bert is a library of state-of-the-art pretrained models for Natural Language Processing (NLP). The library currently contains PyTorch implementations, pretrained model weights, usage scripts, and conversion utilities for models such as BERT, GPT-2, RoBERTa, and DistilBERT. It has also grown quickly, with more than 13,000 GitHub stars and a broad set of users. [Learn more](https://github.com/huggingface/transformers) -* PyTorch Lightning is a Keras-like ML library for PyTorch. It leaves core training and validation logic to you and automates the rest. Reproducibility is a crucial requirement for many fields of research, including those based on ML techniques. As the number of research papers submitted to arXiv and conferences skyrockets into the tens of thousands, scaling reproducibility becomes difficult. [Learn more](https://github.com/williamFalcon/pytorch-lightning). - -We recently held the first online Global PyTorch Summer Hackathon, where researchers and developers around the world were invited to build innovative new projects with PyTorch. Nearly 1,500 developers participated, submitting projects ranging from livestock disease detection to AI-powered financial assistants. The winning projects were: - -* Torchmeta, which provides extensions for PyTorch to simplify the development of meta-learning algorithms in PyTorch. It features a unified interface inspired by TorchVision for both few-shot classification and regression problems, to allow easy benchmarking on multiple data sets to aid with reproducibility. -* Open-Unmix, a system for end-to-end music demixing with PyTorch. Demixing separates the individual instruments or vocal track from any stereo recording. -* Endless AI-Generated Tees, a store featuring AI-generated T-shirt designs that can be purchased and delivered worldwide. The system uses a state-of-the-art generative model (StyleGAN) that was built with PyTorch and then trained on modern art. - -Visit [pytorch.org](https://pytorch.org/) to learn more and get started with PyTorch 1.3 and the latest libraries and ecosystem projects. We look forward to the contributions, exciting research advancements, and real-world applications that the community builds with PyTorch. - -*We’d like to thank the entire PyTorch team and the community for all their contributions to this work.* diff --git a/_posts/2019-12-06-openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml.md b/_posts/2019-12-06-openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml.md deleted file mode 100644 index 2dd9682c9f43..000000000000 --- a/_posts/2019-12-06-openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -layout: blog_detail -title: 'OpenMined and PyTorch partner to launch fellowship funding for privacy-preserving ML community' -author: Andrew Trask (OpenMined/U.Oxford), Shubho Sengupta, Laurens van der Maaten, Joe Spisak -excerpt: Many applications of machine learning (ML) pose a range of security and privacy challenges. ---- - -
        - -
        - -Many applications of machine learning (ML) pose a range of security and privacy challenges. In particular, users may not be willing or allowed to share their data, which prevents them from taking full advantage of ML platforms like PyTorch. To take the field of privacy-preserving ML (PPML) forward, OpenMined and PyTorch are announcing plans to jointly develop a combined platform to accelerate PPML research as well as new funding for fellowships. - -There are many techniques attempting to solve the problem of privacy in ML, each at various levels of maturity. These include (1) homomorphic encryption, (2) secure multi-party computation, (3) trusted execution environments, (4) on-device computation, (5) federated learning with secure aggregation, and (6) differential privacy. Additionally, a number of open source projects implementing these techniques were created with the goal of enabling research at the intersection of privacy, security, and ML. Among them, PySyft and CrypTen have taken an “ML-first” approach by presenting an API that is familiar to the ML community, while masking the complexities of privacy and security protocols. We are excited to announce that these two projects are now collaborating closely to build a mature PPML ecosystem around PyTorch. - -Additionally, to bolster this ecosystem and take the field of privacy preserving ML forward, we are also calling for contributions and supporting research efforts on this combined platform by providing funding to support the OpenMined community and the researchers that contribute, build proofs of concepts and desire to be on the cutting edge of how privacy-preserving technology is applied. We will provide funding through the [RAAIS Foundation](https://www.raais.org/), a non-profit organization with a mission to advance education and research in artificial intelligence for the common good. We encourage interested parties to apply to one or more of the fellowships listed below. - -## Tools Powering the Future of Privacy-Preserving ML - -The next generation of privacy-preserving open source tools enable ML researchers to easily experiment with ML models using secure computing techniques without needing to be cryptography experts. By integrating with PyTorch, PySyft and CrypTen offer familiar environments for ML developers to research and apply these techniques as part of their work. - -**PySyft** is a Python library for secure and private ML developed by the OpenMined community. It is a flexible, easy-to-use library that makes secure computation techniques like [multi-party computation (MPC)](https://en.wikipedia.org/wiki/Secure_multi-party_computation) and privacy-preserving techniques like [differential privacy](https://en.wikipedia.org/wiki/Differential_privacy) accessible to the ML community. It prioritizes ease of use and focuses on integrating these techniques into end-user use cases like federated learning with mobile phones and other edge devices, encrypted ML as a service, and privacy-preserving data science. - -**CrypTen** is a framework built on PyTorch that enables private and secure ML for the PyTorch community. It is the first step along the journey towards a privacy-preserving mode in PyTorch that will make secure computing techniques accessible beyond cryptography researchers. It currently implements [secure multiparty computation](https://en.wikipedia.org/wiki/Secure_multi-party_computation) with the goal of offering other secure computing backends in the near future. Other benefits to ML researchers include: - -* It is **ML first** and presents secure computing techniques via a CrypTensor object that looks and feels exactly like a PyTorch Tensor. This allows the user to use automatic differentiation and neural network modules akin to those in PyTorch. -* The framework focuses on **scalability and performance** and is built with real-world challenges in mind. - -The focus areas for CrypTen and PySyft are naturally aligned and complement each other. The former focuses on building support for various secure and privacy preserving techniques on PyTorch through an encrypted tensor abstraction, while the latter focuses on end user use cases like deployment on edge devices and a user friendly data science platform. - -Working together will enable PySyft to use CrypTen as a backend for encrypted tensors. This can lead to an increase in performance for PySyft and the adoption of CrypTen as a runtime by PySyft’s userbase. In addition to this, PyTorch is also adding cryptography friendly features such as support for cryptographically secure random number generation. Over the long run, this allows each library to focus exclusively on its core competencies while enjoying the benefits of the synergistic relationship. - -## New Funding for OpenMined Contributors - -We are especially excited to announce that the PyTorch team has invested $250,000 to support OpenMined in furthering the development and proliferation of privacy-preserving ML. This gift will be facilitated via the [RAAIS Foundation](https://www.raais.org/) and will be available immediately to support paid fellowship grants for the OpenMined community. - -## How to get involved - -Thanks to the support from the PyTorch team, OpenMined is able to offer three different opportunities for you to participate in the project’s development. Each of these fellowships furthers our shared mission to lower the barrier-to-entry for privacy-preserving ML and to create a more privacy-preserving world. - -### Core PySyft CrypTen Integration Fellowships - -During these fellowships, we will integrate CrypTen as a supported backend for encrypted computation in PySyft. This will allow for the high-performance, secure multi-party computation capabilities of CrypTen to be used alongside other important tools in PySyft such as differential privacy and federated learning. For more information on the roadmap and how to apply for a paid fellowship, check out the project’s [call for contributors](https://blog.openmined.org/openmined-pytorch-fellowship-crypten-project). - -### Federated Learning on Mobile, Web, and IoT Devices - -During these fellowships, we will be extending PyTorch with the ability to perform federated learning across mobile, web, and IoT devices. To this end, a PyTorch front-end will be able to coordinate across federated learning backends that run in Javascript, Kotlin, Swift, and Python. Furthermore, we will also extend PySyft with the ability to coordinate these backends using peer-to-peer connections, providing low latency and the ability to run secure aggregation as a part of the protocol. For more information on the roadmap and how to apply for a paid fellowship, check out the project’s [call for contributors](https://blog.openmined.org/announcing-the-pytorch-openmined-federated-learning-fellowships). - -### Development Challenges - -Over the coming months, we will issue regular open competitions for increasing the performance and security of the PySyft and PyGrid codebases. For performance-related challenges, contestants will compete (for a cash prize) to make a specific PySyft demo (such as federated learning) as fast as possible. For security-related challenges, contestants will compete to hack into a PyGrid server. The first to demonstrate their ability will win the cash bounty! For more information on the challenges and to sign up to receive emails when each challenge is opened, [sign up here](http://blog.openmined.org/announcing-the-openmined-pytorch-development-challenges). - -To apply, select one of the above projects and identify a role that matches your strengths! - -Cheers, - -Andrew, Laurens, Joe, and Shubho diff --git a/_posts/2019-12-06-pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community.md b/_posts/2019-12-06-pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community.md deleted file mode 100644 index d39d84959ce5..000000000000 --- a/_posts/2019-12-06-pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch adds new tools and libraries, welcomes Preferred Networks to its community' -author: Team PyTorch ---- - -PyTorch continues to be used for the latest state-of-the-art research on display at the NeurIPS conference next week, making up nearly [70% of papers](https://chillee.github.io/pytorch-vs-tensorflow/) that cite a framework. In addition, we’re excited to welcome Preferred Networks, the maintainers of the Chainer framework, to the PyTorch community. Their teams are moving fully over to PyTorch for developing their ML capabilities and services. - -This growth underpins PyTorch’s focus on building for the needs of the research community, and increasingly, supporting the full workflow from research to production deployment. To further support researchers and developers, we’re launching a number of new tools and libraries for large scale computer vision and elastic fault tolerant training. Learn more on GitHub and at our NeurIPS booth. - -## Preferred Networks joins the PyTorch community - -Preferred Networks, Inc. (PFN) announced plans to move its deep learning framework from Chainer to PyTorch. As part of this change, PFN will collaborate with the PyTorch community and contributors, including people from Facebook, Microsoft, CMU, and NYU, to participate in the development of PyTorch. - -PFN developed Chainer, a deep learning framework that introduced the concept of define-by-run (also referred to as eager execution), to support and speed up its deep learning development. Chainer has been used at PFN since 2015 to rapidly solve real-world problems with the latest, cutting-edge technology. Chainer was also one of the inspirations for PyTorch’s initial design, as outlined in the [PyTorch NeurIPS](https://papers.nips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library) paper. - -PFN has driven innovative work with [CuPy](https://cupy.chainer.org/), ImageNet in 15 minutes, [Optuna](https://optuna.org/), and other projects that have pushed the boundaries of design and engineering. As part of the PyTorch community, PFN brings with them creative engineering capabilities and experience to help take the framework forward. In addition, PFN’s migration to PyTorch will allow it to efficiently incorporate the latest research results to accelerate its R&D activities, [given PyTorch’s broad adoption with researchers](https://thegradient.pub/state-of-ml-frameworks-2019-pytorch-dominates-research-tensorflow-dominates-industry/), and to collaborate with the community to add support for PyTorch on MN-Core, a deep learning processor currently in development. - -We are excited to welcome PFN to the PyTorch community, and to jointly work towards the common goal of furthering advances in deep learning technology. Learn more about the PFN’s migration to PyTorch [here](https://preferred.jp/en/news/pr20191205/). - -## Tools for elastic training and large scale computer vision - -### PyTorch Elastic (Experimental) - -Large scale model training is becoming commonplace with architectures like BERT and the growth of model parameter counts into the billions or even tens of billions. To achieve convergence at this scale in a reasonable amount of time, the use of distributed training is needed. - -The current PyTorch Distributed Data Parallel (DDP) module enables data parallel training where each process trains the same model but on different shards of data. It enables bulk synchronous, multi-host, multi-GPU/CPU execution of ML training. However, DDP has several shortcomings; e.g. jobs cannot start without acquiring all the requested nodes; jobs cannot continue after a node fails due to error or transient issue; jobs cannot incorporate a node that joined later; and lastly; progress cannot be made with the presence of a slow/stuck node. - -The focus of [PyTorch Elastic](https://github.com/pytorch/elastic), which uses Elastic Distributed Data Parallelism, is to address these issues and build a generic framework/APIs for PyTorch to enable reliable and elastic execution of these data parallel training workloads. It will provide better programmability, higher resilience to failures of all kinds, higher-efficiency and larger-scale training compared with pure DDP. - -Elasticity, in this case, means both: 1) the ability for a job to continue after node failure (by running with fewer nodes and/or by incorporating a new host and transferring state to it); and 2) the ability to add/remove nodes dynamically due to resource availability changes or bottlenecks. - -While this feature is still experimental, you can try it out on AWS EC2, with the instructions [here](https://github.com/pytorch/elastic/tree/master/aws). Additionally, the PyTorch distributed team is working closely with teams across AWS to support PyTorch Elastic training within services such as Amazon Sagemaker and Elastic Kubernetes Service (EKS). Look for additional updates in the near future. - -### New Classification Framework - -Image and video classification are at the core of content understanding. To that end, you can now leverage a new end-to-end framework for large-scale training of state-of-the-art image and video classification models. It allows researchers to quickly prototype and iterate on large distributed training jobs at the scale of billions of images. Advantages include: - -* Ease of use - This framework features a modular, flexible design that allows anyone to train machine learning models on top of PyTorch using very simple abstractions. The system also has out-of-the-box integration with AWS on PyTorch Elastic, facilitating research at scale and making it simple to move between research and production. -* High performance - Researchers can use the framework to train models such as Resnet50 on ImageNet in as little as 15 minutes. - -You can learn more at the NeurIPS Expo workshop on Multi-Modal research to production or get started with the PyTorch Elastic Imagenet example [here](https://github.com/pytorch/elastic/blob/master/examples/imagenet/main.py). - -## Come see us at NeurIPS - -The PyTorch team will be hosting workshops at NeurIPS during the industry expo on 12/8. Join the sessions below to learn more, and visit the team at the PyTorch booth on the show floor and during the Poster Session. At the booth, we’ll be walking through an interactive demo of PyTorch running fast neural style transfer on a Cloud TPU - here’s a [sneak peek](https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/style_transfer_inference-xrt-1-15.ipynb). - -We’re also publishing a [paper that details the principles that drove the implementation of PyTorch](https://papers.nips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library) and how they’re reflected in its architecture. - -*Multi-modal Research to Production* - This workshop will dive into a number of modalities such as computer vision (large scale image classification and instance segmentation) and Translation and Speech (seq-to-seq Transformers) from the lens of taking cutting edge research to production. Lastly, we will also walk through how to use the latest APIs in PyTorch to take eager mode developed models into graph mode via Torchscript and quantize them for scale production deployment on servers or mobile devices. Libraries used include: - -* Classification Framework - a newly open sourced PyTorch framework developed by Facebook AI for research on large-scale image and video classification. It allows researchers to quickly prototype and iterate on large distributed training jobs. Models built on the framework can be seamlessly deployed to production. -* Detectron2 - the recently released object detection library built by the Facebook AI Research computer vision team. We will articulate the improvements over the previous version including: 1) Support for latest models and new tasks; 2) Increased flexibility, to enable new computer vision research; 3) Maintainable and scalable, to support production use cases. -* Fairseq - general purpose sequence-to-sequence library, can be used in many applications, including (unsupervised) translation, summarization, dialog and speech recognition. - -*Responsible and Reproducible AI* - This workshop on Responsible and Reproducible AI will dive into important areas that are shaping the future of how we interpret, reproduce research, and build AI with privacy in mind. We will cover major challenges, walk through solutions, and finish each talk with a hands-on tutorial. - -* Reproducibility: As the number of research papers submitted to arXiv and conferences skyrockets, scaling reproducibility becomes difficult. We must address the following challenges: aid extensibility by standardizing code bases, democratize paper implementation by writing hardware agnostic code, facilitate results validation by documenting “tricks” authors use to make their complex systems function. To offer solutions, we will dive into tool like PyTorch Hub and PyTorch Lightning which are used by some of the top researchers in the world to reproduce the state of the art. -* Interpretability: With the increase in model complexity and the resulting lack of transparency, model interpretability methods have become increasingly important. Model understanding is both an active area of research as well as an area of focus for practical applications across industries using machine learning. To get hands on, we will use the recently released Captum library that provides state-of-the-art algorithms to provide researchers and developers with an easy way to understand the importance of neurons/layers and the predictions made by our models.` -* Private AI: Practical applications of ML via cloud-based or machine-learning-as-a-service platforms pose a range of security and privacy challenges. There are a number of technical approaches being studied including: homomorphic encryption, secure multi-party computation, trusted execution environments, on-device computation, and differential privacy. To provide an immersive understanding of how some of these technologies are applied, we will use the CrypTen project which provides a community based research platform to take the field of Private AI forward. - -*We’d like to thank the entire PyTorch team and the community for all their contributions to this work.* - -Cheers! - -Team PyTorch diff --git a/_posts/2019-4-29-stochastic-weight-averaging-in-pytorch.md b/_posts/2019-4-29-stochastic-weight-averaging-in-pytorch.md deleted file mode 100644 index a610776b0c2d..000000000000 --- a/_posts/2019-4-29-stochastic-weight-averaging-in-pytorch.md +++ /dev/null @@ -1,171 +0,0 @@ ---- -layout: blog_detail -title: 'Stochastic Weight Averaging in PyTorch' -author: Pavel Izmailov and Andrew Gordon Wilson -redirect_from: /2019/04/29/road-to-1.0.html ---- - -In this blogpost we describe the recently proposed Stochastic Weight Averaging (SWA) technique [1, 2], and its new implementation in [`torchcontrib`](https://github.com/pytorch/contrib). SWA is a simple procedure that improves generalization in deep learning over Stochastic Gradient Descent (SGD) at no additional cost, and can be used as a drop-in replacement for any other optimizer in PyTorch. SWA has a wide range of applications and features: - -1. SWA has been shown to significantly improve generalization in computer vision tasks, including VGG, ResNets, Wide ResNets and DenseNets on ImageNet and CIFAR benchmarks [1, 2]. -2. SWA provides state-of-the-art performance on key benchmarks in semi-supervised learning and domain adaptation [2]. -3. SWA is shown to improve the stability of training as well as the final average rewards of policy-gradient methods in deep reinforcement learning [3]. -4. An extension of SWA can obtain efficient Bayesian model averaging, as well as high quality uncertainty estimates and calibration in deep learning [4]. -5. SWA for low precision training, SWALP, can match the performance of full-precision SGD even with all numbers quantized down to 8 bits, including gradient accumulators [5]. - -In short, SWA performs an equal average of the weights traversed by SGD with a modified learning rate schedule (see the left panel of Figure 1.). SWA solutions end up in the center of a wide flat region of loss, while SGD tends to converge to the boundary of the low-loss region, making it susceptible to the shift between train and test error surfaces (see the middle and right panels of Figure 1). - -
        - -
        - -**Figure 1.** Illustrations of SWA and SGD with a Preactivation ResNet-164 on CIFAR-100 [1]. **Left:** test error surface for three FGE samples and the corresponding SWA solution (averaging in weight space). **Middle** and **Right:** test error and train loss surfaces showing the weights proposed by SGD (at convergence) and SWA, starting from the same initialization of SGD after 125 training epochs. Please see [1] for details on how these figures were constructed. - -**With our new implementation in [torchcontrib](https://github.com/pytorch/contrib) using SWA is as easy as using any other optimizer in PyTorch:** - -```python -from torchcontrib.optim import SWA - -... -... - -# training loop -base_opt = torch.optim.SGD(model.parameters(), lr=0.1) -opt = torchcontrib.optim.SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05) -for _ in range(100): - opt.zero_grad() - loss_fn(model(input), target).backward() - opt.step() -opt.swap_swa_sgd() -``` - -You can wrap any optimizer from `torch.optim` using the `SWA` class, and then train your model as usual. When training is complete you simply call `swap_swa_sgd()` to set the weights of your model to their SWA averages. Below we explain the SWA procedure and the parameters of the `SWA` class in detail. We emphasize that SWA can be combined with *any* optimization procedure, such as Adam, in the same way that it can be combined with SGD. - -## Is this just Averaged SGD? - -At a high level, averaging SGD iterates dates back several decades in convex optimization [6, 7], where it is sometimes referred to as Polyak-Ruppert averaging, or *averaged* SGD. **But the details matter**. *Averaged SGD* is often employed in conjunction with a decaying learning rate, and an exponentially moving average, typically for convex optimization. In convex optimization, the focus has been on improved rates of convergence. In deep learning, this form of averaged SGD smooths the trajectory of SGD iterates, but does not perform very differently. - -By contrast, SWA is focused on an **equal average** of SGD iterates with a modified **cyclical or high constant learning rate**, and exploits the flatness of training objectives [8] specific to **deep learning** for **improved generalization**. - -## Stochastic Weight Averaging - -There are two important ingredients that make SWA work. First, SWA uses a modified learning rate schedule so that SGD continues to explore the set of high-performing networks instead of simply converging to a single solution. For example, we can use the standard decaying learning rate strategy for the first 75% of training time, and then set the learning rate to a reasonably high constant value for the remaining 25% of the time (see the Figure 2 below). The second ingredient is to average the weights of the networks traversed by SGD. For example, we can maintain a running average of the weights obtained in the end of every epoch within the last 25% of training time (see Figure 2). -
        - -
        - -**Figure 2.** Illustration of the learning rate schedule adopted by SWA. Standard decaying schedule is used for the first 75% of the training and then a high constant value is used for the remaining 25%. The SWA averages are formed during the last 25% of training. - -In our implementation the auto mode of the `SWA` optimizer allows us to run the procedure described above. To run SWA in auto mode you just need to wrap your optimizer `base_opt` of choice (can be SGD, Adam, or any other `torch.optim.Optimizer`) with `SWA(base_opt, swa_start, swa_freq, swa_lr)`. After `swa_start` optimization steps the learning rate will be switched to a constant value `swa_lr`, and in the end of every `swa_freq` optimization steps a snapshot of the weights will be added to the SWA running average. Once you run `opt.swap_swa_sgd()`, the weights of your model are replaced with their SWA running averages. - -## Batch Normalization - -One important detail to keep in mind is batch normalization. Batch normalization layers compute running statistics of activations during training. Note that the SWA averages of the weights are never used to make predictions during training, and so the batch normalization layers do not have the activation statistics computed after you reset the weights of your model with `opt.swap_swa_sgd()`. To compute the activation statistics you can just make a forward pass on your training data using the SWA model once the training is finished. In the `SWA` class we provide a helper function `opt.bn_update(train_loader, model)`. It updates the activation statistics for every batch normalization layer in the model by making a forward pass on the `train_loader` data loader. You only need to call this function once in the end of training. - -## Advanced Learning-Rate Schedules - -SWA can be used with any learning rate schedule that encourages exploration of the flat region of solutions. For example, you can use cyclical learning rates in the last 25% of the training time instead of a constant value, and average the weights of the networks corresponding to the lowest values of the learning rate within each cycle (see Figure 3). - -
        - -
        - -**Figure 3.** Illustration of SWA with an alternative learning rate schedule. Cyclical learning rates are adopted in the last 25% of training, and models for averaging are collected in the end of each cycle. - -In our implementation you can implement custom learning rate and weight averaging strategies by using `SWA` in the manual mode. The following code is equivalent to the auto mode code presented in the beginning of this blogpost. - -```python -opt = torchcontrib.optim.SWA(base_opt) -for i in range(100): - opt.zero_grad() - loss_fn(model(input), target).backward() - opt.step() - if i > 10 and i % 5 == 0: - opt.update_swa() -opt.swap_swa_sgd() -``` - -In manual mode you don’t specify `swa_start`, `swa_lr` and `swa_freq`, and just call `opt.update_swa()` whenever you want to update the SWA running averages (for example in the end of each learning rate cycle). In manual mode `SWA` doesn’t change the learning rate, so you can use any schedule you want as you would normally do with any other `torch.optim.Optimizer`. - -## Why does it work? - -SGD converges to a solution within a wide flat region of loss. The weight space is extremely high-dimensional, and most of the volume of the flat region is concentrated near the boundary, so SGD solutions will always be found near the boundary of the flat region of the loss. SWA on the other hand averages multiple SGD solutions, which allows it to move towards the center of the flat region. - -We expect solutions that are centered in the flat region of the loss to generalize better than those near the boundary. Indeed, train and test error surfaces are not perfectly aligned in the weight space. Solutions that are centered in the flat region are not as susceptible to the shifts between train and test error surfaces as those near the boundary. In Figure 4 below we show the train loss and test error surfaces along the direction connecting the SWA and SGD solutions. As you can see, while SWA solution has a higher train loss compared to the SGD solution, it is centered in the region of low loss, and has a substantially better test error. - -
        - -
        - -**Figure 4.** Train loss and test error along the line connecting the SWA solution (circle) and SGD solution (square). SWA solution is centered in a wide region of low train loss while the SGD solution lies near the boundary. Because of the shift between train loss and test error surfaces, SWA solution leads to much better generalization. - -## Examples and Results - -We released a GitHub repo [here](https://github.com/izmailovpavel/contrib_swa_examples) with examples of using the `torchcontrib` implementation of SWA for training DNNs. For example, these examples can be used to achieve the following results on CIFAR-100: - -| DNN (Budget) | SGD | SWA 1 Budget | SWA 1.25 Budgets | SWA 1.5 Budgets | -| ------------------------- |:------------:|:------------:|:----------------:|:---------------:| -| VGG16 (200) | 72.55 ± 0.10 | 73.91 ± 0.12 | 74.17 ± 0.15 | 74.27 ± 0.25 | -| PreResNet110 (150) | 76.77 ± 0.38 | 78.75 ± 0.16 | 78.91 ± 0.29 | 79.10 ± 0.21 | -| PreResNet164 (150) | 78.49 ± 0.36 | 79.77 ± 0.17 | 80.18 ± 0.23 | 80.35 ± 0.16 | -| WideResNet28x10 (200) | 80.82 ± 0.23 | 81.46 ± 0.23 | 81.91 ± 0.27 | 82.15 ± 0.27 | - -## Semi-Supervised Learning - -In a follow-up [paper](https://arxiv.org/abs/1806.05594) SWA was applied to semi-supervised learning, where it illustrated improvements beyond the best reported results in multiple settings. For example, with SWA you can get 95% accuracy on CIFAR-10 if you only have the training labels for 4k training data points (the previous best reported result on this problem was 93.7%). This paper also explores averaging multiple times within epochs, which can accelerate convergence and find still flatter solutions in a given time. -
        - -
        - -**Figure 5.** Performance of fast-SWA on semi-supervised learning with CIFAR-10. fast-SWA achieves record results in every setting considered. - -## Calibration and Uncertainty Estimates -[SWA-Gaussian](https://arxiv.org/abs/1902.02476) (SWAG) is a simple, scalable and convenient approach to uncertainty estimation and calibration in Bayesian deep learning. Similarly to SWA, which maintains a running average of SGD iterates, SWAG estimates the first and second moments of the iterates to construct a Gaussian distribution over weights. SWAG distribution approximates the shape of the true posterior: Figure 6 below shows the SWAG distribution on top of the posterior log-density for PreResNet-164 on CIFAR-100. -
        - -
        -**Figure 6.** SWAG distribution on top of posterior log-density for PreResNet-164 on CIFAR-100. The shape of SWAG distribution is aligned with the posterior. - -Empirically, SWAG performs on par or better than popular alternatives including MC dropout, KFAC Laplace, and temperature scaling on uncertainty quantification, out-of-distribution detection, calibration and transfer learning in computer vision tasks. Code for SWAG is available [here](https://github.com/wjmaddox/swa_gaussian). - -## Reinforcement Learning - -In another follow-up [paper](http://www.gatsby.ucl.ac.uk/~balaji/udl-camera-ready/UDL-24.pdf) SWA was shown to improve the performance of policy gradient methods A2C and DDPG on several Atari games and MuJoCo environments. - -| Environment | A2C | A2C + SWA | -|---------------|:----------------:|:----------------:| -| Breakout | 522 ± 34 | 703 ± 60 | -| Qbert | 18777 ± 778 | 21272 ± 655 | -| SpaceInvaders | 7727 ± 1121 | 21676 ± 8897 | -| Seaquest | 1779 ± 4 | 1795 ± 4 | -| CrazyClimber | 147030 ± 10239 | 139752 ± 11618 | -| BeamRider | 9999 ± 402 | 11321 ± 1065 | - -## Low Precision Training -We can filter through quantization noise by combining weights that have been rounded down with weights that have been rounded up. Moreover, by averaging weights to find a flat region of the loss surface, large perturbations of the weights will not affect the quality of the solution (Figures 7 and 8). Recent work shows that by adapting SWA to the low precision setting, in a method called SWALP, one can *match the performance of full-precision SGD even with all training in 8 bits* [5]. This is quite a practically important result, given that (1) SGD training in 8 bits performs notably worse than full precision SGD, and (2) low precision training is significantly harder than predictions in low precision after training (the usual setting). For example, a ResNet-164 trained on CIFAR-100 with float (16-bit) SGD achieves 22.2% error, while 8-bit SGD achieves 24.0% error. By contrast, SWALP with 8 bit training achieves 21.8% error. -
        - -
        - -**Figure 7.** Quantizing in a flat region can still provide solutions with low loss. - -
        - -
        - -**Figure 8.** Low precision SGD training (with a modified learning rate schedule) and SWALP. - -## Conclusion - -One of the greatest open questions in deep learning is why SGD manages to find good solutions, given that the training objectives are highly multimodal, and there are in principle many settings of parameters that achieve no training loss but poor generalization. By understanding geometric features such as flatness, which relate to generalization, we can begin to resolve these questions and build optimizers that provide even better generalization, and many other useful features, such as uncertainty representation. We have presented SWA, a simple drop-in replacement for standard SGD, which can in principle benefit anyone training a deep neural network. SWA has been demonstrated to have strong performance in a number of areas, including computer vision, semi-supervised learning, reinforcement learning, uncertainty representation, calibration, Bayesian model averaging, and low precision training. - -We encourage you try out SWA! Using SWA is now as easy as using any other optimizer in PyTorch. And even if you have already trained your model with SGD (or any other optimizer), it’s very easy to realize the benefits of SWA by running SWA for a small number of epochs starting with a pre-trained model. - -- [1] Averaging Weights Leads to Wider Optima and Better Generalization; Pavel Izmailov, Dmitry Podoprikhin, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson; Uncertainty in Artificial Intelligence (UAI), 2018 -- [2] There Are Many Consistent Explanations of Unlabeled Data: Why You Should Average; Ben Athiwaratkun, Marc Finzi, Pavel Izmailov, Andrew Gordon Wilson; International Conference on Learning Representations (ICLR), 2019 -- [3] Improving Stability in Deep Reinforcement Learning with Weight Averaging; Evgenii Nikishin, Pavel Izmailov, Ben Athiwaratkun, Dmitrii Podoprikhin, Timur Garipov, Pavel Shvechikov, Dmitry Vetrov, Andrew Gordon Wilson, UAI 2018 Workshop: Uncertainty in Deep Learning, 2018 -- [4] A Simple Baseline for Bayesian Uncertainty in Deep Learning, Wesley Maddox, Timur Garipov, Pavel Izmailov, Andrew Gordon Wilson, arXiv pre-print, 2019: [https://arxiv.org/abs/1902.02476](https://arxiv.org/abs/1902.02476) -- [5] SWALP : Stochastic Weight Averaging in Low Precision Training, Guandao Yang, Tianyi Zhang, Polina Kirichenko, Junwen Bai, Andrew Gordon Wilson, Christopher De Sa, To appear at the International Conference on Machine Learning (ICML), 2019. -- [6] David Ruppert. Efficient estimations from a slowly convergent Robbins-Monro process. Technical report, Cornell University Operations Research and Industrial Engineering, 1988. -- [7] Acceleration of stochastic approximation by averaging. Boris T Polyak and Anatoli B Juditsky. SIAM Journal on Control and Optimization, 30(4):838–855, 1992. -- [8] Loss Surfaces, Mode Connectivity, and Fast Ensembling of DNNs, Timur Garipov, Pavel Izmailov, Dmitrii Podoprikhin, Dmitry Vetrov, Andrew Gordon Wilson. Neural Information Processing Systems (NeurIPS), 2018 diff --git a/_posts/2019-5-1-optimizing-cuda-rnn-with-torchscript.md b/_posts/2019-5-1-optimizing-cuda-rnn-with-torchscript.md deleted file mode 100644 index ce4d7e255a42..000000000000 --- a/_posts/2019-5-1-optimizing-cuda-rnn-with-torchscript.md +++ /dev/null @@ -1,225 +0,0 @@ ---- -layout: blog_detail -title: "Optimizing CUDA Recurrent Neural Networks with TorchScript" -author: "The PyTorch Team" -date: 2019-05-01 8:00:00 -0500 ---- - -This week, we officially released PyTorch 1.1, a large feature update to PyTorch 1.0. One of the new features we've added is better support for fast, custom Recurrent Neural Networks (fastrnns) with TorchScript (the PyTorch JIT) (https://pytorch.org/docs/stable/jit.html). - -RNNs are popular models that have shown good performance on a variety of NLP tasks that come in different shapes and sizes. PyTorch implements a number of the most popular ones, the [Elman RNN](https://pytorch.org/docs/master/nn.html?highlight=rnn#torch.nn.RNN), [GRU](https://pytorch.org/docs/master/nn.html?highlight=gru#torch.nn.GRU), and [LSTM](https://pytorch.org/docs/master/nn.html?highlight=lstm#torch.nn.LSTM) as well as multi-layered and bidirectional variants. - -However, many users want to implement their own custom RNNs, taking ideas from recent literature. Applying [Layer Normalization](https://arxiv.org/abs/1607.06450) to LSTMs is one such use case. Because the PyTorch CUDA LSTM implementation uses a fused kernel, it is difficult to insert normalizations or even modify the base LSTM implementation. Many users have turned to writing custom implementations using standard PyTorch operators, but such code suffers from high overhead: most PyTorch operations launch at least one kernel on the GPU and RNNs generally run many operations due to their recurrent nature. However, we can apply TorchScript to fuse operations and optimize our code automatically, launching fewer, more optimized kernels on the GPU. - -Our goal is for users to be able to write fast, custom RNNs in TorchScript without writing specialized CUDA kernels to achieve similar performance. In this post, we'll provide a tutorial for how to write your own fast RNNs with TorchScript. To better understand the optimizations TorchScript applies, we'll examine how those work on a standard LSTM implementation but most of the optimizations can be applied to general RNNs. - -## Writing custom RNNs - -To get started, you can use [this file](https://github.com/pytorch/pytorch/blob/master/benchmarks/fastrnns/custom_lstms.py) as a template to write your own custom RNNs. - -We are constantly improving our infrastructure on trying to make the performance better. If you want to gain the speed/optimizations that TorchScript currently provides (like operator fusion, batch matrix multiplications, etc.), here are some guidelines to follow. The next section explains the optimizations in depth. - -1. If the customized operations are all element-wise, that's great because you can get the benefits of the PyTorch JIT's operator fusion automatically! - -2. If you have more complex operations (e.g. reduce ops mixed with element-wise ops), consider grouping the reduce operations and element-wise ops separately in order to fuse the element-wise operations into a single fusion group. - -3. If you want to know about what has been fused in your custom RNN, you can inspect the operation's optimized graph by using `graph_for` . Using `LSTMCell` as an example: - - ```python - # get inputs and states for LSTMCell - - inputs = get_lstm_inputs() - - # instantiate a ScriptModule - - cell = LSTMCell(input_size, hidden_size) - - # print the optimized graph using graph_for - - out = cell(inputs) - print(cell.graph_for(inputs)) - - ``` - - This will generate the optimized TorchScript graph (a.k.a PyTorch JIT IR) for the specialized inputs that you provides: - - ``` - graph(%x : Float(*, *), - %hx : Float(*, *), - %cx : Float(*, *), - %w_ih : Float(*, *), - %w_hh : Float(*, *), - %b_ih : Float(*), - %b_hh : Float(*)): - %hy : Float(*, *), %cy : Float(*, *) = prim::DifferentiableGraph_0(%cx, %b_hh, %b_ih, %hx, %w_hh, %x, %w_ih) - %30 : (Float(*, *), Float(*, *)) = prim::TupleConstruct(%hy, %cy) - return (%30) - with prim::DifferentiableGraph_0 = graph(%13 : Float(*, *), - %29 : Float(*), - %33 : Float(*), - %40 : Float(*, *), - %43 : Float(*, *), - %45 : Float(*, *), - %48 : Float(*, *)): - %49 : Float(*, *) = aten::t(%48) - %47 : Float(*, *) = aten::mm(%45, %49) - %44 : Float(*, *) = aten::t(%43) - %42 : Float(*, *) = aten::mm(%40, %44) - ...some broadcast sizes operations... - %hy : Float(*, *), %287 : Float(*, *), %cy : Float(*, *), %outgate.1 : Float(*, *), %cellgate.1 : Float(*, *), %forgetgate.1 : Float(*, *), %ingate.1 : Float(*, *) = prim::FusionGroup_0(%13, %346, %345, %344, %343) - ...some broadcast sizes operations... - return (%hy, %cy, %49, %44, %196, %199, %340, %192, %325, %185, %ingate.1, %forgetgate.1, %cellgate.1, %outgate.1, %395, %396, %287) - with prim::FusionGroup_0 = graph(%13 : Float(*, *), - %71 : Tensor, - %76 : Tensor, - %81 : Tensor, - %86 : Tensor): - ...some chunks, constants, and add operations... - %ingate.1 : Float(*, *) = aten::sigmoid(%38) - %forgetgate.1 : Float(*, *) = aten::sigmoid(%34) - %cellgate.1 : Float(*, *) = aten::tanh(%30) - %outgate.1 : Float(*, *) = aten::sigmoid(%26) - %14 : Float(*, *) = aten::mul(%forgetgate.1, %13) - %11 : Float(*, *) = aten::mul(%ingate.1, %cellgate.1) - %cy : Float(*, *) = aten::add(%14, %11, %69) - %4 : Float(*, *) = aten::tanh(%cy) - %hy : Float(*, *) = aten::mul(%outgate.1, %4) - return (%hy, %4, %cy, %outgate.1, %cellgate.1, %forgetgate.1, %ingate.1) - ``` - -From the above graph we can see that it has a `prim::FusionGroup_0` subgraph that is fusing all element-wise operations in LSTMCell (transpose and matrix multiplication are not element-wise ops). Some graph nodes might be hard to understand in the first place but we will explain some of them in the optimization section, we also omitted some long verbose operators in this post that is there just for correctness. - -## Variable-length sequences best practices - -TorchScript does not support PackedSequence. Generally, when one is handling variable-length sequences, it is best to pad them into a single tensor and send that tensor through a TorchScript LSTM. Here's an example: - -```python -sequences = [...] # List[Tensor], each Tensor is T' x C -padded = torch.utils.rnn.pad_sequence(sequences) -lengths = [seq.size(0) for seq in sequences] -padded # T x N x C, where N is batch size and T is the max of all T' - -model = LSTM(...) -output, hiddens = model(padded) -output # T x N x C -``` - -Of course, `output` may have some garbage data in the padded regions; use `lengths` to keep track of which part you don't need. - -## Optimizations - -We will now explain the optimizations performed by the PyTorch JIT to speed up custom RNNs. We will use a simple custom LSTM model in TorchScript to illustrate the optimizations, but many of these are general and apply to other RNNs. - -To illustrate the optimizations we did and how we get benefits from those optimizations, we will run a simple custom LSTM model written in TorchScript (you can refer the code in the custom_lstm.py or the below code snippets) and time our changes. - -We set up the environment in a machine equipped with 2 Intel Xeon chip and one Nvidia P100, with cuDNN v7.3, CUDA 9.2 installed. The basic set up for the LSTM model is as follows: - -``` -input_size = 512 -hidden_size = 512 -mini_batch = 64 -numLayers = 1 -seq_length = 100 -``` - -The most important thing PyTorch JIT did is to compile the python program to a PyTorch JIT IR, which is an intermediate representation used to model the program's graph structure. This IR can then benefit from whole program optimization, hardware acceleration and overall has the potential to provide large computation gains. In this example, we run the initial TorchScript model with only compiler optimization passes that are provided by the JIT, including common subexpression elimination, constant pooling, constant propagation, dead code elimination and some peephole optimizations. We run the model training for 100 times after warm up and average the training time. The initial results for model forward time is around 27ms and backward time is around 64ms, which is a bit far away from what PyTorch cuDNN LSTM provided. Next we will explain the major optimizations we did on how we improve the performance on training or inferencing, starting with LSTMCell and LSTMLayer, and some misc optimizations. - -### LSTM Cell (forward) - -Almost all the computations in an LSTM happen in the LSTMCell, so it's important for us to take a look at the computations it contains and how can we improve their speed. Below is a sample LSTMCell implementation in TorchScript: - -```python -class LSTMCell(jit.ScriptModule): - def __init__(self, input_size, hidden_size): - super(LSTMCell, self).__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size)) - self.weight_hh = Parameter(torch.randn(4 * hidden_size, hidden_size)) - self.bias_ih = Parameter(torch.randn(4 * hidden_size)) - self.bias_hh = Parameter(torch.randn(4 * hidden_size)) - - @jit.script_method - def forward(self, input, state): - # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]] - hx, cx = state - gates = (torch.mm(input, self.weight_ih.t()) + self.bias_ih + - torch.mm(hx, self.weight_hh.t()) + self.bias_hh) - ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) - - ingate = torch.sigmoid(ingate) - forgetgate = torch.sigmoid(forgetgate) - cellgate = torch.tanh(cellgate) - outgate = torch.sigmoid(outgate) - - cy = (forgetgate * cx) + (ingate * cellgate) - hy = outgate * torch.tanh(cy) - - return hy, (hy, cy) -``` - - -This graph representation (IR) that TorchScript generated enables several optimizations and scalable computations. In addition to the typical compiler optimizations that we could do (CSE, constant propagation, etc. ) we can also run other IR transformations to make our code run faster. - -* Element-wise operator fusion. PyTorch JIT will automatically fuse element-wise ops, so when you have adjacent operators that are all element-wise, JIT will automatically group all those operations together into a single FusionGroup, this FusionGroup can then be launched with a single GPU/CPU kernel and performed in one pass. This avoids expensive memory reads and writes for each operation. -* Reordering chunks and pointwise ops to enable more fusion. An LSTM cell adds gates together (a pointwise operation), and then chunks the gates into four pieces: the ifco gates. Then, it performs pointwise operations on the ifco gates like above. This leads to two fusion groups in practice: one fusion group for the element-wise ops pre-chunk, and one group for the element-wise ops post-chunk. - The interesting thing to note here is that pointwise operations commute with `torch.chunk`: Instead of performing pointwise ops on some input tensors and chunking the output, we can chunk the input tensors and then perform the same pointwise ops on the output tensors. By moving the chunk to before the first fusion group, we can merge the first and second fusion groups into one big group. - -
        - -
        - -* Tensor creation on the CPU is expensive, but there is ongoing work to make it faster. At this point, a LSTMCell runs three CUDA kernels: two `gemm` kernels and one for the single pointwise group. One of the things we noticed was that there was a large gap between the finish of the second `gemm` and the start of the single pointwise group. This gap was a period of time when the GPU was idling around and not doing anything. Looking into it more, we discovered that the problem was that `torch.chunk` constructs new tensors and that tensor construction was not as fast as it could be. Instead of constructing new Tensor objects, we taught the fusion compiler how to manipulate a data pointer and strides to do the `torch.chunk` before sending it into the fused kernel, shrinking the amount of idle time between the second gemm and the launch of the element-wise fusion group. This give us around 1.2x increase speed up on the LSTM forward pass. - -By doing the above tricks, we are able to fuse the almost all `LSTMCell` forward graph (except the two gemm kernels) into a single fusion group, which corresponds to the `prim::FusionGroup_0` in the above IR graph. It will then be launched into a single fused kernel for execution. With these optimizations the model performance improves significantly with average forward time reduced by around 17ms (1.7x speedup) to 10ms, and average backward time reduce by 37ms to 27ms (1.37x speed up). - -### LSTM Layer (forward) - -```python -class LSTMLayer(jit.ScriptModule): - def __init__(self, cell, *cell_args): - super(LSTMLayer, self).__init__() - self.cell = cell(*cell_args) - - @jit.script_method - def forward(self, input, state): - # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]] - inputs = input.unbind(0) - outputs = torch.jit.annotate(List[Tensor], []) - for i in range(len(inputs)): - out, state = self.cell(inputs[i], state) - outputs += [out] - return torch.stack(outputs), state -``` - -We did several tricks on the IR we generated for TorchScript LSTM to boost the performance, some example optimizations we did: - -* Loop Unrolling: We automatically unroll loops in the code (for big loops, we unroll a small subset of it), which then empowers us to do further optimizations on the for loops control flow. For example, the fuser can fuse together operations across iterations of the loop body, which results in a good performance improvement for control flow intensive models like LSTMs. -* Batch Matrix Multiplication: For RNNs where the input is pre-multiplied (i.e. the model has a lot of matrix multiplies with the same LHS or RHS), we can efficiently batch those operations together into a single matrix multiply while chunking the outputs to achieve equivalent semantics. - -By applying these techniques, we reduced our time in the forward pass by an additional 1.6ms to 8.4ms (1.2x speed up) and timing in backward by 7ms to around 20ms (1.35x speed up). - -### LSTM Layer (backward) - -* “Tree” Batch Matrix Muplication: It is often the case that a single weight is reused multiple times in the LSTM backward graph, forming a tree where the leaves are matrix multiplies and nodes are adds. These nodes can be combined together by concatenating the LHSs and RHSs in different dimensions, then computed as a single matrix multiplication. The formula of equivalence can be denoted as follows: - - $L1 * R1 + L2 * R2 = torch.cat((L1, L2), dim=1) * torch.cat((R1, R2), dim=0)$ - -* Autograd is a critical component of what makes PyTorch such an elegant ML framework. As such, we carried this through to PyTorch JIT, but using a new **Automatic Differentiation** (AD) mechanism that works on the IR level. JIT automatic differentiation will slice the forward graph into symbolically differentiable subgraphs, and generate backwards nodes for those subgraphs. Taking the above IR as an example, we group the graph nodes into a single `prim::DifferentiableGraph_0` for the operations that has AD formulas. For operations that have not been added to AD formulas, we will fall back to Autograd during execution. - -* Optimizing the backwards path is hard, and the implicit broadcasting semantics make the optimization of automatic differentiation harder. PyTorch makes it convenient to write tensor operations without worrying about the shapes by broadcasting the tensors for you. For performance, the painful point in backward is that we need to have a summation for such kind of broadcastable operations. This results in the derivative of every broadcastable op being followed by a summation. Since we cannot currently fuse reduce operations, this causes FusionGroups to break into multiple small groups leading to bad performance. To deal with this, refer to this great [post](http://lernapparat.de/fast-lstm-pytorch/) written by Thomas Viehmann. - -### Misc Optimizations - -* In addition to the steps laid about above, we also eliminated overhead between CUDA kernel launches and unnecessary tensor allocations. One example is when you do a tensor device look up. This can provide some poor performance initially with a lot of unnecessary allocations. When we remove these this results in a reduction from milliseconds to nanoseconds between kernel launches. -* Lastly, there might be normalization applied in the custom LSTMCell like LayerNorm. Since LayerNorm and other normalization ops contains reduce operations, it is hard to fuse it in its entirety. Instead, we automatically decompose Layernorm to a statistics computation (reduce operations) + element-wise transformations, and then fuse those element-wise parts together. As of this post, there are some limitations on our auto differentiation and graph fuser infrastructure which limits the current support to inference mode only. We plan to add backward support in a future release. - -With the above optimizations on operation fusion, loop unrolling, batch matrix multiplication and some misc optimizations, we can see a clear performance increase on our custom TorchScript LSTM forward and backward from the following figure: - -
        - -
        - - -There are a number of additional optimizations that we did not cover in this post. In addition to the ones laid out in this post, we now see that our custom LSTM forward pass is on par with cuDNN. We are also working on optimizing backward more and expect to see improvements in future releases. Besides the speed that TorchScript provides, we introduced a much more flexible API that enable you to hand draft a lot more custom RNNs, which cuDNN could not provide. - diff --git a/_posts/2019-5-22-torchvision03.md b/_posts/2019-5-22-torchvision03.md deleted file mode 100644 index eb807b4394b3..000000000000 --- a/_posts/2019-5-22-torchvision03.md +++ /dev/null @@ -1,134 +0,0 @@ ---- -layout: blog_detail -title: 'torchvision 0.3: segmentation, detection models, new datasets and more..' -author: Francisco Massa -redirect_from: /2019/05/23/torchvision03.html ---- - -PyTorch domain libraries like torchvision provide convenient access to common datasets and models that can be used to quickly create a state-of-the-art baseline. Moreover, they also provide common abstractions to reduce boilerplate code that users might have to otherwise repeatedly write. The torchvision 0.3 release brings several new features including models for semantic segmentation, object detection, instance segmentation, and person keypoint detection, as well as custom C++ / CUDA ops specific to computer vision. - -
        - -
        - - -### New features include: - -**Reference training / evaluation scripts:** torchvision now provides, under the references/ folder, scripts for training and evaluation of the following tasks: classification, semantic segmentation, object detection, instance segmentation and person keypoint detection. These serve as a log of how to train a specific model and provide baseline training and evaluation scripts to quickly bootstrap research. - -**torchvision ops:** torchvision now contains custom C++ / CUDA operators. Those operators are specific to computer vision, and make it easier to build object detection models. These operators currently do not support PyTorch script mode, but support for it is planned for in the next release. Some of the ops supported include: - -* roi_pool (and the module version RoIPool) -* roi_align (and the module version RoIAlign) -* nms, for non-maximum suppression of bounding boxes -* box_iou, for computing the intersection over union metric between two sets of bounding boxes -* box_area, for computing the area of a set of bounding boxes - -Here are a few examples on using torchvision ops: - -```python -import torch -import torchvision - -# create 10 random boxes -boxes = torch.rand(10, 4) * 100 -# they need to be in [x0, y0, x1, y1] format -boxes[:, 2:] += boxes[:, :2] -# create a random image -image = torch.rand(1, 3, 200, 200) -# extract regions in `image` defined in `boxes`, rescaling -# them to have a size of 3x3 -pooled_regions = torchvision.ops.roi_align(image, [boxes], output_size=(3, 3)) -# check the size -print(pooled_regions.shape) -# torch.Size([10, 3, 3, 3]) - -# or compute the intersection over union between -# all pairs of boxes -print(torchvision.ops.box_iou(boxes, boxes).shape) -# torch.Size([10, 10]) -``` - - -**New models and datasets:** torchvision now adds support for object detection, instance segmentation and person keypoint detection models. In addition, several popular datasets have been added. Note: The API is currently experimental and might change in future versions of torchvision. New models include: - -### Segmentation Models - -The 0.3 release also contains models for dense pixelwise prediction on images. -It adds FCN and DeepLabV3 segmentation models, using a ResNet50 and ResNet101 backbones. -Pre-trained weights for ResNet101 backbone are available, and have been trained on a subset of COCO train2017, which contains the same 20 categories as those from Pascal VOC. - -The pre-trained models give the following results on the subset of COCO val2017 which contain the same 20 categories as those present in Pascal VOC: - -Network | mean IoU | global pixelwise acc --- | -- | -- -FCN ResNet101 | 63.7 | 91.9 -DeepLabV3 ResNet101 | 67.4 | 92.4 - -### Detection Models - -Network | box AP | mask AP | keypoint AP --- | -- | -- | -- -Faster R-CNN ResNet-50 FPN trained on COCO | 37.0 |   |   -Mask R-CNN ResNet-50 FPN trained on COCO | 37.9 | 34.6 |   -Keypoint R-CNN ResNet-50 FPN trained on COCO | 54.6 |   | 65.0 - -The implementations of the models for object detection, instance segmentation and keypoint detection are fast, specially during training. - -In the following table, we use 8 V100 GPUs, with CUDA 10.0 and CUDNN 7.4 to report the results. During training, we use a batch size of 2 per GPU, and during testing a batch size of 1 is used. - -For test time, we report the time for the model evaluation and post-processing (including mask pasting in image), but not the time for computing the precision-recall. - -Network | train time (s / it) | test time (s / it) | memory (GB) --- | -- | -- | -- -Faster R-CNN ResNet-50 FPN | 0.2288 | 0.0590 | 5.2 -Mask R-CNN ResNet-50 FPN | 0.2728 | 0.0903 | 5.4 -Keypoint R-CNN ResNet-50 FPN | 0.3789 | 0.1242 | 6.8 - - -You can load and use pre-trained detection and segmentation models with a few lines of code - -```python -import torchvision - -model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True) -# set it to evaluation mode, as the model behaves differently -# during training and during evaluation -model.eval() - -image = PIL.Image.open('/path/to/an/image.jpg') -image_tensor = torchvision.transforms.functional.to_tensor(image) - -# pass a list of (potentially different sized) tensors -# to the model, in 0-1 range. The model will take care of -# batching them together and normalizing -output = model([image_tensor]) -# output is a list of dict, containing the postprocessed predictions -``` - -### Classification Models - -The following classification models were added: - -* GoogLeNet (Inception v1) -* MobileNet V2 -* ShuffleNet v2 -* ResNeXt-50 32x4d and ResNeXt-101 32x8d - -### Datasets - -The following datasets were added: - -* Caltech101, Caltech256, and CelebA -* ImageNet dataset (improving on ImageFolder, provides class-strings) -* Semantic Boundaries Dataset -* VisionDataset as a base class for all datasets - - -In addition, we've added more image transforms, general improvements and bug fixes, as well as improved documentation. - -**See the full release notes [here](https://github.com/pytorch/vision/releases) as well as this getting started tutorial [on Google Colab here](https://colab.research.google.com/github/pytorch/vision/blob/temp-tutorial/tutorials/torchvision_finetuning_instance_segmentation.ipynb), which describes how to fine tune your own instance segmentation model on a custom dataset.** - -Cheers! - -Team PyTorch diff --git a/_posts/2020-07-28-accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision.md b/_posts/2020-07-28-accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision.md deleted file mode 100644 index 60524a62887f..000000000000 --- a/_posts/2020-07-28-accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision.md +++ /dev/null @@ -1,127 +0,0 @@ ---- -layout: blog_detail -title: 'Introducing native PyTorch automatic mixed precision for faster training on NVIDIA GPUs' -author: Mengdi Huang, Chetan Tekur, Michael Carilli ---- - -Most deep learning frameworks, including PyTorch, train with 32-bit floating point (FP32) arithmetic by default. However this is not essential to achieve full accuracy for many deep learning models. In 2017, NVIDIA researchers developed a methodology for [mixed-precision training](https://developer.nvidia.com/blog/mixed-precision-training-deep-neural-networks/), which combined [single-precision](https://blogs.nvidia.com/blog/2019/11/15/whats-the-difference-between-single-double-multi-and-mixed-precision-computing/) (FP32) with half-precision (e.g. FP16) format when training a network, and achieved the same accuracy as FP32 training using the same hyperparameters, with additional performance benefits on NVIDIA GPUs: - -* Shorter training time; -* Lower memory requirements, enabling larger batch sizes, larger models, or larger inputs. - -In order to streamline the user experience of training in mixed precision for researchers and practitioners, NVIDIA developed [Apex](https://developer.nvidia.com/blog/apex-pytorch-easy-mixed-precision-training/) in 2018, which is a lightweight PyTorch extension with [Automatic Mixed Precision](https://developer.nvidia.com/automatic-mixed-precision) (AMP) feature. This feature enables automatic conversion of certain GPU operations from FP32 precision to mixed precision, thus improving performance while maintaining accuracy. - -For the PyTorch 1.6 release, developers at NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, [torch.cuda.amp](https://pytorch.org/docs/stable/amp.html). `torch.cuda.amp` is more flexible and intuitive compared to `apex.amp`. Some of `apex.amp`'s known pain points that `torch.cuda.amp` has been able to fix: - -* Guaranteed PyTorch version compatibility, because it's part of PyTorch -* No need to build extensions -* Windows support -* Bitwise accurate [saving/restoring](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler.load_state_dict) of checkpoints -* [DataParallel](https://pytorch.org/docs/master/notes/amp_examples.html#dataparallel-in-a-single-process) and intra-process model parallelism (although we still recommend [torch.nn.DistributedDataParallel](https://pytorch.org/docs/master/notes/amp_examples.html#distributeddataparallel-one-gpu-per-process) with one GPU per process as the most performant approach) -* [Gradient penalty](https://pytorch.org/docs/master/notes/amp_examples.html#gradient-penalty) (double backward) -* torch.cuda.amp.autocast() has no effect outside regions where it's enabled, so it should serve cases that formerly struggled with multiple calls to [apex.amp.initialize()](https://github.com/NVIDIA/apex/issues/439) (including [cross-validation)](https://github.com/NVIDIA/apex/issues/392#issuecomment-610038073) without difficulty. Multiple convergence runs in the same script should each use a fresh [GradScaler instance](https://github.com/NVIDIA/apex/issues/439#issuecomment-610028282), but GradScalers are lightweight and self-contained so that's not a problem. -* Sparse gradient support - -With AMP being added to PyTorch core, we have started the process of deprecating `apex.amp.` We have moved `apex.amp` to maintenance mode and will support customers using `apex.amp.` However, we highly encourage `apex.amp` customers to transition to using `torch.cuda.amp` from PyTorch Core. - -# Example Walkthrough -Please see official docs for usage: -* [https://pytorch.org/docs/stable/amp.html](https://pytorch.org/docs/stable/amp.html ) -* [https://pytorch.org/docs/stable/notes/amp_examples.html](https://pytorch.org/docs/stable/notes/amp_examples.html) - -Example: - -```python -import torch -# Creates once at the beginning of training -scaler = torch.cuda.amp.GradScaler() - -for data, label in data_iter: - optimizer.zero_grad() - # Casts operations to mixed precision - with torch.cuda.amp.autocast(): - loss = model(data) - - # Scales the loss, and calls backward() - # to create scaled gradients - scaler.scale(loss).backward() - - # Unscales gradients and calls - # or skips optimizer.step() - scaler.step(optimizer) - - # Updates the scale for next iteration - scaler.update() -``` - -# Performance Benchmarks -In this section, we discuss the accuracy and performance of mixed precision training with AMP on the latest NVIDIA GPU A100 and also previous generation V100 GPU. The mixed precision performance is compared to FP32 performance, when running Deep Learning workloads in the [NVIDIA pytorch:20.06-py3 container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch?ncid=partn-52193#cid=ngc01_partn_en-us) from NGC. - -## Accuracy: AMP (FP16), FP32 -The advantage of using AMP for Deep Learning training is that the models converge to the similar final accuracy while providing improved training performance. To illustrate this point, for [Resnet 50 v1.5 training](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5#training-accuracy-nvidia-dgx-a100-8x-a100-40gb), we see the following accuracy results where higher is better. Please note that the below accuracy numbers are sample numbers that are subject to run to run variance of up to 0.4%. Accuracy numbers for other models including BERT, Transformer, ResNeXt-101, Mask-RCNN, DLRM can be found at [NVIDIA Deep Learning Examples Github](https://github.com/NVIDIA/DeepLearningExamples). - -Training accuracy: NVIDIA DGX A100 (8x A100 40GB) - - - - - - - - - - - - - - -
         epochs Mixed Precision Top 1(%) TF32 Top1(%)
         90 76.93 76.85
        - -Training accuracy: NVIDIA DGX-1 (8x V100 16GB) - - - - - - - - - - - - - - - - - - - - - - - - -
         epochs Mixed Precision Top 1(%) FP32 Top1(%)
        5076.2576.26
        9077.0977.01
        25078.4278.30
        - -## Speedup Performance: - -### FP16 on NVIDIA V100 vs. FP32 on V100 -AMP with FP16 is the most performant option for DL training on the V100. In Table 1, we can observe that for various models, AMP on V100 provides a speedup of 1.5x to 5.5x over FP32 on V100 while converging to the same final accuracy. - -
        - -
        -*Figure 2. Performance of mixed precision training on NVIDIA 8xV100 vs. FP32 training on 8xV100 GPU. Bars represent the speedup factor of V100 AMP over V100 FP32. The higher the better.* - -## FP16 on NVIDIA A100 vs. FP16 on V100 - -AMP with FP16 remains the most performant option for DL training on the A100. In Figure 3, we can observe that for various models, AMP on A100 provides a speedup of 1.3x to 2.5x over AMP on V100 while converging to the same final accuracy. - -
        - -
        -*Figure 3. Performance of mixed precision training on NVIDIA 8xA100 vs. 8xV100 GPU. Bars represent the speedup factor of A100 over V100. The higher the better.* - -# Call to action -AMP provides a healthy speedup for Deep Learning training workloads on Nvidia Tensor Core GPUs, especially on the latest Ampere generation A100 GPUs. You can start experimenting with AMP enabled models and model scripts for A100, V100, T4 and other GPUs available at NVIDIA deep learning [examples](https://github.com/NVIDIA/DeepLearningExamples). NVIDIA PyTorch with native AMP support is available from the [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch?ncid=partn-52193#cid=ngc01_partn_en-us) version 20.06. We highly encourage existing `apex.amp` customers to transition to using `torch.cuda.amp` from PyTorch Core available in the latest [PyTorch 1.6 release](https://pytorch.org/blog/pytorch-1.6-released/). diff --git a/_posts/2020-07-28-microsoft-becomes-maintainer-of-the-windows-version-of-pytorch.md b/_posts/2020-07-28-microsoft-becomes-maintainer-of-the-windows-version-of-pytorch.md deleted file mode 100644 index aa420c33a431..000000000000 --- a/_posts/2020-07-28-microsoft-becomes-maintainer-of-the-windows-version-of-pytorch.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -layout: blog_detail -title: 'Microsoft becomes maintainer of the Windows version of PyTorch' -author: Maxim Lukiyanov - Principal PM at Microsoft, Emad Barsoum - Group EM at Microsoft, Guoliang Hua - Principal EM at Microsoft, Nikita Shulga - Tech Lead at Facebook, Geeta Chauhan - PE Lead at Facebook, Chris Gottbrath - Technical PM at Facebook, Jiachen Pu - Engineer at Facebook - ---- - -Along with the PyTorch 1.6 release, we are excited to announce that Microsoft has expanded its participation in the PyTorch community and will be responsible for the development and maintenance of the PyTorch build for Windows. - -According to the latest [Stack Overflow developer survey](https://insights.stackoverflow.com/survey/2020#technology-developers-primary-operating-systems), Windows remains the primary operating system for the developer community (46% Windows vs 28% MacOS). [Jiachen Pu](https://github.com/peterjc123) initially made a heroic effort to add support for PyTorch on Windows, but due to limited resources, Windows support for PyTorch has lagged behind other platforms. Lack of test coverage resulted in unexpected issues popping up every now and then. Some of the core tutorials, meant for new users to learn and adopt PyTorch, would fail to run. The installation experience was also not as smooth, with the lack of official PyPI support for PyTorch on Windows. Lastly, some of the PyTorch functionality was simply not available on the Windows platform, such as the TorchAudio domain library and distributed training support. To help alleviate this pain, Microsoft is happy to bring its Windows expertise to the table and bring PyTorch on Windows to its best possible self. - -In the PyTorch 1.6 release, we have improved the core quality of the Windows build by bringing test coverage up to par with Linux for core PyTorch and its domain libraries and by automating tutorial testing. Thanks to the broader PyTorch community, which contributed TorchAudio support to Windows, we were able to add test coverage to all three domain libraries: TorchVision, TorchText and TorchAudio. In subsequent releases of PyTorch, we will continue improving the Windows experience based on community feedback and requests. So far, the feedback we received from the community points to distributed training support and a better installation experience using pip as the next areas of improvement. - -In addition to the native Windows experience, Microsoft released a preview adding [GPU compute support to Windows Subsystem for Linux (WSL) 2](https://blogs.windows.com/windowsdeveloper/2020/06/17/gpu-accelerated-ml-training-inside-the-windows-subsystem-for-linux/) distros, with a focus on enabling AI and ML developer workflows. WSL is designed for developers that want to run any Linux based tools directly on Windows. This preview enables valuable scenarios for a variety of frameworks and Python packages that utilize [NVIDIA CUDA](https://developer.nvidia.com/cuda/wsl) for acceleration and only support Linux. This means WSL customers using the preview can run native Linux based PyTorch applications on Windows unmodified without the need for a traditional virtual machine or a dual boot setup. - -## Getting started with PyTorch on Windows -It's easy to get started with PyTorch on Windows. To install PyTorch using Anaconda with the latest GPU support, run the command below. To install different supported configurations of PyTorch, refer to the installation instructions on [pytorch.org](https://pytorch.org). - -`conda install pytorch torchvision cudatoolkit=10.2 -c pytorch` - -Once you install PyTorch, learn more by visiting the [PyTorch Tutorials](https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html) and [documentation](https://pytorch.org/docs/stable/index.html). - -
        - -
        - -## Getting started with PyTorch on Windows Subsystem for Linux -The [preview of NVIDIA CUDA support in WSL](https://docs.microsoft.com/en-us/windows/win32/direct3d12/gpu-cuda-in-wsl) is now available to Windows Insiders running Build 20150 or higher. In WSL, the command to install PyTorch using Anaconda is the same as the above command for native Windows. If you prefer pip, use the command below. - -`pip install torch torchvision` - -You can use the same tutorials and documentation inside your WSL environment as on native Windows. This functionality is still in preview so if you run into issues with WSL please share feedback via the [WSL GitHub repo](https://github.com/microsoft/WSL) or with NVIDIA CUDA support share via NVIDIA’s [Community Forum for CUDA on WSL](https://forums.developer.nvidia.com/c/accelerated-computing/cuda/cuda-on-windows-subsystem-for-linux/303). - -## Feedback -If you find gaps in the PyTorch experience on Windows, please let us know on the [PyTorch discussion forum](https://discuss.pytorch.org/c/windows/26) or file an issue on [GitHub](https://github.com/pytorch/pytorch) using the #module: windows label. diff --git a/_posts/2020-07-28-pytorch-feature-classification-changes.md b/_posts/2020-07-28-pytorch-feature-classification-changes.md deleted file mode 100644 index 057867a68158..000000000000 --- a/_posts/2020-07-28-pytorch-feature-classification-changes.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch feature classification changes' -author: Team PyTorch ---- - -Traditionally features in PyTorch were classified as either stable or experimental with an implicit third option of testing bleeding edge features by building master or through installing nightly builds (available via prebuilt whls). This has, in a few cases, caused some confusion around the level of readiness, commitment to the feature and backward compatibility that can be expected from a user perspective. Moving forward, we’d like to better classify the 3 types of features as well as define explicitly here what each mean from a user perspective. - -# New Feature Designations - -We will continue to have three designations for features but, as mentioned, with a few changes: Stable, Beta (previously Experimental) and Prototype (previously Nightlies). Below is a brief description of each and a comment on the backward compatibility expected: - -## Stable -Nothing changes here. A stable feature means that the user value-add is or has been proven, the API isn’t expected to change, the feature is performant and all documentation exists to support end user adoption. - -*Level of commitment*: We expect to maintain these features long term and generally there should be no major performance limitations, gaps in documentation and we also expect to maintain backwards compatibility (although breaking changes can happen and notice will be given one release ahead of time). - -## Beta -We previously called these features ‘Experimental’ and we found that this created confusion amongst some of the users. In the case of a Beta level features, the value add, similar to a Stable feature, has been proven (e.g. pruning is a commonly used technique for reducing the number of parameters in NN models, independent of the implementation details of our particular choices) and the feature generally works and is documented. This feature is tagged as Beta because the API may change based on user feedback, because the performance needs to improve or because coverage across operators is not yet complete. - -*Level of commitment*: We are committing to seeing the feature through to the Stable classification. We are however not committing to Backwards Compatibility. Users can depend on us providing a solution for problems in this area going forward, but the APIs and performance characteristics of this feature may change. - -
        - -
        - -## Prototype -Previously these were features that were known about by developers who paid close attention to RFCs and to features that land in master. These features are part of the release and are available as part of binary distributions like PyPI or Conda. We would like to get high bandwidth partner feedback ahead of a real release in order to gauge utility and any changes we need to make to the UX. For each prototype feature, a pointer to draft docs or other instructions will be provided. - -*Level of commitment*: We are committing to gathering high bandwidth feedback only. Based on this feedback and potential further engagement between community members, we as a community will decide if we want to upgrade the level of commitment or to fail fast. Additionally, while some of these features might be more speculative (e.g. new Frontend APIs), others have obvious utility (e.g. model optimization) but may be in a state where gathering feedback outside of high bandwidth channels is not practical, e.g. the feature may be in an earlier state, may be moving fast (PRs are landing too quickly to catch a major release) and/or generally active development is underway. - -# What changes for current features? - -First and foremost, you can find these designations on [pytorch.org/docs](http://pytorch.org/docs). We will also be linking any early stage features here for clarity. - -Additionally, the following features will be reclassified under this new rubric: - -1. [High Level Autograd APIs](https://pytorch.org/docs/stable/autograd.html#functional-higher-level-api): Beta (was Experimental) -2. [Eager Mode Quantization](https://pytorch.org/docs/stable/quantization.html): Beta (was Experimental) -3. [Named Tensors](https://pytorch.org/docs/stable/named_tensor.html): Prototype (was Experimental) -4. [TorchScript/RPC](https://pytorch.org/docs/stable/rpc.html#rpc): Prototype (was Experimental) -5. [Channels Last Memory Layout](https://pytorch.org/docs/stable/tensor_attributes.html#torch-memory-format): Beta (was Experimental) -6. [Custom C++ Classes](https://pytorch.org/docs/stable/jit.html?highlight=experimental): Beta (was Experimental) -7. [PyTorch Mobile](https://pytorch.org/mobile/home/): Beta (was Experimental) -8. [Java Bindings](https://pytorch.org/docs/stable/index.html): Beta (was Experimental) -9. [Torch.Sparse](https://pytorch.org/docs/stable/sparse.html?highlight=experimental#): Beta (was Experimental) - - -Cheers, - -Joe, Greg, Woo & Jessica diff --git a/_posts/2020-08-11-efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus.md b/_posts/2020-08-11-efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus.md deleted file mode 100644 index 29077e8008b8..000000000000 --- a/_posts/2020-08-11-efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus.md +++ /dev/null @@ -1,156 +0,0 @@ ---- -layout: blog_detail -title: 'Efficient PyTorch I/O library for Large Datasets, Many Files, Many GPUs' -author: Alex Aizman, Gavin Maltby, Thomas Breuel ---- - -Data sets are growing bigger every day and GPUs are getting faster. This means there are more data sets for deep learning researchers and engineers to train and validate their models. - -* Many datasets for research in still image recognition are becoming available with 10 million or more images, including OpenImages and Places. -* million YouTube videos [(YouTube 8M)](https://research.google.com/youtube8m/) consume about 300 TB in 720p, used for research in object recognition, video analytics, and action recognition. -* The Tobacco Corpus consists of about 20 million scanned HD pages, useful for OCR and text analytics research. - -Although the most commonly encountered big data sets right now involve images and videos, big datasets occur in many other domains and involve many other kinds of data types: web pages, financial transactions, network traces, brain scans, etc. - -However, working with the large amount of data sets presents a number of challenges: - -* **Dataset Size:** datasets often exceed the capacity of node-local disk storage, requiring distributed storage systems and efficient network access. -* **Number of Files:** datasets often consist of billions of files with uniformly random access patterns, something that often overwhelms both local and network file systems. -* **Data Rates:** training jobs on large datasets often use many GPUs, requiring aggregate I/O bandwidths to the dataset of many GBytes/s; these can only be satisfied by massively parallel I/O systems. -* **Shuffling and Augmentation:** training data needs to be shuffled and augmented prior to training. -* **Scalability:** users often want to develop and test on small datasets and then rapidly scale up to large datasets. - -Traditional local and network file systems, and even object storage servers, are not designed for these kinds of applications. [The WebDataset I/O library](https://github.com/tmbdev/webdataset) for PyTorch, together with the optional [AIStore server](https://github.com/NVIDIA/aistore) and [Tensorcom](https://github.com/NVlabs/tensorcom) RDMA libraries, provide an efficient, simple, and standards-based solution to all these problems. The library is simple enough for day-to-day use, is based on mature open source standards, and is easy to migrate to from existing file-based datasets. - -Using WebDataset is simple and requires little effort, and it will let you scale up the same code from running local experiments to using hundreds of GPUs on clusters or in the cloud with linearly scalable performance. Even on small problems and on your desktop, it can speed up I/O tenfold and simplifies data management and processing of large datasets. The rest of this blog post tells you how to get started with WebDataset and how it works. - -## The WebDataset Library - -The WebDataset library provides a simple solution to the challenges listed above. Currently, it is available as a separate library [(github.com/tmbdev/webdataset)](https://github.com/tmbdev/webdataset), but it is on track for being incorporated into PyTorch (see [RFC 38419](https://github.com/pytorch/pytorch/issues/38419)). The WebDataset implementation is small (about 1500 LOC) and has no external dependencies. - -Instead of inventing a new format, WebDataset represents large datasets as collections of POSIX tar archive files consisting of the original data files. The WebDataset library can use such tar archives directly for training, without the need for unpacking or local storage. - -WebDataset scales perfectly from small, local datasets to petascale datasets and training on hundreds of GPUs and allows data to be stored on local disk, on web servers, or dedicated file servers. For container-based training, WebDataset eliminates the need for volume plugins or node-local storage. As an additional benefit, datasets need not be unpacked prior to training, simplifying the distribution and use of research data. - -WebDataset implements PyTorch’s [IterableDataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset) interface and can be used like existing DataLoader-based code. Since data is stored as files inside an archive, existing loading and data augmentation code usually requires minimal modification. - -The WebDataset library is a complete solution for working with large datasets and distributed training in PyTorch (and also works with TensorFlow, Keras, and DALI via their Python APIs). Since POSIX tar archives are a standard, widely supported format, it is easy to write other tools for manipulating datasets in this format. E.g., the [tarp](https://github.com/tmbdev/tarp) command is written in Go and can shuffle and process training datasets. - -## Benefits - -The use of sharded, sequentially readable formats is essential for very large datasets. In addition, it has benefits in many other environments. WebDataset provides a solution that scales well from small problems on a desktop machine to very large deep learning problems in clusters or in the cloud. The following table summarizes some of the benefits in different environments. - - {:.table.table-striped.table-bordered} - | Environment | Benefits of WebDataset | -| ------------- | ------------- | -| Local Cluster with AIStore | AIStore can be deployed easily as K8s containers and offers linear scalability and near 100% utilization of network and I/O bandwidth. Suitable for petascale deep learning. | -| Cloud Computing | WebDataset deep learning jobs can be trained directly against datasets stored in cloud buckets; no volume plugins required. Local and cloud jobs work identically. Suitable for petascale learning. | -| Local Cluster with existing distributed FS or object store | WebDataset’s large sequential reads improve performance with existing distributed stores and eliminate the need for dedicated volume plugins. | -| Educational Environments | WebDatasets can be stored on existing web servers and web caches, and can be accessed directly by students by URL | -| Training on Workstations from Local Drives | Jobs can start training as the data still downloads. Data doesn’t need to be unpacked for training. Ten-fold improvements in I/O performance on hard drives over random access file-based datasets. | -| All Environments | Datasets are represented in an archival format and contain metadata such as file types. Data is compressed in native formats (JPEG, MP4, etc.). Data management, ETL-style jobs, and data transformations and I/O are simplified and easily parallelized. | - -We will be adding more examples giving benchmarks and showing how to use WebDataset in these environments over the coming months. - -## High-Performance -For high-performance computation on local clusters, the companion open-source [AIStore](https://github.com/NVIDIA/AIStore) server provides full disk to GPU I/O bandwidth, subject only to hardware constraints. [This Bigdata 2019 Paper](https://arxiv.org/abs/2001.01858) contains detailed benchmarks and performance measurements. In addition to benchmarks, research projects at NVIDIA and Microsoft have used WebDataset for petascale datasets and billions of training samples. - -Below is a benchmark of AIStore with WebDataset clients using 12 server nodes with 10 rotational drives each. - -
        - -
        - -The left axis shows the aggregate bandwidth from the cluster, while the right scale shows the measured per drive I/O bandwidth. WebDataset and AIStore scale linearly to about 300 clients, at which point they are increasingly limited by the maximum I/O bandwidth available from the rotational drives (about 150 MBytes/s per drive). For comparison, HDFS is shown. HDFS uses a similar approach to AIStore/WebDataset and also exhibits linear scaling up to about 192 clients; at that point, it hits a performance limit of about 120 MBytes/s per drive, and it failed when using more than 1024 clients. Unlike HDFS, the WebDataset-based code just uses standard URLs and HTTP to access data and works identically with local files, with files stored on web servers, and with AIStore. For comparison, NFS in similar experiments delivers about 10-20 MBytes/s per drive. - -## Storing Datasets in Tar Archives - -The format used for WebDataset is standard POSIX tar archives, the same archives used for backup and data distribution. In order to use the format to store training samples for deep learning, we adopt some simple naming conventions: -* datasets are POSIX tar archives -* each training sample consists of adjacent files with the same basename -* shards are numbered consecutively - -For example, ImageNet is stored in 1282 separate 100 Mbyte shards with names ```pythonimagenet-train-000000.tar to imagenet-train-001281.tar,``` the contents of the first shard are: - -```python --r--r--r-- bigdata/bigdata 3 2020-05-08 21:23 n03991062_24866.cls --r--r--r-- bigdata/bigdata 108611 2020-05-08 21:23 n03991062_24866.jpg --r--r--r-- bigdata/bigdata 3 2020-05-08 21:23 n07749582_9506.cls --r--r--r-- bigdata/bigdata 129044 2020-05-08 21:23 n07749582_9506.jpg --r--r--r-- bigdata/bigdata 3 2020-05-08 21:23 n03425413_23604.cls --r--r--r-- bigdata/bigdata 106255 2020-05-08 21:23 n03425413_23604.jpg --r--r--r-- bigdata/bigdata 3 2020-05-08 21:23 n02795169_27274.cls -``` - -WebDataset datasets can be used directly from local disk, from web servers (hence the name), from cloud storage and object stores, just by changing a URL. WebDataset datasets can be used for training without unpacking, and training can even be carried out on streaming data, with no local storage. - -Shuffling during training is important for many deep learning applications, and WebDataset performs shuffling both at the shard level and at the sample level. Splitting of data across multiple workers is performed at the shard level using a user-provided ```shard_selection``` function that defaults to a function that splits based on ```get_worker_info.``` (WebDataset can be combined with the [tensorcom](https://github.com/NVLabs/tensorcom) library to offload decompression/data augmentation and provide RDMA and direct-to-GPU loading; see below.) - -## Code Sample -Here are some code snippets illustrating the use of WebDataset in a typical PyTorch deep learning application (you can find a full example at [http://github.com/tmbdev/pytorch-imagenet-wds](http://github.com/tmbdev/pytorch-imagenet-wds). - -```python -import webdataset as wds -import ... - -sharedurl = "/imagenet/imagenet-train-{000000..001281}.tar" - -normalize = transforms.Normalize( - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - -preproc = transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, -]) - -dataset = ( - wds.Dataset(sharedurl) - .shuffle(1000) - .decode("pil") - .rename(image="jpg;png", data="json") - .map_dict(image=preproc) - .to_tuple("image", "data") -) - -loader = torch.utils.data.DataLoader(dataset, batch_size=64, num_workers=8) - -for inputs, targets in loader: - ... - ``` - -This code is nearly identical to the file-based I/O pipeline found in the PyTorch Imagenet example: it creates a preprocessing/augmentation pipeline, instantiates a dataset using that pipeline and a data source location, and then constructs a DataLoader instance from the dataset. - - WebDataset uses a fluent API for a configuration that internally builds up a processing pipeline. Without any added processing stages, In this example, WebDataset is used with the PyTorch DataLoader class, which replicates DataSet instances across multiple threads and performs both parallel I/O and parallel data augmentation. - -WebDataset instances themselves just iterate through each training sample as a dictionary: - -```python -# load from a web server using a separate client process -sharedurl = "pipe:curl -s http://server/imagenet/imagenet-train-{000000..001281}.tar" - -dataset = wds.Dataset(sharedurl) - -for sample in dataset: - # sample["jpg"] contains the raw image data - # sample["cls"] contains the class - ... - ``` - -For a general introduction to how we handle large scale training with WebDataset, see these [YouTube videos](https://www.youtube.com/playlist?list=PL0dsKxFNMcX4XcB0w1Wm-pvSfQu-eWM26). - -## Related Software - -* [AIStore](https://github.com/NVIDIA/AIStore) is an open-source object store capable of full-bandwidth disk-to-GPU data delivery (meaning that if you have 1000 rotational drives with 200 MB/s read speed, AIStore actually delivers an aggregate bandwidth of 200 GB/s to the GPUs). AIStore is fully compatible with WebDataset as a client, and in addition understands the WebDataset format, permitting it to perform shuffling, sorting, ETL, and some map-reduce operations directly in the storage system. AIStore can be thought of as a remix of a distributed object store, a network file system, a distributed database, and a GPU-accelerated map-reduce implementation. - -* [tarp](https://github.com/tmbdev/tarp) is a small command-line program for splitting, merging, shuffling, and processing tar archives and WebDataset datasets. - -* [tensorcom](https://github.com/NVLabs/tensorcom) is a library supporting distributed data augmentation and RDMA to GPU. - -* [pytorch-imagenet-wds](https://github.com/tmbdev/pytorch-imagenet-wds) contains an example of how to use WebDataset with ImageNet, based on the PyTorch ImageNet example. - -* [Bigdata 2019 Paper with Benchmarks](https://arxiv.org/abs/2001.01858) - -Check out [the library](https://github.com/tmbdev/webdataset) and provide your feedback for [RFC 38419](https://github.com/pytorch/pytorch/issues/38419). diff --git a/_posts/2020-08-18-pytorch-1.6-now-includes-stochastic-weight-averaging.md b/_posts/2020-08-18-pytorch-1.6-now-includes-stochastic-weight-averaging.md deleted file mode 100644 index 6719fa1bffc3..000000000000 --- a/_posts/2020-08-18-pytorch-1.6-now-includes-stochastic-weight-averaging.md +++ /dev/null @@ -1,271 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.6 now includes Stochastic Weight Averaging' -author: Pavel Izmailov, Andrew Gordon Wilson and Vincent Quenneville-Belair ---- - -Do you use stochastic gradient descent (SGD) or Adam? Regardless of the procedure you use to train your neural network, you can likely achieve significantly better generalization at virtually no additional cost with a simple new technique now natively supported in PyTorch 1.6, Stochastic Weight Averaging (SWA) [1]. Even if you have already trained your model, it’s easy to realize the benefits of SWA by running SWA for a small number of epochs starting with a pre-trained model. [Again](https://twitter.com/MilesCranmer/status/1282140440892932096) and [again](https://twitter.com/leopd/status/1285969855062192129), researchers are discovering that SWA improves the performance of well-tuned models in a wide array of practical applications with little cost or effort! - - -SWA has a wide range of applications and features: -* SWA significantly improves performance compared to standard training techniques in computer vision (e.g., VGG, ResNets, Wide ResNets and DenseNets on ImageNet and CIFAR benchmarks [1, 2]). -* SWA provides state-of-the-art performance on key benchmarks in semi-supervised learning and domain adaptation [2]. -* SWA was shown to improve performance in language modeling (e.g., AWD-LSTM on WikiText-2 [4]) and policy-gradient methods in deep reinforcement learning [3]. -* SWAG, an extension of SWA, can approximate Bayesian model averaging in Bayesian deep learning and achieves state-of-the-art uncertainty calibration results in various settings. Moreover, its recent generalization MultiSWAG provides significant additional performance gains and mitigates double-descent [4, 10]. Another approach, Subspace Inference, approximates the Bayesian posterior in a small subspace of the parameter space around the SWA solution [5]. -* SWA for low precision training, SWALP, can match the performance of full-precision SGD training, even with all numbers quantized down to 8 bits, including gradient accumulators [6]. -* SWA in parallel, SWAP, was shown to greatly speed up the training of neural networks by using large batch sizes and, in particular, set a record by training a neural network to 94% accuracy on CIFAR-10 in 27 seconds [11]. - - -
        - -
        - -**Figure 1**. *Illustrations of SWA and SGD with a Preactivation ResNet-164 on CIFAR-100 [1]. **Left**: test error surface for three FGE samples and the corresponding SWA solution (averaging in weight space). **Middle** and **Right**: test error and train loss surfaces showing the weights proposed by SGD (at convergence) and SWA, starting from the same initialization of SGD after 125 training epochs. Please see [1] for details on how these figures were constructed*. - -In short, SWA performs an equal average of the weights traversed by SGD (or any stochastic optimizer) with a modified learning rate schedule (see the left panel of Figure 1.). SWA solutions end up in the center of a wide flat region of loss, while SGD tends to converge to the boundary of the low-loss region, making it susceptible to the shift between train and test error surfaces (see the middle and right panels of Figure 1). We emphasize that SWA **can be used with any optimizer, such as Adam, and is not specific to SGD**. - -Previously, SWA was in PyTorch contrib. In PyTorch 1.6, we provide a new convenient implementation of SWA in [torch.optim.swa_utils](https://pytorch.org/docs/stable/optim.html#stochastic-weight-averaging). - -## Is this just Averaged SGD? - -At a high level, averaging SGD iterates dates back several decades in convex optimization [7, 8], where it is sometimes referred to as Polyak-Ruppert averaging, or averaged SGD. **But the details matter**. Averaged SGD is often used in conjunction with a decaying learning rate, and an exponential moving average (EMA), typically for convex optimization. In convex optimization, the focus has been on improved rates of convergence. In deep learning, this form of averaged SGD smooths the trajectory of SGD iterates but does not perform very differently. - -By contrast, SWA uses an **equal average** of SGD iterates with a modified **cyclical or high constant learning rate** and exploits the flatness of training objectives [8] specific to **deep learning** for **improved generalization**. - -## How does Stochastic Weight Averaging Work? - -There are two important ingredients that make SWA work. First, SWA uses a **modified learning rate** schedule so that SGD (or other optimizers such as Adam) continues to bounce around the optimum and explore diverse models instead of simply converging to a single solution. For example, we can use the standard decaying learning rate strategy for the first 75% of training time and then set the learning rate to a reasonably high constant value for the remaining 25% of the time (see Figure 2 below). The second ingredient is to take an average of the weights **(typically an equal average)** of the networks traversed by SGD. For example, we can maintain a running average of the weights obtained at the end of every epoch within the last 25% of training time (see Figure 2). After training is complete, we then set the weights of the network to the computed SWA averages. - -
        - -
        - -**Figure 2**. *Illustration of the learning rate schedule adopted by SWA. Standard decaying schedule is used for the first 75% of the training and then a high constant value is used for the remaining 25%. The SWA averages are formed during the last 25% of training*. - -One important detail is the batch normalization. Batch normalization layers compute running statistics of activations during training. Note that the SWA averages of the weights are never used to make predictions during training. So the batch normalization layers do not have the activation statistics computed at the end of training. We can compute these statistics by doing a single forward pass on the train data with the SWA model. - -While we focus on SGD for simplicity in the description above, SWA can be combined with any optimizer. You can also use cyclical learning rates instead of a high constant value (see e.g., [2]). - -## How to use SWA in PyTorch? - -In `torch.optim.swa_utils` we implement all the SWA ingredients to make it convenient to use SWA with any model. In particular, we implement `AveragedModel` class for SWA models, `SWALR` learning rate scheduler, and `update_bn` utility function to update SWA batch normalization statistics at the end of training. - -In the example below, `swa_model` is the SWA model that accumulates the averages of the weights. We train the model for a total of 300 epochs, and we switch to the SWA learning rate schedule and start to collect SWA averages of the parameters at epoch 160. - -```python -from torch.optim.swa_utils import AveragedModel, SWALR -from torch.optim.lr_scheduler import CosineAnnealingLR - -loader, optimizer, model, loss_fn = ... -swa_model = AveragedModel(model) -scheduler = CosineAnnealingLR(optimizer, T_max=100) -swa_start = 5 -swa_scheduler = SWALR(optimizer, swa_lr=0.05) - -for epoch in range(100): - for input, target in loader: - optimizer.zero_grad() - loss_fn(model(input), target).backward() - optimizer.step() - if epoch > swa_start: - swa_model.update_parameters(model) - swa_scheduler.step() - else: - scheduler.step() - -# Update bn statistics for the swa_model at the end -torch.optim.swa_utils.update_bn(loader, swa_model) -# Use swa_model to make predictions on test data -preds = swa_model(test_input) -``` - -Next, we explain each component of `torch.optim.swa_utils` in detail. - -`AveragedModel` class serves to compute the weights of the SWA model. You can create an averaged model by running `swa_model = AveragedModel(model)`. You can then update the parameters of the averaged model by `swa_model.update_parameters(model)`. By default, `AveragedModel` computes a running equal average of the parameters that you provide, but you can also use custom averaging functions with the `avg_fn` parameter. In the following example, `ema_model` computes an exponential moving average. - -```python -ema_avg = lambda averaged_model_parameter, model_parameter, num_averaged:\ -0.1 * averaged_model_parameter + 0.9 * model_parameter -ema_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=ema_avg) -``` - -In practice, we find an equal average with the modified learning rate schedule in Figure 2 provides the best performance. - -`SWALR` is a learning rate scheduler that anneals the learning rate to a fixed value, and then keeps it constant. For example, the following code creates a scheduler that linearly anneals the learning rate from its initial value to `0.05` in `5` epochs within each parameter group. - -```python -swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, -anneal_strategy="linear", anneal_epochs=5, swa_lr=0.05) - -``` -We also implement cosine annealing to a fixed value (`anneal_strategy="cos"`). In practice, we typically switch to `SWALR` at epoch `swa_start` (e.g. after 75% of the training epochs), and simultaneously start to compute the running averages of the weights: - -```python -scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100) -swa_start = 75 -for epoch in range(100): - # - if i > swa_start: - swa_model.update_parameters(model) - swa_scheduler.step() - else: - scheduler.step() -``` - -Finally, `update_bn` is a utility function that computes the batchnorm statistics for the SWA model on a given dataloader `loader`: -``` -torch.optim.swa_utils.update_bn(loader, swa_model) -``` -`update_bn` applies the `swa_model` to every element in the dataloader and computes the activation statistics for each batch normalization layer in the model. - -Once you computed the SWA averages and updated the batch normalization layers, you can apply `swa_model` to make predictions on test data. - -## Why does it work? - -There are large flat regions of the loss surface [9]. In Figure 3 below, we show a visualization of the loss surface in a subspace of the parameter space containing a path connecting two independently trained SGD solutions, such that the loss is similarly low at every point along the path. SGD converges near the boundary of these regions because there isn’t much gradient signal to move inside, as the points in the region all have similarly low values of loss. By increasing the learning rate, SWA spins around this flat region, and then by averaging the iterates, moves towards the center of the flat region. - -
        - -
        - -**Figure 3**: *visualization of mode connectivity for ResNet-20 with no skip connections on CIFAR-10 dataset. The visualization is created in collaboration with Javier Ideami [(https://losslandscape.com/)](https://losslandscape.com/). For more details, see this [blogpost](https://izmailovpavel.github.io/curves_blogpost/)*. - -We expect solutions that are centered in the flat region of the loss to generalize better than those near the boundary. Indeed, train and test error surfaces are not perfectly aligned in the weight space. Solutions that are centered in the flat region are not as susceptible to the shifts between train and test error surfaces as those near the boundary. In Figure 4 below, we show the train loss and test error surfaces along the direction connecting the SWA and SGD solutions. As you can see, while the SWA solution has a higher train loss compared to the SGD solution, it is centered in a region of low loss and has a substantially better test error. - - -
        - -
        - -**Figure 4**. *Train loss and test error along the line connecting the SWA solution (circle) and SGD solution (square). The SWA solution is centered in a wide region of low train loss, while the SGD solution lies near the boundary. Because of the shift between train loss and test error surfaces, the SWA solution leads to much better generalization*. - -## What are the results achieved with SWA? - -We release a GitHub [repo](https://github.com/izmailovpavel/torch_swa_examples) with examples using the PyTorch implementation of SWA for training DNNs. For example, these examples can be used to achieve the following results on CIFAR-100: - - - {:.table.table-striped.table-bordered} - | | VGG-16 | ResNet-164 | WideResNet-28x10 | -| ------------- | ------------- | ------------- | ------------- | -| SGD | 72.8 ± 0.3 | 78.4 ± 0.3 | 81.0 ± 0.3 | -| SWA | 74.4 ± 0.3 | 79.8 ± 0.4 | 82.5 ± 0.2 | - - -## Semi-Supervised Learning - -In a follow-up [paper](https://arxiv.org/abs/1806.05594) SWA was applied to semi-supervised learning, where it improved the best reported results in multiple settings [2]. For example, with SWA you can get 95% accuracy on CIFAR-10 if you only have the training labels for 4k training data points (the previous best reported result on this problem was 93.7%). This paper also explores averaging multiple times within epochs, which can accelerate convergence and find still flatter solutions in a given time. - -
        - -
        -**Figure 5**. Performance of fast-SWA on semi-supervised learning with CIFAR-10. fast-SWA achieves record results in every setting considered. - -## Reinforcement Learning - -In another follow-up [paper](http://www.gatsby.ucl.ac.uk/~balaji/udl-camera-ready/UDL-24.pdf) SWA was shown to improve the performance of policy gradient methods A2C and DDPG on several Atari games and MuJoCo environments [3]. This application is also an instance of where SWA is used with Adam. Recall that SWA is not specific to SGD and can benefit essentially any optimizer. - - -{:.table.table-striped.table-bordered} - | Environment Name | A2C | A2C + SWA | -| ------------- | ------------- | ------------- | -| Breakout | 522 ± 34 | 703 ± 60 | -| Qbert | 18777 ± 778 | 21272 ± 655 | -| SpaceInvaders | 7727 ± 1121 | 21676 ± 8897 | -| Seaquest | 1779 ± 4 | 1795 ± 4 | -| BeamRider | 9999 ± 402 | 11321 ± 1065 | -| CrazyClimber | 147030 ± 10239 | 139752 ± 11618 | - - -## Low Precision Training - -We can filter through quantization noise by combining weights that have been rounded down with weights that have been rounded up. Moreover, by averaging weights to find a flat region of the loss surface, large perturbations of the weights will not affect the quality of the solution (Figures 9 and 10). Recent [work](https://arxiv.org/abs/1904.11943) shows that by adapting SWA to the low precision setting, in a method called SWALP, one can match the performance of full-precision SGD even with all training in 8 bits [5]. This is quite a practically important result, given that (1) SGD training in 8 bits performs notably worse than full precision SGD, and (2) low precision training is significantly harder than predictions in low precision after training (the usual setting). For example, a ResNet-164 trained on CIFAR-100 with float (16-bit) SGD achieves 22.2% error, while 8-bit SGD achieves 24.0% error. By contrast, SWALP with 8 bit training achieves 21.8% error. - -
        - -
        -**Figure 9**. *Quantizing a solution leads to a perturbation of the weights which has a greater effect on the quality of the sharp solution (left) compared to wide solution (right)*. - - -
        - -
        -**Figure 10**. *The difference between standard low precision training and SWALP*. - -Another [work](https://arxiv.org/abs/2002.00343), SQWA, presents an approach for quantization and fine-tuning of neural networks in low precision [12]. In particular, SQWA achieved state-of-the-art results for DNNs quantized to 2 bits on CIFAR-100 and ImageNet. - -## Calibration and Uncertainty Estimates - -By finding a centred solution in the loss, SWA can also improve calibration and uncertainty representation. Indeed, SWA can be viewed as an approximation to an ensemble, resembling a Bayesian model average, but with a single model [1]. - -SWA can be viewed as taking the first moment of SGD iterates with a modified learning rate schedule. We can directly generalize SWA by also taking the second moment of iterates to form a Gaussian approximate posterior over the weights, further characterizing the loss geometry with SGD iterates. This approach,[SWA-Gaussian (SWAG)](https://arxiv.org/abs/1902.02476) is a simple, scalable and convenient approach to uncertainty estimation and calibration in Bayesian deep learning [4]. The SWAG distribution approximates the shape of the true posterior: Figure 6 below shows the SWAG distribution and the posterior log-density for ResNet-20 on CIFAR-10. - -
        - -
        -**Figure 6**. *SWAG posterior approximation and the loss surface for a ResNet-20 without skip-connections trained on CIFAR-10 in the subspace formed by the two largest eigenvalues of the SWAG covariance matrix. The shape of SWAG distribution is aligned with the posterior: the peaks of the two distributions coincide, and both distributions are wider in one direction than in the orthogonal direction. Visualization created in collaboration with* [Javier Ideami](https://losslandscape.com/). - -Empirically, SWAG performs on par or better than popular alternatives including MC dropout, KFAC Laplace, and temperature scaling on uncertainty quantification, out-of-distribution detection, calibration and transfer learning in computer vision tasks. Code for SWAG is available [here](https://github.com/wjmaddox/swa_gaussian). - -
        - -
        -**Figure 7**. *MultiSWAG generalizes SWAG and deep ensembles, to perform Bayesian model averaging over multiple basins of attraction, leading to significantly improved performance. By contrast, as shown here, deep ensembles select different modes, while standard variational inference (VI) marginalizes (model averages) within a single basin*. - -MultiSWAG [9] uses multiple independent SWAG models to form a mixture of Gaussians as an approximate posterior distribution. Different basins of attraction contain highly complementary explanations of the data. Accordingly, marginalizing over these multiple basins provides a significant boost in accuracy and uncertainty representation. MultiSWAG can be viewed as a generalization of deep ensembles, but with performance improvements. - - -Indeed, we see in Figure 8 that MultiSWAG entirely mitigates double descent -- more flexible models have monotonically improving performance -- and provides significantly improved generalization over SGD. For example, when the ResNet-18 has layers of width 20, Multi-SWAG achieves under 30% error whereas SGD achieves over 45%, more than a 15% gap! - -
        - -
        -**Figure 8**. *SGD, SWAG, and Multi-SWAG on CIFAR-100 for a ResNet-18 with varying widths. We see Multi-SWAG in particular mitigates double descent and provides significant accuracy improvements over SGD*. - -Reference [10] also considers Multi-SWA, which uses multiple independently trained SWA solutions in an ensemble, providing performance improvements over deep ensembles without any additional computational cost. Code for MultiSWA and MultiSWAG is available [here](https://github.com/izmailovpavel/understandingbdl). - -Another [method](https://arxiv.org/abs/1907.07504), Subspace Inference, constructs a low-dimensional subspace around the SWA solution and marginalizes the weights in this subspace to approximate the Bayesian model average [5]. Subspace Inference uses the statistics from the SGD iterates to construct both the SWA solution and the subspace. The method achieves strong performance in terms of prediction accuracy and uncertainty calibration both in classification and regression problems. Code is available [here](https://github.com/wjmaddox/drbayes). - -## Try it Out! - -One of the greatest open questions in deep learning is why SGD manages to find good solutions, given that the training objectives are highly multimodal, and there are many settings of parameters that achieve no training loss but poor generalization. By understanding geometric features such as flatness, which relate to generalization, we can begin to resolve these questions and build optimizers that provide even better generalization, and many other useful features, such as uncertainty representation. We have presented SWA, a simple drop-in replacement for standard optimizers such as SGD and Adam, which can in principle, benefit anyone training a deep neural network. SWA has been demonstrated to have a strong performance in several areas, including computer vision, semi-supervised learning, reinforcement learning, uncertainty representation, calibration, Bayesian model averaging, and low precision training. - - -We encourage you to try out SWA! SWA is now as easy as any standard training in PyTorch. And even if you have already trained your model, you can use SWA to significantly improve performance by running it for a small number of epochs from a pre-trained model. - - -[1] Averaging Weights Leads to Wider Optima and Better Generalization; Pavel Izmailov, Dmitry Podoprikhin, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson; Uncertainty in Artificial Intelligence (UAI), 2018. - -[2] There Are Many Consistent Explanations of Unlabeled Data: Why You Should Average; Ben Athiwaratkun, Marc Finzi, Pavel Izmailov, Andrew Gordon Wilson; -International Conference on Learning Representations (ICLR), 2019. - -[3] Improving Stability in Deep Reinforcement Learning with Weight Averaging; Evgenii Nikishin, Pavel Izmailov, Ben Athiwaratkun, Dmitrii Podoprikhin, -Timur Garipov, Pavel Shvechikov, Dmitry Vetrov, Andrew Gordon Wilson; UAI 2018 Workshop: Uncertainty in Deep Learning, 2018. - -[4] A Simple Baseline for Bayesian Uncertainty in Deep Learning -Wesley Maddox, Timur Garipov, Pavel Izmailov, Andrew Gordon Wilson; Neural Information Processing Systems (NeurIPS), 2019. - -[5] Subspace Inference for Bayesian Deep Learning -Pavel Izmailov, Wesley Maddox, Polina Kirichenko, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson -Uncertainty in Artificial Intelligence (UAI), 2019. - -[6] SWALP : Stochastic Weight Averaging in Low Precision Training -Guandao Yang, Tianyi Zhang, Polina Kirichenko, Junwen Bai, -Andrew Gordon Wilson, Christopher De Sa; International Conference on Machine Learning (ICML), 2019. - -[7] David Ruppert. Efficient estimations from a slowly convergent Robbins-Monro process; Technical report, Cornell University Operations Research and Industrial Engineering, 1988. - -[8] Acceleration of stochastic approximation by averaging. Boris T Polyak and Anatoli B Juditsky; SIAM Journal on Control and Optimization, 30(4):838–855, 1992. - -[9] Loss Surfaces, Mode Connectivity, and Fast Ensembling of DNNs -Timur Garipov, Pavel Izmailov, Dmitrii Podoprikhin, Dmitry Vetrov, -Andrew Gordon Wilson. Neural Information Processing Systems (NeurIPS), 2018. - -[10] Bayesian Deep Learning and a Probabilistic Perspective of Generalization -Andrew Gordon Wilson, Pavel Izmailov. ArXiv preprint, 2020. - -[11] Stochastic Weight Averaging in Parallel: Large-Batch Training That Generalizes Well -Gupta, Vipul, Santiago Akle Serrano, and Dennis DeCoste; International Conference on Learning Representations (ICLR). 2019. - -[12] SQWA: Stochastic Quantized Weight Averaging for Improving the Generalization Capability of Low-Precision Deep Neural Networks -Shin, Sungho, Yoonho Boo, and Wonyong Sung; arXiv preprint 2020. - diff --git a/_posts/2020-08-24-torchcsprng-release-blog.md b/_posts/2020-08-24-torchcsprng-release-blog.md deleted file mode 100644 index 868415269ed4..000000000000 --- a/_posts/2020-08-24-torchcsprng-release-blog.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch framework for cryptographically secure random number generation, torchcsprng, now available' -author: Team PyTorch ---- - -One of the key components of modern cryptography is the pseudorandom number generator. Katz and Lindell stated, "The use of badly designed or inappropriate random number generators can often leave a good cryptosystem vulnerable to attack. Particular care must be taken to use a random number generator that is designed for cryptographic use, rather than a 'general-purpose' random number generator which may be fine for some applications but not ones that are required to be cryptographically secure."[1] Additionally, most pseudorandom number generators scale poorly to massively parallel high-performance computation because of their sequential nature. Others don’t satisfy cryptographically secure properties. - -[torchcsprng](https://github.com/pytorch/csprng) is a PyTorch [C++/CUDA extension](https://pytorch.org/tutorials/advanced/cpp_extension.html) that provides [cryptographically secure pseudorandom number generators](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) for PyTorch. - -## torchcsprng overview - -Historically, PyTorch had only two pseudorandom number generator implementations: Mersenne Twister for CPU and Nvidia’s cuRAND Philox for CUDA. Despite good performance properties, neither of them are suitable for cryptographic applications. Over the course of the past several months, the PyTorch team developed the torchcsprng extension API. Based on PyTorch dispatch mechanism and operator registration, it allows the users to extend c10::GeneratorImpl and implement their own custom pseudorandom number generator. - -torchcsprng generates a random 128-bit key on the CPU using one of its generators and then runs AES128 in [CTR mode](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR)) either on CPU or GPU using CUDA. This then generates a random 128-bit state and applies a transformation function to map it to target tensor values. This approach is based on [Parallel Random Numbers: As Easy as 1, 2, 3 (John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, D. E. Shaw Research)](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf). It makes torchcsprng both crypto-secure and parallel on both CPU and CUDA. - -
        - -
        - -Since torchcsprng is a PyTorch extension, it is available on the platforms where PyTorch is available (support for Windows-CUDA will be available in the coming months). - -## Using torchcsprng - -The torchcsprng API is very simple to use and is fully compatible with the PyTorch random infrastructure: - -**Step 1: Install via binary distribution** - -Anaconda: - -```python -conda install torchcsprng -c pytorch - ``` - -pip: - -```python -pip install torchcsprng - ``` - -**Step 2: import packages as usual but add csprng** - -```python -import torch -import torchcsprng as csprng - ``` - -**Step 3: Create a cryptographically secure pseudorandom number generator from /dev/urandom:** - -```python -urandom_gen = csprng.create_random_device_generator('/dev/urandom') - ``` - -and simply use it with the existing PyTorch methods: - -```python -torch.randn(10, device='cpu', generator=urandom_gen) - ``` - -**Step 4: Test with Cuda** - -One of the advantages of torchcsprng generators is that they can be used with both CPU and CUDA tensors: - -```python -torch.randn(10, device='cuda', generator=urandom_gen) - ``` - -Another advantage of torchcsprng generators is that they are parallel on CPU unlike the default PyTorch CPU generator. - -## Getting Started - -The easiest way to get started with torchcsprng is by visiting the [GitHub page](https://github.com/pytorch/csprng) where you can find installation and build instructions, and more how-to examples. - -Cheers, - -The PyTorch Team - -[1] [Introduction to Modern Cryptography: Principles and Protocols (Chapman & Hall/CRC Cryptography and Network Security Series)](https://www.amazon.com/Introduction-Modern-Cryptography-Principles-Protocols/dp/1584885513) by Jonathan Katz and Yehuda Lindell - - - - diff --git a/_posts/2020-1-15-pytorch-1-dot-4-released-and-domain-libraries-updated.md b/_posts/2020-1-15-pytorch-1-dot-4-released-and-domain-libraries-updated.md deleted file mode 100644 index e55070202d16..000000000000 --- a/_posts/2020-1-15-pytorch-1-dot-4-released-and-domain-libraries-updated.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.4 released, domain libraries updated' -author: Team PyTorch ---- - -Today, we’re announcing the availability of PyTorch 1.4, along with updates to the PyTorch domain libraries. These releases build on top of the announcements from [NeurIPS 2019](https://pytorch.org/blog/pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community/), where we shared the availability of PyTorch Elastic, a new classification framework for image and video, and the addition of Preferred Networks to the PyTorch community. For those that attended the workshops at NeurIPS, the content can be found [here](https://research.fb.com/neurips-2019-expo-workshops/). - -## PyTorch 1.4 - -The 1.4 release of PyTorch adds new capabilities, including the ability to do fine grain build level customization for PyTorch Mobile, and new experimental features including support for model parallel training and Java language bindings. - -### PyTorch Mobile - Build level customization - -Following the open sourcing of [PyTorch Mobile in the 1.3 release](https://pytorch.org/blog/pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors/), PyTorch 1.4 adds additional mobile support including the ability to customize build scripts at a fine-grain level. This allows mobile developers to optimize library size by only including the operators used by their models and, in the process, reduce their on device footprint significantly. Initial results show that, for example, a customized MobileNetV2 is 40% to 50% smaller than the prebuilt PyTorch mobile library. You can learn more [here](https://pytorch.org/mobile/home/) about how to create your own custom builds and, as always, please engage with the community on the [PyTorch forums](https://discuss.pytorch.org/c/mobile) to provide any feedback you have. - -Example code snippet for selectively compiling only the operators needed for MobileNetV2: - -```python -# Dump list of operators used by MobileNetV2: -import torch, yaml -model = torch.jit.load('MobileNetV2.pt') -ops = torch.jit.export_opnames(model) -with open('MobileNetV2.yaml', 'w') as output: - yaml.dump(ops, output) -``` - -```console -# Build PyTorch Android library customized for MobileNetV2: -SELECTED_OP_LIST=MobileNetV2.yaml scripts/build_pytorch_android.sh arm64-v8a - -# Build PyTorch iOS library customized for MobileNetV2: -SELECTED_OP_LIST=MobileNetV2.yaml BUILD_PYTORCH_MOBILE=1 IOS_ARCH=arm64 scripts/build_ios.sh -``` - -### Distributed model parallel training (Experimental) - -With the scale of models, such as RoBERTa, continuing to increase into the billions of parameters, model parallel training has become ever more important to help researchers push the limits. This release provides a distributed RPC framework to support distributed model parallel training. It allows for running functions remotely and referencing remote objects without copying the real data around, and provides autograd and optimizer APIs to transparently run backwards and update parameters across RPC boundaries. - -To learn more about the APIs and the design of this feature, see the links below: - -* [API documentation](https://pytorch.org/docs/stable/rpc.html) - -For the full tutorials, see the links below: - -* [A full RPC tutorial](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html) -* [Examples using model parallel training for reinforcement learning and with an LSTM](https://github.com/pytorch/examples/tree/master/distributed/rpc) - -As always, you can connect with community members and discuss more on the [forums](https://discuss.pytorch.org/c/distributed/distributed-rpc). - -### Java bindings (Experimental) - -In addition to supporting Python and C++, this release adds experimental support for Java bindings. Based on the interface developed for Android in PyTorch Mobile, the new bindings allow you to invoke TorchScript models from any Java program. Note that the Java bindings are only available for Linux for this release, and for inference only. We expect support to expand in subsequent releases. See the code snippet below for how to use PyTorch within Java: - -```java -Module mod = Module.load("demo-model.pt1"); -Tensor data = - Tensor.fromBlob( - new int[] {1, 2, 3, 4, 5, 6}, // data - new long[] {2, 3} // shape - ); -IValue result = mod.forward(IValue.from(data), IValue.from(3.0)); -Tensor output = result.toTensor(); -System.out.println("shape: " + Arrays.toString(output.shape())); -System.out.println("data: " + Arrays.toString(output.getDataAsFloatArray())); -``` - -Learn more about how to use PyTorch from Java [here](https://github.com/pytorch/java-demo), and see the full Javadocs API documentation [here](https://pytorch.org/javadoc/1.4.0/). - -For the full 1.4 release notes, see [here](https://github.com/pytorch/pytorch/releases). - -## Domain Libraries - -PyTorch domain libraries like torchvision, torchtext, and torchaudio complement PyTorch with common datasets, models, and transforms. We’re excited to share new releases for all three domain libraries alongside the PyTorch 1.4 core release. - -### torchvision 0.5 - -The improvements to torchvision 0.5 mainly focus on adding support for production deployment including quantization, TorchScript, and ONNX. Some of the highlights include: - -* All models in torchvision are now torchscriptable making them easier to ship into non-Python production environments -* ResNets, MobileNet, ShuffleNet, GoogleNet and InceptionV3 now have quantized counterparts with pre-trained models, and also include scripts for quantization-aware training. -* In partnership with the Microsoft team, we’ve added ONNX support for all models including Mask R-CNN. - -Learn more about torchvision 0.5 [here](https://github.com/pytorch/vision/releases). - -### torchaudio 0.4 - -Improvements in torchaudio 0.4 focus on enhancing the currently available transformations, datasets, and backend support. Highlights include: - -* SoX is now optional, and a new extensible backend dispatch mechanism exposes SoundFile as an alternative to SoX. -* The interface for datasets has been unified. This enables the addition of two large datasets: LibriSpeech and Common Voice. -* New filters such as biquad, data augmentation such as time and frequency masking, transforms such as MFCC, gain and dither, and new feature computation such as deltas, are now available. -* Transformations now support batches and are jitable. -* An interactive speech recognition demo with voice activity detection is available for experimentation. - -Learn more about torchaudio 0.4 [here](https://github.com/pytorch/audio/releases). - -### torchtext 0.5 - -torchtext 0.5 focuses mainly on improvements to the dataset loader APIs, including compatibility with core PyTorch APIs, but also adds support for unsupervised text tokenization. Highlights include: - -* Added bindings for SentencePiece for unsupervised text tokenization . -* Added a new unsupervised learning dataset - enwik9. -* Made revisions to PennTreebank, WikiText103, WikiText2, IMDb to make them compatible with torch.utils.data. Those datasets are in an experimental folder and we welcome your feedback. - -Learn more about torchtext 0.5 [here](https://github.com/pytorch/text/releases). - -*We’d like to thank the entire PyTorch team and the community for all their contributions to this work.* - -Cheers! - -Team PyTorch diff --git a/_posts/2020-10-1-announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon.md b/_posts/2020-10-1-announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon.md deleted file mode 100644 index 76c7e05ef8f4..000000000000 --- a/_posts/2020-10-1-announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon.md +++ /dev/null @@ -1,122 +0,0 @@ ---- -layout: blog_detail -title: 'Announcing the Winners of the 2020 Global PyTorch Summer Hackathon' -author: Team PyTorch ---- - -More than 2,500 participants in this year’s Global PyTorch Summer Hackathon pushed the envelope to create unique new tools and applications for PyTorch developers and researchers. - -
        - -
        - -***Notice**: None of the projects submitted to the hackathon are associated with or offered by Facebook, Inc.* - -This year’s projects fell into three categories: - -* **PyTorch Developer Tools:** a tool or library for improving productivity and efficiency for PyTorch researchers and developers. - -* **Web/Mobile Applications Powered by PyTorch:** a web or mobile interface and/or an embedded device built using PyTorch. - -* **PyTorch Responsible AI Development Tools:** a tool, library, or web/mobile app to support researchers and developers in creating responsible AI that factors in fairness, security, privacy, and more throughout its entire development process. - -The virtual hackathon ran from June 22 to August 25, with more than 2,500 registered participants, representing 114 countries from Republic of Azerbaijan, to Zimbabwe, to Japan, submitting a total of 106 projects. Entrants were judged on their idea’s quality, originality, potential impact, and how well they implemented it. - -Meet the winners of each category below. - -## PyTorch Developer Tools - -**1st place** - [DeMask](https://devpost.com/software/asteroid-the-pytorch-based-source-separation-toolkit) - -DeMask is an end-to-end model for enhancing speech while wearing face masks — offering a clear benefit during times when face masks are mandatory in many spaces and for workers who wear face masks on the job. Built with [Asteroid](https://github.com/mpariente/asteroid), a PyTorch-based audio source separation toolkit, DeMask is trained to recognize distortions in speech created by the muffling from face masks and to adjust the speech to make it sound clearer. - -This submission stood out in particular because it represents both a high-quality idea and an implementation that can be reproduced by other researchers. - -Here is an example on how to train a speech separation model in less than 20 lines: - -```python -from torch import optim -from pytorch_lightning import Trainer - -from asteroid import ConvTasNet -from asteroid.losses import PITLossWrapper -from asteroid.data import LibriMix -from asteroid.engine import System - -train_loader, val_loader = LibriMix.loaders_from_mini(task='sep_clean', batch_size=4) -model = ConvTasNet(n_src=2) -optimizer = optim.Adam(model.parameters(), lr=1e-3) -loss = PITLossWrapper( - lambda x, y: (x - y).pow(2).mean(-1), # MSE - pit_from="pw_pt", # Point in the pairwise matrix. -) - -system = System(model, optimizer, loss, train_loader, val_loader) - -trainer = Trainer(fast_dev_run=True) -trainer.fit(system) -``` - -**2nd place** - [carefree-learn](https://devpost.com/software/carefree-learn) - -A PyTorch-based automated machine learning (AutoML) solution, carefree-learn provides high-level APIs to make training models using tabular data sets simpler. It features an interface similar to [scikit-learn](https://scikit-learn.org/stable/) and functions as an end-to-end end pipeline for tabular data sets. It automatically detects feature column types and redundant feature columns, imputes missing values, encodes string columns and categorical columns, and preprocesses numerical columns, among other features. - -**3rd Place** - [TorchExpo](https://devpost.com/software/torchexpo) - -TorchExpo is a collection of models and extensions that simplifies taking PyTorch from research to production in mobile devices. This library is more than a web and mobile application, and also comes with a Python library. The Python library is available via pip install and it helps researchers convert a state-of-the-art model in TorchScript and ONNX format in just one line. - -## Web/Mobile Applications Powered by PyTorch - -**1st place** - [Q&Aid](https://devpost.com/software/pytorchxai) - -Q&Aid is a conceptual health-care chatbot aimed at making health-care diagnoses and facilitating communication between patients and doctors. It relies on a series of machine learning models to filter, label, and answer medical questions, based on a medical image and/or questions in text provided by a patient. The transcripts from the chat app then can be forwarded to the local hospitals and the patient will be contacted by one of them to make an appointment to determine proper diagnosis and care. The team hopes that this concept application helps hospitals to work with patients more efficiently and provide proper care. - -
        - -
        - -**2nd place** - [Rasoee](https://devpost.com/software/groundwav) - -Rasoee is an application that can take images as input and output the name of the dish. It also lists the ingredients and recipe, along with the link to the original recipe online. Additionally, users can choose a cuisine from the list of cuisines in the drop menu, and describe the taste and/or method of preparation in text. Then the application will return matching dishes from the [list of 308 identifiable dishes](https://github.com/arijitgupta42/Rasoee/blob/master/Dishes.txt). The team has put a significant amount of effort gathering and cleaning various datasets to build more accurate and comprehensive models. You can check out the application [here](https://rasoee.herokuapp.com). - -**3rd place** - [Rexana the Robot — PyTorch](https://devpost.com/software/rexana-the-robot) - -Rexana is an AI voice assistant meant to lay the foundation for a physical robot that can complete basic tasks around the house. The system is capable of autonomous navigation (knowing its position around the house relative to landmarks), recognizing voice commands, and object detection and recognition — meaning it can be commanded to perform various household tasks (e.g., "Rexana, water the potted plant in the lounge room.”). Rexana can be controlled remotely via a mobile device, and the robot itself features customizable hands (magnets, grippers, etc.) for taking on different jobs. - -## PyTorch Responsible AI Development Tools - -**1st place**: [FairTorch](https://devpost.com/software/a-qeysp1) - -FairTorch is a fairness library for PyTorch. It lets developers add constraints to their models to equalize metrics across subgroups by simply adding a few lines of code. Model builders can choose a metric definition of fairness for their context, and enforce it at time of training. The library offers a suite of metrics that measure an AI system’s performance among subgroups, and can apply to high-stakes examples where decision-making algorithms are deployed, such as hiring, school admissions, and banking. - - - -**2nd place**: [Fluence](https://devpost.com/software/fluence-5g2s9m) - -Fluence is a PyTorch-based deep learning library for language research. It specifically addresses the large compute demands of natural language processing (NLP) research. Fluence aims to provide low-resource and computationally efficient algorithms for NLP, giving researchers algorithms that can enhance current NLP methods or help discover where current methods fall short. - -**3rd place**: [Causing: CAUSal INterpretation using Graphs](https://devpost.com/software/realrate-explainable-ai-for-company-ratings) - -Causing (CAUSal INterpretation using Graphs) is a multivariate graphic analysis tool for bringing transparency to neural networks. It explains causality and helps researchers and developers interpret the causal effects of a given equation system to ensure fairness. Developers can input data and a model describing the dependencies between the variables within the data set into Causing, and Causing will output a colored graph of quantified effects acting between the model’s variables. In addition, it also allows developers to estimate these effects to validate whether data fits a model. - -Thank you, - -**The PyTorch team** - - - - - - - - - - - - - - diff --git a/_posts/2020-10-27-pytorch-1.7-released.md b/_posts/2020-10-27-pytorch-1.7-released.md deleted file mode 100644 index 766ed9889ef6..000000000000 --- a/_posts/2020-10-27-pytorch-1.7-released.md +++ /dev/null @@ -1,293 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.7 released w/ CUDA 11, New APIs for FFTs, Windows support for Distributed training and more' -author: Team PyTorch ---- - -Today, we’re announcing the availability of PyTorch 1.7, along with updated domain libraries. The PyTorch 1.7 release includes a number of new APIs including support for NumPy-Compatible FFT operations, profiling tools and major updates to both distributed data parallel (DDP) and remote procedure call (RPC) based distributed training. In addition, several features moved to [stable](https://pytorch.org/docs/stable/index.html#pytorch-documentation) including custom C++ Classes, the memory profiler, extensions via custom tensor-like objects, user async functions in RPC and a number of other features in torch.distributed such as Per-RPC timeout, DDP dynamic bucketing and RRef helper. - -A few of the highlights include: -* CUDA 11 is now officially supported with binaries available at [PyTorch.org](http://pytorch.org/) -* Updates and additions to profiling and performance for RPC, TorchScript and Stack traces in the autograd profiler -* (Beta) Support for NumPy compatible Fast Fourier transforms (FFT) via torch.fft -* (Prototype) Support for Nvidia A100 generation GPUs and native TF32 format -* (Prototype) Distributed training on Windows now supported -* torchvision - * (Stable) Transforms now support Tensor inputs, batch computation, GPU, and TorchScript - * (Stable) Native image I/O for JPEG and PNG formats - * (Beta) New Video Reader API -* torchaudio - * (Stable) Added support for speech rec (wav2letter), text to speech (WaveRNN) and source separation (ConvTasNet) - -To reiterate, starting PyTorch 1.6, features are now classified as stable, beta and prototype. You can see the detailed announcement [here](https://pytorch.org/blog/pytorch-feature-classification-changes/). Note that the prototype features listed in this blog are available as part of this release. - -Find the full release notes [here](https://github.com/pytorch/pytorch/releases). - -# Front End APIs -## [Beta] NumPy Compatible torch.fft module -FFT-related functionality is commonly used in a variety of scientific fields like signal processing. While PyTorch has historically supported a few FFT-related functions, the 1.7 release adds a new torch.fft module that implements FFT-related functions with the same API as NumPy. - -This new module must be imported to be used in the 1.7 release, since its name conflicts with the historic (and now deprecated) torch.fft function. - -**Example usage:** -```python ->>> import torch.fft ->>> t = torch.arange(4) ->>> t -tensor([0, 1, 2, 3]) - ->>> torch.fft.fft(t) -tensor([ 6.+0.j, -2.+2.j, -2.+0.j, -2.-2.j]) - ->>> t = tensor([0.+1.j, 2.+3.j, 4.+5.j, 6.+7.j]) ->>> torch.fft.fft(t) -tensor([12.+16.j, -8.+0.j, -4.-4.j, 0.-8.j]) - ``` - -* [Documentation](https://pytorch.org/docs/stable/fft.html#torch-fft) - -## [Beta] C++ Support for Transformer NN Modules -Since [PyTorch 1.5](https://pytorch.org/blog/pytorch-1-dot-5-released-with-new-and-updated-apis/), we’ve continued to maintain parity between the python and C++ frontend APIs. This update allows developers to use the nn.transformer module abstraction from the C++ Frontend. And moreover, developers no longer need to save a module from python/JIT and load into C++ as it can now be used it in C++ directly. -* [Documentation](https://pytorch.org/cppdocs/api/classtorch_1_1nn_1_1_transformer_impl.html#_CPPv4N5torch2nn15TransformerImplE) - -## [Beta] torch.set_deterministic -Reproducibility (bit-for-bit determinism) may help identify errors when debugging or testing a program. To facilitate reproducibility, PyTorch 1.7 adds the ```torch.set_deterministic(bool)``` function that can direct PyTorch operators to select deterministic algorithms when available, and to throw a runtime error if an operation may result in nondeterministic behavior. By default, the flag this function controls is false and there is no change in behavior, meaning PyTorch may implement its operations nondeterministically by default. - -More precisely, when this flag is true: -* Operations known to not have a deterministic implementation throw a runtime error; -* Operations with deterministic variants use those variants (usually with a performance penalty versus the non-deterministic version); and -* ```torch.backends.cudnn.deterministic = True``` is set. - -Note that this is necessary, **but not sufficient**, for determinism **within a single run of a PyTorch program**. Other sources of randomness like random number generators, unknown operations, or asynchronous or distributed computation may still cause nondeterministic behavior. - -See the documentation for ```torch.set_deterministic(bool)``` for the list of affected operations. -* [RFC](https://github.com/pytorch/pytorch/issues/15359) - -# Performance & Profiling -## [Beta] Stack traces added to profiler -Users can now see not only operator name/inputs in the profiler output table but also where the operator is in the code. The workflow requires very little change to take advantage of this capability. The user uses the [autograd profiler](https://pytorch.org/docs/stable/autograd.html#profiler) as before but with optional new parameters: ```with_stack``` and ```group_by_stack_n```. Caution: regular profiling runs should not use this feature as it adds significant overhead. -* [Detail](https://github.com/pytorch/pytorch/pull/43898/) -* [Documentation](https://pytorch.org/docs/stable/autograd.html) - -# Distributed Training & RPC -## [Stable] TorchElastic now bundled into PyTorch docker image -Torchelastic offers a strict superset of the current ```torch.distributed.launch``` CLI with the added features for fault-tolerance and elasticity. If the user is not be interested in fault-tolerance, they can get the exact functionality/behavior parity by setting ```max_restarts=0``` with the added convenience of auto-assigned ```RANK``` and ```MASTER_ADDR|PORT``` (versus manually specified in ```torch.distributed.launch)```. - -By bundling ```torchelastic``` in the same docker image as PyTorch, users can start experimenting with TorchElastic right-away without having to separately install ```torchelastic```. In addition to convenience, this work is a nice-to-have when adding support for elastic parameters in the existing Kubeflow’s distributed PyTorch operators. -* [Usage examples and how to get started](https://pytorch.org/elastic/0.2.0/examples.html) - -## [Beta] Support for uneven dataset inputs in DDP -PyTorch 1.7 introduces a new context manager to be used in conjunction with models trained using ```torch.nn.parallel.DistributedDataParallel``` to enable training with uneven dataset size across different processes. This feature enables greater flexibility when using DDP and prevents the user from having to manually ensure dataset sizes are the same across different process. With this context manager, DDP will handle uneven dataset sizes automatically, which can prevent errors or hangs at the end of training. -* [RFC](https://github.com/pytorch/pytorch/issues/38174) -* [Documentation](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join) - -## [Beta] NCCL Reliability - Async Error/Timeout Handling -In the past, NCCL training runs would hang indefinitely due to stuck collectives, leading to a very unpleasant experience for users. This feature will abort stuck collectives and throw an exception/crash the process if a potential hang is detected. When used with something like torchelastic (which can recover the training process from the last checkpoint), users can have much greater reliability for distributed training. This feature is completely opt-in and sits behind an environment variable that needs to be explicitly set in order to enable this functionality (otherwise users will see the same behavior as before). -* [RFC](https://github.com/pytorch/pytorch/issues/46874) -* [Documentation](https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group) - -## [Beta] TorchScript ```rpc_remote``` and ```rpc_sync``` -```torch.distributed.rpc.rpc_async``` has been available in TorchScript in prior releases. For PyTorch 1.7, this functionality will be extended the remaining two core RPC APIs, ```torch.distributed.rpc.rpc_sync``` and ```torch.distributed.rpc.remote```. This will complete the major RPC APIs targeted for support in TorchScript, it allows users to use the existing python RPC APIs within TorchScript (in a script function or script method, which releases the python Global Interpreter Lock) and could possibly improve application performance in multithreaded environment. -* [Documentation](https://pytorch.org/docs/stable/rpc.html#rpc) -* [Usage examples](https://github.com/pytorch/pytorch/blob/58ed60c259834e324e86f3e3118e4fcbbfea8dd1/torch/testing/_internal/distributed/rpc/jit/rpc_test.py#L505-L525) - -## [Beta] Distributed optimizer with TorchScript support -PyTorch provides a broad set of optimizers for training algorithms, and these have been used repeatedly as part of the python API. However, users often want to use multithreaded training instead of multiprocess training as it provides better resource utilization and efficiency in the context of large scale distributed training (e.g. Distributed Model Parallel) or any RPC-based training application). Users couldn’t do this with with distributed optimizer before because we need to get rid of the python Global Interpreter Lock (GIL) limitation to achieve this. - -In PyTorch 1.7, we are enabling the TorchScript support in distributed optimizer to remove the GIL, and make it possible to run optimizer in multithreaded applications. The new distributed optimizer has the exact same interface as before but it automatically converts optimizers within each worker into TorchScript to make each GIL free. This is done by leveraging a functional optimizer concept and allowing the distributed optimizer to convert the computational portion of the optimizer into TorchScript. This will help use cases like distributed model parallel training and improve performance using multithreading. - -Currently, the only optimizer that supports automatic conversion with TorchScript is ```Adagrad``` and all other optimizers will still work as before without TorchScript support. We are working on expanding the coverage to all PyTorch optimizers and expect more to come in future releases. The usage to enable TorchScript support is automatic and exactly the same with existing python APIs, here is an example of how to use this: - -```python -import torch.distributed.autograd as dist_autograd -import torch.distributed.rpc as rpc -from torch import optim -from torch.distributed.optim import DistributedOptimizer - -with dist_autograd.context() as context_id: - # Forward pass. - rref1 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 3)) - rref2 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 1)) - loss = rref1.to_here() + rref2.to_here() - - # Backward pass. - dist_autograd.backward(context_id, [loss.sum()]) - - # Optimizer, pass in optim.Adagrad, DistributedOptimizer will - # automatically convert/compile it to TorchScript (GIL-free) - dist_optim = DistributedOptimizer( - optim.Adagrad, - [rref1, rref2], - lr=0.05, - ) - dist_optim.step(context_id) - ``` -* [RFC](https://github.com/pytorch/pytorch/issues/46883) -* [Documentation](https://pytorch.org/docs/stable/rpc.html#module-torch.distributed.optim) - -## [Beta] Enhancements to RPC-based Profiling -Support for using the PyTorch profiler in conjunction with the RPC framework was first introduced in PyTorch 1.6. In PyTorch 1.7, the following enhancements have been made: -* Implemented better support for profiling TorchScript functions over RPC -* Achieved parity in terms of profiler features that work with RPC -* Added support for asynchronous RPC functions on the server-side (functions decorated with ```rpc.functions.async_execution)```. - -Users are now able to use familiar profiling tools such as with ```torch.autograd.profiler.profile()``` and ```with torch.autograd.profiler.record_function```, and this works transparently with the RPC framework with full feature support, profiles asynchronous functions, and TorchScript functions. -* [Design doc](https://github.com/pytorch/pytorch/issues/39675) -* [Usage examples](https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html) - -## [Prototype] Windows support for Distributed Training -PyTorch 1.7 brings prototype support for ```DistributedDataParallel``` and collective communications on the Windows platform. In this release, the support only covers Gloo-based ```ProcessGroup``` and ```FileStore```. - -To use this feature across multiple machines, please provide a file from a shared file system in ```init_process_group```. - -```python -# initialize the process group -dist.init_process_group( - "gloo", - # multi-machine example: - # init_method = "file://////{machine}/{share_folder}/file" - init_method="file:///{your local file path}", - rank=rank, - world_size=world_size -) - -model = DistributedDataParallel(local_model, device_ids=[rank]) -``` -* [Design doc](https://github.com/pytorch/pytorch/issues/42095) -* [Documentation](https://pytorch.org/docs/master/distributed.html#backends-that-come-with-pytorch) -* Acknowledgement ([gunandrose4u](https://github.com/gunandrose4u)) - -# Mobile -PyTorch Mobile supports both [iOS](https://pytorch.org/mobile/ios) and [Android](https://pytorch.org/mobile/android/) with binary packages available in [Cocoapods](https://cocoapods.org/) and [JCenter](https://mvnrepository.com/repos/jcenter) respectively. You can learn more about PyTorch Mobile [here](https://pytorch.org/mobile/home/). - -## [Beta] PyTorch Mobile Caching allocator for performance improvements -On some mobile platforms, such as Pixel, we observed that memory is returned to the system more aggressively. This results in frequent page faults as PyTorch being a functional framework does not maintain state for the operators. Thus outputs are allocated dynamically on each execution of the op, for the most ops. To ameliorate performance penalties due to this, PyTorch 1.7 provides a simple caching allocator for CPU. The allocator caches allocations by tensor sizes and, is currently, available only via the PyTorch C++ API. The caching allocator itself is owned by client and thus the lifetime of the allocator is also maintained by client code. Such a client owned caching allocator can then be used with scoped guard, ```c10::WithCPUCachingAllocatorGuard```, to enable the use of cached allocation within that scope. -**Example usage:** - -```python -#include -..... -c10::CPUCachingAllocator caching_allocator; - // Owned by client code. Can be a member of some client class so as to tie the - // the lifetime of caching allocator to that of the class. -..... -{ - c10::optional caching_allocator_guard; - if (FLAGS_use_caching_allocator) { - caching_allocator_guard.emplace(&caching_allocator); - } - .... - model.forward(..); -} -... -``` -**NOTE**: Caching allocator is only available on mobile builds, thus the use of caching allocator outside of mobile builds won’t be effective. -* [Documentation](https://github.com/pytorch/pytorch/blob/master/c10/mobile/CPUCachingAllocator.h#L13-L43) -* [Usage examples](https://github.com/pytorch/pytorch/blob/master/binaries/speed_benchmark_torch.cc#L207) - -# torchvision -## [Stable] Transforms now support Tensor inputs, batch computation, GPU, and TorchScript -torchvision transforms are now inherited from ```nn.Module``` and can be torchscripted and applied on torch Tensor inputs as well as on PIL images. They also support Tensors with batch dimensions and work seamlessly on CPU/GPU devices: -```python -import torch -import torchvision.transforms as T - -# to fix random seed, use torch.manual_seed -# instead of random.seed -torch.manual_seed(12) - -transforms = torch.nn.Sequential( - T.RandomCrop(224), - T.RandomHorizontalFlip(p=0.3), - T.ConvertImageDtype(torch.float), - T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) -) -scripted_transforms = torch.jit.script(transforms) -# Note: we can similarly use T.Compose to define transforms -# transforms = T.Compose([...]) and -# scripted_transforms = torch.jit.script(torch.nn.Sequential(*transforms.transforms)) - -tensor_image = torch.randint(0, 256, size=(3, 256, 256), dtype=torch.uint8) -# works directly on Tensors -out_image1 = transforms(tensor_image) -# on the GPU -out_image1_cuda = transforms(tensor_image.cuda()) -# with batches -batched_image = torch.randint(0, 256, size=(4, 3, 256, 256), dtype=torch.uint8) -out_image_batched = transforms(batched_image) -# and has torchscript support -out_image2 = scripted_transforms(tensor_image) -``` -These improvements enable the following new features: -* support for GPU acceleration -* batched transformations e.g. as needed for videos -* transform multi-band torch tensor images (with more than 3-4 channels) -* torchscript transforms together with your model for deployment -**Note:** Exceptions for TorchScript support includes ```Compose```, ```RandomChoice```, ```RandomOrder```, ```Lambda``` and those applied on PIL images, such as ```ToPILImage```. - -## [Stable] Native image IO for JPEG and PNG formats -torchvision 0.8.0 introduces native image reading and writing operations for JPEG and PNG formats. Those operators support TorchScript and return ```CxHxW``` tensors in ```uint8``` format, and can thus be now part of your model for deployment in C++ environments. -```python -from torchvision.io import read_image - -# tensor_image is a CxHxW uint8 Tensor -tensor_image = read_image('path_to_image.jpeg') - -# or equivalently -from torchvision.io import read_file, decode_image -# raw_data is a 1d uint8 Tensor with the raw bytes -raw_data = read_file('path_to_image.jpeg') -tensor_image = decode_image(raw_data) - -# all operators are torchscriptable and can be -# serialized together with your model torchscript code -scripted_read_image = torch.jit.script(read_image) -``` -## [Stable] RetinaNet detection model -This release adds pretrained models for RetinaNet with a ResNet50 backbone from [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002). - -## [Beta] New Video Reader API -This release introduces a new video reading abstraction, which gives more fine-grained control of iteration over videos. It supports image and audio, and implements an iterator interface so that it is interoperable with other the python libraries such as itertools. -```python -from torchvision.io import VideoReader - -# stream indicates if reading from audio or video -reader = VideoReader('path_to_video.mp4', stream='video') -# can change the stream after construction -# via reader.set_current_stream - -# to read all frames in a video starting at 2 seconds -for frame in reader.seek(2): - # frame is a dict with "data" and "pts" metadata - print(frame["data"], frame["pts"]) - -# because reader is an iterator you can combine it with -# itertools -from itertools import takewhile, islice -# read 10 frames starting from 2 seconds -for frame in islice(reader.seek(2), 10): - pass - -# or to return all frames between 2 and 5 seconds -for frame in takewhile(lambda x: x["pts"] < 5, reader): - pass -``` -**Notes:** -* In order to use the Video Reader API beta, you must compile torchvision from source and have ffmpeg installed in your system. -* The VideoReader API is currently released as beta and its API may change following user feedback. - -# torchaudio -With this release, torchaudio is expanding its support for models and [end-to-end applications](https://github.com/pytorch/audio/tree/master/examples), adding a wav2letter training pipeline and end-to-end text-to-speech and source separation pipelines. Please file an issue on [github](https://github.com/pytorch/audio/issues/new?template=questions-help-support.md) to provide feedback on them. - -## [Stable] Speech Recognition -Building on the addition of the wav2letter model for speech recognition in the last release, we’ve now added an [example wav2letter training pipeline](https://github.com/pytorch/audio/tree/master/examples/pipeline_wav2letter) with the LibriSpeech dataset. - -## [Stable] Text-to-speech -With the goal of supporting text-to-speech applications, we added a vocoder based on the WaveRNN model, based on the implementation from [this repository](https://github.com/fatchord/WaveRNN). The original implementation was introduced in "Efficient Neural Audio Synthesis". We also provide an [example WaveRNN training pipeline](https://github.com/pytorch/audio/tree/master/examples/pipeline_wavernn) that uses the LibriTTS dataset added to torchaudio in this release. - -## [Stable] Source Separation -With the addition of the ConvTasNet model, based on the paper "Conv-TasNet: Surpassing Ideal Time-Frequency Magnitude Masking for Speech Separation," torchaudio now also supports source separation. An [example ConvTasNet training pipeline](https://github.com/pytorch/audio/tree/master/examples/source_separation) is provided with the wsj-mix dataset. - -Cheers! - -Team PyTorch diff --git a/_posts/2020-11-1-pytorch-developer-day-2020.md b/_posts/2020-11-1-pytorch-developer-day-2020.md deleted file mode 100644 index c68bafaec0d1..000000000000 --- a/_posts/2020-11-1-pytorch-developer-day-2020.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -layout: blog_detail -title: 'Announcing PyTorch Developer Day 2020' -author: Team PyTorch ---- - -Starting this year, we plan to host two separate events for PyTorch: one for developers and users to discuss core technical development, ideas and roadmaps called **“Developer Day”**, and another for the PyTorch ecosystem and industry communities to showcase their work and discover opportunities to collaborate called **“Ecosystem Day”** (scheduled for early 2021). - -
        - -
        - -The **PyTorch Developer Day** (#PTD2) is kicking off on November 12, 2020, 8AM PST with a full day of technical talks on a variety of topics, including updates to the core framework, new tools and libraries to support development across a variety of domains. You'll also see talks covering the latest research around systems and tooling in ML. - -For Developer Day, we have an online networking event limited to people composed of PyTorch maintainers and contributors, long-time stakeholders and experts in areas relevant to PyTorch’s future. Conversations from the networking event will strongly shape the future of PyTorch. Hence, invitations are required to attend the networking event. - -All talks will be livestreamed and available to the public. -* [Livestream event page](https://www.facebook.com/events/802177440559164/) - -Visit the event website to learn more. We look forward to welcoming you to PyTorch Developer Day on November 12th! - -Thank you, - -The PyTorch team diff --git a/_posts/2020-11-12-prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds.md b/_posts/2020-11-12-prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds.md deleted file mode 100644 index c5c16358c0b2..000000000000 --- a/_posts/2020-11-12-prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: blog_detail -title: 'Prototype Features Now Available - APIs for Hardware Accelerated Mobile and ARM64 Builds' -author: Team PyTorch ---- - -Today, we are announcing four PyTorch prototype features. The first three of these will enable Mobile machine-learning developers to execute models on the full set of hardware (HW) engines making up a system-on-chip (SOC). This gives developers options to optimize their model execution for unique performance, power, and system-level concurrency. - -These features include enabling execution on the following on-device HW engines: -* DSP and NPUs using the Android Neural Networks API (NNAPI), developed in collaboration with Google -* GPU execution on Android via Vulkan -* GPU execution on iOS via Metal - -This release also includes developer efficiency benefits with newly introduced support for ARM64 builds for Linux. - -Below, you’ll find brief descriptions of each feature with the links to get you started. These features are available through our [nightly builds](https://pytorch.org/). Reach out to us on the [PyTorch Forums](https://discuss.pytorch.org/) for any comment or feedback. We would love to get your feedback on those and hear how you are using them! - -## NNAPI Support with Google Android - -The Google Android and PyTorch teams collaborated to enable support for Android’s Neural Networks API (NNAPI) via PyTorch Mobile. Developers can now unlock high-performance execution on Android phones as their machine-learning models will be able to access additional hardware blocks on the phone’s system-on-chip. NNAPI allows Android apps to run computationally intensive neural networks on the most powerful and efficient parts of the chips that power mobile phones, including DSPs (Digital Signal Processors) and NPUs (specialized Neural Processing Units). The API was introduced in Android 8 (Oreo) and significantly expanded in Android 10 and 11 to support a richer set of AI models. With this integration, developers can now seamlessly access NNAPI directly from PyTorch Mobile. This initial release includes fully-functional support for a core set of features and operators, and Google and Facebook will be working to expand capabilities in the coming months. - -**Links** -* [Android Blog: Android Neural Networks API 1.3 and PyTorch Mobile support](https://android-developers.googleblog.com/2020/11/android-neural-networks-api-13.html) -* [PyTorch Medium Blog: Support for Android NNAPI with PyTorch Mobile](http://bit.ly/android-nnapi-pytorch-mobile-announcement) - -## PyTorch Mobile GPU support - -Inferencing on GPU can provide great performance on many models types, especially those utilizing high-precision floating-point math. Leveraging the GPU for ML model execution as those found in SOCs from Qualcomm, Mediatek, and Apple allows for CPU-offload, freeing up the Mobile CPU for non-ML use cases. This initial prototype level support provided for on device GPUs is via the Metal API specification for iOS, and the Vulkan API specification for Android. As this feature is in an early stage: performance is not optimized and model coverage is limited. We expect this to improve significantly over the course of 2021 and would like to hear from you which models and devices you would like to see performance improvements on. - -**Links** -* [Prototype source workflows](https://github.com/pytorch/tutorials/tree/master/prototype_source) - -## ARM64 Builds for Linux - -We will now provide prototype level PyTorch builds for ARM64 devices on Linux. As we see more ARM usage in our community with platforms such as Raspberry Pis and Graviton(2) instances spanning both at the edge and on servers respectively. This feature is available through our [nightly builds](https://pytorch.org/). - -We value your feedback on these features and look forward to collaborating with you to continuously improve them further! - -Thank you, - -Team PyTorch diff --git a/_posts/2020-3-26-introduction-to-quantization-on-pytorch.md b/_posts/2020-3-26-introduction-to-quantization-on-pytorch.md deleted file mode 100644 index a23bdc353b4b..000000000000 --- a/_posts/2020-3-26-introduction-to-quantization-on-pytorch.md +++ /dev/null @@ -1,286 +0,0 @@ ---- -layout: blog_detail -title: 'Introduction to Quantization on PyTorch' -author: Raghuraman Krishnamoorthi, James Reed, Min Ni, Chris Gottbrath, and Seth Weidman ---- - -It’s important to make efficient use of both server-side and on-device compute resources when developing machine learning applications. To support more efficient deployment on servers and edge devices, PyTorch added a support for model quantization using the familiar eager mode Python API. - -Quantization leverages 8bit integer (int8) instructions to reduce the model size and run the inference faster (reduced latency) and can be the difference between a model achieving quality of service goals or even fitting into the resources available on a mobile device. Even when resources aren’t quite so constrained it may enable you to deploy a larger and more accurate model. Quantization is available in PyTorch starting in version 1.3 and with the release of PyTorch 1.4 we published quantized models for ResNet, ResNext, MobileNetV2, GoogleNet, InceptionV3 and ShuffleNetV2 in the PyTorch torchvision 0.5 library. - -This blog post provides an overview of the quantization support on PyTorch and its incorporation with the TorchVision domain library. - -## **What is Quantization?** - -Quantization refers to techniques for doing both computations and memory accesses with lower precision data, usually int8 compared to floating point implementations. This enables performance gains in several important areas: -* 4x reduction in model size; -* 2-4x reduction in memory bandwidth; -* 2-4x faster inference due to savings in memory bandwidth and faster compute with int8 arithmetic (the exact speed up varies depending on the hardware, the runtime, and the model). - -Quantization does not however come without additional cost. Fundamentally quantization means introducing approximations and the resulting networks have slightly less accuracy. These techniques attempt to minimize the gap between the full floating point accuracy and the quantized accuracy. - -We designed quantization to fit into the PyTorch framework. The means that: -1. PyTorch has data types corresponding to [quantized tensors](https://github.com/pytorch/pytorch/wiki/Introducing-Quantized-Tensor), which share many of the features of tensors. -2. One can write kernels with quantized tensors, much like kernels for floating point tensors to customize their implementation. PyTorch supports quantized modules for common operations as part of the `torch.nn.quantized` and `torch.nn.quantized.dynamic` name-space. -3. Quantization is compatible with the rest of PyTorch: quantized models are traceable and scriptable. The quantization method is virtually identical for both server and mobile backends. One can easily mix quantized and floating point operations in a model. -4. Mapping of floating point tensors to quantized tensors is customizable with user defined observer/fake-quantization blocks. PyTorch provides default implementations that should work for most use cases. - -
        - -
        - -We developed three techniques for quantizing neural networks in PyTorch as part of quantization tooling in the `torch.quantization` name-space. - -## **The Three Modes of Quantization Supported in PyTorch starting version 1.3** - -1. ### **Dynamic Quantization** - The easiest method of quantization PyTorch supports is called **dynamic quantization**. This involves not just converting the weights to int8 - as happens in all quantization variants - but also converting the activations to int8 on the fly, just before doing the computation (hence “dynamic”). The computations will thus be performed using efficient int8 matrix multiplication and convolution implementations, resulting in faster compute. However, the activations are read and written to memory in floating point format. - * **PyTorch API**: we have a simple API for dynamic quantization in PyTorch. `torch.quantization.quantize_dynamic` takes in a model, as well as a couple other arguments, and produces a quantized model! Our [end-to-end tutorial](https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html) illustrates this for a BERT model; while the tutorial is long and contains sections on loading pre-trained models and other concepts unrelated to quantization, the part the quantizes the BERT model is simply: - - ```python - import torch.quantization - quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) - ``` - * See the documentation for the function [here](https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic) an end-to-end example in our tutorials [here](https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html) and [here](https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html). - -2. ### **Post-Training Static Quantization** - - One can further improve the performance (latency) by converting networks to use both integer arithmetic and int8 memory accesses. Static quantization performs the additional step of first feeding batches of data through the network and computing the resulting distributions of the different activations (specifically, this is done by inserting “observer” modules at different points that record these distributions). This information is used to determine how specifically the different activations should be quantized at inference time (a simple technique would be to simply divide the entire range of activations into 256 levels, but we support more sophisticated methods as well). Importantly, this additional step allows us to pass quantized values between operations instead of converting these values to floats - and then back to ints - between every operation, resulting in a significant speed-up. - - With this release, we’re supporting several features that allow users to optimize their static quantization: - 1. Observers: you can customize observer modules which specify how statistics are collected prior to quantization to try out more advanced methods to quantize your data. - 2. Operator fusion: you can fuse multiple operations into a single operation, saving on memory access while also improving the operation’s numerical accuracy. - 3. Per-channel quantization: we can independently quantize weights for each output channel in a convolution/linear layer, which can lead to higher accuracy with almost the same speed. - - * ### **PyTorch API**: - * To fuse modules, we have `torch.quantization.fuse_modules` - * Observers are inserted using `torch.quantization.prepare` - * Finally, quantization itself is done using `torch.quantization.convert` - - We have a tutorial with an end-to-end example of quantization (this same tutorial also covers our third quantization method, quantization-aware training), but because of our simple API, the three lines that perform post-training static quantization on the pre-trained model `myModel` are: - ```python - # set quantization config for server (x86) - deploymentmyModel.qconfig = torch.quantization.get_default_config('fbgemm') - - # insert observers - torch.quantization.prepare(myModel, inplace=True) - # Calibrate the model and collect statistics - - # convert to quantized version - torch.quantization.convert(myModel, inplace=True) - ``` - -3. ### **Quantization Aware Training** - **Quantization-aware training(QAT)** is the third method, and the one that typically results in highest accuracy of these three. With QAT, all weights and activations are “fake quantized” during both the forward and backward passes of training: that is, float values are rounded to mimic int8 values, but all computations are still done with floating point numbers. Thus, all the weight adjustments during training are made while “aware” of the fact that the model will ultimately be quantized; after quantizing, therefore, this method usually yields higher accuracy than the other two methods. -* ### **PyTorch API**: - * `torch.quantization.prepare_qat` inserts fake quantization modules to model quantization. - * Mimicking the static quantization API, `torch.quantization.convert` actually quantizes the model once training is complete. - - For example, in [the end-to-end example](https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html), we load in a pre-trained model as `qat_model`, then we simply perform quantization-aware training using: - - ```python - # specify quantization config for QAT - qat_model.qconfig=torch.quantization.get_default_qat_qconfig('fbgemm') - - # prepare QAT - torch.quantization.prepare_qat(qat_model, inplace=True) - - # convert to quantized version, removing dropout, to check for accuracy on each - epochquantized_model=torch.quantization.convert(qat_model.eval(), inplace=False) - ``` - -### **Device and Operator Support** -Quantization support is restricted to a subset of available operators, depending on the method being used, for a list of supported operators, please see the documentation at [https://pytorch.org/docs/stable/quantization.html](https://pytorch.org/docs/stable/quantization.html). - -The set of available operators and the quantization numerics also depend on the backend being used to run quantized models. Currently quantized operators are supported only for CPU inference in the following backends: x86 and ARM. Both the quantization configuration (how tensors should be quantized and the quantized kernels (arithmetic with quantized tensors) are backend dependent. One can specify the backend by doing: - -```python -import torchbackend='fbgemm' -# 'fbgemm' for server, 'qnnpack' for mobile -my_model.qconfig = torch.quantization.get_default_qconfig(backend) -# prepare and convert model -# Set the backend on which the quantized kernels need to be run -torch.backends.quantized.engine=backend -``` - -However, quantization aware training occurs in full floating point and can run on either GPU or CPU. Quantization aware training is typically only used in CNN models when post training static or dynamic quantization doesn’t yield sufficient accuracy. This can occur with models that are highly optimized to achieve small size (such as Mobilenet). - -#### **Integration in torchvision** -We’ve also enabled quantization for some of the most popular models in [torchvision](https://github.com/pytorch/vision/tree/master/torchvision/models/quantization): Googlenet, Inception, Resnet, ResNeXt, Mobilenet and Shufflenet. We have upstreamed these changes to torchvision in three forms: -1. Pre-trained quantized weights so that you can use them right away. -2. Quantization ready model definitions so that you can do post-training quantization or quantization aware training. -3. A script for doing quantization aware training — which is available for any of these model though, as you will learn below, we only found it necessary for achieving accuracy with Mobilenet. -4. We also have a [tutorial](https://pytorch.org/tutorials/intermediate/quantized_transfer_learning_tutorial.html) showing how you can do transfer learning with quantization using one of the torchvision models. - -### **Choosing an approach** -The choice of which scheme to use depends on multiple factors: -1. Model/Target requirements: Some models might be sensitive to quantization, requiring quantization aware training. -2. Operator/Backend support: Some backends require fully quantized operators. - -Currently, operator coverage is limited and may restrict the choices listed in the table below: -The table below provides a guideline. - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model TypePreferred schemeWhy
        LSTM/RNNDynamic QuantizationThroughput dominated by compute/memory bandwidth for weights
        BERT/TransformerDynamic QuantizationThroughput dominated by compute/memory bandwidth for weights
        CNNStatic QuantizationThroughput limited by memory bandwidth for activations
        CNNQuantization Aware TrainingIn the case where accuracy can't be achieved with static quantization
        - -### **Performance Results** -Quantization provides a 4x reduction in the model size and a speedup of 2x to 3x compared to floating point implementations depending on the hardware platform and the model being benchmarked. Some sample results are: - -
        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        ModelFloat Latency (ms)Quantized Latency (ms)Inference Performance GainDeviceNotes
        BERT5813131.8xXeon-D2191 (1.6GHz)Batch size = 1, Maximum sequence length= 128, Single thread, x86-64, Dynamic quantization
        Resnet-502141032xXeon-D2191 (1.6GHz)Single thread, x86-64, Static quantization
        Mobilenet-v297175.7xSamsung S9Static quantization, Floating point numbers are based on Caffe2 run-time and are not optimized
        -
        - -### **Accuracy results** -We also compared the accuracy of static quantized models with the floating point models on Imagenet. For dynamic quantization, we [compared](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py) the F1 score of BERT on the GLUE benchmark for MRPC. - -#### **Computer Vision Model accuracy** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        ModelTop-1 Accuracy (Float)Top-1 Accuracy (Quantized)Quantization scheme
        Googlenet69.869.7Static post training quantization
        Inception-v377.577.1Static post training quantization
        ResNet-1869.869.4Static post training quantization
        Resnet-5076.175.9Static post training quantization
        ResNext-101 32x8d79.379Static post training quantization
        Mobilenet-v271.971.6Quantization Aware Training
        Shufflenet-v269.468.4Static post training quantization
        - -#### **Speech and NLP Model accuracy** - -
        - - - - - - - - - - - - - -
        ModelF1 (GLUEMRPC) FloatF1 (GLUEMRPC) QuantizedQuantization scheme
        BERT0.9020.895Dynamic quantization
        -
        - -### **Conclusion** -To get started on quantizing your models in PyTorch, start with [the tutorials on the PyTorch website](https://pytorch.org/tutorials/#model-optimization). If you are working with sequence data start with [dynamic quantization for LSTM](https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html), or [BERT](https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html). If you are working with image data then we recommend starting with the [transfer learning with quantization](https://pytorch.org/tutorials/intermediate/quantized_transfer_learning_tutorial.html) tutorial. Then you can explore [static post training quantization](https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html). If you find that the accuracy drop with post training quantization is too high, then try [quantization aware training](https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html). - -If you run into issues you can get community help by posting in at [discuss.pytorch.org](https://discuss.pytorch.org/), use the quantization category for quantization related issues. - -_This post is authored by Raghuraman Krishnamoorthi, James Reed, Min Ni, Chris Gottbrath and Seth Weidman. Special thanks to Jianyu Huang, Lingyi Liu and Haixin Liu for producing quantization metrics included in this post._ - -### **Further reading**: -1. PyTorch quantization presentation at Neurips: [(https://research.fb.com/wp-content/uploads/2019/12/2.-Quantization.pptx)](https://research.fb.com/wp-content/uploads/2019/12/2.-Quantization.pptx) -2. Quantized Tensors [(https://github.com/pytorch/pytorch/wiki/ -Introducing-Quantized-Tensor)](https://github.com/pytorch/pytorch/wiki/Introducing-Quantized-Tensor) -3. Quantization RFC on Github [(https://github.com/pytorch/pytorch/ -issues/18318)](https://github.com/pytorch/pytorch/issues/18318) diff --git a/_posts/2020-4-21-pytorch-1-dot-5-released-with-new-and-updated-apis.md b/_posts/2020-4-21-pytorch-1-dot-5-released-with-new-and-updated-apis.md deleted file mode 100644 index e81d2f7da780..000000000000 --- a/_posts/2020-4-21-pytorch-1-dot-5-released-with-new-and-updated-apis.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.5 released, new and updated APIs including C++ frontend API parity with Python' -author: Team PyTorch ---- - - -Today, we’re announcing the availability of PyTorch 1.5, along with new and updated libraries. This release includes several major new API additions and improvements. PyTorch now includes a significant update to the C++ frontend, ‘channels last’ memory format for computer vision models, and a stable release of the distributed RPC framework used for model-parallel training. The release also has new APIs for autograd for hessians and jacobians, and an API that allows the creation of Custom C++ Classes that was inspired by pybind. - -You can find the detailed release notes [here](https://github.com/pytorch/pytorch/releases). - -## C++ Frontend API (Stable) - -The C++ frontend API is now at parity with Python, and the features overall have been moved to ‘stable’ (previously tagged as experimental). Some of the major highlights include: - -* Now with ~100% coverage and docs for C++ torch::nn module/functional, users can easily translate their model from Python API to C++ API, making the model authoring experience much smoother. -* Optimizers in C++ had deviated from the Python equivalent: C++ optimizers can’t take parameter groups as input while the Python ones can. Additionally, step function implementations were not exactly the same. With the 1.5 release, C++ optimizers will always behave the same as the Python equivalent. -* The lack of tensor multi-dim indexing API in C++ is a well-known issue and had resulted in many posts in PyTorch Github issue tracker and forum. The previous workaround was to use a combination of `narrow` / `select` / `index_select` / `masked_select`, which was clunky and error-prone compared to the Python API’s elegant `tensor[:, 0, ..., mask]` syntax. With the 1.5 release, users can use `tensor.index({Slice(), 0, "...", mask})` to achieve the same purpose. - -## ‘Channels last’ memory format for Computer Vision models (Experimental) - -‘Channels last’ memory layout unlocks ability to use performance efficient convolution algorithms and hardware (NVIDIA’s Tensor Cores, FBGEMM, QNNPACK). Additionally, it is designed to automatically propagate through the operators, which allows easy switching between memory layouts. - -Learn more [here](https://github.com/pytorch/pytorch/wiki/Writing-memory-format-aware-operators) on how to write memory format aware operators. - -## Custom C++ Classes (Experimental) - -This release adds a new API, `torch::class_`, for binding custom C++ classes into TorchScript and Python simultaneously. This API is almost identical in syntax to [pybind11](https://pybind11.readthedocs.io/en/stable/). It allows users to expose their C++ class and its methods to the TorchScript type system and runtime system such that they can instantiate and manipulate arbitrary C++ objects from TorchScript and Python. An example C++ binding: - -```python -template -struct MyStackClass : torch::CustomClassHolder { - std::vector stack_; - MyStackClass(std::vector init) : stack_(std::move(init)) {} - - void push(T x) { - stack_.push_back(x); - } - T pop() { - auto val = stack_.back(); - stack_.pop_back(); - return val; - } -}; - -static auto testStack = - torch::class_>("myclasses", "MyStackClass") - .def(torch::init>()) - .def("push", &MyStackClass::push) - .def("pop", &MyStackClass::pop) - .def("size", [](const c10::intrusive_ptr& self) { - return self->stack_.size(); - }); -``` - - Which exposes a class you can use in Python and TorchScript like so: - -```python -@torch.jit.script -def do_stacks(s : torch.classes.myclasses.MyStackClass): - s2 = torch.classes.myclasses.MyStackClass(["hi", "mom"]) - print(s2.pop()) # "mom" - s2.push("foobar") - return s2 # ["hi", "foobar"] -``` - -You can try it out in the tutorial [here](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html). - - -## Distributed RPC framework APIs (Now Stable) - -The Distributed [RPC framework](https://pytorch.org/docs/stable/rpc.html) was launched as experimental in the 1.4 release and the proposal is to mark Distributed RPC framework as stable and no longer experimental. This work involves a lot of enhancements and bug fixes to make the distributed RPC framework more reliable and robust overall, as well as adding a couple of new features, including profiling support, using TorchScript functions in RPC, and several enhancements for ease of use. Below is an overview of the various APIs within the framework: - -### RPC API -The RPC API allows users to specify functions to run and objects to be instantiated on remote nodes. These functions are transparently recorded so that gradients can backpropagate through remote nodes using Distributed Autograd. - -### Distributed Autograd -Distributed Autograd connects the autograd graph across several nodes and allows gradients to flow through during the backwards pass. Gradients are accumulated into a context (as opposed to the .grad field as with Autograd) and users must specify their model’s forward pass under a with `dist_autograd.context()` manager in order to ensure that all RPC communication is recorded properly. Currently, only FAST mode is implemented (see [here](https://pytorch.org/docs/stable/rpc/distributed_autograd.html#distributed-autograd-design) for the difference between FAST and SMART modes). - -### Distributed Optimizer -The distributed optimizer creates RRefs to optimizers on each worker with parameters that require gradients, and then uses the RPC API to run the optimizer remotely. The user must collect all remote parameters and wrap them in an `RRef`, as this is required input to the distributed optimizer. The user must also specify the distributed autograd `context_id` so that the optimizer knows in which context to look for gradients. - -Learn more about distributed RPC framework APIs [here](https://pytorch.org/docs/stable/rpc.html). - -## New High level autograd API (Experimental) - -PyTorch 1.5 brings new functions including jacobian, hessian, jvp, vjp, hvp and vhp to the `torch.autograd.functional` submodule. This feature builds on the current API and allows the user to easily perform these functions. - -Detailed design discussion on GitHub can be found [here](https://github.com/pytorch/pytorch/issues/30632). - -## Python 2 no longer supported - -Starting PyTorch 1.5.0, we will no longer support Python 2, specifically version 2.7. Going forward support for Python will be limited to Python 3, specifically Python 3.5, 3.6, 3.7 and 3.8 (first enabled in PyTorch 1.4.0). - - -*We’d like to thank the entire PyTorch team and the community for all their contributions to this work.* - -Cheers! - -Team PyTorch diff --git a/_posts/2020-4-21-pytorch-library-updates-new-model-serving-library.md b/_posts/2020-4-21-pytorch-library-updates-new-model-serving-library.md deleted file mode 100644 index af49e31a38f7..000000000000 --- a/_posts/2020-4-21-pytorch-library-updates-new-model-serving-library.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch library updates including new model serving library ' -author: Team PyTorch ---- - - -Along with the PyTorch 1.5 release, we are announcing new libraries for high-performance PyTorch model serving and tight integration with TorchElastic and Kubernetes. Additionally, we are releasing updated packages for torch_xla (Google Cloud TPUs), torchaudio, torchvision, and torchtext. All of these new libraries and enhanced capabilities are available today and accompany all of the core features [released in PyTorch 1.5](https://pytorch.org/blog/pytorch-1-dot-5-released-with-new-and-updated-apis). - -## TorchServe (Experimental) - -TorchServe is a flexible and easy to use library for serving PyTorch models in production performantly at scale. It is cloud and environment agnostic and supports features such as multi-model serving, logging, metrics, and the creation of RESTful endpoints for application integration. TorchServe was jointly developed by engineers from Facebook and AWS with feedback and engagement from the broader PyTorch community. The experimental release of TorchServe is available today. Some of the highlights include: - -* Support for both Python-based and TorchScript-based models -* Default handlers for common use cases (e.g., image segmentation, text classification) as well as the ability to write custom handlers for other use cases -* Model versioning, the ability to run multiple versions of a model at the same time, and the ability to roll back to an earlier version -* The ability to package a model, learning weights, and supporting files (e.g., class mappings, vocabularies) into a single, persistent artifact (a.k.a. the “model archive”) -* Robust management capability, allowing full configuration of models, versions, and individual worker threads via command line, config file, or run-time API -* Automatic batching of individual inferences across HTTP requests -* Logging including common metrics, and the ability to incorporate custom metrics -* Ready-made Dockerfile for easy deployment -* HTTPS support for secure deployment - -To learn more about the APIs and the design of this feature, see the links below: -* See for a full multi-node deployment reference architecture. -* The full documentation can be found [here](https://pytorch.org/serve). - -## TorchElastic integration with Kubernetes (Experimental) - -[TorchElastic](https://github.com/pytorch/elastic) is a proven library for training large scale deep neural networks at scale within companies like Facebook, where having the ability to dynamically adapt to server availability and scale as new compute resources come online is critical. Kubernetes enables customers using machine learning frameworks like PyTorch to run training jobs distributed across fleets of powerful GPU instances like the Amazon EC2 P3. Distributed training jobs, however, are not fault-tolerant, and a job cannot continue if a node failure or reclamation interrupts training. Further, jobs cannot start without acquiring all required resources, or scale up and down without being restarted. This lack of resiliency and flexibility results in increased training time and costs from idle resources. TorchElastic addresses these limitations by enabling distributed training jobs to be executed in a fault-tolerant and elastic manner. Until today, Kubernetes users needed to manage Pods and Services required for TorchElastic training jobs manually. - -Through the joint collaboration of engineers at Facebook and AWS, TorchElastic, adding elasticity and fault tolerance, is now supported using vanilla Kubernetes and through the managed EKS service from AWS. - -To learn more see the [TorchElastic repo](http://pytorch.org/elastic/0.2.0rc0/kubernetes.html) for the controller implementation and docs on how to use it. - -## torch_xla 1.5 now available - -[torch_xla](http://pytorch.org/xla/) is a Python package that uses the [XLA linear algebra compiler](https://www.tensorflow.org/xla) to accelerate the [PyTorch deep learning framework](https://pytorch.org/) on [Cloud TPUs](https://cloud.google.com/tpu/) and [Cloud TPU Pods](https://cloud.google.com/tpu/docs/tutorials/pytorch-pod). torch_xla aims to give PyTorch users the ability to do everything they can do on GPUs on Cloud TPUs as well while minimizing changes to the user experience. The project began with a conversation at NeurIPS 2017 and gathered momentum in 2018 when teams from Facebook and Google came together to create a proof of concept. We announced this collaboration at PTDC 2018 and made the PyTorch/XLA integration broadly available at PTDC 2019. The project already has 28 contributors, nearly 2k commits, and a repo that has been forked more than 100 times. - -This release of [torch_xla](http://pytorch.org/xla/) is aligned and tested with PyTorch 1.5 to reduce friction for developers and to provide a stable and mature PyTorch/XLA stack for training models using Cloud TPU hardware. You can [try it for free](https://medium.com/pytorch/get-started-with-pytorch-cloud-tpus-and-colab-a24757b8f7fc) in your browser on an 8-core Cloud TPU device with [Google Colab](https://colab.research.google.com/), and you can use it at a much larger scaleon [Google Cloud](https://cloud.google.com/gcp). - -See the full torch_xla release notes [here](https://github.com/pytorch/xla/releases). Full docs and tutorials can be found [here](https://pytorch.org/xla/) and [here](https://cloud.google.com/tpu/docs/tutorials). - -## PyTorch Domain Libraries - -torchaudio, torchvision, and torchtext complement PyTorch with common datasets, models, and transforms in each domain area. We’re excited to share new releases for all three domain libraries alongside PyTorch 1.5 and the rest of the library updates. For this release, all three domain libraries are removing support for Python2 and will support Python3 only. - -### torchaudio 0.5 -The torchaudio 0.5 release includes new transforms, functionals, and datasets. Highlights for the release include: - -* Added the Griffin-Lim functional and transform, `InverseMelScale` and `Vol` transforms, and `DB_to_amplitude`. -* Added support for `allpass`, `fade`, `bandpass`, `bandreject`, `band`, `treble`, `deemph`, and `riaa` filters and transformations. -* New datasets added including `LJSpeech` and `SpeechCommands` datasets. - -See the release full notes [here](https://github.com/pytorch/audio/releases) and full docs can be found [here](https://pytorch.org/audio/). - -### torchvision 0.6 -The torchvision 0.6 release includes updates to datasets, models and a significant number of bug fixes. Highlights include: - -* Faster R-CNN now supports negative samples which allows the feeding of images without annotations at training time. -* Added `aligned` flag to `RoIAlign` to match Detectron2. -* Refactored abstractions for C++ video decoder - -See the release full notes [here](https://github.com/pytorch/vision/releases) and full docs can be found [here](https://pytorch.org/vision/stable/index.html). - -### torchtext 0.6 -The torchtext 0.6 release includes a number of bug fixes and improvements to documentation. Based on user's feedback, dataset abstractions are currently being redesigned also. Highlights for the release include: - -* Fixed an issue related to the SentencePiece dependency in conda package. -* Added support for the experimental IMDB dataset to allow a custom vocab. -* A number of documentation updates including adding a code of conduct and a deduplication of the docs on the torchtext site. - -Your feedback and discussions on the experimental datasets API are welcomed. You can send them to [issue #664](https://github.com/pytorch/text/issues/664). We would also like to highlight the pull request [here](https://github.com/pytorch/text/pull/701) where the latest dataset abstraction is applied to the text classification datasets. The feedback can be beneficial to finalizing this abstraction. - -See the release full notes [here](https://github.com/pytorch/text/releases) and full docs can be found [here](https://pytorch.org/text/). - - -*We’d like to thank the entire PyTorch team, the Amazon team and the community for all their contributions to this work.* - -Cheers! - -Team PyTorch diff --git a/_posts/2020-5-5-updates-improvements-to-pytorch-tutorials.md b/_posts/2020-5-5-updates-improvements-to-pytorch-tutorials.md deleted file mode 100644 index 15bb56bab071..000000000000 --- a/_posts/2020-5-5-updates-improvements-to-pytorch-tutorials.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -layout: blog_detail -title: 'Updates & Improvements to PyTorch Tutorials' -author: Team PyTorch ---- - -PyTorch.org provides researchers and developers with documentation, installation instructions, latest news, community projects, tutorials, and more. Today, we are introducing usability and content improvements including tutorials in additional categories, a new recipe format for quickly referencing common topics, sorting using tags, and an updated homepage. - -Let’s take a look at them in detail. - -## TUTORIALS HOME PAGE UPDATE -The tutorials home page now provides clear actions that developers can take. For new PyTorch users, there is an easy-to-discover button to take them directly to “A 60 Minute Blitz”. Right next to it, there is a button to view all recipes which are designed to teach specific features quickly with examples. - -
        - -
        - -In addition to the existing left navigation bar, tutorials can now be quickly filtered by multi-select tags. Let’s say you want to view all tutorials related to “Production” and “Quantization”. You can select the “Production” and “Quantization” filters as shown in the image shown below: - -
        - -
        - -The following additional resources can also be found at the bottom of the Tutorials homepage: -* [PyTorch Cheat Sheet](https://pytorch.org/tutorials/beginner/ptcheat.html) -* [PyTorch Examples](https://github.com/pytorch/examples) -* [Tutorial on GitHub](https://github.com/pytorch/tutorials) - -## PYTORCH RECIPES -Recipes are new bite-sized, actionable examples designed to teach researchers and developers how to use specific PyTorch features. Some notable new recipes include: -* [Loading Data in PyTorch](https://pytorch.org/tutorials/recipes/recipes/loading_data_recipe.html) -* [Model Interpretability Using Captum](https://pytorch.org/tutorials/recipes/recipes/Captum_Recipe.html) -* [How to Use TensorBoard](https://pytorch.org/tutorials/recipes/recipes/tensorboard_with_pytorch.html) - -View the full recipes [here](http://pytorch.org/tutorials/recipes/recipes_index.html). - -## LEARNING PYTORCH -This section includes tutorials designed for users new to PyTorch. Based on community feedback, we have made updates to the current [Deep Learning with PyTorch: A 60 Minute Blitz](https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html) tutorial, one of our most popular tutorials for beginners. Upon completion, one can understand what PyTorch and neural networks are, and be able to build and train a simple image classification network. Updates include adding explanations to clarify output meanings and linking back to where users can read more in the docs, cleaning up confusing syntax errors, and reconstructing and explaining new concepts for easier readability. - -## DEPLOYING MODELS IN PRODUCTION -This section includes tutorials for developers looking to take their PyTorch models to production. The tutorials include: -* [Deploying PyTorch in Python via a REST API with Flask](https://pytorch.org/tutorials/intermediate/flask_rest_api_tutorial.html) -* [Introduction to TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) -* [Loading a TorchScript Model in C++](https://pytorch.org/tutorials/advanced/cpp_export.html) -* [Exporting a Model from PyTorch to ONNX and Running it using ONNX Runtime](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html) - -## FRONTEND APIS -PyTorch provides a number of frontend API features that can help developers to code, debug, and validate their models more efficiently. This section includes tutorials that teach what these features are and how to use them. Some tutorials to highlight: -* [Introduction to Named Tensors in PyTorch](https://pytorch.org/tutorials/intermediate/named_tensor_tutorial.html) -* [Using the PyTorch C++ Frontend](https://pytorch.org/tutorials/advanced/cpp_frontend.html) -* [Extending TorchScript with Custom C++ Operators](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html) -* [Extending TorchScript with Custom C++ Classes](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html) -* [Autograd in C++ Frontend](https://pytorch.org/tutorials/advanced/cpp_autograd.html) - -## MODEL OPTIMIZATION -Deep learning models often consume large amounts of memory, power, and compute due to their complexity. This section provides tutorials for model optimization: -* [Pruning](https://pytorch.org/tutorials/intermediate/pruning_tutorial.html) -* [Dynamic Quantization on BERT](https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html) -* [Static Quantization with Eager Mode in PyTorch](https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html) - -## PARALLEL AND DISTRIBUTED TRAINING -PyTorch provides features that can accelerate performance in research and production such as native support for asynchronous execution of collective operations and peer-to-peer communication that is accessible from Python and C++. This section includes tutorials on parallel and distributed training: -* [Single-Machine Model Parallel Best Practices](https://pytorch.org/tutorials/intermediate/model_parallel_tutorial.html) -* [Getting started with Distributed Data Parallel](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) -* [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html) -* [Implementing a Parameter Server Using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html) - -Making these improvements are just the first step of improving PyTorch.org for the community. Please submit your suggestions [here](https://github.com/pytorch/tutorials/pulls). - -Cheers, - -Team PyTorch diff --git a/_posts/2020-7-28-pytorch-1.6-released.md b/_posts/2020-7-28-pytorch-1.6-released.md deleted file mode 100644 index 9d1f6442249a..000000000000 --- a/_posts/2020-7-28-pytorch-1.6-released.md +++ /dev/null @@ -1,226 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.6 released w/ Native AMP Support, Microsoft joins as maintainers for Windows' -author: Team PyTorch ---- - -Today, we’re announcing the availability of PyTorch 1.6, along with updated domain libraries. We are also excited to announce the team at [Microsoft is now maintaining Windows builds and binaries](https://pytorch.org/blog/microsoft-becomes-maintainer-of-the-windows-version-of-pytorch) and will also be supporting the community on GitHub as well as the PyTorch Windows discussion forums. - -The PyTorch 1.6 release includes a number of new APIs, tools for performance improvement and profiling, as well as major updates to both distributed data parallel (DDP) and remote procedure call (RPC) based distributed training. -A few of the highlights include: - -1. Automatic mixed precision (AMP) training is now natively supported and a stable feature (See [here](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/) for more details) - thanks for NVIDIA’s contributions; -2. Native TensorPipe support now added for tensor-aware, point-to-point communication primitives built specifically for machine learning; -3. Added support for complex tensors to the frontend API surface; -4. New profiling tools providing tensor-level memory consumption information; -5. Numerous improvements and new features for both distributed data parallel (DDP) training and the remote procedural call (RPC) packages. - -Additionally, from this release onward, features will be classified as Stable, Beta and Prototype. Prototype features are not included as part of the binary distribution and are instead available through either building from source, using nightlies or via compiler flag. You can learn more about what this change means in the post [here](https://pytorch.org/blog/pytorch-feature-classification-changes/). You can also find the full release notes [here](https://github.com/pytorch/pytorch/releases). - -# Performance & Profiling - -## [Stable] Automatic Mixed Precision (AMP) Training - -AMP allows users to easily enable automatic mixed precision training enabling higher performance and memory savings of up to 50% on Tensor Core GPUs. Using the natively supported `torch.cuda.amp` API, AMP provides convenience methods for mixed precision, where some operations use the `torch.float32 (float)` datatype and other operations use `torch.float16 (half)`. Some ops, like linear layers and convolutions, are much faster in `float16`. Other ops, like reductions, often require the dynamic range of `float32`. Mixed precision tries to match each op to its appropriate datatype. - -* Design doc ([Link](https://github.com/pytorch/pytorch/issues/25081)) -* Documentation ([Link](https://pytorch.org/docs/stable/amp.html)) -* Usage examples ([Link](https://pytorch.org/docs/stable/notes/amp_examples.html)) - -## [Beta] Fork/Join Parallelism - -This release adds support for a language-level construct as well as runtime support for coarse-grained parallelism in TorchScript code. This support is useful for situations such as running models in an ensemble in parallel, or running bidirectional components of recurrent nets in parallel, and allows the ability to unlock the computational power of parallel architectures (e.g. many-core CPUs) for task level parallelism. - -Parallel execution of TorchScript programs is enabled through two primitives: `torch.jit.fork` and `torch.jit.wait`. In the below example, we parallelize execution of `foo`: - -```python -import torch -from typing import List - -def foo(x): - return torch.neg(x) - -@torch.jit.script -def example(x): - futures = [torch.jit.fork(foo, x) for _ in range(100)] - results = [torch.jit.wait(future) for future in futures] - return torch.sum(torch.stack(results)) - -print(example(torch.ones([]))) - ``` - -* Documentation ([Link](https://pytorch.org/docs/stable/jit.html)) - -## [Beta] Memory Profiler - -The `torch.autograd.profiler` API now includes a memory profiler that lets you inspect the tensor memory cost of different operators inside your CPU and GPU models. - -Here is an example usage of the API: - -```python -import torch -import torchvision.models as models -import torch.autograd.profiler as profiler - -model = models.resnet18() -inputs = torch.randn(5, 3, 224, 224) -with profiler.profile(profile_memory=True, record_shapes=True) as prof: - model(inputs) - -# NOTE: some columns were removed for brevity -print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10)) -# --------------------------- --------------- --------------- --------------- -# Name CPU Mem Self CPU Mem Number of Calls -# --------------------------- --------------- --------------- --------------- -# empty 94.79 Mb 94.79 Mb 123 -# resize_ 11.48 Mb 11.48 Mb 2 -# addmm 19.53 Kb 19.53 Kb 1 -# empty_strided 4 b 4 b 1 -# conv2d 47.37 Mb 0 b 20 -# --------------------------- --------------- --------------- --------------- - ``` - -* PR ([Link](https://github.com/pytorch/pytorch/pull/37775)) -* Documentation ([Link](https://pytorch.org/docs/stable/autograd.html#profiler)) - -# Distributed Training & RPC - -## [Beta] TensorPipe backend for RPC - -PyTorch 1.6 introduces a new backend for the RPC module which leverages the TensorPipe library, a tensor-aware point-to-point communication primitive targeted at machine learning, intended to complement the current primitives for distributed training in PyTorch (Gloo, MPI, ...) which are collective and blocking. The pairwise and asynchronous nature of TensorPipe lends itself to new networking paradigms that go beyond data parallel: client-server approaches (e.g., parameter server for embeddings, actor-learner separation in Impala-style RL, ...) and model and pipeline parallel training (think GPipe), gossip SGD, etc. - -```python -# One-line change needed to opt in -torch.distributed.rpc.init_rpc( - ... - backend=torch.distributed.rpc.BackendType.TENSORPIPE, -) - -# No changes to the rest of the RPC API -torch.distributed.rpc.rpc_sync(...) -``` - -* Design doc ([Link](https://github.com/pytorch/pytorch/issues/35251)) -* Documentation ([Link](https://pytorch.org/docs/stable/)) - -## [Beta] DDP+RPC - -PyTorch Distributed supports two powerful paradigms: DDP for full sync data parallel training of models and the RPC framework which allows for distributed model parallelism. Previously, these two features worked independently and users couldn’t mix and match these to try out hybrid parallelism paradigms. - -Starting in PyTorch 1.6, we’ve enabled DDP and RPC to work together seamlessly so that users can combine these two techniques to achieve both data parallelism and model parallelism. An example is where users would like to place large embedding tables on parameter servers and use the RPC framework for embedding lookups, but store smaller dense parameters on trainers and use DDP to synchronize the dense parameters. Below is a simple code snippet. - -```python -// On each trainer - -remote_emb = create_emb(on="ps", ...) -ddp_model = DDP(dense_model) - -for data in batch: - with torch.distributed.autograd.context(): - res = remote_emb(data) - loss = ddp_model(res) - torch.distributed.autograd.backward([loss]) -``` - -* DDP+RPC Tutorial ([Link](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html)) -* Documentation ([Link](https://pytorch.org/docs/stable/)) -* Usage Examples ([Link](https://github.com/pytorch/examples/pull/800)) - -## [Beta] RPC - Asynchronous User Functions - -RPC Asynchronous User Functions supports the ability to yield and resume on the server side when executing a user-defined function. Prior to this feature, when a callee processes a request, one RPC thread waits until the user function returns. If the user function contains IO (e.g., nested RPC) or signaling (e.g., waiting for another request to unblock), the corresponding RPC thread would sit idle waiting for these events. As a result, some applications have to use a very large number of threads and send additional RPC requests, which can potentially lead to performance degradation. To make a user function yield on such events, applications need to: 1) Decorate the function with the `@rpc.functions.async_execution` decorator; and 2) Let the function return a `torch.futures.Future` and install the resume logic as callbacks on the `Future` object. See below for an example: - - -```python -@rpc.functions.async_execution -def async_add_chained(to, x, y, z): - return rpc.rpc_async(to, torch.add, args=(x, y)).then( - lambda fut: fut.wait() + z - ) - -ret = rpc.rpc_sync( - "worker1", - async_add_chained, - args=("worker2", torch.ones(2), 1, 1) -) - -print(ret) # prints tensor([3., 3.]) -``` - -* Tutorial for performant batch RPC using Asynchronous User Functions -* Documentation ([Link](https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.functions.async_execution)) -* Usage examples ([Link](https://github.com/pytorch/examples/tree/master/distributed/rpc/batch)) - -# Frontend API Updates - -## [Beta] Complex Numbers - -The PyTorch 1.6 release brings beta level support for complex tensors including torch.complex64 and torch.complex128 dtypes. A complex number is a number that can be expressed in the form a + bj, where a and b are real numbers, and j is a solution of the equation x^2 = −1. Complex numbers frequently occur in mathematics and engineering, especially in signal processing and the area of complex neural networks is an active area of research. The beta release of complex tensors will support common PyTorch and complex tensor functionality, plus functions needed by Torchaudio, ESPnet and others. While this is an early version of this feature, and we expect it to improve over time, the overall goal is provide a NumPy compatible user experience that leverages PyTorch’s ability to run on accelerators and work with autograd to better support the scientific community. - -# Mobile Updates - -PyTorch 1.6 brings increased performance and general stability for mobile on-device inference. We squashed a few bugs, continued maintenance and added few new features while improving fp32 and int8 performance on a large variety of ML model inference on CPU backend. - -## [Beta] Mobile Features and Performance - -* Stateless and stateful XNNPACK Conv and Linear operators -* Stateless MaxPool2d + JIT optimization passes -* JIT pass optimizations: Conv + BatchNorm fusion, graph rewrite to replace conv2d/linear with xnnpack ops, relu/hardtanh fusion, dropout removal -* QNNPACK integration removes requantization scale constraint -* Per-channel quantization for conv, linear and dynamic linear -* Disable tracing for mobile client to save ~600 KB on full-jit builds - -# Updated Domain Libraries - -## torchvision 0.7 - -torchvision 0.7 introduces two new pretrained semantic segmentation models, [FCN ResNet50](https://arxiv.org/abs/1411.4038) and [DeepLabV3 ResNet50](https://arxiv.org/abs/1706.05587), both trained on COCO and using smaller memory footprints than the ResNet101 backbone. We also introduced support for AMP (Automatic Mixed Precision) autocasting for torchvision models and operators, which automatically selects the floating point precision for different GPU operations to improve performance while maintaining accuracy. - -* Release notes ([Link](https://github.com/pytorch/vision/releases)) - -## torchaudio 0.6 - -torchaudio now officially supports Windows. This release also introduces a new model module (with wav2letter included), new functionals (contrast, cvm, dcshift, overdrive, vad, phaser, flanger, biquad), datasets (GTZAN, CMU), and a new optional sox backend with support for TorchScript. - -* Release notes ([Link](https://github.com/pytorch/audio/releases)) - -# Additional updates - -## HACKATHON - -The Global PyTorch Summer Hackathon is back! This year, teams can compete in three categories virtually: - - 1. **PyTorch Developer Tools:** Tools or libraries designed to improve productivity and efficiency of PyTorch for researchers and developers - 2. **Web/Mobile Applications powered by PyTorch:** Applications with web/mobile interfaces and/or embedded devices powered by PyTorch - 3. **PyTorch Responsible AI Development Tools:** Tools, libraries, or web/mobile apps for responsible AI development - -This is a great opportunity to connect with the community and practice your machine learning skills. - -* [Join the hackathon](http://pytorch2020.devpost.com/) -* [Watch educational videos](https://www.youtube.com/pytorch) - - -## LPCV Challenge - -The [2020 CVPR Low-Power Vision Challenge (LPCV) - Online Track for UAV video](https://lpcv.ai/2020CVPR/video-track) submission deadline is coming up shortly. You have until July 31, 2020 to build a system that can discover and recognize characters in video captured by an unmanned aerial vehicle (UAV) accurately using PyTorch and Raspberry Pi 3B+. - -## Prototype Features - -To reiterate, Prototype features in PyTorch are early features that we are looking to gather feedback on, gauge the usefulness of and improve ahead of graduating them to Beta or Stable. The following features are not part of the PyTorch 1.6 release and instead are available in nightlies with separate docs/tutorials to help facilitate early usage and feedback. - -#### Distributed RPC/Profiler -Allow users to profile training jobs that use `torch.distributed.rpc` using the autograd profiler, and remotely invoke the profiler in order to collect profiling information across different nodes. The RFC can be found [here](https://github.com/pytorch/pytorch/issues/39675) and a short recipe on how to use this feature can be found [here](https://github.com/pytorch/tutorials/tree/master/prototype_source). - -#### TorchScript Module Freezing -Module Freezing is the process of inlining module parameters and attributes values into the TorchScript internal representation. Parameter and attribute values are treated as final value and they cannot be modified in the frozen module. The PR for this feature can be found [here](https://github.com/pytorch/pytorch/pull/32178) and a short tutorial on how to use this feature can be found [here](https://github.com/pytorch/tutorials/tree/master/prototype_source). - -#### Graph Mode Quantization -Eager mode quantization requires users to make changes to their model, including explicitly quantizing activations, module fusion, rewriting use of torch ops with Functional Modules and quantization of functionals are not supported. If we can trace or script the model, then the quantization can be done automatically with graph mode quantization without any of the complexities in eager mode, and it is configurable through a `qconfig_dict`. A tutorial on how to use this feature can be found [here](https://github.com/pytorch/tutorials/tree/master/prototype_source). - -#### Quantization Numerical Suite -Quantization is good when it works, but it’s difficult to know what's wrong when it doesn't satisfy the expected accuracy. A prototype is now available for a Numerical Suite that measures comparison statistics between quantized modules and float modules. This is available to test using eager mode and on CPU only with more support coming. A tutorial on how to use this feature can be found [here](https://github.com/pytorch/tutorials/tree/master/prototype_source). - - -Cheers! - -Team PyTorch diff --git a/_posts/2021-08-23-pytorch-developer-day-2021.md b/_posts/2021-08-23-pytorch-developer-day-2021.md deleted file mode 100644 index 6d121fb00cd0..000000000000 --- a/_posts/2021-08-23-pytorch-developer-day-2021.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -layout: blog_detail -title: 'Announcing PyTorch Developer Day 2021' -author: Team PyTorch -featured-img: 'assets/images/ptdevday21.gif' ---- - -We are excited to announce PyTorch Developer Day (#PTD2), taking place virtually from December 1 & 2, 2021. Developer Day is designed for developers and users to discuss core technical developments, ideas, and roadmaps. - -
        - -
        - -## Event Details -**Technical Talks Live Stream - December 1, 2021** - -Join us for technical talks on a variety of topics, including updates to the core framework, new tools and libraries to support development across a variety of domains, responsible AI and industry use cases. All talks will take place on December 1 and will be live streamed on PyTorch channels. - -Stay up to date by following us on our social channels: [Twitter](https://twitter.com/PyTorch), [Facebook](https://facebook.com/PyTorch), or [LinkedIn](https://www.linkedin.com/company/pytorch). - -**Poster Exhibition & Networking - December 2, 2021** - -On the second day, we’ll be hosting an online poster exhibition on Gather.Town. There will be opportunities to meet the authors and learn more about their PyTorch projects as well as network with the community. This poster and networking event is limited to people composed of PyTorch maintainers and contributors, long-time stakeholders and experts in areas relevant to PyTorch’s future. Conversations from the networking event will strongly shape the future of PyTorch. As such, invitations are required to attend the networking event. - - -## Call for Content Now Open - -Submit your poster abstracts today! Please send us the title and brief summary of your project, tools and libraries that could benefit PyTorch researchers in academia and industry, application developers, and ML engineers for consideration. The focus must be on academic papers, machine learning research, or open-source projects related to PyTorch development, Responsible AI or Mobile. Please no sales pitches. **Deadline for submission is September 24, 2021**. - -Visit the event website for more information and we look forward to having you at PyTorch Developer Day. \ No newline at end of file diff --git a/_posts/2021-09-08-pytorch-hackathon-2021.md b/_posts/2021-09-08-pytorch-hackathon-2021.md deleted file mode 100644 index 0964e0b38a6f..000000000000 --- a/_posts/2021-09-08-pytorch-hackathon-2021.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -layout: blog_detail -title: 'Announcing PyTorch Annual Hackathon 2021' -author: Team PyTorch -featured-img: 'assets/images/social_hackathon21.png' ---- - -We’re excited to announce the PyTorch Annual Hackathon 2021! This year, we’re looking to support the community in creating innovative PyTorch tools, libraries, and applications. 2021 is the third year we’re hosting this Hackathon, and we welcome you to join the PyTorch community and put your machine learning skills into action. Submissions start on September 8 and end on November 3. Good luck to everyone! - -
        - -
        - -## Submission Categories -You can enter your PyTorch projects into three categories: - -* **PyTorch Responsible AI Development Tools & Libraries** - Build an AI development tool or library that helps develop AI models and applications responsibly. These tools, libraries, and apps need to support a researcher or developer to factor in fairness, security, and privacy throughout the entire machine learning development process of data gathering, model training, model validation, inferences, monitoring, and more. - -* **Web and Mobile Applications Powered by PyTorch** - Build an application with the web, mobile interface, and/or embedded device powered by PyTorch so the end users can interact with it. The submission must be built on PyTorch or use PyTorch-based libraries such as torchvision, torchtext, and fast.ai. - -* **PyTorch Developer Tools & Libraries** - Build a creative, useful, and well-implemented tool or library for improving the productivity and efficiency of PyTorch researchers and developers. The submission must be a machine learning algorithm, model, or application built using PyTorch or PyTorch-based libraries. - -## Prizes -Submissions will be judged on the idea’s quality, originality, implementation, and potential impact. - -* **First-Place Winners** in each category of the Hackathon will receive $5,000 in cash, along with a 30-minute call with the PyTorch development team. - -* **Second-Place Winners** will receive $3,000. - - -* **Third-Place Winners** will receive $2,000. - -All winners will also receive the opportunity to create blog posts that will be featured throughout PyTorch channels as well as an exclusive Github badge. Honorable Mentions will also be awarded to the following three highest-scoring entries in each category and will receive $1,000 each. - -## Cloud Computing Credits -Request $100 in credits from Amazon Web Services or Google Cloud for your computing costs. Please allow 3 business days for your request to be reviewed. Credits will be provided to verified registrants until the supplies run out. For more information, see https://pytorch2021.devpost.com/details/sponsors. - -## 2020 Winning Projects - -[DeMask](https://devpost.com/software/asteroid-the-pytorch-based-source-separation-toolkit) won first place in the PyTorch Developer Tools category. Built using Asteroid, a PyTorch-based audio source separation toolkit, DeMask is an end-to-end model for enhancing speech while wearing face masks. - -[Q&Aid](https://devpost.com/software/pytorchxai) won first place in the Web/Mobile Applications Powered by PyTorch category. Backed by PyTorch core algorithms and models, Q&Aid is a conceptual health care chatbot aimed at making health care diagnoses and facilitating communication between patients and doctors. - -[FairTorch](https://devpost.com/software/a-qeysp1) won first place in the PyTorch Responsible AI Development Tools category. FairTorch is a PyTorch fairness library that lets developers add constraints to their models to equalize metrics across subgroups by simply adding a few lines of code. - -## How to Join -If you’re interested in joining this year’s PyTorch Hackathon, register at [http://pytorch2021.devpost.com](http://pytorch2021.devpost.com). diff --git a/_posts/2021-10-21-pytorch-1.10-new-library-releases.md b/_posts/2021-10-21-pytorch-1.10-new-library-releases.md deleted file mode 100644 index 8356cb1bc9cf..000000000000 --- a/_posts/2021-10-21-pytorch-1.10-new-library-releases.md +++ /dev/null @@ -1,213 +0,0 @@ ---- -layout: blog_detail -title: 'New Library Releases in PyTorch 1.10, including TorchX, TorchAudio, TorchVision' -author: Team PyTorch ---- - -Today, we are announcing a number of new features and improvements to PyTorch libraries, alongside the [PyTorch 1.10 release](https://pytorch.org/blog/pytorch-1.10-released/). Some highlights include: - -Some highlights include: - -* **TorchX** - a new SDK for quickly building and deploying ML applications from research & development to production. -* **TorchAudio** - Added text-to-speech pipeline, self-supervised model support, multi-channel support and MVDR beamforming module, RNN transducer (RNNT) loss function, and batch and filterbank support to `lfilter` function. See the TorchAudio release notes [here](https://github.com/pytorch/audio/releases). -* **TorchVision** - Added new RegNet and EfficientNet models, FX based feature extraction added to utilities, two new Automatic Augmentation techniques: Rand Augment and Trivial Augment, and updated training recipes. See the TorchVision release notes [here](https://github.com/pytorch/vision/releases). - - -# Introducing TorchX -TorchX is a new SDK for quickly building and deploying ML applications from research & development to production. It offers various builtin components that encode MLOps best practices and make advanced features like distributed training and hyperparameter optimization accessible to all. - -Users can get started with TorchX 0.1 with no added setup cost since it supports popular ML schedulers and pipeline orchestrators that are already widely adopted and deployed in production. No two production environments are the same. To comply with various use cases, TorchX’s core APIs allow tons of customization at well-defined extension points so that even the most unique applications can be serviced without customizing the whole vertical stack. - -Read the [documentation](https://pytorch.org/torchx) for more details and try out this feature using this quickstart [tutorial](https://pytorch.org/torchx/latest/quickstart.html). - - -# TorchAudio 0.10 - -### [Beta] Text-to-speech pipeline -TorchAudio now adds the Tacotron2 model and pretrained weights. It is now possible to build a text-to-speech pipeline with existing vocoder implementations like WaveRNN and Griffin-Lim. Building a TTS pipeline requires matching data processing and pretrained weights, which are often non-trivial to users. So TorchAudio introduces a bundle API so that constructing pipelines for specific pretrained weights is easy. The following example illustrates this. - -```python ->>> import torchaudio ->>> ->>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH ->>> ->>> # Build text processor, Tacotron2 and vocoder (WaveRNN) model ->>> processor = bundle.get_text_processor() ->>> tacotron2 = bundle.get_tacotron2() -Downloading: -100%|███████████████████████████████| 107M/107M [00:01<00:00, 87.9MB/s] ->>> vocoder = bundle.get_vocoder() -Downloading: -100%|███████████████████████████████| 16.7M/16.7M [00:00<00:00, 78.1MB/s] ->>> ->>> text = "Hello World!" ->>> ->>> # Encode text ->>> input, lengths = processor(text) ->>> ->>> # Generate (mel-scale) spectrogram ->>> specgram, lengths, _ = tacotron2.infer(input, lengths) ->>> ->>> # Convert spectrogram to waveform ->>> waveforms, lengths = vocoder(specgram, lengths) ->>> ->>> # Save audio ->>> torchaudio.save('hello-world.wav', waveforms, vocoder.sample_rate) - -``` - -For the details of this API please refer to [the documentation](https://pytorch.org/audio/0.10.0/pipelines#tacotron2-text-to-speech). You can also try this from [the tutorial](https://pytorch.org/tutorials/intermediate/text_to_speech_with_torchaudio.html). - -### (Beta) Self-Supervised Model Support -TorchAudio added HuBERT model architecture and pre-trained weight support for wav2vec 2.0 and HuBERT. HuBERT and wav2vec 2.0 are novel ways for audio representation learning and they yield high accuracy when fine-tuned on downstream tasks. These models can serve as baseline in future research, therefore, TorchAudio is providing a simple way to run the model. Similar to the TTS pipeline, the pretrained weights and associated information, such as expected sample rates and output class labels (for fine-tuned weights) are put together as a bundle, so that they can be used to build pipelines. The following example illustrates this. - -```python ->>> import torchaudio ->>> ->>> bundle = torchaudio.pipelines.HUBERT_ASR_LARGE ->>> ->>> # Build the model and load pretrained weight. ->>> model = bundle.get_model() -Downloading: -100%|███████████████████████████████| 1.18G/1.18G [00:17<00:00, 73.8MB/s] ->>> # Check the corresponding labels of the output. ->>> labels = bundle.get_labels() ->>> print(labels) -('', '', '', '', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z') ->>> ->>> # Infer the label probability distribution ->>> waveform, sample_rate = torchaudio.load(hello-world.wav') ->>> ->>> emissions, _ = model(waveform) ->>> ->>> # Pass emission to (hypothetical) decoder ->>> transcripts = ctc_decode(emissions, labels) ->>> print(transcripts[0]) -HELLO WORLD - -``` - -Please refer to the [documentation](https://pytorch.org/audio/0.10.0/pipelines#wav2vec-2-0-hubert-representation-learning) for more details and try out this feature using this [tutorial](https://pytorch.org/tutorials/intermediate/speech_command_recognition_with_torchaudio_tutorial.html). - -### (Beta) Multi-channel support and MVDR beamforming -Far-field speech recognition is a more challenging task compared to near-field recognition. Multi-channel methods such as beamforming help reduce the noises and enhance the target speech. - -TorchAudio now adds support for differentiable Minimum Variance Distortionless Response (MVDR) beamforming on multi-channel audio using Time-Frequency masks. Researchers can easily assemble it with any multi-channel ASR pipeline. There are three solutions (ref_channel, stv_evd, stv_power) and it supports single-channel and multi-channel (perform average in the method) masks. It provides an online option that recursively updates the parameters for streaming audio. We also provide a tutorial on how to apply MVDR beamforming to the multi-channel audio in the example directory. - -```python ->>> from torchaudio.transforms import MVDR, Spectrogram, InverseSpectrogram ->>> ->>> # Load the multi-channel noisy audio ->>> waveform_mix, sr = torchaudio.load('mix.wav') ->>> # Initialize the stft and istft modules ->>> stft = Spectrogram(n_fft=1024, hop_length=256, return_complex=True, power=None) ->>> istft = InverseSpectrogram(n_fft=1024, hop_length=256) ->>> # Get the noisy spectrogram ->>> specgram_mix = stft(waveform_mix) ->>> # Get the Time-Frequency mask via machine learning models ->>> mask = model(waveform) ->>> # Initialize the MVDR module ->>> mvdr = MVDR(ref_channel=0, solution=”ref_channel”, multi_mask=False) ->>> # Apply MVDR beamforming ->>> specgram_enhanced = mvdr(specgram_mix, mask) ->>> # Get the enhanced waveform via iSTFT ->>> waveform_enhanced = istft(specgram_enhanced, length=waveform.shape[-1]) -``` -Please refer to the [documentation](https://pytorch.org/audio/0.10.0/transforms.html#mvdr) for more details and try out this feature using the MVDR tutorial. - -### (Beta) RNN Transducer Loss -The RNN transducer (RNNT) loss is part of the RNN transducer pipeline, which is a popular architecture for speech recognition tasks. Recently it has gotten attention for being used in a streaming setting, and has also achieved state-of-the-art WER for the LibriSpeech benchmark. - -TorchAudio’s loss function supports float16 and float32 logits, has autograd and torchscript support, and can be run on both CPU and GPU, which has a custom CUDA kernel implementation for improved performance. The implementation is consistent with the original loss function in [Sequence Transduction with Recurrent Neural Networks](https://arxiv.org/pdf/1211.3711.pdf), but relies on code from [Alignment Restricted Streaming Recurrent Neural Network Transducer](https://arxiv.org/pdf/2011.03072.pdf). Special thanks to Jay Mahadeokar and Ching-Feng Yeh for their code contributions and guidance. - -Please refer to the [documentation](https://pytorch.org/audio/0.10.0/transforms.html#rnntloss) for more details. - -### (Beta) Batch support and filter bank support -`torchaudio.functional.lfilter` now supports batch processing and multiple filters. - -### (Prototype) Emformer Module -Automatic speech recognition (ASR) research and productization have increasingly focused on on-device applications. Towards supporting such efforts, TorchAudio now includes [Emformer](https://arxiv.org/abs/2010.10759), a memory-efficient transformer architecture that has achieved state-of-the-art results on LibriSpeech in low-latency streaming scenarios, as a prototype feature. - -Please refer to the [documentation](https://pytorch.org/audio/main/prototype.html#emformer) for more details. - -### GPU Build -GPU builds that support custom CUDA kernels in TorchAudio, like the one being used for RNN transducer loss, have been added. Following this change, TorchAudio’s binary distribution now includes CPU-only versions and CUDA-enabled versions. To use CUDA-enabled binaries, PyTorch also needs to be compatible with CUDA. - -# TorchVision 0.11 - -### (Stable) New Models -[RegNet](https://arxiv.org/abs/2003.13678) and [EfficientNet](https://arxiv.org/abs/1905.11946) are two popular architectures that can be scaled to different computational budgets. In this release we include 22 pre-trained weights for their classification variants. The models were trained on ImageNet and the accuracies of the pre-trained models obtained on ImageNet val can be found below (see [#4403](https://github.com/pytorch/vision/pull/4403#issuecomment-930381524), [#4530](https://github.com/pytorch/vision/pull/4530#issuecomment-933213238) and [#4293](https://github.com/pytorch/vision/pull/4293) for more details). - -The models can be used as follows: - -```python -import torch -from torchvision import models - -x = torch.rand(1, 3, 224, 224) - -regnet = models.regnet_y_400mf(pretrained=True) -regnet.eval() -predictions = regnet(x) - -efficientnet = models.efficientnet_b0(pretrained=True) -efficientnet.eval() -predictions = efficientnet(x) -``` -See the full list of new models on the [torchvision.models](https://pytorch.org/vision/master/models.html) documentation page. - -We would like to thank Ross Wightman and Luke Melas-Kyriazi for contributing the weights of the EfficientNet variants. - -### (Beta) FX-based Feature Extraction -A new Feature Extraction method has been added to our utilities. It uses [torch.fx](https://pytorch.org/docs/stable/fx.html) and enables us to retrieve the outputs of intermediate layers of a network which is useful for feature extraction and visualization. - -Here is an example of how to use the new utility: - -```python -import torch -from torchvision.models import resnet50 -from torchvision.models.feature_extraction import create_feature_extractor - - -x = torch.rand(1, 3, 224, 224) - -model = resnet50() - -return_nodes = { -"layer4.2.relu_2": "layer4" -} -model2 = create_feature_extractor(model, return_nodes=return_nodes) -intermediate_outputs = model2(x) - -print(intermediate_outputs['layer4'].shape) -``` -We would like to thank Alexander Soare for developing this utility. - -### (Stable) New Data Augmentations -Two new Automatic Augmentation techniques were added: [RandAugment](https://arxiv.org/abs/1909.13719) and [Trivial Augment](https://arxiv.org/abs/2103.10158). They apply a series of transformations on the original data to enhance them and to boost the performance of the models. The new techniques build on top of the previously added [AutoAugment](https://github.com/pytorch/vision/pull/3123) and focus on simplifying the approach, reducing the search space for the optimal policy and improving the performance gain in terms of accuracy. These techniques enable users to reproduce recipes to achieve state-of-the-art performance on the offered models. Additionally, it enables users to apply these techniques in order to do transfer learning and achieve optimal accuracy on new datasets. - -Both methods can be used as drop-in replacement of the AutoAugment technique as seen below: - -```python -from torchvision import transforms - -t = transforms.RandAugment() -# t = transforms.TrivialAugmentWide() -transformed = t(image) - -transform = transforms.Compose([ -transforms.Resize(256), -transforms.RandAugment(), # transforms.TrivialAugmentWide() -transforms.ToTensor()]) -``` -Read the [automatic augmentation transforms](https://pytorch.org/vision/master/transforms.html#automatic-augmentation-transforms) for more details. - -We would like to thank Samuel G. Müller for contributing to Trivial Augment and for his help on refactoring the AA package. - -### Updated Training Recipes -We have updated our training reference scripts to add support for Exponential Moving Average, Label Smoothing, Learning-Rate Warmup, [Mixup](https://arxiv.org/abs/1710.09412), [Cutmix](https://arxiv.org/abs/1905.04899) and other [SOTA primitives](https://github.com/pytorch/vision/issues/3911). The above enabled us to improve the classification Acc@1 of some pre-trained models by over 4 points. A major update of the existing pre-trained weights is expected in the next release. - -Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join [the discussion](https://discuss.pytorch.org/) forums and [open GitHub issues](https://github.com/pytorch/pytorch/issues). To get the latest news from PyTorch, follow us on [Twitter](https://twitter.com/PyTorch), [Medium](https://medium.com/pytorch), [YouTube](https://www.youtube.com/pytorch) and [LinkedIn](https://www.linkedin.com/company/pytorch). - -Cheers! -Team PyTorch diff --git a/_posts/2021-10-21-pytorch-1.10-released.md b/_posts/2021-10-21-pytorch-1.10-released.md deleted file mode 100644 index df1666304a77..000000000000 --- a/_posts/2021-10-21-pytorch-1.10-released.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.10 Release, including CUDA Graphs APIs, Frontend and Compiler Improvements' -author: Team PyTorch ---- - -We are excited to announce the release of PyTorch 1.10. This release is composed of over 3,400 commits since 1.9, made by 426 contributors. We want to sincerely thank our community for continuously improving PyTorch. - -PyTorch 1.10 updates are focused on improving training and performance of PyTorch, and developer usability. The full release notes are available [here](https://github.com/pytorch/pytorch/releases/tag/v1.10.0). Highlights include: -1. CUDA Graphs APIs are integrated to reduce CPU overheads for CUDA workloads. -2. Several frontend APIs such as FX, torch.special, and nn.Module Parametrization, have moved from beta to stable. -3. Support for automatic fusion in JIT Compiler expands to CPUs in addition to GPUs. -4. Android NNAPI support is now available in beta. - -Along with 1.10, we are also releasing major updates to the PyTorch libraries, which you can read about in [this blog post](https://pytorch.org/blog/pytorch-1.10-new-library-releases/). - - - -# Frontend APIs - -### (Stable) Python code transformations with FX - -FX provides a Pythonic platform for transforming and lowering PyTorch programs. It is a toolkit for pass writers to facilitate Python-to-Python transformation of functions and nn.Module instances. This toolkit aims to support a subset of Python language semantics—rather than the whole Python language—to facilitate ease of implementation of transforms. With 1.10, FX is moving to stable. - -You can learn more about FX in the [official documentation](https://pytorch.org/docs/master/fx.html) and [GitHub examples](https://github.com/pytorch/examples/tree/master/fx) of program transformations implemented using ```torch.fx```. - -### (Stable) *torch.special* -A ```torch.special module```, analogous to [SciPy’s special module](https://docs.scipy.org/doc/scipy/reference/special.html), is now available in stable. The module has 30 operations, including gamma, Bessel, and (Gauss) error functions. - -Refer to this [documentation](https://pytorch.org/docs/master/special.html) for more details. - -### (Stable) nn.Module Parametrization -```nn.Module``` parametrizaton, a feature that allows users to parametrize any parameter or buffer of an ```nn.Module``` without modifying the ```nn.Module``` itself, is available in stable. This release adds weight normalization (```weight_norm```), orthogonal parametrization (matrix constraints and part of pruning) and more flexibility when creating your own parametrization. - -Refer to this [tutorial](https://pytorch.org/tutorials/intermediate/parametrizations.html) and the general [documentation](https://pytorch.org/docs/master/generated/torch.nn.utils.parametrizations.spectral_norm.html?highlight=parametrize) for more details. - -### (Beta) CUDA Graphs APIs Integration -PyTorch now integrates CUDA Graphs APIs to reduce CPU overheads for CUDA workloads. - -CUDA Graphs greatly reduce the CPU overhead for CPU-bound cuda workloads and thus improve performance by increasing GPU utilization. For distributed workloads, CUDA Graphs also reduce jitter, and since parallel workloads have to wait for the slowest worker, reducing jitter improves overall parallel efficiency. - -Integration allows seamless interop between the parts of the network captured by cuda graphs, and parts of the network that cannot be captured due to graph limitations. - -Read the [note](https://pytorch.org/docs/master/notes/cuda.html#cuda-graphs) for more details and examples, and refer to the general [documentation](https://pytorch.org/docs/master/generated/torch.cuda.CUDAGraph.html#torch.cuda.CUDAGraph) for additional information. - -### [Beta] Conjugate View -PyTorch’s conjugation for complex tensors ([torch.conj()](https://pytorch.org/docs/1.10.0/generated/torch.conj.html?highlight=conj#torch.conj)) is now a constant time operation, and returns a view of the input tensor with a conjugate bit set as can be seen by calling [torch.is_conj()](https://pytorch.org/docs/1.10.0/generated/torch.is_conj.html?highlight=is_conj#torch.is_conj) . This has already been leveraged in various other PyTorch operations like matrix multiplication, dot product etc., to fuse conjugation with the operation leading to significant performance gain and memory savings on both CPU and CUDA. - -# Distributed Training - -### Distributed Training Releases Now in Stable -In 1.10, there are a number of features that are moving from beta to stable in the distributed package: -* **(Stable) Remote Module**: This feature allows users to operate a module on a remote worker like using a local module, where the RPCs are transparent to the user. Refer to this [documentation](https://pytorch.org/docs/master/rpc.html#remotemodule) for more details. -* **(Stable) DDP Communication Hook**: This feature allows users to override how DDP synchronizes gradients across processes. Refer to this [documentation](https://pytorch.org/docs/master/rpc.html#remotemodule) for more details. -* **(Stable) ZeroRedundancyOptimizer**: This feature can be used in conjunction with DistributedDataParallel to reduce the size of per-process optimizer states. With this stable release, it now can handle uneven inputs to different data-parallel workers. Check out this [tutorial](https://pytorch.org/tutorials/advanced/generic_join.html). We also improved the parameter partition algorithm to better balance memory and computation overhead across processes. Refer to this [documentation](https://pytorch.org/docs/master/distributed.optim.html) and this [tutorial](https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html) to learn more. - -# Performance Optimization and Tooling - -### [Beta] Profile-directed typing in TorchScript -TorchScript has a hard requirement for source code to have type annotations in order for compilation to be successful. For a long time, it was only possible to add missing or incorrect type annotations through trial and error (i.e., by fixing the type-checking errors generated by torch.jit.script one by one), which was inefficient and time consuming. - -Now, we have enabled profile directed typing for torch.jit.script by leveraging existing tools like MonkeyType, which makes the process much easier, faster, and more efficient. For more details, refer to the [documentation](https://pytorch.org/docs/1.9.0/jit.html). - -### (Beta) CPU Fusion -In PyTorch 1.10, we've added an LLVM-based JIT compiler for CPUs that can fuse together sequences of `torch` library calls to improve performance. While we've had this capability for some time on GPUs, this release is the first time we've brought compilation to the CPU. -You can check out a few performance results for yourself in this [Colab notebook](https://colab.research.google.com/drive/1xaH-L0XjsxUcS15GG220mtyrvIgDoZl6?usp=sharing). - -### (Beta) PyTorch Profiler -The objective of PyTorch Profiler is to target the execution steps that are the most costly in time and/or memory, and visualize the workload distribution between GPUs and CPUs. PyTorch 1.10 includes the following key features: - -* **Enhanced Memory View**: This helps you understand your memory usage better. This tool will help you avoid Out of Memory errors by showing active memory allocations at various points of your program run. -* **Enhanced Automated Recommendations**: This helps provide automated performance recommendations to help optimize your model. The tools recommend changes to batch size, TensorCore, memory reduction technologies, etc. -* **Enhanced Kernel View**: Additional columns show grid and block sizes as well as shared memory usage and registers per thread. -* **Distributed Training**: Gloo is now supported for distributed training jobs. -* **Correlate Operators in the Forward & Backward Pass**: This helps map the operators found in the forward pass to the backward pass, and vice versa, in a trace view. -* **TensorCore**: This tool shows the Tensor Core (TC) usage and provides recommendations for data scientists and framework developers. -* **NVTX**: Support for NVTX markers was ported from the legacy autograd profiler. -* **Support for profiling on mobile devices**: The PyTorch profiler now has better integration with TorchScript and mobile backends, enabling trace collection for mobile workloads. - -Refer to this [documentation](https://pytorch.org/docs/stable/profiler.html) for details. Check out this [tutorial](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) to learn how to get started with this feature. - -# PyTorch Mobile - -### (Beta) Android NNAPI Support in Beta -Last year we [released prototype support](https://medium.com/pytorch/pytorch-mobile-now-supports-android-nnapi-e2a2aeb74534) for Android’s Neural Networks API (NNAPI). NNAPI allows Android apps to run computationally intensive neural networks on the most powerful and efficient parts of the chips that power mobile phones, including GPUs (Graphics Processing Units) and NPUs (specialized Neural Processing Units). - -Since the prototype we’ve added more op coverage, added support for load-time flexible shapes and ability to run the model on the host for testing. Try out this feature using the [tutorial](https://pytorch.org/tutorials/prototype/nnapi_mobilenetv2.html). - -Additionally, Transfer Learning steps have been added to Object Detection examples. Check out this [GitHub page](https://github.com/pytorch/android-demo-app/tree/master/ObjectDetection#transfer-learning) to learn more. Please provide your feedback or ask questions on the [forum](https://discuss.pytorch.org/c/mobile/18). You can also check out [this presentation](https://www.youtube.com/watch?v=B-2spa3UCTU) to get an overview. - -Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the [discussion forums](https://discuss.pytorch.org/) and [open GitHub issues](https://github.com/pytorch/pytorch/issues). To get the latest news from PyTorch, follow us on [Twitter](https://twitter.com/PyTorch), [Medium](https://medium.com/pytorch), [YouTube](https://www.youtube.com/pytorch), and [LinkedIn](https://www.linkedin.com/company/pytorch). - -Cheers! -Team PyTorch diff --git a/_posts/2021-10-26-accelerating-pytorch-with-cuda-graphs.md b/_posts/2021-10-26-accelerating-pytorch-with-cuda-graphs.md deleted file mode 100644 index a5b38f0b8030..000000000000 --- a/_posts/2021-10-26-accelerating-pytorch-with-cuda-graphs.md +++ /dev/null @@ -1,288 +0,0 @@ ---- -layout: blog_detail -title: 'Accelerating PyTorch with CUDA Graphs' -author: Vinh Nguyen, Michael Carilli, Sukru Burc Eryilmaz, Vartika Singh, Michelle Lin, Natalia Gimelshein, Alban Desmaison, Edward Yang -featured-img: 'assets/images/cudagraphs-pytorch.png' ---- - -Today, we are pleased to announce a new advanced CUDA feature, CUDA Graphs, has been brought to PyTorch. Modern DL frameworks have complicated software stacks that incur significant overheads associated with the submission of each operation to the GPU. When DL workloads are strong-scaled to many GPUs for performance, the time taken by each GPU operation diminishes to just a few microseconds and, in these cases, the high work submission latencies of frameworks often lead to low utilization of the GPU. As GPUs get faster and workloads are scaled to more devices, the likelihood of workloads suffering from these launch-induced stalls increases. To overcome these performance overheads, NVIDIA engineers worked with PyTorch developers to enable CUDA graph execution natively in PyTorch. This design was instrumental in scaling NVIDIA’s MLPerf workloads (implemented in PyTorch) to over 4000 GPUs in order to achieve [record-breaking performance](https://blogs.nvidia.com/blog/2021/06/30/mlperf-ai-training-partners/). - -
        - -
        - -CUDA graphs support in PyTorch is just one more example of a long collaboration between NVIDIA and Facebook engineers. [torch.cuda.amp](https://pytorch.org/docs/stable/amp.html), for example, trains with half precision while maintaining the network accuracy achieved with single precision and automatically utilizing tensor cores wherever possible. AMP delivers up to 3X higher performance than FP32 with just a few lines of code change. Similarly, NVIDIA’s [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) was trained using PyTorch on up to 3072 GPUs. In PyTorch, one of the most performant methods to scale-out GPU training is with [torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) coupled with the NVIDIA Collective Communications Library ([NCCL](https://developer.nvidia.com/nccl)) backend. - - -# CUDA Graphs - - -[CUDA Graphs](https://developer.nvidia.com/blog/cuda-10-features-revealed/), which made its debut in CUDA 10, let a series of CUDA kernels to be defined and encapsulated as a single unit, i.e., a graph of operations, rather than a sequence of individually-launched operations. It provides a mechanism to launch multiple GPU operations through a single CPU operation, and hence reduces the launching overheads. - -The benefits of CUDA graphs can be demonstrated with the simple example in Figure 1. On the top, a sequence of short kernels is launched one-by-one by the CPU. The CPU launching overhead creates a significant gap in between the kernels. If we replace this sequence of kernels with a CUDA graph, initially we will need to spend a little extra time on building the graph and launching the whole graph in one go on the first occasion, but subsequent executions will be very fast, as there will be very little gap between the kernels. The difference is more pronounced when the same sequence of operations is repeated many times, for example, overy many training steps. In that case, the initial costs of building and launching the graph will be amortized over the entire number of training iterations. For a more comprehensive introduction on the topic, see our blog - [Getting Started with CUDA Graphs](https://developer.nvidia.com/blog/cuda-graphs) and GTC talk [Effortless CUDA Graphs](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s32082/). - - -

        -Cuda graphs reduce launching overhead by bundling multiple GPU operations into a single launchable unit, i.e., a graph. On the top, you can see five individual launches; whereas on the bottom, with CUDA graphs, they are all bundled into a single launch, reducing overhead. -
        - Figure 1. Benefits of using CUDA graphs -

        - - -## NCCL support for CUDA graphs - - -The previously mentioned benefits of reducing launch overheads also extend to NCCL kernel launches. NCCL enables GPU-based collective and P2P communications. With [NCCL support for CUDA graphs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/cudagraph.html), we can eliminate the NCCL kernel launch overhead. - -Additionally, kernel launch timing can be unpredictable due to various CPU load and operating system factors. Such time skews can be harmful to the performance of NCCL collective operations. With CUDA graphs, kernels are clustered together so that performance is consistent across ranks in a distributed workload. This is especially useful in large clusters where even a single slow node can bring down overall cluster level performance. - -For distributed multi-GPU workloads, NCCL is used for collective communications. If we look at training a neural network that leverages data parallelism, without NCCL support for CUDA graphs, we’ll need a separate launch for each of forward/back propagation and NCCL AllReduce. By contrast, with NCCL support for CUDA graphs, we can reduce launch overhead by lumping together the forward/backward propagation and NCCL AllReduce all in a single graph launch. - - -

        -With NCCL CUDA graph support, all the kernel launches for NCCL AllReduce for  the forward/backward propagation can be bundled into a graph to reduce overhead launch time. -
        - Figure 2. Looking at a typical neural network, all the kernel launches for NCCL AllReduce can be bundled into a graph to reduce overhead launch time. -

        - - -# PyTorch CUDA Graphs - - -From PyTorch v1.10, the CUDA graphs functionality is made available as a set of beta APIs. - -### API overview - -PyTorch supports the construction of CUDA graphs using [stream capture](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture), which puts a CUDA stream in capture mode. CUDA work issued to a capturing stream doesn’t actually run on the GPU. Instead, the work is recorded in a graph. After capture, the graph can be launched to run the GPU work as many times as needed. Each replay runs the same kernels with the same arguments. For pointer arguments this means the same memory addresses are used. By filling input memory with new data (e.g., from a new batch) before each replay, you can rerun the same work on new data. - -Replaying a graph sacrifices the dynamic flexibility of typical eager execution in exchange for greatly reduced CPU overhead. A graph’s arguments and kernels are fixed, so a graph replay skips all layers of argument setup and kernel dispatch, including Python, C++, and CUDA driver overheads. Under the hood, a replay submits the entire graph’s work to the GPU with a single call to [cudaGraphLaunch](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597). Kernels in a replay also execute slightly faster on the GPU, but eliding CPU overhead is the main benefit. - -You should try CUDA graphs if all or part of your network is graph-safe (usually this means static shapes and static control flow, but see the other [constraints](https://pytorch.org/docs/master/notes/cuda.html#constraints)) and you suspect its runtime is at least somewhat CPU-limited. - -### API example - -PyTorch exposes graphs via a raw [`torch.cuda.CUDAGraph`](https://pytorch.org/docs/master/generated/torch.cuda.graph.html#torch.cuda.graph)class and two convenience wrappers, [`torch.cuda.graph`](https://pytorch.org/docs/master/generated/torch.cuda.graph.html#torch.cuda.graph) and [`torch.cuda.make_graphed_callables`](https://pytorch.org/docs/master/generated/torch.cuda.make_graphed_callables.html#torch.cuda.make_graphed_callables). - -[`torch.cuda.graph`](https://pytorch.org/docs/master/generated/torch.cuda.graph.html#torch.cuda.graph) is a simple, versatile context manager that captures CUDA work in its context. Before capture, warm up the workload to be captured by running a few eager iterations. Warmup must occur on a side stream. Because the graph reads from and writes to the same memory addresses in every replay, you must maintain long-lived references to tensors that hold input and output data during capture. To run the graph on new input data, copy new data to the capture’s input tensor(s), replay the graph, then read the new output from the capture’s output tensor(s). - -If the entire network is capture safe, one can capture and replay the whole network as in the following example. - -```python -N, D_in, H, D_out = 640, 4096, 2048, 1024 -model = torch.nn.Sequential(torch.nn.Linear(D_in, H), - torch.nn.Dropout(p=0.2), - torch.nn.Linear(H, D_out), - torch.nn.Dropout(p=0.1)).cuda() -loss_fn = torch.nn.MSELoss() -optimizer = torch.optim.SGD(model.parameters(), lr=0.1) - -# Placeholders used for capture -static_input = torch.randn(N, D_in, device='cuda') -static_target = torch.randn(N, D_out, device='cuda') - -# warmup -# Uses static_input and static_target here for convenience, -# but in a real setting, because the warmup includes optimizer.step() -# you must use a few batches of real data. -s = torch.cuda.Stream() -s.wait_stream(torch.cuda.current_stream()) -with torch.cuda.stream(s): - for i in range(3): - optimizer.zero_grad(set_to_none=True) - y_pred = model(static_input) - loss = loss_fn(y_pred, static_target) - loss.backward() - optimizer.step() -torch.cuda.current_stream().wait_stream(s) - -# capture -g = torch.cuda.CUDAGraph() -# Sets grads to None before capture, so backward() will create -# .grad attributes with allocations from the graph's private pool -optimizer.zero_grad(set_to_none=True) -with torch.cuda.graph(g): - static_y_pred = model(static_input) - static_loss = loss_fn(static_y_pred, static_target) - static_loss.backward() - optimizer.step() - -real_inputs = [torch.rand_like(static_input) for _ in range(10)] -real_targets = [torch.rand_like(static_target) for _ in range(10)] - -for data, target in zip(real_inputs, real_targets): - # Fills the graph's input memory with new data to compute on - static_input.copy_(data) - static_target.copy_(target) - # replay() includes forward, backward, and step. - # You don't even need to call optimizer.zero_grad() between iterations - # because the captured backward refills static .grad tensors in place. - g.replay() - # Params have been updated. static_y_pred, static_loss, and .grad - # attributes hold values from computing on this iteration's data. -``` - -If some of your network is unsafe to capture (e.g., due to dynamic control flow, dynamic shapes, CPU syncs, or essential CPU-side logic), you can run the unsafe part(s) eagerly and use [`torch.cuda.make_graphed_callables`](https://pytorch.org/docs/master/generated/torch.cuda.make_graphed_callables.html#torch.cuda.make_graphed_callables) to graph only the capture-safe part(s). This is demonstrated next. - -[`make_graphed_callables`](https://pytorch.org/docs/master/generated/torch.cuda.make_graphed_callables.html#torch.cuda.make_graphed_callables) accepts callables (functions or [`nn.Module`](https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module) and returns graphed versions. By default, callables returned by [`make_graphed_callables`](https://pytorch.org/docs/master/generated/torch.cuda.make_graphed_callables.html#torch.cuda.make_graphed_callables) are autograd-aware, and can be used in the training loop as direct replacements for the functions or [`nn.Module`](https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module) you passed. [`make_graphed_callables`](https://pytorch.org/docs/master/generated/torch.cuda.make_graphed_callables.html#torch.cuda.make_graphed_callables) internally creates [`CUDAGraph`](https://pytorch.org/docs/master/generated/torch.cuda.CUDAGraph.html#torch.cuda.CUDAGraph) objects, runs warm up iterations, and maintains static inputs and outputs as needed. Therefore, (unlike with [`torch.cuda.graph`](https://pytorch.org/docs/master/generated/torch.cuda.graph.html#torch.cuda.graph)) you don’t need to handle those manually. - -In the following example, data-dependent dynamic control flow means the network isn’t capturable end-to-end, but [`make_graphed_callables`](https://pytorch.org/docs/master/generated/torch.cuda.make_graphed_callables.html#torch.cuda.make_graphed_callables)() lets us capture and run graph-safe sections as graphs regardless: - - -```python -N, D_in, H, D_out = 640, 4096, 2048, 1024 - -module1 = torch.nn.Linear(D_in, H).cuda() -module2 = torch.nn.Linear(H, D_out).cuda() -module3 = torch.nn.Linear(H, D_out).cuda() - -loss_fn = torch.nn.MSELoss() -optimizer = torch.optim.SGD(chain(module1.parameters(), - module2.parameters(), - module3.parameters()), - lr=0.1) - -# Sample inputs used for capture -# requires_grad state of sample inputs must match -# requires_grad state of real inputs each callable will see. -x = torch.randn(N, D_in, device='cuda') -h = torch.randn(N, H, device='cuda', requires_grad=True) - -module1 = torch.cuda.make_graphed_callables(module1, (x,)) -module2 = torch.cuda.make_graphed_callables(module2, (h,)) -module3 = torch.cuda.make_graphed_callables(module3, (h,)) - -real_inputs = [torch.rand_like(x) for _ in range(10)] -real_targets = [torch.randn(N, D_out, device="cuda") for _ in range(10)] - -for data, target in zip(real_inputs, real_targets): - optimizer.zero_grad(set_to_none=True) - - tmp = module1(data) # forward ops run as a graph - - if tmp.sum().item() > 0: - tmp = module2(tmp) # forward ops run as a graph - else: - tmp = module3(tmp) # forward ops run as a graph - - loss = loss_fn(tmp, target) - # module2's or module3's (whichever was chosen) backward ops, - # as well as module1's backward ops, run as graphs - loss.backward() - optimizer.step() -``` - -# Example use cases -## MLPerf v1.0 training workloads - -The PyTorch CUDA graphs functionality was instrumental in scaling NVIDIA’s MLPerf training v1.0 workloads (implemented in PyTorch) to over 4000 GPUs, setting new [records across the board](https://blogs.nvidia.com/blog/2021/06/30/mlperf-ai-training-partners/). We illustrate below two MLPerf workloads where the most significant gains were observed with the use of CUDA graphs, yielding up to ~1.7x speedup. - -| | Number of GPUs | Speedup from CUDA-graphs | -|-----------------------------|----------------:|-------------------------:| -| Mask R-CNN | 272 | 1.70× | -| BERT | 4096 | 1.12× | - -Table 1. MLPerf training v1.0 performance improvement with PyTorch CUDA graph. - -### Mask R-CNN - -Deep learning frameworks use GPUs to accelerate computations, but a significant amount of code still runs on CPU cores. CPU cores process meta-data like tensor shapes in order to prepare arguments needed to launch GPU kernels. Processing meta-data is a fixed cost while the cost of the computational work done by the GPUs is positively correlated with batch size. For large batch sizes, CPU overhead is a negligible percentage of total run time cost, but at small batch sizes CPU overhead can become larger than GPU run time. When that happens, GPUs go idle between kernel calls. This issue can be identified on an NSight timeline plot in Figure 3. The plot below shows the “backbone” portion of Mask R-CNN with per-gpu batch size of 1 before graphing. The green portion shows CPU load while the blue portion shows GPU load. In this profile we see that the CPU is maxed out at 100% load while GPU is idle most of the time, there is a lot of empty space between GPU kernels. - -

        -NSight timeline plot of Mask R-CNN shows that the CPU is maxed out at 100% load while GPU is idle most of the time, and a lot of empty space between GPU kernels -
        - Figure 3: NSight timeline plot of Mask R-CNN -

        - -CUDA graphs can automatically eliminate CPU overhead when tensor shapes are static. A complete graph of all the kernel calls is captured during the first step, in subsequent steps the entire graph is launched with a single op, eliminating all the CPU overhead, as observed in Figure 4.. - -

        -With CUDA graph, the entire graph is launched with a single op, eliminating all the CPU overhead -
        - Figure 4: CUDA graphs optimization -

        - -With graphing, we see that the GPU kernels are tightly packed and GPU utilization remains high. The graphed portion now runs in 6 ms instead of 31ms, a speedup of 5x. We did not graph the entire model, mostly just the resnet backbone, which resulted in an overall speedup of ~1.7x. -In order to increase the scope of the graph, we made some changes in the software stack to eliminate some of the CPU-GPU synchronization points. In MLPerf v1.0, this work included changing the implementation of torch.randperm function to use CUB instead of Thrust because the latter is a synchronous C++ template library. These improvements are available in the latest NGC container. - - -### BERT - -Similarly, by graph capturing the model, we eliminate CPU overhead and accompanying synchronization overhead. CUDA graphs implementation results in a 1.12x performance boost for our max-scale BERT configuration. To maximize the benefits from CUDA graphs, it is important to keep the scope of the graph as large as possible. To achieve this, we modified the model script to remove CPU-GPU synchronizations during the execution such that the full model can be graph captured. Furthermore, we also made sure that the tensor sizes during the execution are static within the scope of the graph. For instance, in BERT, only a specific subset of total tokens contribute to loss function, determined by a pre-generated mask tensor. Extracting the indices of valid tokens from this mask, and using these indices to gather the tokens that contribute to the loss, results in a tensor with a dynamic shape, i.e. with shape that is not constant across iterations. In order to make sure tensor sizes are static, instead of using the dynamic-shape tensors in the loss computation, we used static shape tensors where a mask is used to indicate which elements are valid. As a result, all tensor shapes are static. Dynamic shapes also require CPU-GPU synchronization since it has to involve the framework’s memory management on the CPU side. With static-only shapes, no CPU-GPU synchronizations are necessary. This is shown in Figure 5. - - -

        - Synchronization free training eliminates CPU synchronization -
        - Figure 5. By using a fixed size tensor and a boolean mask as described in the text, we are able to eliminate CPU synchronizations needed for dynamic sized tensors -

        - - -## CUDA graphs in NVIDIA DL examples collection - -Single GPU use cases can also benefit from using CUDA Graphs. This is particularly true for workloads launching many short kernels with small batches. A good example is training and inference for recommender systems. Below we present preliminary benchmark results for NVIDIA's implementation of the Deep Learning Recommendation Model (DLRM) from our Deep Learning Examples collection. Using CUDA graphs for this workload provides significant speedups for both training and inference. The effect is particularly visible when using very small batch sizes, where CPU overheads are more pronounced. - -CUDA graphs are being actively integrated into other PyTorch NGC model scripts and the NVIDIA Github deep learning examples. Stay tuned for more examples on how to use it. - - -

        - CUDA graphs optimization for the DLRM model. The impact is larger for smaller batch sizes where CPU overheads are more pronounced. -

        -

        - CUDA graphs optimization for the DLRM model. The impact is larger for smaller batch sizes where CPU overheads are more pronounced. -
        - Figure 6: CUDA graphs optimization for the DLRM model. -

        - - -# Call to action: CUDA Graphs in PyTorch v1.10 - -CUDA graphs can provide substantial benefits for workloads that comprise many small GPU kernels and hence bogged down by CPU launch overheads. This has been demonstrated in our MLPerf efforts, optimizing PyTorch models. Many of these optimizations, including CUDA graphs, have or will eventually be integrated into our PyTorch NGC model scripts [collection](https://ngc.nvidia.com/catalog/collections?orderBy=scoreDESC&pageNumber=0&query=pytorch&quickFilter=&filters=) and the NVIDIA [Github deep learning examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/). For now, check out our open-source MLPerf training v1.0 [implementation](https://github.com/mlcommons/training_results_v1.0/tree/master/NVIDIA) which could serve as a good starting point to see CUDA graph in action. Alternatively, try the PyTorch CUDA graphs API on your own workloads. - -We thank many NVIDIAN’s and Facebook engineers for their discussions and suggestions: -[Karthik Mandakolathur US](mailto:karthik@nvidia.com), -[Tomasz Grel](mailto:tgrel@nvidia.com), -[PLJoey Conway](mailto:jconway@nvidia.com), -[Arslan Zulfiqar US](mailto:azulfiqar@nvidia.com) - -## Authors bios - -[**Vinh Nguyen**](mailto:vinhn@nvidia.com) -*DL Engineer, NVIDIA* - -Vinh is a Deep learning engineer and data scientist, having published more than 50 scientific articles attracting more than 2500 citations. At NVIDIA, his work spans a wide range of deep learning and AI applications, including speech, language and vision processing, and recommender systems. - -[**Michael Carilli**](mailto:mcarilli@nvidia.com) -*Senior Developer Technology Engineer, NVIDIA* - -Michael worked at the Air Force Research Laboratory optimizing CFD code for modern parallel architectures. He holds a PhD in computational physics from the University of California, Santa Barbara. A member of the PyTorch team, he focuses on making GPU training fast, numerically stable, and easy(er) for internal teams, external customers, and Pytorch community users. - -[**Sukru Burc Eryilmaz**](mailto:seryilmaz@nvidia.com) -*Senior Architect in Dev Arch, NVIDIA* - -Sukru received his PhD from Stanford University, and B.S from Bilkent University. He currently works on improving the end-to-end performance of neural network training both at single-node scale and supercomputer scale. - -[**Vartika Singh**](mailto:vartikas@nvidia.com) -*Tech Partner Lead for DL Frameworks and Libraries, NVIDIA* - -Vartika has led teams working in confluence of cloud and distributed computing, scaling and AI, influencing the design and strategy of major corporations. She currently works with the major frameworks and compiler organizations and developers within and outside NVIDIA, to help the design to work efficiently and optimally on NVIDIA hardware. - -[**Michelle Lin**](mailto:miclin@nvidia.com) -*Product Intern, NVIDIA* - -Michelle is currently pursuing an undergraduate degree in Computer Science and Business Administration at UC Berkeley. She is currently managing execution of projects such as conducting market research and creating marketing assets for Magnum IO. - -[**Natalia Gimelshein**](mailto:ngimel@fb.com) -*Applied Research Scientist, Facebook* - -Natalia Gimelshein worked on GPU performance optimization for deep learning workloads at NVIDIA and Facebook. She is currently a member of the PyTorch core team, working with partners to seamlessly support new software and hardware features. - -[**Alban Desmaison**](mailto:albandes@fb.com) -*Research Engineer, Facebook* - -Alban studied engineering and did a PhD in Machine Learning and Optimization, during which he was an OSS contributor to PyTorch prior to joining Facebook. His main responsibilities are maintaining some core library and features (autograd, optim, nn) and working on making PyTorch better in general. - -[**Edward Yang**](mailto:ezyang@fb.com) -*Research Engineer, Facebook* - -Edward studied CS at MIT and then Stanford before starting at Facebook. He is a part of the PyTorch core team and is one of the leading contributors to PyTorch. diff --git a/_posts/2021-10-29-FX-feature-extraction-torchvision.md b/_posts/2021-10-29-FX-feature-extraction-torchvision.md deleted file mode 100644 index 0413c262b4f5..000000000000 --- a/_posts/2021-10-29-FX-feature-extraction-torchvision.md +++ /dev/null @@ -1,397 +0,0 @@ ---- -layout: blog_detail -title: 'Feature Extraction in TorchVision using Torch FX' -author: Alexander Soare and Francisco Massa -featured-img: 'assets/images/fx-image2.png' ---- - - - -# Introduction - -[FX](https://pytorch.org/docs/stable/fx.html) based feature extraction is a new [TorchVision utility](https://pytorch.org/vision/stable/feature_extraction.html) that lets us access intermediate transformations of an input during the forward pass of a PyTorch Module. It does so by symbolically tracing the forward method to produce a graph where each node represents a single operation. Nodes are named in a human-readable manner such that one may easily specify which nodes they want to access. - -Did that all sound a little complicated? Not to worry as there’s a little in this article for everyone. Whether you’re a beginner or an advanced deep-vision practitioner, chances are you will want to know about FX feature extraction. If you still want more background on feature extraction in general, read on. If you’re already comfortable with that and want to know how to do it in PyTorch, skim ahead to Existing Methods in PyTorch: Pros and Cons. And if you already know about the challenges of doing feature extraction in PyTorch, feel free to skim forward to FX to The Rescue. - - -## A Recap On Feature Extraction - -We’re all used to the idea of having a deep neural network (DNN) that takes inputs and produces outputs, and we don’t necessarily think of what happens in between. Let’s just consider a ResNet-50 classification model as an example: - -

        - CResNet-50 takes an image of a bird and transforms that into the abstract concept 'bird' -
        - Figure 1: ResNet-50 takes an image of a bird and transforms that into the abstract concept "bird". Source: Bird image from ImageNet. -

        - -We know though, that there are many sequential “layers” within the ResNet-50 architecture that transform the input step-by-step. In Figure 2 below, we peek under the hood to show the layers within ResNet-50, and we also show the intermediate transformations of the input as it passes through those layers. - -

        - ResNet-50 transforms the input image in multiple steps. Conceptually, we may access the intermediate transformation of the image after each one of these steps. -
        - Figure 2: ResNet-50 transforms the input image in multiple steps. Conceptually, we may access the intermediate transformation of the image after each one of these steps. Source: Bird image from ImageNet. -

        - - -## Existing Methods In PyTorch: Pros and Cons - -There were already a few ways of doing feature extraction in PyTorch prior to FX based feature extraction being introduced. - -To illustrate these, let’s consider a simple convolutional neural network that does the following - -* Applies several “blocks” each with several convolution layers within. -* After several blocks, it uses a global average pool and flatten operation. -* Finally it uses a single output classification layer. - -```python -import torch -from torch import nn - - -class ConvBlock(nn.Module): - """ - Applies `num_layers` 3x3 convolutions each followed by ReLU then downsamples - via 2x2 max pool. - """ - - def __init__(self, num_layers, in_channels, out_channels): - super().__init__() - self.convs = nn.ModuleList( - [nn.Sequential( - nn.Conv2d(in_channels if i==0 else out_channels, out_channels, 3, padding=1), - nn.ReLU() - ) - for i in range(num_layers)] - ) - self.downsample = nn.MaxPool2d(kernel_size=2, stride=2) - - def forward(self, x): - for conv in self.convs: - x = conv(x) - x = self.downsample(x) - return x - - -class CNN(nn.Module): - """ - Applies several ConvBlocks each doubling the number of channels, and - halving the feature map size, before taking a global average and classifying. - """ - - def __init__(self, in_channels, num_blocks, num_classes): - super().__init__() - first_channels = 64 - self.blocks = nn.ModuleList( - [ConvBlock( - 2 if i==0 else 3, - in_channels=(in_channels if i == 0 else first_channels*(2**(i-1))), - out_channels=first_channels*(2**i)) - for i in range(num_blocks)] - ) - self.global_pool = nn.AdaptiveAvgPool2d((1, 1)) - self.cls = nn.Linear(first_channels*(2**(num_blocks-1)), num_classes) - - def forward(self, x): - for block in self.blocks: - x = block(x) - x = self.global_pool(x) - x = x.flatten(1) - x = self.cls(x) - return x - - -model = CNN(3, 4, 10) -out = model(torch.zeros(1, 3, 32, 32)) # This will be the final logits over classes - -``` - -Let’s say we want to get the final feature map before global average pooling. We could do the following: - -### Modify the forward method - -```python -def forward(self, x): - for block in self.blocks: - x = block(x) - self.final_feature_map = x - x = self.global_pool(x) - x = x.flatten(1) - x = self.cls(x) - return x -``` - -Or return it directly: - -```python -def forward(self, x): - for block in self.blocks: - x = block(x) - final_feature_map = x - x = self.global_pool(x) - x = x.flatten(1) - x = self.cls(x) - return x, final_feature_map -``` -That looks pretty easy. But there are some downsides here which all stem from the same underlying issue: that is, modifying the source code is not ideal: - -* It’s not always easy to access and change given the practical considerations of a project. -* If we want flexibility (switching feature extraction on or off, or having variations on it), we need to further adapt the source code to support that. -* It’s not always just a question of inserting a single line of code. Think about how you would go about getting the feature map from one of the intermediate blocks with the way I’ve written this module. -* Overall, we’d rather avoid the overhead of maintaining source code for a model, when we actually don’t need to change anything about how it works. - -One can see how this downside can start to get a lot more thorny when dealing with larger, more complicated models, and trying to get at features from within nested submodules. - -### Write a new module using the parameters from the original one - -Following on the example from above, say we want to get a feature map from each block. We could write a new module like so: - -```python -class CNNFeatures(nn.Module): - def __init__(self, backbone): - super().__init__() - self.blocks = backbone.blocks - - def forward(self, x): - feature_maps = [] - for block in self.blocks: - x = block(x) - feature_maps.append(x) - return feature_maps - - -backbone = CNN(3, 4, 10) -model = CNNFeatures(backbone) -out = model(torch.zeros(1, 3, 32, 32)) # This is now a list of Tensors, each representing a feature map -``` - -In fact, this is much like the method that TorchVision used internally to make many of its detection models. - -Although this approach solves some of the issues with modifying the source code directly, there are still some major downsides: - -* It’s only really straight-forward to access the outputs of top-level submodules. Dealing with nested submodules rapidly becomes complicated. -* We have to be careful not to miss any important operations in between the input and the output. We introduce potential for errors in transcribing the exact functionality of the original module to the new module. - -Overall, this method and the last both have the complication of tying in feature extraction with the model’s source code itself. Indeed, if we examine the source code for TorchVision models we might suspect that some of the design choices were influenced by the desire to use them in this way for downstream tasks. - -### Use hooks - -Hooks move us away from the paradigm of writing source code, towards one of specifying outputs. Considering our toy CNN example above, and the goal of getting feature maps for each layer, we could use hooks like this: - - -```python -model = CNN(3, 4, 10) -feature_maps = [] # This will be a list of Tensors, each representing a feature map - -def hook_feat_map(mod, inp, out): - feature_maps.append(out) - -for block in model.blocks: - block.register_forward_hook(hook_feat_map) - -out = model(torch.zeros(1, 3, 32, 32)) # This will be the final logits over classes -``` - -Now we have full flexibility in terms of accessing nested submodules, and we free ourselves of the responsibilities of fiddling with the source code. But this approach comes with its own downsides: - -* We can only apply hooks to modules. If we have functional operations (reshape, view, functional non-linearities, etc) for which we want the outputs, hooks won’t work directly on them. -* We have not modified anything about the source code, so the whole forward pass is executed, regardless of the hooks. If we only need to access early features without any need for the final output, this could result in a lot of useless computation. -* Hooks are not TorchScript friendly. - -Here’s a summary of the different methods and their pros/cons: - - -| | Can use source code as is without any modifications or rewriting | Full flexibility in accessing features | Drops unnecessary computational steps | TorchScript friendly | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| -| Modify forward method | NO | Technically yes. Depends on how much code you’re willing to write. So in practice, NO. | YES | YES | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| -| New module that reuses submodules / parameters of original module | NO | Technically yes. Depends on how much code you’re willing to write. So in practice, NO. | YES | YES | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| -| Hooks | YES | Mostly YES. Only outputs of submodules | NO | NO | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| - -Table 1: The pros (or cons) of some of the existing methods for feature extraction with PyTorch - -In the next section of this article, let’s see how we can get YES across the board. - - -## FX to The Rescue - -The natural question for some new-starters in Python and coding at this point might be: *“Can’t we just point to a line of code and tell Python or PyTorch that we want the result of that line?”* For those who have spent more time coding, the reason this can’t be done is clear: multiple operations can happen in one line of code, whether they are explicitly written there, or they are implicit as sub-operations. Just take this simple module as an example: - -```python -class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.param = torch.nn.Parameter(torch.rand(3, 4)) - self.submodule = MySubModule() - - def forward(self, x): - return self.submodule(x + self.param).clamp(min=0.0, max=1.0) -``` - -The forward method has a single line of code which we can unravel as: - -1. Add `self.param` to `x` -2. Pass x through self.submodule. Here we would need to consider the steps happening in that submodule. I’m just going to use dummy operation names for illustration: - I. submodule.op_1 - II. submodule.op_2 -3. Apply the clamp operation - -So even if we point at this one line, the question then is: “For which step do we want to extract the output?”. - -[FX](https://pytorch.org/docs/stable/fx.html) is a core PyTorch toolkit that (oversimplifying) does the unravelling I just mentioned. It does something called “symbolic tracing”, which means the Python code is interpreted and stepped through, operation-by-operation, using some dummy proxy for a real input. Introducing some nomenclature, each step as described above is considered a **“node”**, and consecutive nodes are connected to one another to form a **“graph”** (not unlike the common mathematical notion of a graph). Here are the “steps” above translated to this concept of a graph. - -

        - Graphical representation of the result of symbolically tracing our example of a simple forward method. -
        - Figure 3: Graphical representation of the result of symbolically tracing our example of a simple forward method. -

        - -Note that we call this a graph, and not just a set of steps, because it’s possible for the graph to branch off and recombine. Think of the skip connection in a residual block. This would look something like: - -

        - Graphical representation of a residual skip connection. The middle node is like the main branch of a residual block, and the final node represents the sum of the input and output of the main branch. -
        - Figure 4: Graphical representation of a residual skip connection. The middle node is like the main branch of a residual block, and the final node represents the sum of the input and output of the main branch. -

        - -Now, TorchVision’s **[get_graph_node_names](https://pytorch.org/vision/stable/feature_extraction.html#torchvision.models.feature_extraction.get_graph_node_names)** function applies FX as described above, and in the process of doing so, tags each node with a human readable name. Let’s try this with our toy CNN model from the previous section: - -```python -model = CNN(3, 4, 10) -from torchvision.models.feature_extraction import get_graph_node_names -nodes, _ = get_graph_node_names(model) -print(nodes) -``` -which will result in: -```python -['x', 'blocks.0.convs.0.0', 'blocks.0.convs.0.1', 'blocks.0.convs.1.0', 'blocks.0.convs.1.1', 'blocks.0.downsample', 'blocks.1.convs.0.0', 'blocks.1.convs.0.1', 'blocks.1.convs.1.0', 'blocks.1.convs.1.1', 'blocks.1.convs.2.0', 'blocks.1.convs.2.1', 'blocks.1.downsample', 'blocks.2.convs.0.0', 'blocks.2.convs.0.1', 'blocks.2.convs.1.0', 'blocks.2.convs.1.1', 'blocks.2.convs.2.0', 'blocks.2.convs.2.1', 'blocks.2.downsample', 'blocks.3.convs.0.0', 'blocks.3.convs.0.1', 'blocks.3.convs.1.0', 'blocks.3.convs.1.1', 'blocks.3.convs.2.0', 'blocks.3.convs.2.1', 'blocks.3.downsample', 'global_pool', 'flatten', 'cls'] -``` - -We can read these node names as hierarchically organised “addresses” for the operations of interest. For example 'blocks.1.downsample' refers to the MaxPool2d layer in the second `ConvBlock`. - -[`create_feature_extractor`](https://pytorch.org/vision/stable/feature_extraction.html#torchvision.models.feature_extraction.create_feature_extractor), which is where all the magic happens, goes a few steps further than **`get_graph_node_names`**. It takes desired node names as one of the input arguments, and then uses more FX core functionality to: - -1. Assign the desired nodes as outputs. -2. Prune unnecessary downstream nodes and their associated parameters. -3. Translate the resulting graph back into Python code. -4. Return another PyTorch Module to the user. This has the python code from step 3 as the forward method. - -As a demonstration, here’s how we would apply `create_feature_extractor` to get the 4 feature maps from our toy CNN model - -```python -from torchvision.models.feature_extraction import create_feature_extractor -# Confused about the node specification here? -# We are allowed to provide truncated node names, and `create_feature_extractor` -# will choose the last node with that prefix. -feature_extractor = create_feature_extractor( - model, return_nodes=['blocks.0', 'blocks.1', 'blocks.2', 'blocks.3']) -# `out` will be a dict of Tensors, each representing a feature map -out = feature_extractor(torch.zeros(1, 3, 32, 32)) -``` - -It’s as simple as that. When it comes down to it, FX feature extraction is just a way of making it possible to do what some of us would have naively hoped for when we first started programming: *“just give me the output of this code (*points finger at screen)”*. - -- [ ] … does not require us to fiddle with source code. -- [ ] … provides full flexibility in terms of accessing any intermediate transformation of our inputs, whether they are the results of a module or a functional operation -- [ ] … does drop unnecessary computations steps once features have been extracted -- [ ] … and I didn’t mention this before, but it’s also TorchScript friendly! - -Here’s that table again with another row added for FX feature extraction - - -| | Can use source code as is without any modifications or rewriting | Full flexibility in accessing features | Drops unnecessary computational steps | TorchScript friendly | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| -| Modify forward method | NO | Technically yes. Depends on how much code you’re willing to write. So in practice, NO. | YES | YES | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| -| New module that reuses submodules / parameters of original module | NO | Technically yes. Depends on how much code you’re willing to write. So in practice, NO. | YES | YES | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| -| Hooks | YES | Mostly YES. Only outputs of submodules | NO | NO | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| -| FX | YES | YES | YES | YES | -|-------------------------------------------------------------------|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|:--------------------------------------:|:--------------------:| - -Table 2: A copy of Table 1 with an added row for FX feature extraction. FX feature extraction gets YES across the board! - - -## Current FX Limitations - -Although I would have loved to end the post there, FX does have some of its own limitations which boil down to: - -1. There may be some Python code that isn’t yet handled by FX when it comes to the step of interpretation and translation into a graph. -2. Dynamic control flow can’t be represented in terms of a static graph. - -The easiest thing to do when these problems crop up is to bundle the underlying code into a “leaf node”. Recall the example graph from Figure 3? Conceptually, we may agree that the `submodule` should be treated as a node in itself rather than a set of nodes representing the underlying operations. If we do so, we can redraw the graph as: - -

        - The individual operations within `submodule` may (left - within red box), may be consolidated into one node (right - node #2) if we consider the `submodule` as a 'leaf' node. -
        - Figure 5: The individual operations within `submodule` may (left - within red box), may be consolidated into one node (right - node #2) if we consider the `submodule` as a "leaf" node. -

        - - -We would want to do so if there is some problematic code within the submodule, but we don’t have any need for extracting any intermediate transformations from within it. In practice, this is easily achievable by providing a keyword argument to create_feature_extractor or get_graph_node_names. - - -```python -model = CNN(3, 4, 10) -nodes, _ = get_graph_node_names(model, tracer_kwargs={'leaf_modules': [ConvBlock]}) -print(nodes) -``` - -for which the output will be: - -```python -['x', 'blocks.0', 'blocks.1', 'blocks.2', 'blocks.3', 'global_pool', 'flatten', 'cls'] -``` - -Notice how, as compared to previously, all the nodes for any given `ConvBlock` are consolidated into a single node. - -We could do something similar with functions. For example, Python’s inbuilt `len` needs to be wrapped and the result should be treated as a leaf node. Here’s how you can do that with core FX functionality: - -```python -torch.fx.wrap('len') - -class MyModule(nn.Module): - def forward(self, x): - x += 1 - len(x) - -model = MyModule() -feature_extractor = create_feature_extractor(model, return_nodes=['add']) -``` - -For functions you define, you may instead use another keyword argument to `create_feature_extractor` (minor detail: here’s[ why you might want to do it this way instead](https://github.com/pytorch/pytorch/issues/62021#issue-950458396)): - - -```python -def myfunc(x): - return len(x) - -class MyModule(nn.Module): - def forward(self, x): - x += 1 - myfunc(x) - -model = MyModule() -feature_extractor = create_feature_extractor( - model, return_nodes=['add'], tracer_kwargs={'autowrap_functions': [myfunc]}) -``` - -Notice that none of the fixes above involved modifying source code. - -Of course, there may be times when the very intermediate transformation one is trying to get access to is within the same forward method or function that is causing problems. Here, we can’t just treat that module or function as a leaf node, because then we can’t access the intermediate transformations within. In these cases, some rewriting of the source code will be needed. Here are some examples (not exhaustive) - -- FX will raise an error when trying to trace through code with an `assert` statement. In this case you may need to remove that assertion or switch it with [`torch._assert`](https://pytorch.org/docs/stable/generated/torch._assert.html) (this is not a public function - so consider it a bandaid and use with caution). -- Symbolically tracing in-place changes to slices of tensors is not supported. You will need to make a new variable for the slice, apply the operation, then reconstruct the original tensor using concatenation or stacking. -- Representing dynamic control flow in a static graph is just not logically possible. See if you can distill the coded logic down to something that is not dynamic - see FX documentation for tips. - -In general, you may consult the FX documentation for more detail on the [limitations of symbolic tracing](https://pytorch.org/docs/stable/fx.html#limitations-of-symbolic-tracing) and the possible workarounds. - -## Conclusion - -We did a quick recap on feature extraction and why one might want to do it. Although there are existing methods for doing feature extraction in PyTorch they all have rather significant shortcomings. We learned how TorchVision’s FX feature extraction utility works and what makes it so versatile compared to the existing methods. While there are still some minor kinks to iron out for the latter, we understand the limitations, and can trade them off against the limitations of other methods depending on our use case. Hopefully by adding this new utility to your PyTorch toolkit, you’re now equipped to handle the vast majority of feature extraction requirements you may come across. - -Happy coding! diff --git a/_posts/2021-11-18-how-to-train-state-of-the-art-models-using-torchvision-latest-primitives.md b/_posts/2021-11-18-how-to-train-state-of-the-art-models-using-torchvision-latest-primitives.md deleted file mode 100644 index b2aeca44ab58..000000000000 --- a/_posts/2021-11-18-how-to-train-state-of-the-art-models-using-torchvision-latest-primitives.md +++ /dev/null @@ -1,416 +0,0 @@ ---- -layout: blog_detail -title: 'How to Train State-Of-The-Art Models Using TorchVision’s Latest Primitives' -author: Vasilis Vryniotis -featured-img: 'assets/images/fx-image2.png' ---- - - - -A few weeks ago, TorchVision v0.11 was released packed with numerous new primitives, models and training recipe improvements which allowed achieving state-of-the-art (SOTA) results. The project was dubbed “[TorchVision with Batteries Included](https://github.com/pytorch/vision/issues/3911)” and aimed to modernize our library. We wanted to enable researchers to reproduce papers and conduct research more easily by using common building blocks. Moreover, we aspired to provide the necessary tools to Applied ML practitioners to train their models on their own data using the same SOTA techniques as in research. Finally, we wanted to refresh our pre-trained weights and offer better off-the-shelf models to our users, hoping that they would build better applications. - -Though there is still much work to be done, we wanted to share with you some exciting results from the above work. We will showcase how one can use the new tools included in TorchVision to achieve state-of-the-art results on a highly competitive and well-studied architecture such as ResNet50 [[1]](https://arxiv.org/abs/1512.03385). We will share the exact recipe used to improve our baseline by over 4.7 accuracy points to reach a final top-1 accuracy of 80.9% and share the journey for deriving the new training process. Moreover, we will show that this recipe generalizes well to other model variants and families. We hope that the above will influence future research for developing stronger generalizable training methodologies and will inspire the community to adopt and contribute to our efforts. - -## The Results - -Using our new training recipe found on ResNet50, we’ve refreshed the pre-trained weights of the following models: - - -| Model | Accuracy@1 | Accuracy@5| -|----------|:--------:|:----------:| -| ResNet50 | 80.858 | 95.434| -|----------|:--------:|:----------:| -| ResNet101 | 81.886 | 95.780| -|----------|:--------:|:----------:| -| ResNet152 | 82.284 | 96.002| -|----------|:--------:|:----------:| -| ResNeXt50-32x4d | 81.198 | 95.340| - -Note that the accuracy of all models except RetNet50 can be further improved by adjusting their training parameters slightly, but our focus was to have a single robust recipe which performs well for all. - -**UPDATE:** We have refreshed the majority of popular classification models of TorchVision, you can find the details on this [blog post](https://pytorch.org/blog/introducing-torchvision-new-multi-weight-support-api/). - -There are currently two ways to use the latest weights of the model. - -## Using the Multi-pretrained weight API - -We are currently working on a new prototype mechanism which will extend the model builder methods of TorchVision to [support multiple weights](https://github.com/pytorch/vision/issues/4611). Along with the weights, we store useful [meta-data](https://github.com/pytorch/vision/blob/c5fb79f8fad60511c89957c4970cc2a5cfc8432e/torchvision/prototype/models/resnet.py#L94-L103) (such as the labels, the accuracy, links to recipe etc) and the preprocessing transforms necessary for using the models. Example: - -```python - from PIL import Image - from torchvision import prototype as P - img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg") -   - # Initialize model - weights = P.models.ResNet50_Weights.IMAGENET1K_V2 - model = P.models.resnet50(weights=weights) - model.eval() - - # Initialize inference transforms - preprocess = weights.transforms() -   - # Apply inference preprocessing transforms - batch = preprocess(img).unsqueeze(0) - prediction = model(batch).squeeze(0).softmax(0) -   - # Make predictions - label = prediction.argmax().item() - score = prediction[label].item() -   - # Use meta to get the labels - category_name = weights.meta['categories'][label] - print(f"{category_name}: {100 * score}%") -``` - -## Using the legacy API - -Those who don’t want to use a prototype API have the option of accessing the new weights via the legacy API using the following approach: - -```python - from torchvision.models import resnet -   - # Overwrite the URL of the previous weights - resnet.model_urls["resnet50"] = "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" -   - # Initialize the model using the legacy API - model = resnet.resnet50(pretrained=True) -   - # TODO: Apply preprocessing + call the model - # ... -``` - -## The Training Recipe - -Our goal was to use the newly introduced primitives of TorchVision to derive a new strong training recipe which achieves state-of-the-art results for the vanilla ResNet50 architecture when trained from scratch on ImageNet with no additional external data. Though by using architecture specific tricks [[2]](https://arxiv.org/abs/1812.01187) one could further improve the accuracy, we’ve decided not to include them so that the recipe can be used in other architectures. Our recipe heavily focuses on simplicity and builds upon work by FAIR [[3]](https://arxiv.org/abs/2103.06877), [[4]](https://arxiv.org/abs/2106.14881), [[5]](https://arxiv.org/abs/1906.06423), [[6]](https://arxiv.org/abs/2012.12877), [[7]](https://arxiv.org/abs/2110.00476). Our findings align with the parallel study of Wightman et al. [[7]](https://arxiv.org/abs/2110.00476), who also report major accuracy improvements by focusing on the training recipes. - -Without further ado, here are the main parameters of our recipe: - -```python - # Optimizer & LR scheme - ngpus=8, - batch_size=128,  # per GPU - - epochs=600, - opt='sgd',   - momentum=0.9, - - lr=0.5, - lr_scheduler='cosineannealinglr', - lr_warmup_epochs=5, - lr_warmup_method='linear', - lr_warmup_decay=0.01, - - - # Regularization and Augmentation - weight_decay=2e-05, - norm_weight_decay=0.0, - - label_smoothing=0.1, - mixup_alpha=0.2, - cutmix_alpha=1.0, - auto_augment='ta_wide', - random_erase=0.1, - - ra_sampler=True, - ra_reps=4, - - - # EMA configuration - model_ema=True, - model_ema_steps=32, - model_ema_decay=0.99998, - - - # Resizing - interpolation='bilinear', - val_resize_size=232, - val_crop_size=224, - train_crop_size=176, -``` - -Using our standard [training reference script](https://github.com/pytorch/vision/tree/main/references/classification), we can train a ResNet50 using the following command: - -``` -torchrun --nproc_per_node=8 train.py --model resnet50 --batch-size 128 --lr 0.5 \ ---lr-scheduler cosineannealinglr --lr-warmup-epochs 5 --lr-warmup-method linear \ ---auto-augment ta_wide --epochs 600 --random-erase 0.1 --weight-decay 0.00002 \ ---norm-weight-decay 0.0 --label-smoothing 0.1 --mixup-alpha 0.2 --cutmix-alpha 1.0 \ ---train-crop-size 176 --model-ema --val-resize-size 232 --ra-sampler --ra-reps 4 -``` - -## Methodology - -There are a few principles we kept in mind during our explorations: - -1. Training is a stochastic process and the validation metric we try to optimize is a random variable. This is due to the random weight initialization scheme employed and the existence of random effects during the training process. This means that we can’t do a single run to assess the effect of a recipe change. The standard practice is doing multiple runs (usually 3 to 5) and studying the summarization stats (such as mean, std, median, max, etc). -2. There is usually a significant interaction between different parameters, especially for techniques that focus on Regularization and reducing overfitting. Thus changing the value of one can have effects on the optimal configurations of others. To account for that one can either adopt a greedy search approach (which often leads to suboptimal results but tractable experiments) or apply grid search (which leads to better results but is computationally expensive). In this work, we used a mixture of both. -3. Techniques that are non-deterministic or introduce noise usually require longer training cycles to improve model performance. To keep things tractable, we initially used short training cycles (small number of epochs) to decide which paths can be eliminated early and which should be explored using longer training. -4. There is a risk of overfitting the validation dataset [[8]](https://arxiv.org/abs/1902.10811) because of the repeated experiments. To mitigate some of the risk, we apply only training optimizations that provide a significant accuracy improvements and use K-fold cross validation to verify optimizations done on the validation set. Moreover we confirm that our recipe ingredients generalize well on other models for which we didn’t optimize the hyper-parameters. - -## Break down of key accuracy improvements - -As discussed in [earlier blogposts](https://pytorch.org/blog/torchvision-ssdlite-implementation/#break-down-of-key-accuracy-improvements), training models is not a journey of monotonically increasing accuracies and the process involves a lot of backtracking. To quantify the effect of each optimization, below we attempt to show-case an idealized linear journey of deriving the final recipe starting from the original recipe of TorchVision. We would like to clarify that this is an oversimplification of the actual path we followed and thus it should be taken with a grain of salt.  - -

        -Cumulative Accuracy Improvements for ResNet50 -

        - -In the table below, we provide a summary of the performance of stacked incremental improvements on top of Baseline. Unless denoted otherwise, we report the model with best Acc@1 out of 3 runs: - - -| | Accuracy@1 | Accuracy@5| Incremental Diff|Absolute Diff| -|----------|:--------:|:----------:|:---------|:--------:| -| ResNet50 Baseline |76.130 | 92.862| 0.000|0.000| -|----------|:--------:|:----------:|:---------|:--------:| -| + LR optimizations | 76.494 |93.198| 0.364|0.364 -|----------|:--------:|:----------:|:---------|:--------:| -| + TrivialAugment | 76.806| 93.272|0.312| 0.676| -|----------|:--------:|:----------:|:---------|:--------:| -| + Long Training | 78.606| 94.052| 1.800|2.476| -|----------|:--------:|:----------:|:---------|:--------:| -| + Random Erasing | 78.796 | 94.094|0.190|2.666 -|----------|:--------:|:----------:|:---------|:--------:| -| + Label Smoothing |79.114| 94.374| 0.318|2.984| -|----------|:--------:|:----------:|:---------|:--------:| -| + Mixup | 79.232| 94.536| 0.118|3.102| -|----------|:--------:|:----------:|:---------|:--------:| -| + Cutmix |79.510| 94.642| 0.278|3.380| -|----------|:--------:|:----------:|:---------|:--------:| -| + Weight Decay tuning |80.036|94.746| 0.526|3.906| -|----------|:--------:|:----------:|:---------|:--------:| -| + FixRes mitigations |80.196|94.672| 0.160|4.066| -|----------|:--------:|:----------:|:---------|:--------:| -|+ EMA |80.450|94.908| 0.254|4.320| -|----------|:--------:|:----------:|:---------|:--------:| -| + Inference Resize tuning * |80.674|95.166| 0.224|4.544| -|----------|:--------:|:----------:|:---------|:--------:| -| + Repeated Augmentation ** |80.858|95.434| 0.184|4.728| - -*The tuning of the inference size was done on top of the last model. See below for details. - -** Community contribution done after the release of the article. See below for details. - -## Baseline - -Our baseline is the previously released ResNet50 model of TorchVision. It was trained with the following recipe: - -```python - # Optimizer & LR scheme - ngpus=8, - batch_size=32,  # per GPU - - epochs=90, - opt='sgd',   - momentum=0.9, - - lr=0.1, - lr_scheduler='steplr', - lr_step_size=30, - lr_gamma=0.1, - - - # Regularization - weight_decay=1e-4, - - - # Resizing - interpolation='bilinear', - val_resize_size=256, - val_crop_size=224, - train_crop_size=224, -``` - -Most of the above parameters are the defaults on our [training scripts](https://github.com/pytorch/vision/tree/main/references/classification). We will start building on top of this baseline by introducing optimizations until we gradually arrive at the final recipe. - -## LR optimizations - -There are a few parameter updates we can apply to improve both the accuracy and the speed of our training. This can be achieved by increasing the batch size and tuning the LR. Another common method is to apply warmup and gradually increase our learning rate. This is beneficial especially when we use very high learning rates and helps with the stability of the training in the early epochs. Finally, another optimization is to apply Cosine Schedule to adjust our LR during the epochs. A big advantage of cosine is that there are no hyper-parameters to optimize, which cuts down our search space. - -Here are the additional optimizations applied on top of the baseline recipe. Note that we’ve run multiple experiments to determine the optimal configuration of the parameters: - -```python - batch_size=128,  # per GPU - - lr=0.5, - lr_scheduler='cosineannealinglr', - lr_warmup_epochs=5, - lr_warmup_method='linear', - lr_warmup_decay=0.01, -``` - -The above optimizations increase our top-1 Accuracy by 0.364 points comparing to the baseline. Note that in order to combine the different LR strategies we use the newly introduced [SequentialLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.SequentialLR.html#torch.optim.lr_scheduler.SequentialLR) scheduler. - -## TrivialAugment - -The original model was trained using basic augmentation transforms such as Random resized crops and horizontal flips. An easy way to improve our accuracy is to apply more complex “Automatic-Augmentation” techniques. The one that performed best for us is TrivialAugment [[9]](https://arxiv.org/abs/2103.10158), which is extremely simple and can be considered “parameter free”, which means it can help us cut down our search space further. - -Here is the update applied on top of the previous step: - -``` -auto_augment='ta_wide', -``` - -The use of TrivialAugment increased our top-1 Accuracy by 0.312 points compared to the previous step. - -## Long Training - -Longer training cycles are beneficial when our recipe contains ingredients that behave randomly. More specifically as we start adding more and more techniques that introduce noise, increasing the number of epochs becomes crucial. Note that at early stages of our exploration, we used relatively short cycles of roughly 200 epochs which was later increased to 400 as we started narrowing down most of the parameters and finally increased to 600 epochs at the final versions of the recipe. - -Below we see the update applied on top of the earlier steps: - -``` -epochs=600, -``` - -This further increases our top-1 Accuracy by 1.8 points on top of the previous step. This is the biggest increase we will observe in this iterative process. It’s worth noting that the effect of this single optimization is overstated and somehow misleading. Just increasing the number of epochs on top of the old baseline won’t yield such significant improvements. Nevertheless the combination of the LR optimizations with strong Augmentation strategies helps the model benefit from longer cycles. It’s also worth mentioning that the reason we introduce the lengthy training cycles so early in the process is because in the next steps we will introduce techniques that require significantly more epochs to provide good results. - -## Random Erasing - -Another data augmentation technique known to help the classification accuracy is Random Erasing [[10]](https://arxiv.org/abs/1708.04896), [[11]](https://arxiv.org/abs/1708.04552). Often paired with Automatic Augmentation methods, it usually yields additional improvements in accuracy due to its regularization effect. In our experiments we tuned only the probability of applying the method via a grid search and found that it’s beneficial to keep its probability at low levels, typically around 10%.  - -Here is the extra parameter introduced on top of the previous: - -``` -random_erase=0.1, -``` - -Applying Random Erasing increases our Acc@1 by further 0.190 points. - -## Label Smoothing - -A good technique to reduce overfitting is to stop the model from becoming overconfident. This can be achieved by softening the ground truth using Label Smoothing [[12]](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Szegedy_Rethinking_the_Inception_CVPR_2016_paper.pdf). There is a single parameter which controls the degree of smoothing (the higher the stronger) that we need to specify. Though optimizing it via grid search is possible, we found that values around 0.05-0.15 yield similar results, so to avoid overfitting it we used the same value as on the paper that introduced it. - -Below we can find the extra config added on this step: - -``` -label_smoothing=0.1, -``` - -We use PyTorch’s newly introduced [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html?highlight=label_smoothing) label_smoothing parameter and that increases our accuracy by an additional 0.318 points. - -## Mixup and Cutmix - -Two data augmentation techniques often used to produce SOTA results are Mixup and Cutmix [[13]](https://arxiv.org/abs/1710.09412), [[14]](https://arxiv.org/abs/1905.04899). They both provide strong regularization effects by softening not only the labels but also the images. In our setup we found it beneficial to apply one of them randomly with equal probability. Each is parameterized with a hyperparameter alpha, which controls the shape of the Beta distribution from which the smoothing probability is sampled. We did a very limited grid search, focusing primarily on common values proposed on the papers.  - -Below you will find the optimal values for the alpha parameters of the two techniques: - -``` -mixup_alpha=0.2, -cutmix_alpha=1.0, -``` - -Applying mixup increases our accuracy by 0.118 points and combining it with cutmix improves it by additional 0.278 points. - -## Weight Decay tuning - -Our standard recipe uses L2 regularization to reduce overfitting. The Weight Decay parameter controls the degree of the regularization (the larger the stronger) and is applied universally to all learned parameters of the model by default. In this recipe, we apply two optimizations to the standard approach. First we perform grid search to tune the parameter of weight decay and second we disable weight decay for the parameters of the normalization layers.  - -Below you can find the optimal configuration of weight decay for our recipe: - -``` -weight_decay=2e-05, -norm_weight_decay=0.0, -``` - -The above update improves our accuracy by a further 0.526 points, providing additional experimental evidence for a known fact that tuning weight decay has significant effects on the performance of the model. Our approach for separating the Normalization parameters from the rest was inspired by [ClassyVision’s](https://github.com/facebookresearch/ClassyVision) approach. - -## FixRes mitigations - -An important property identified early in our experiments is the fact that the models performed significantly better if the resolution used during validation was increased from the 224x224 of training. This effect is studied in detail on the FixRes paper [[5]](https://arxiv.org/abs/1906.06423) and two mitigations are proposed: a) one could try to reduce the training resolution so that the accuracy on the validation resolution is maximized or b) one could fine-tune the model on a two-phase training so that it adjusts on the target resolution. Since we didn’t want to introduce a 2-phase training, we went for option a). This means that we reduced the train crop size from 224 and used grid search to find the one that maximizes the validation on resolution of 224x224. - -Below you can see the optimal value used on our recipe: - -``` -val_crop_size=224, -train_crop_size=176, -``` - -The above optimization improved our accuracy by an additional 0.160 points and sped up our training by 10%.  - -It’s worth noting that the FixRes effect still persists, meaning that the model continues to perform better on validation when we increase the resolution. Moreover, further reducing the training crop-size actually hurts the accuracy. This intuitively makes sense because one can only reduce the resolution so much before critical details start disappearing from the picture. Finally, we should note that the above FixRes mitigation seems to benefit models with similar depth to ResNet50. Deeper variants with larger receptive fields seem to be slightly negatively affected (typically by 0.1-0.2 points). Hence we consider this part of the recipe optional. Below we visualize the performance of the best available checkpoints (with the full recipe) for models trained with 176 and 224 resolution: - -
        -Best ResNet50 trained with 176 Resolution -Best ResNet50 trained with 224 Resolution -
        - -## Exponential Moving Average (EMA) - -EMA is a technique that allows one to push the accuracy of a model without increasing its complexity or inference time. It performs an exponential moving average on the model weights and this leads to increased accuracy and more stable models. The averaging happens every few iterations and its decay parameter was tuned via grid search.  - -Below you can see the optimal values for our recipe: - -``` -model_ema=True, -model_ema_steps=32, -model_ema_decay=0.99998, -``` - -The use of EMA increases our accuracy by 0.254 points comparing to the previous step. Note that TorchVision’s [EMA implementation](https://github.com/pytorch/vision/pull/4406) is build on top of PyTorch’s [AveragedModel](https://pytorch.org/docs/stable/optim.html#stochastic-weight-averaging) class with the key difference being that it averages not only the model parameters but also its buffers. Moreover, we have adopted tricks from [Pycls](https://github.com/facebookresearch/pycls/tree/main/pycls) which allow us to parameterize the decay in a way that doesn’t depend on the number of epochs. - -## Inference Resize tuning - -Unlike all other steps of the process which involved training models with different parameters, this optimization was done on top of the final model. During inference, the image is resized to a specific resolution and then a central 224x224 crop is taken from it. The original recipe used a resize size of 256, which caused a similar discrepancy as the one described on the FixRes paper [[5]](https://arxiv.org/abs/1906.06423). By bringing this resize value closer to the target inference resolution, one can improve the accuracy. To select the value we run a short grid search between interval [224, 256] with step of 8. To avoid overfitting, the value was selected using half of the validation set and confirmed using the other half. - -Below you can see the optimal value used on our recipe: - -``` -val_resize_size=232, -``` - -The above is an optimization which improved our accuracy by 0.224 points. It’s worth noting that the optimal value for ResNet50 works also best for ResNet101, ResNet152 and ResNeXt50, which hints that it generalizes across models: - - -
        -ResNet50 Inference Resize -ResNet101 Inference Resize -Best ResNet50 trained with 224 Resolution -
        - -## [UPDATE] Repeated Augmentation - -Repeated Augmentation [[15]](https://arxiv.org/abs/1901.09335), [[16]](https://arxiv.org/abs/1902.05509) is another technique which can improve the overall accuracy and has been used by other strong recipes such as those at [[6]](https://arxiv.org/abs/2012.12877), [[7]](https://arxiv.org/abs/2110.00476). Tal Ben-Nun, a community contributor, has [further improved](https://github.com/pytorch/vision/pull/5201) upon our original recipe by proposing training the model with 4 repetitions. His contribution came after the release of this article. - -Below you can see the optimal value used on our recipe: - -``` -ra_sampler=True, -ra_reps=4, -``` - -The above is the final optimization which improved our accuracy by 0.184 points.  - -## Optimizations that were tested but not adopted - -During the early stages of our research, we experimented with additional techniques, configurations and optimizations. Since our target was to keep our recipe as simple as possible, we decided not to include anything that didn’t provide a significant improvement. Here are a few approaches that we took but didn’t make it to our final recipe: - -- **Optimizers:** Using more complex optimizers such as Adam, RMSProp or SGD with Nesterov momentum didn’t provide significantly better results than vanilla SGD with momentum. -- **LR Schedulers:** We tried different LR Scheduler schemes such as StepLR and Exponential. Though the latter tends to work better with EMA, it often requires additional hyper-parameters such as defining the minimum LR to work well. Instead, we just use cosine annealing decaying the LR up to zero and choose the checkpoint with the highest accuracy. -- **Automatic Augmentations:** We’ve tried different augmentation strategies such as AutoAugment and RandAugment. None of these outperformed the simpler parameter-free TrivialAugment. -- **Interpolation:** Using bicubic or nearest interpolation didn’t provide significantly better results than bilinear. -- **Normalization layers:** Using Sync Batch Norm didn’t yield significantly better results than using the regular Batch Norm. - -## Acknowledgements - -We would like to thank Piotr Dollar, Mannat Singh and Hugo Touvron for providing their insights and feedback during the development of the recipe and for their previous research work on which our recipe is based on. Their support was invaluable for achieving the above result. Moreover, we would like to thank Prabhat Roy, Kai Zhang, Yiwen Song, Joel Schlosser, Ilqar Ramazanli, Francisco Massa, Mannat Singh, Xiaoliang Dai, Samuel Gabriel, Allen Goodman and Tal Ben-Nun for their contributions to the Batteries Included project. - -## References - -1. Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. “Deep Residual Learning for Image Recognition”. -2. Tong He, Zhi Zhang, Hang Zhang, Zhongyue Zhang, Junyuan Xie, Mu Li. “Bag of Tricks for Image Classification with Convolutional Neural Networks” -3. Piotr Dollár, Mannat Singh, Ross Girshick. “Fast and Accurate Model Scaling” -4. Tete Xiao, Mannat Singh, Eric Mintun, Trevor Darrell, Piotr Dollár, Ross Girshick. “Early Convolutions Help Transformers See Better” -5. Hugo Touvron, Andrea Vedaldi, Matthijs Douze, Hervé Jégou. “Fixing the train-test resolution discrepancy -6. Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. “Training data-efficient image transformers & distillation through attention” -7. Ross Wightman, Hugo Touvron, Hervé Jégou. “ResNet strikes back: An improved training procedure in timm” -8. Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, Vaishaal Shankar. “Do ImageNet Classifiers Generalize to ImageNet?” -9. Samuel G. Müller, Frank Hutter. “TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation” -10. Zhun Zhong, Liang Zheng, Guoliang Kang, Shaozi Li, Yi Yang. “Random Erasing Data Augmentation” -11. Terrance DeVries, Graham W. Taylor. “Improved Regularization of Convolutional Neural Networks with Cutout” -12. Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jon Shlens, Zbigniew Wojna. “Rethinking the Inception Architecture for Computer Vision” -13. Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, David Lopez-Paz. “mixup: Beyond Empirical Risk Minimization” -14. Sangdoo Yun, Dongyoon Han, Seong Joon Oh, Sanghyuk Chun, Junsuk Choe, Youngjoon Yoo. “CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features” -15. Elad Hoffer, Tal Ben-Nun, Itay Hubara, Niv Giladi, Torsten Hoefler, Daniel Soudry. “Augment your batch: better training with larger batches” -16. Maxim Berman, Hervé Jégou, Andrea Vedaldi, Iasonas Kokkinos, Matthijs Douze. “Multigrain: a unified image embedding for classes and instances” diff --git a/_posts/2021-12-15-tensor-memory-format-matters.md b/_posts/2021-12-15-tensor-memory-format-matters.md deleted file mode 100644 index 047a008081fc..000000000000 --- a/_posts/2021-12-15-tensor-memory-format-matters.md +++ /dev/null @@ -1,335 +0,0 @@ ---- -layout: blog_detail -title: 'Efficient PyTorch: Tensor Memory Format Matters' -author: 'Dhruv Matani, Suraj Subramanian' -featured-img: '' ---- - -Ensuring the right memory format for your inputs can significantly impact the running time of your PyTorch vision models. When in doubt, choose a Channels Last memory format. - -When dealing with vision models in PyTorch that accept multimedia (for example image Tensorts) as input, the Tensor’s memory format can significantly impact **the inference execution speed of your model on mobile platforms when using the CPU backend along with XNNPACK**. This holds true for training and inference on server platforms as well, but latency is particularly critical for mobile devices and users. - - - -## Outline of this article -1. Deep Dive into matrix storage/memory representation in C++. Introduction to [Row and Column major order](https://en.wikipedia.org/wiki/Row-_and_column-major_order). -2. Impact of looping over a matrix in the same or different order as the storage representation, along with an example. -3. Introduction to Cachegrind; a tool to inspect the cache friendliness of your code. -4. Memory formats supported by PyTorch Operators. -5. Best practices example to ensure efficient model execution with XNNPACK optimizations - -## Matrix Storage Representation in C++ - -Images are fed into PyTorch ML models as multi-dimensional Tensors. These Tensors have specific memory formats. To understand this concept better, let’s take a look at how a 2-d matrix may be stored in memory. - -Broadly speaking, there are 2 main ways of efficiently storing multi-dimensional data in memory. -1. **Row Major Order:** In this format, the matrix is stored in row order, with each row stored before the next row in memory. I.e. row N comes before row N+1. -2. **Column Major Order:** In this format, the matrix is stored in column-order, with each column stored before the next column in memory. I.e. column N comes before column N+1. - -You can see the differences graphically below. - -

        -C++ stores multi-dimensional data in row-major format. -
        -C++ stores multi-dimensional data in row-major format. -

        - -## Efficiently accessing elements of a 2d matrix - -Similar to the storage format, there are 2 ways to access data in a 2d matrix. - -1. **Loop Over Rows first:** All elements of a row are processed before any element of the next row. -2. **Loop Over Columns first:** All elements of a column are processed before any element of the next column. - -For maximum efficiency, one should always access data in the same format in which it is stored. I.e. if the data is stored in row-major order, then one should try to access it in that order. - -The code below (main.cpp) shows [2 ways](https://stackoverflow.com/questions/9936132/why-does-the-order-of-the-loops-affect-performance-when-iterating-over-a-2d-arra) of accessing all the elements of a 2d 4000x4000 matrix. - -```python -#include -#include - -// loop1 accesses data in matrix 'a' in row major order, -// since i is the outer loop variable, and j is the -// inner loop variable. -int loop1(int a[4000][4000]) { - int s = 0; - for (int i = 0; i < 4000; ++i) { - for (int j = 0; j < 4000; ++j) { - s += a[i][j]; - } - } - return s; -} - -// loop2 accesses data in matrix 'a' in column major order -// since j is the outer loop variable, and i is the -// inner loop variable. -int loop2(int a[4000][4000]) { - int s = 0; - for (int j = 0; j < 4000; ++j) { - for (int i = 0; i < 4000; ++i) { - s += a[i][j]; - } - } - return s; -} - -int main() { - static int a[4000][4000] = {0}; - for (int i = 0; i < 100; ++i) { - int x = rand() % 4000; - int y = rand() % 4000; - a[x][y] = rand() % 1000; - } - - auto start = std::chrono::high_resolution_clock::now(); - auto end = start; - int s = 0; - -#if defined RUN_LOOP1 - start = std::chrono::high_resolution_clock::now(); - - s = 0; - for (int i = 0; i < 10; ++i) { - s += loop1(a); - s = s % 100; - } - end = std::chrono::high_resolution_clock::now(); - - std::cout << "s = " << s << std::endl; - std::cout << "Time for loop1: " - << std::chrono::duration(end - start).count() - << "ms" << std::endl; -#endif - -#if defined RUN_LOOP2 - start = std::chrono::high_resolution_clock::now(); - s = 0; - for (int i = 0; i < 10; ++i) { - s += loop2(a); - s = s % 100; - } - end = std::chrono::high_resolution_clock::now(); - - std::cout << "s = " << s << std::endl; - std::cout << "Time for loop2: " - << std::chrono::duration(end - start).count() - << "ms" << std::endl; -#endif -} - - -Let’s build and run this program and see what it prints. - -g++ -O2 main.cpp -DRUN_LOOP1 -DRUN_LOOP2 -./a.out - - -Prints the following: - -s = 70 -Time for loop1: 77.0687ms -s = 70 -Time for loop2: 1219.49ms -``` - -loop1() is **15x faster** than loop2(). Why is that? Let’s find out below! - -## Measure cache misses using Cachegrind - -[Cachegrind](https://courses.cs.washington.edu/courses/cse326/05wi/valgrind-doc/cg_main.html) is a cache profiling tool used to see how many I1 (first level instruction), D1 (first level data), and LL (last level) cache misses your program caused. - -Let’s build our program with just loop1() and just loop2() to see how cache friendly each of these functions is. - -### Build and run/profile just loop1() - -```python -g++ -O2 main.cpp -DRUN_LOOP1 -valgrind --tool=cachegrind ./a.out -``` - -#### Prints: - -```python -==3299700== -==3299700== I refs: 643,156,721 -==3299700== I1 misses: 2,077 -==3299700== LLi misses: 2,021 -==3299700== I1 miss rate: 0.00% -==3299700== LLi miss rate: 0.00% -==3299700== -==3299700== D refs: 160,952,192 (160,695,444 rd + 256,748 wr) -==3299700== D1 misses: 10,021,300 ( 10,018,723 rd + 2,577 wr) -==3299700== LLd misses: 10,010,916 ( 10,009,147 rd + 1,769 wr) -==3299700== D1 miss rate: 6.2% ( 6.2% + 1.0% ) -==3299700== LLd miss rate: 6.2% ( 6.2% + 0.7% ) -==3299700== -==3299700== LL refs: 10,023,377 ( 10,020,800 rd + 2,577 wr) -==3299700== LL misses: 10,012,937 ( 10,011,168 rd + 1,769 wr) -==3299700== LL miss rate: 1.2% ( 1.2% + 0.7% ) -``` - -### Build and run/profile just loop2() - - -```python -g++ -O2 main.cpp -DRUN_LOOP2 -valgrind --tool=cachegrind ./a.out -``` - -#### Prints: - -```python -==3300389== -==3300389== I refs: 643,156,726 -==3300389== I1 misses: 2,075 -==3300389== LLi misses: 2,018 -==3300389== I1 miss rate: 0.00% -==3300389== LLi miss rate: 0.00% -==3300389== -==3300389== D refs: 160,952,196 (160,695,447 rd + 256,749 wr) -==3300389== D1 misses: 160,021,290 (160,018,713 rd + 2,577 wr) -==3300389== LLd misses: 10,014,907 ( 10,013,138 rd + 1,769 wr) -==3300389== D1 miss rate: 99.4% ( 99.6% + 1.0% ) -==3300389== LLd miss rate: 6.2% ( 6.2% + 0.7% ) -==3300389== -==3300389== LL refs: 160,023,365 (160,020,788 rd + 2,577 wr) -==3300389== LL misses: 10,016,925 ( 10,015,156 rd + 1,769 wr) -==3300389== LL miss rate: 1.2% ( 1.2% + 0.7% ) -``` - -The main differences between the 2 runs are: -1. **D1 misses:** 10M v/s 160M -2. **D1 miss rate:** 6.2% v/s 99.4% - -As you can see, `loop2()` causes many many more (**~16x more**) L1 data cache misses than loop1(). This is why `loop1()` is ~15x faster than loop2(). - -## Memory Formats supported by PyTorch Operators - -While PyTorch operators expect all tensors to be in [Channels First (NCHW) dimension format](https://discuss.pytorch.org/t/why-does-pytorch-prefer-using-nchw/83637/4), PyTorch operators support 3 output [memory formats](https://github.com/pytorch/pytorch/blob/master/c10/core/MemoryFormat.h). - -1. **Contiguous:** Tensor memory is in the same order as the tensor’s dimensions. -2. **ChannelsLast:** Irrespective of the dimension order, the 2d (image) tensor is laid out as an HWC or [NHWC](https://oneapi-src.github.io/oneDNN/dev_guide_understanding_memory_formats.html) (N: batch, H: height, W: width, C: channels) tensor in memory. The dimensions could be permuted in any order. -3. **ChannelsLast3d:** For 3d tensors (video tensors), the memory is laid out in THWC (Time, Height, Width, Channels) or NTHWC (N: batch, T: time, H: height, W: width, C: channels) format. The dimensions could be permuted in any order. - -The reason that ChannelsLast is preferred for vision models is because [XNNPACK](https://github.com/google/XNNPACK) (kernel acceleration library) used by PyTorch expects all inputs to be in **Channels Last** format, so if the input to the model isn’t channels last, then it must first be converted to channels last, which is an additional operation. - -Additionally, most PyTorch operators preserve the input tensor’s memory format, so if the input is Channels First, then the operator needs to first convert to Channels Last, then perform the operation, and then convert back to Channels First. - -When you combine it with the fact that accelerated operators work better with a channels last memory format, you’ll notice that having the operator return back a channels-last memory format is better for subsequent operator calls or you’ll end up having every operator convert to channels-last (should it be more efficient for that specific operator). - -From the XNNPACK home page: - -> “All operators in XNNPACK support NHWC layout, but additionally allow custom stride along the Channel dimension". - -## PyTorch Best Practice - -The best way to get the most performance from your PyTorch vision models is to ensure that your input tensor is in a **Channels Last** [memory format](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) before it is fed into the model. - -You can get even more speedups by optimizing your model to use the XNNPACK backend (by simply calling `optimize_for_mobile()` on your torchscripted model). Note that XNNPACK models will run slower if the inputs are contiguous, so definitely make sure it is in Channels-Last format. - -## Working example showing speedup - -Run this example on [Google Colab](https://colab.research.google.com/gist/suraj813/ad9aebcbffbdd6d02b23ca7231130a30/channels-last-with-xnnpack.ipynb#scrollTo=xvJN73YWXgDF) - note that runtimes on colab CPUs might not reflect accurate performance; it is recommended to run this code on your local machine. - -```python -import torch -from torch.utils.mobile_optimizer import optimize_for_mobile -import torch.backends.xnnpack -import time - -print("XNNPACK is enabled: ", torch.backends.xnnpack.enabled, "\n") - -N, C, H, W = 1, 3, 200, 200 -x = torch.rand(N, C, H, W) -print("Contiguous shape: ", x.shape) -print("Contiguous stride: ", x.stride()) -print() - -xcl = x.to(memory_format=torch.channels_last) -print("Channels-Last shape: ", xcl.shape) -print("Channels-Last stride: ", xcl.stride()) - -## Outputs: - -# XNNPACK is enabled: True - -# Contiguous shape: torch.Size([1, 3, 200, 200]) -# Contiguous stride: (120000, 40000, 200, 1) - -# Channels-Last shape: torch.Size([1, 3, 200, 200]) -# Channels-Last stride: (120000, 1, 600, 3) - -``` - -The input shape stays the same for contiguous and channels-last formats. Internally however, the tensor's layout has changed as you can see in the strides. Now, the number of jumps required to go across channels is only 1 (instead of 40000 in the contiguous tensor). -This better data locality means convolution layers can access all the channels for a given pixel much faster. Let's see now how the memory format affects runtime: - -```python -from torchvision.models import resnet34, resnet50, resnet101 - -m = resnet34(pretrained=False) -# m = resnet50(pretrained=False) -# m = resnet101(pretrained=False) - -def get_optimized_model(mm): - mm = mm.eval() - scripted = torch.jit.script(mm) - optimized = optimize_for_mobile(scripted) # explicitly call the xnnpack rewrite - return scripted, optimized - - -def compare_contiguous_CL(mm): - # inference on contiguous - start = time.perf_counter() - for i in range(20): - mm(x) - end = time.perf_counter() - print("Contiguous: ", end-start) - - # inference on channels-last - start = time.perf_counter() - for i in range(20): - mm(xcl) - end = time.perf_counter() - print("Channels-Last: ", end-start) - -with torch.inference_mode(): - scripted, optimized = get_optimized_model(m) - - print("Runtimes for torchscripted model: ") - compare_contiguous_CL(scripted.eval()) - print() - print("Runtimes for mobile-optimized model: ") - compare_contiguous_CL(optimized.eval()) - - -## Outputs (on an Intel Core i9 CPU): - -# Runtimes for torchscripted model: -# Contiguous: 1.6711160129999598 -# Channels-Last: 1.6678222839999535 - -# Runtimes for mobile-optimized model: -# Contiguous: 0.5712863490000473 -# Channels-Last: 0.46113000699995155 - -``` - -## Conclusion - -The Memory Layout of an input tensor can significantly impact a model’s running time. For Vision Models, prefer a **Channels Last** memory format to get the most out of your PyTorch models. - -## References - -- [Row/Column Major matrix storage order](https://en.wikipedia.org/wiki/Row-_and_column-major_order) -- [Loop order impact on performance](https://stackoverflow.com/questions/9936132/why-does-the-order-of-the-loops-affect-performance-when-iterating-over-a-2d-arra) -- [Cachegrind: a cache-miss profiler](https://courses.cs.washington.edu/courses/cse326/05wi/valgrind-doc/cg_main.html) -- [NHWC format explained](https://oneapi-src.github.io/oneDNN/dev_guide_understanding_memory_formats.html) -- [Why does PyTorch prefer NCHW?](https://discuss.pytorch.org/t/why-does-pytorch-prefer-using-nchw/83637/4) -- [XNNPACK](https://github.com/google/XNNPACK) -- [PyTorch memory format tutorial](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) -- [Supported operators](https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support) diff --git a/_posts/2021-12-22-introducing-torchvision-new-multi-weight-support-api.md b/_posts/2021-12-22-introducing-torchvision-new-multi-weight-support-api.md deleted file mode 100644 index 6086188e92e0..000000000000 --- a/_posts/2021-12-22-introducing-torchvision-new-multi-weight-support-api.md +++ /dev/null @@ -1,248 +0,0 @@ ---- -layout: blog_detail -title: "Introducing TorchVision’s New Multi-Weight Support API" -author: Vasilis Vryniotis -featured-img: "assets/images/torchvision_featured.jpg" ---- - -TorchVision has a new backwards compatible API for building models with multi-weight support. The new API allows loading different pre-trained weights on the same model variant, keeps track of vital meta-data such as the classification labels and includes the preprocessing transforms necessary for using the models. In this blog post, we plan to review the prototype API, show-case its features and highlight key differences with the existing one. - -
        - -
        - -We are hoping to get your thoughts about the API prior finalizing it. To collect your feedback, we have created a [Github issue](https://github.com/pytorch/vision/issues/5088) where you can post your thoughts, questions and comments. - -## Limitations of the current API - -TorchVision currently provides pre-trained models which could be a starting point for transfer learning or used as-is in Computer Vision applications. The typical way to instantiate a pre-trained model and make a prediction is: - -```Python -import torch - -from PIL import Image -from torchvision import models as M -from torchvision.transforms import transforms as T - - -img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg") - -# Step 1: Initialize model -model = M.resnet50(pretrained=True) -model.eval() - -# Step 2: Define and initialize the inference transforms -preprocess = T.Compose([ - T.Resize([256, ]), - T.CenterCrop(224), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) -]) - -# Step 3: Apply inference preprocessing transforms -batch = preprocess(img).unsqueeze(0) -prediction = model(batch).squeeze(0).softmax(0) - -# Step 4: Use the model and print the predicted category -class_id = prediction.argmax().item() -score = prediction[class_id].item() -with open("imagenet_classes.txt", "r") as f: - categories = [s.strip() for s in f.readlines()] - category_name = categories[class_id] -print(f"{category_name}: {100 * score}%") - -``` - -There are a few limitations with the above approach: - -1. **Inability to support multiple pre-trained weights:** Since the `pretrained` variable is boolean, we can only offer one set of weights. This poses a severe limitation when we significantly [improve the accuracy of existing models](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/) and we want to make those improvements available to the community. It also stops us from offering pre-trained weights of the same model variant on different datasets. -2. **Missing inference/preprocessing transforms:** The user is forced to define the necessary transforms prior using the model. The inference transforms are usually linked to the training process and dataset used to estimate the weights. Any minor discrepancies in these transforms (such as interpolation value, resize/crop sizes etc) can lead to major reductions in accuracy or unusable models. -3. **Lack of meta-data:** Critical pieces of information in relation to the weights are unavailable to the users. For example, one needs to look into external sources and the documentation to find things like the [category labels](https://github.com/pytorch/vision/issues/1946), the training recipe, the accuracy metrics etc. - -The new API addresses the above limitations and reduces the amount of boilerplate code needed for standard tasks. - -## Overview of the prototype API - -Let’s see how we can achieve exactly the same results as above using the new API: - -```Python -from PIL import Image -from torchvision.prototype import models as PM - - -img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg") - -# Step 1: Initialize model -weights = PM.ResNet50_Weights.IMAGENET1K_V1 -model = PM.resnet50(weights=weights) -model.eval() - -# Step 2: Initialize the inference transforms -preprocess = weights.transforms() - -# Step 3: Apply inference preprocessing transforms -batch = preprocess(img).unsqueeze(0) -prediction = model(batch).squeeze(0).softmax(0) - -# Step 4: Use the model and print the predicted category -class_id = prediction.argmax().item() -score = prediction[class_id].item() -category_name = weights.meta["categories"][class_id] -print(f"{category_name}: {100 * score}*%*") -``` - -As we can see the new API eliminates the aforementioned limitations. Let’s explore the new features in detail. - -### Multi-weight support - -At the heart of the new API, we have the ability to define multiple different weights for the same model variant. Each model building method (eg `resnet50`) has an associated Enum class (eg `ResNet50_Weights`) which has as many entries as the number of pre-trained weights available. Additionally, each Enum class has a `DEFAULT` alias which points to the best available weights for the specific model. This allows the users who want to always use the best available weights to do so without modifying their code. - -Here is an example of initializing models with different weights: - -```python -from torchvision.prototype.models import resnet50, ResNet50_Weights - -# Legacy weights with accuracy 76.130% -model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1) - -# New weights with accuracy 80.858% -model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2) - -# Best available weights (currently alias for IMAGENET1K_V2) -model = resnet50(weights=ResNet50_Weights.DEFAULT) - -# No weights - random initialization -model = resnet50(weights=None) -``` - -### Associated meta-data & preprocessing transforms - -The weights of each model are associated with meta-data. The type of information we store depends on the task of the model (Classification, Detection, Segmentation etc). Typical information includes a link to the training recipe, the interpolation mode, information such as the categories and validation metrics. These values are programmatically accessible via the `meta` attribute: - -```Python -from torchvision.prototype.models import ResNet50_Weights - -# Accessing a single record -size = ResNet50_Weights.IMAGENET1K_V2.meta["size"] - -# Iterating the items of the meta-data dictionary -for k, v in ResNet50_Weights.IMAGENET1K_V2.meta.items(): - print(k, v) -``` - -Additionally, each weights entry is associated with the necessary preprocessing transforms. All current preprocessing transforms are JIT-scriptable and can be accessed via the `transforms` attribute. Prior using them with the data, the transforms need to be initialized/constructed. This lazy initialization scheme is done to ensure the solution is memory efficient. The input of the transforms can be either a `PIL.Image` or a `Tensor` read using `torchvision.io`. - -```Python -from torchvision.prototype.models import ResNet50_Weights - -# Initializing preprocessing at standard 224x224 resolution -preprocess = ResNet50_Weights.IMAGENET1K_V2.transforms() - -# Initializing preprocessing at 400x400 resolution -preprocess = ResNet50_Weights.IMAGENET1K_V2.transforms(crop_size=400, resize_size=400) - -# Once initialized the callable can accept the image data: -# img_preprocessed = preprocess(img) -``` - -Associating the weights with their meta-data and preprocessing will boost transparency, improve reproducibility and make it easier to document how a set of weights was produced. - -### Get weights by name - -The ability to link directly the weights with their properties (meta data, preprocessing callables etc) is the reason why our implementation uses Enums instead of Strings. Nevertheless for cases when only the name of the weights is available, we offer a method capable of linking Weight names to their Enums: - -```Python -from torchvision.prototype.models import get_weight - -# Weights can be retrieved by name: -assert get_weight("ResNet50_Weights.IMAGENET1K_V1") == ResNet50_Weights.IMAGENET1K_V1 -assert get_weight("ResNet50_Weights.IMAGENET1K_V2") == ResNet50_Weights.IMAGENET1K_V2 - -# Including using the DEFAULT alias: -assert get_weight("ResNet50_Weights.DEFAULT") == ResNet50_Weights.IMAGENET1K_V2 -``` - -## Deprecations - -In the new API the boolean `pretrained` and `pretrained_backbone` parameters, which were previously used to load weights to the full model or to its backbone, are deprecated. The current implementation is fully backwards compatible as it seamlessly maps the old parameters to the new ones. Using the old parameters to the new builders emits the following deprecation warnings: - -```Python ->>> model = torchvision.prototype.models.resnet50(pretrained=True) - UserWarning: The parameter 'pretrained' is deprecated, please use 'weights' instead. -UserWarning: -Arguments other than a weight enum or `None` for 'weights' are deprecated. -The current behavior is equivalent to passing `weights=ResNet50_Weights.IMAGENET1K_V1`. -You can also use `weights=ResNet50_Weights.DEFAULT` to get the most up-to-date weights. -``` - -Additionally the builder methods require using keyword parameters. The use of positional parameter is deprecated and using them emits the following warning: - -```Python ->>> model = torchvision.prototype.models.resnet50(None) -UserWarning: -Using 'weights' as positional parameter(s) is deprecated. -Please use keyword parameter(s) instead. -``` - -## Testing the new API - -Migrating to the new API is very straightforward. The following method calls between the 2 APIs are all equivalent: - -``` -# Using pretrained weights: -torchvision.prototype.models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1) -torchvision.models.resnet50(pretrained=True) -torchvision.models.resnet50(True) - -# Using no weights: -torchvision.prototype.models.resnet50(weights=None) -torchvision.models.resnet50(pretrained=False) -torchvision.models.resnet50(False) -``` - -Note that the prototype features are available only on the nightly versions of TorchVision, so to use it you need to install it as follows: - -``` -conda install torchvision -c pytorch-nightly -``` - -For alternative ways to install the nightly have a look on the PyTorch [download page](https://pytorch.org/get-started/locally/). You can also install TorchVision from source from the latest main; for more information have a look on our [repo](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md). - -## Accessing state-of-the-art model weights with the new API - -If you are still unconvinced about giving a try to the new API, here is one more reason to do so. We’ve recently refreshed our [training recipe](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/) and achieved SOTA accuracy from many of our models. The improved weights can easily be accessed via the new API. Here is a quick overview of the model improvements: - -
        - -
        - -| Model | Old Acc@1 | New Acc@1 | -| -------------------------- | --------- | --------- | -| EfficientNet B1 | 78.642 | 79.838 | -| MobileNetV3 Large | 74.042 | 75.274 | -| Quantized ResNet50 | 75.92 | 80.282 | -| Quantized ResNeXt101 32x8d | 78.986 | 82.574 | -| RegNet X 400mf | 72.834 | 74.864 | -| RegNet X 800mf | 75.212 | 77.522 | -| RegNet X 1 6gf | 77.04 | 79.668 | -| RegNet X 3 2gf | 78.364 | 81.198 | -| RegNet X 8gf | 79.344 | 81.682 | -| RegNet X 16gf | 80.058 | 82.72 | -| RegNet X 32gf | 80.622 | 83.018 | -| RegNet Y 400mf | 74.046 | 75.806 | -| RegNet Y 800mf | 76.42 | 78.838 | -| RegNet Y 1 6gf | 77.95 | 80.882 | -| RegNet Y 3 2gf | 78.948 | 81.984 | -| RegNet Y 8gf | 80.032 | 82.828 | -| RegNet Y 16gf | 80.424 | 82.89 | -| RegNet Y 32gf | 80.878 | 83.366 | -| ResNet50 | 76.13 | 80.858 | -| ResNet101 | 77.374 | 81.886 | -| ResNet152 | 78.312 | 82.284 | -| ResNeXt50 32x4d | 77.618 | 81.198 | -| ResNeXt101 32x8d | 79.312 | 82.834 | -| Wide ResNet50 2 | 78.468 | 81.602 | -| Wide ResNet101 2 | 78.848 | 82.51 | - -Please spare a few minutes to provide your feedback on the new API, as this is crucial for graduating it from prototype and including it in the next release. You can do this on the dedicated [Github Issue](https://github.com/pytorch/vision/issues/5088). We are looking forward to reading your comments! diff --git a/_posts/2021-12-8-announcing-the-winners-of-the-2021-pytorch-annual-hackathon.md b/_posts/2021-12-8-announcing-the-winners-of-the-2021-pytorch-annual-hackathon.md deleted file mode 100644 index c420a8e9e443..000000000000 --- a/_posts/2021-12-8-announcing-the-winners-of-the-2021-pytorch-annual-hackathon.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -layout: blog_detail -title: 'Announcing the Winners of the 2021 PyTorch Annual Hackathon' -author: Team PyTorch -featured-img: 'assets/images/social_hackathon21.png' ---- - -More than 1,900 people worked hard in this year’s PyTorch Annual Hackathon to create unique tools and applications for PyTorch developers and researchers. - -*Notice: None of the projects submitted to the hackathon are associated with or offered by Meta Platforms, Inc.* - -
        - -
        - -This year, participants could enter their projects into following three categories: -* **PyTorch Developer Tools**: a tool or library for improving productivity and efficiency for PyTorch researchers and developers. -* **Web and Mobile Applications Powered by PyTorch**: a web or mobile interface and/or an embedded device built using PyTorch. -* **PyTorch Responsible AI Development Tools**: a tool, library, or web/mobile app to support researchers and developers in creating responsible AI that factors in fairness, security, privacy, and more throughout its entire development process. - -The virtual hackathon ran from September 8 through November 2, 2021, with more than 1,900 registered participants from 110 countries, submitting a total of 65 projects. Entrants were judged on their idea’s quality, originality, potential impact, and how well they implemented it. All projects can be viewed [here](https://pytorch2021.devpost.com/project-gallery). - -Meet the winners of each category below! - -## PYTORCH DEVELOPER TOOLS - -#### **First Place: [RaNNC](https://devpost.com/software/rannc-rapid-neural-network-connector)** -RaNNC is a middleware to automate hybrid model/data parallelism for training very large-scale neural networks capable of training 100 billion parameter models without any manual tuning. - -#### **Second Place: [XiTorch](https://devpost.com/software/xitorch-differentiable-scientific-computing-library)** -XiTorch provides first and higher order gradients of functional routines, such as optimization, rootfinder, and ODE solver. It also contains operations for implicit linear operators (e.g. large matrix that is expressed only by its matrix-vector multiplication) such as symmetric eigen-decomposition, linear solve, and singular value decomposition. - -#### **Third Place: [TorchLiberator](https://devpost.com/software/torchliberator-partial-weight-loading)** -TorchLiberator automates model surgery, finding the maximum correspondence between weights in two networks. - -#### **Honorable Mentions** -* [PADL](https://devpost.com/software/doch) manages your entire PyTorch work flow with a single python abstraction and a beautiful functional API, so there’s no more complex configuration or juggling preprocessing, postprocessing and forward passes. -* [PyTree](https://devpost.com/software/pytree) is a PyTorch package for recursive neural networks that provides highly generic recursive neural network implementations as well as efficient batching methods. -* [IndicLP](https://devpost.com/software/indiclp) makes it easier for developers and researchers to build applications and models in Indian Languages, thus making NLP a more diverse field. - -## WEB/MOBILE APPLICATIONS POWERED BY PYTORCH - -#### **First Place: [PyTorch Driving Guardian](https://devpost.com/software/pytorch-driving-guardian)** -PyTorch Driving Guardian is a tool that monitors driver alertness, emotional state, and potential blind spots on the road. - -#### **Second Place: [Kronia](https://devpost.com/software/kronia)** -Kronia is an Android mobile app built to maximize the harvest outputs for farmers. - -#### **Third Place: [Heyoh camera for Mac](https://devpost.com/software/heyoh-camera)** -Heyoh is a Mac virtual camera for Zoom and Meets that augments live video by recognizing hand gestures and smiles and shows animated effects to other video participants. - -#### **Honorable Mentions** -* [Mamma AI](https://devpost.com/software/mamma-ai) is a tool that helps doctors with the breast cancer identification process by identifying areas likely to have cancer using ultrasonic and x-ray images. -* [AgingClock](https://devpost.com/software/agingclock) is a tool that predicts biological age first with methylation genome data, then blood test data and eventually with multimodal omics and lifestyle data. -* [Iris](https://devpost.com/software/iris-7s3yna) is an open source photos platform which is more of an alternative of Google Photos that includes features such as Listing photos, Detecting Categories, Detecting and Classifying Faces from Photos, Detecting and Clustering by Location and Things in Photos. - -## PYTORCH RESPONSIBLE AI DEVELOPMENT TOOLS - -#### **First Place: [FairWell](https://devpost.com/software/fairwell-a-tool-to-bid-goodbye-to-unknown-ai-biasness)** -FairWell aims to address model bias on specific groups of people by allowing data scientists to evaluate their dataset and model predictions and take steps to make their datasets more inclusive and their models less biased. - -#### **Second Place: [promp2slip](https://devpost.com/software/promp2slip)** -Promp2slip is a library that tests the ethics of language models by using natural adversarial texts. - -#### **Third Place: [Phorch](https://devpost.com/software/phorch)** -Phorch adversarially attacks the data using FIGA (Feature Importance Guided Attack) and creates 3 different attack sets of data based on certain parameters. These features are utilized to implement adversarial training as a defense against FIGA using neural net architecture in PyTorch. - -#### **Honorable Mentions** -* [Greenops](https://devpost.com/software/greenops) helps to measure the footprints of deep learning models at training, testing and evaluating to reduce energy consumption and carbon footprints. -* [Xaitk-saliency](https://devpost.com/software/xaitk-saliency) is an open-source, explainable AI toolkit for visual saliency algorithm interfaces and implementations, built for analytic and autonomy applications. - -Thank you, - -Team PyTorch diff --git a/_posts/2021-3-24-pytorch-for-amd-rocm-platform-now-available-as-python-package.md b/_posts/2021-3-24-pytorch-for-amd-rocm-platform-now-available-as-python-package.md deleted file mode 100644 index d13784307e1c..000000000000 --- a/_posts/2021-3-24-pytorch-for-amd-rocm-platform-now-available-as-python-package.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch for AMD ROCm™ Platform now available as Python package' -author: Niles Burbank – Director PM at AMD, Mayank Daga – Director, Deep Learning Software at AMD ---- - -With the PyTorch 1.8 release, we are delighted to announce a new installation option for users of -PyTorch on the ROCm™ open software platform. An installable Python package is now hosted on -pytorch.org, along with instructions for local installation in the same simple, selectable format as -PyTorch packages for CPU-only configurations and other GPU platforms. PyTorch on ROCm includes full -capability for mixed-precision and large-scale training using AMD’s MIOpen & RCCL libraries. This -provides a new option for data scientists, researchers, students, and others in the community to get -started with accelerated PyTorch using AMD GPUs. - -
        - -
        - -## The ROCm Ecosystem - -ROCm is AMD’s open source software platform for GPU-accelerated high performance computing and -machine learning. Since the original ROCm release in 2016, the ROCm platform has evolved to support -additional libraries and tools, a wider set of Linux® distributions, and a range of new GPUs. This includes -the AMD Instinct™ MI100, the first GPU based on AMD CDNA™ architecture. - -The ROCm ecosystem has an established history of support for PyTorch, which was initially implemented -as a fork of the PyTorch project, and more recently through ROCm support in the upstream PyTorch -code. PyTorch users can install PyTorch for ROCm using AMD’s public PyTorch docker image, and can of -course build PyTorch for ROCm from source. With PyTorch 1.8, these existing installation options are -now complemented by the availability of an installable Python package. - -The primary focus of ROCm has always been high performance computing at scale. The combined -capabilities of ROCm and AMD’s Instinct family of data center GPUs are particularly suited to the -challenges of HPC at data center scale. PyTorch is a natural fit for this environment, as HPC and ML -workflows become more intertwined. - -### Getting started with PyTorch for ROCm - -The scope for this build of PyTorch is AMD GPUs with ROCm support, running on Linux. The GPUs -supported by ROCm include all of AMD’s Instinct family of compute-focused data center GPUs, along -with some other select GPUs. A current list of supported GPUs can be found in the [ROCm Github -repository](https://github.com/RadeonOpenCompute/ROCm#supported-gpus). After confirming that the target system includes supported GPUs and the current 4.0.1 -release of ROCm, installation of PyTorch follows the same simple Pip-based installation as any other -Python package. As with PyTorch builds for other platforms, the configurator at [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) provides the specific command line to be run. - -PyTorch for ROCm is built from the upstream PyTorch repository, and is a full featured implementation. -Notably, it includes support for distributed training across multiple GPUs and supports accelerated -mixed precision training. - -### More information - -A list of ROCm supported GPUs and operating systems can be found at -[https://github.com/RadeonOpenCompute/ROCm](https://github.com/RadeonOpenCompute/ROCm) -General documentation on the ROCm platform is available at [https://rocmdocs.amd.com/en/latest/](https://rocmdocs.amd.com/en/latest/) -ROCm Learning Center at [https://developer.amd.com/resources/rocm-resources/rocm-learning-center/](https://developer.amd.com/resources/rocm-resources/rocm-learning-center/) General information on AMD’s offerings for HPC and ML can be found at [https://amd.com/hpc](https://amd.com/hpc) - -### Feedback -An engaged user base is a tremendously important part of the PyTorch ecosystem. We would be deeply -appreciative of feedback on the PyTorch for ROCm experience in the [PyTorch discussion forum](https://discuss.pytorch.org/) and, where appropriate, reporting any issues via [Github](https://github.com/pytorch/pytorch). diff --git a/_posts/2021-3-25-introducing-pytorch-profiler-the-new-and-improved-performance-tool.md b/_posts/2021-3-25-introducing-pytorch-profiler-the-new-and-improved-performance-tool.md deleted file mode 100644 index 19a1c3c009ee..000000000000 --- a/_posts/2021-3-25-introducing-pytorch-profiler-the-new-and-improved-performance-tool.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -layout: blog_detail -title: 'Introducing PyTorch Profiler - the new and improved performance tool' -author: Maxim Lukiyanov - Principal PM at Microsoft, Guoliang Hua - Principal Engineering Manager at Microsoft, Geeta Chauhan - Partner Engineering Lead at Facebook, Gisle Dankel - Tech Lead at Facebook ---- - -Along with [PyTorch 1.8.1 release](https://github.com/pytorch/pytorch/releases/tag/v1.8.1), we are excited to announce PyTorch Profiler – the new and improved performance debugging profiler for PyTorch. Developed as part of a collaboration between Microsoft and Facebook, the PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models. - -Analyzing and improving large-scale deep learning model performance is an ongoing challenge that grows in importance as the model sizes increase. For a long time, PyTorch users had a hard time solving this challenge due to the lack of available tools. There were standard performance debugging tools that provide GPU hardware level information but missed PyTorch-specific context of operations. In order to recover missed information, users needed to combine multiple tools together or manually add minimum correlation information to make sense of the data. There was also the autograd profiler (```torch.autograd.profiler```) which can capture information about PyTorch operations but does not capture detailed GPU hardware-level information and cannot provide support for visualization. - -The new PyTorch Profiler (```torch.profiler```) is a tool that brings both types of information together and then builds experience that realizes the full potential of that information. This new profiler collects both GPU hardware and PyTorch related information, correlates them, performs automatic detection of bottlenecks in the model, and generates recommendations on how to resolve these bottlenecks. All of this information from the profiler is visualized for the user in TensorBoard. The new Profiler API is natively supported in PyTorch and delivers the simplest experience available to date where users can profile their models without installing any additional packages and see results immediately in TensorBoard with the new PyTorch Profiler plugin. Below is the screenshot of PyTorch Profiler - automatic bottleneck detection. - -
        - -
        - -## Getting started - -PyTorch Profiler is the next version of the PyTorch autograd profiler. It has a new module namespace ```torch.profiler``` but maintains compatibility with autograd profiler APIs. The Profiler uses a new GPU profiling engine, built using Nvidia CUPTI APIs, and is able to capture GPU kernel events with high fidelity. To profile your model training loop, wrap the code in the profiler context manager as shown below. - -```python - with torch.profiler.profile( - schedule=torch.profiler.schedule( - wait=2, - warmup=2, - active=6, - repeat=1), - on_trace_ready=tensorboard_trace_handler, - with_stack=True -) as profiler: - for step, data in enumerate(trainloader, 0): - print("step:{}".format(step)) - inputs, labels = data[0].to(device=device), data[1].to(device=device) - - outputs = model(inputs) - loss = criterion(outputs, labels) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - profiler.step() -``` -The ```schedule``` parameter allows you to limit the number of training steps included in the profile to reduce the amount of data collected and simplify visual analysis by focusing on what’s important. The ```tensorboard_trace_handler``` automatically saves profiling results to disk for analysis in TensorBoard. - -To view results of the profiling session in TensorBoard, install PyTorch Profiler TensorBoard Plugin package. - -```python -pip install torch_tb_profiler -``` -## Visual Studio Code Integration -[Microsoft Visual Studio Code](https://code.visualstudio.com/) is one of the most popular code editors for Python developers and data scientists. The [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) for VS Code recently added the integration of TensorBoard into the code editor, including support for the PyTorch Profiler. Once you have VS Code and the Python extension installed, you can quickly open the TensorBoard Profiler plugin by launching the Command Palette using the keyboard shortcut CTRL + SHIFT + P (CMD + SHIFT + P on a Mac) and typing the “Launch TensorBoard” command. - -
        - -
        - -This integration comes with a built-in lifecycle management feature. VS Code will install the TensorBoard package and the PyTorch Profiler plugin package (coming in mid-April) automatically if you don’t have them on your system. VS Code will also launch TensorBoard process for you and automatically look for any TensorBoard log files within your current directory. When you’re done, just close the tab and VS Code will automatically close the process. No more Terminal windows running on your system to provide a backend for the TensorBoard UI! Below is PyTorch Profiler Trace View running in TensorBoard. - -
        - -
        - -Learn more about TensorBoard support in VS Code in [this blog](https://devblogs.microsoft.com/python/python-in-visual-studio-code-february-2021-release/). - -## Feedback - -Review [PyTorch Profiler documentation](https://pytorch.org/docs/stable/profiler.html), give Profiler a try and let us know about your experience. Provide your feedback on [PyTorch Discussion Forum](https://discuss.pytorch.org/) or file issues on [PyTorch GitHub](https://github.com/pytorch/pytorch). - diff --git a/_posts/2021-3-3-the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch.md b/_posts/2021-3-3-the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch.md deleted file mode 100644 index 5bf2fb31dbbf..000000000000 --- a/_posts/2021-3-3-the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -layout: blog_detail -title: 'The torch.fft module: Accelerated Fast Fourier Transforms with Autograd in PyTorch' -author: Mike Ruberry, Peter Bell, and Joe Spisak ---- - -The Fast Fourier Transform (FFT) calculates the Discrete Fourier Transform in O(n log n) time. It is foundational to a wide variety of numerical algorithms and signal processing techniques since it makes working in signals’ “frequency domains” as tractable as working in their spatial or temporal domains. - -As part of PyTorch’s goal to support hardware-accelerated deep learning and scientific computing, we have invested in improving our FFT support, and with PyTorch 1.8, we are releasing the ``torch.fft`` module. This module implements the same functions as NumPy’s ``np.fft`` module, but with support for accelerators, like GPUs, and autograd. - -## Getting started - -Getting started with the new ``torch.fft`` module is easy whether you are familiar with NumPy’s ``np.fft`` module or not. While complete documentation for each function in the module can be found [here](https://pytorch.org/docs/1.8.0/fft.html), a breakdown of what it offers is: - -* ``fft``, which computes a complex FFT over a single dimension, and ``ifft``, its inverse -* the more general ``fftn`` and ``ifftn``, which support multiple dimensions -* The “real” FFT functions, ``rfft``, ``irfft``, ``rfftn``, ``irfftn``, designed to work with signals that are real-valued in their time domains -* The "Hermitian" FFT functions, ``hfft`` and ``ihfft``, designed to work with signals that are real-valued in their frequency domains -* Helper functions, like ``fftfreq``, ``rfftfreq``, ``fftshift``, ``ifftshift``, that make it easier to manipulate signals - -We think these functions provide a straightforward interface for FFT functionality, as vetted by the NumPy community, although we are always interested in feedback and suggestions! - -To better illustrate how easy it is to move from NumPy’s ``np.fft`` module to PyTorch’s ``torch.fft`` module, let’s look at a NumPy implementation of a simple low-pass filter that removes high-frequency variance from a 2-dimensional image, a form of noise reduction or blurring: - -```python -import numpy as np -import numpy.fft as fft - -def lowpass_np(input, limit): - pass1 = np.abs(fft.rfftfreq(input.shape[-1])) < limit - pass2 = np.abs(fft.fftfreq(input.shape[-2])) < limit - kernel = np.outer(pass2, pass1) - - fft_input = fft.rfft2(input) - return fft.irfft2(fft_input * kernel, s=input.shape[-2:]) -``` - -Now let’s see the same filter implemented in PyTorch: - -```python -import torch -import torch.fft as fft - -def lowpass_torch(input, limit): - pass1 = torch.abs(fft.rfftfreq(input.shape[-1])) < limit - pass2 = torch.abs(fft.fftfreq(input.shape[-2])) < limit - kernel = torch.outer(pass2, pass1) - - fft_input = fft.rfft2(input) - return fft.irfft2(fft_input * kernel, s=input.shape[-2:]) -``` - -Not only do current uses of NumPy’s ``np.fft`` module translate directly to ``torch.fft``, the ``torch.fft`` operations also support tensors on accelerators, like GPUs and autograd. This makes it possible to (among other things) develop new neural network modules using the FFT. - - -## Performance - -The ``torch.fft`` module is not only easy to use — it is also fast! PyTorch natively supports Intel’s MKL-FFT library on Intel CPUs, and NVIDIA’s cuFFT library on CUDA devices, and we have carefully optimized how we use those libraries to maximize performance. While your own results will depend on your CPU and CUDA hardware, computing Fast Fourier Transforms on CUDA devices can be many times faster than computing it on the CPU, especially for larger signals. - -In the future, we may add support for additional math libraries to support more hardware. See below for where you can request additional hardware support. - -## Updating from older PyTorch versions - -Some PyTorch users might know that older versions of PyTorch also offered FFT functionality with the ``torch.fft()`` function. Unfortunately, this function had to be removed because its name conflicted with the new module’s name, and we think the new functionality is the best way to use the Fast Fourier Transform in PyTorch. In particular, ``torch.fft()`` was developed before PyTorch supported complex tensors, while the ``torch.fft`` module was designed to work with them. - -PyTorch also has a “Short Time Fourier Transform”, ``torch.stft``, and its inverse ``torch.istft``. These functions are being kept but updated to support complex tensors. - -## Future - -As mentioned, PyTorch 1.8 offers the torch.fft module, which makes it easy to use the Fast Fourier Transform (FFT) on accelerators and with support for autograd. We encourage you to try it out! - -While this module has been modeled after NumPy’s ``np.fft`` module so far, we are not stopping there. We are eager to hear from you, our community, on what FFT-related functionality you need, and we encourage you to create posts on our forums at [https://discuss.pytorch.org/](https://discuss.pytorch.org/), or [file issues on our Github](https://github.com/pytorch/pytorch/issues/new?assignees=&labels=&template=feature-request.md) with your feedback and requests. Early adopters have already started asking about Discrete Cosine Transforms and support for more hardware platforms, for example, and we are investigating those features now. - -We look forward to hearing from you and seeing what the community does with PyTorch’s new FFT functionality! diff --git a/_posts/2021-3-4-pytorch-1.8-new-library-releases.md b/_posts/2021-3-4-pytorch-1.8-new-library-releases.md deleted file mode 100644 index c0f18f2ac863..000000000000 --- a/_posts/2021-3-4-pytorch-1.8-new-library-releases.md +++ /dev/null @@ -1,158 +0,0 @@ ---- -layout: blog_detail -title: 'New PyTorch library releases including TorchVision Mobile, TorchAudio I/O, and more' -author: Team PyTorch ---- - -Today, we are announcing updates to a number of PyTorch libraries, alongside the [PyTorch 1.8 release](https://pytorch.org/blog/pytorch-1.8-released). The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio as well as new version of TorchCSPRNG. These releases include a number of new features and improvements and, along with the PyTorch 1.8 release, provide a broad set of updates for the PyTorch community to build on and leverage. - -Some highlights include: -* **TorchVision** - Added support for PyTorch Mobile including [Detectron2Go](https://ai.facebook.com/blog/d2go-brings-detectron2-to-mobile) (D2Go), auto-augmentation of data during training, on the fly type conversion, and [AMP autocasting](https://pytorch.org/docs/stable/amp.html). -* **TorchAudio** - Major improvements to I/O, including defaulting to sox_io backend and file-like object support. Added Kaldi Pitch feature and support for CMake based build allowing TorchAudio to better support no-Python environments. -* **TorchText** - Updated the dataset loading API to be compatible with standard PyTorch data loading utilities. -* **TorchCSPRNG** - Support for cryptographically secure pseudorandom number generators for PyTorch is now stable with new APIs for AES128 ECB/CTR and CUDA support on Windows. - -Please note that, starting in PyTorch 1.6, features are classified as Stable, Beta, and Prototype. Prototype features are not included as part of the binary distribution and are instead available through either building from source, using nightlies or via compiler flag. You can see the detailed announcement [here](https://pytorch.org/blog/pytorch-feature-classification-changes/). - - -# TorchVision 0.9.0 -### [Stable] TorchVision Mobile: Operators, Android Binaries, and Tutorial -We are excited to announce the first on-device support and binaries for a PyTorch domain library. We have seen significant appetite in both research and industry for on-device vision support to allow low latency, privacy friendly, and resource efficient mobile vision experiences. You can follow this [new tutorial](https://github.com/pytorch/android-demo-app/tree/master/D2Go) to build your own Android object detection app using TorchVision operators, D2Go, or your own custom operators and model. - -
        - -
        - -### [Stable] New Mobile models for Classification, Object Detection and Semantic Segmentation -We have added support for the MobileNetV3 architecture and provided pre-trained weights for Classification, Object Detection and Segmentation. It is easy to get up and running with these models, just import and load them as you would any ```torchvision``` model: -```python -import torch -import torchvision - -# Classification -x = torch.rand(1, 3, 224, 224) -m_classifier = torchvision.models.mobilenet_v3_large(pretrained=True) -m_classifier.eval() -predictions = m_classifier(x) - -# Quantized Classification -x = torch.rand(1, 3, 224, 224) -m_classifier = torchvision.models.quantization.mobilenet_v3_large(pretrained=True) -m_classifier.eval() -predictions = m_classifier(x) - -# Object Detection: Highly Accurate High Resolution Mobile Model -x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] -m_detector = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True) -m_detector.eval() -predictions = m_detector(x) - -# Semantic Segmentation: Highly Accurate Mobile Model -x = torch.rand(1, 3, 520, 520) -m_segmenter = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True) -m_segmenter.eval() -predictions = m_segmenter(x) -``` -These models are highly competitive with TorchVision’s existing models on resource efficiency, speed, and accuracy. See our [release notes](https://github.com/pytorch/vision/releases) for detailed performance metrics. - -### [Stable] AutoAugment -[AutoAugment](https://arxiv.org/pdf/1805.09501.pdf) is a common Data Augmentation technique that can increase the accuracy of Scene Classification models. Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that ImageNet policies provide significant improvements when applied to other datasets. We’ve implemented 3 policies learned on the following datasets: ImageNet, CIFA10 and SVHN. These can be used standalone or mixed-and-matched with existing transforms: -```python -from torchvision import transforms - -t = transforms.AutoAugment() -transformed = t(image) - - -transform=transforms.Compose([ - transforms.Resize(256), - transforms.AutoAugment(), - transforms.ToTensor()]) -``` -### Other New Features for TorchVision -* [Stable] All read and decode methods in the io.image package now support: - * Palette, Grayscale Alpha and RBG Alpha image types during PNG decoding - * On-the-fly conversion of image from one type to the other during read -* [Stable] WiderFace dataset -* [Stable] Improved FasterRCNN speed and accuracy by introducing a score threshold on RPN -* [Stable] Modulation input for DeformConv2D -* [Stable] Option to write audio to a video file -* [Stable] Utility to draw bounding boxes -* [Beta] Autocast support in all Operators -Find the full TorchVision release notes [here](https://github.com/pytorch/vision/releases). - -# TorchAudio 0.8.0 -### I/O Improvements -We have continued our work from the [previous release](https://github.com/pytorch/audio/releases/tag/v0.7.0) to improve TorchAudio’s I/O support, including: -* [Stable] Changing the default backend to “sox_io” (for Linux/macOS), and updating the “soundfile” backend’s interface to align with that of “sox_io”. The legacy backend and interface are still accessible, though it is strongly discouraged to use them. -* [Stable] File-like object support in both "sox_io" backend, “soundfile” backend and sox_effects. -* [Stable] New options to change the format, encoding, and bits_per_sample when saving. -* [Stable] Added GSM, HTK, AMB, AMR-NB and AMR-WB format support to the “sox_io” backend. -* [Beta] A new ```functional.apply_codec``` function which can degrade audio data by applying audio codecs supported by “sox_io” backend in an in-memory fashion. -Here are some examples of features landed in this release: - -```python -# Load audio over HTTP -with requests.get(URL, stream=True) as response: - waveform, sample_rate = torchaudio.load(response.raw) - -# Saving to Bytes buffer as 32-bit floating-point PCM -buffer_ = io.BytesIO() -torchaudio.save( - buffer_, waveform, sample_rate, - format="wav", encoding="PCM_S", bits_per_sample=16) - -# Apply effects while loading audio from S3 -client = boto3.client('s3') -response = client.get_object(Bucket=S3_BUCKET, Key=S3_KEY) -waveform, sample_rate = torchaudio.sox_effects.apply_effect_file( - response['Body'], - [["lowpass", "-1", "300"], ["rate", "8000"]]) - -# Apply GSM codec to Tensor -encoded = torchaudio.functional.apply_codec( - waveform, sample_rate, format="gsm") -``` - -Check out the revamped audio preprocessing tutorial, [Audio Manipulation with TorchAudio](https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html). - -### [Stable] Switch to CMake-based build -In the previous version of TorchAudio, it was utilizing CMake to build third party dependencies. Starting in 0.8.0, TorchaAudio uses CMake to build its C++ extension. This will open the door to integrate TorchAudio in non-Python environments (such as C++ applications and mobile). We will continue working on adding example applications and mobile integrations. - -### [Beta] Improved and New Audio Transforms -We have added two widely requested operators in this release: the SpectralCentroid transform and the Kaldi Pitch feature extraction (detailed in ["A pitch extraction algorithm tuned for automatic speech recognition"](https://ieeexplore.ieee.org/document/6854049)). We’ve also exposed a normalization method to Mel transforms, and additional STFT arguments to Spectrogram. We would like to ask our community to continue to [raise feature requests](https://github.com/pytorch/audio/issues/new?assignees=&labels=&template=feature-request.md) for core audio processing features like these! - -### Community Contributions -We had more contributions from the open source community in this release than ever before, including several completely new features. We would like to extend our sincere thanks to the community. Please check out the newly added [CONTRIBUTING.md](https://github.com/pytorch/audio/blob/master/CONTRIBUTING.md) for ways to contribute code, and remember that reporting bugs and requesting features are just as valuable. We will continue posting well-scoped work items as issues labeled “help-wanted” and “contributions-welcome” for anyone who would like to contribute code, and are happy to coach new contributors through the contribution process. - -Find the full TorchAudio release notes [here](https://github.com/pytorch/audio/releases). - -# TorchText 0.9.0 -### [Beta] Dataset API Updates -In this release, we are updating TorchText’s dataset API to be compatible with PyTorch data utilities, such as DataLoader, and are deprecating TorchText’s custom data abstractions such as ```Field```. The updated datasets are simple string-by-string iterators over the data. For guidance about migrating from the legacy abstractions to use modern PyTorch data utilities, please refer to our [migration guide](https://github.com/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb). - -The text datasets listed below have been updated as part of this work. For examples of how to use these datasets, please refer to our [end-to-end text classification tutorial](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html). -* **Language modeling:** WikiText2, WikiText103, PennTreebank, EnWik9 -* **Text classification:** AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, YelpReviewFull, YahooAnswers, AmazonReviewPolarity, AmazonReviewFull, IMDB -* **Sequence tagging:** UDPOS, CoNLL2000Chunking -* **Translation:** IWSLT2016, IWSLT2017 -* **Question answer:** SQuAD1, SQuAD2 - -Find the full TorchText release notes [here](https://github.com/pytorch/text/releases). - -# [Stable] TorchCSPRNG 0.2.0 -We [released TorchCSPRNG in August 2020](https://pytorch.org/blog/torchcsprng-release-blog/), a PyTorch C++/CUDA extension that provides cryptographically secure pseudorandom number generators for PyTorch. Today, we are releasing the 0.2.0 version and designating the library as stable. This release includes a new API for encrypt/decrypt with AES128 ECB/CTR as well as CUDA 11 and Windows CUDA support. - -Find the full TorchCSPRNG release notes [here](https://github.com/pytorch/csprng/releases/). - - - - - - - -Thanks for reading, and if you are excited about these updates and want to participate in the future of PyTorch, we encourage you to join the [discussion forums](https://discuss.pytorch.org/) and [open GitHub issues](https://github.com/pytorch). - -Cheers! - -***Team PyTorch*** diff --git a/_posts/2021-3-4-pytorch-1.8-released.md b/_posts/2021-3-4-pytorch-1.8-released.md deleted file mode 100644 index 1f0ac2d66df7..000000000000 --- a/_posts/2021-3-4-pytorch-1.8-released.md +++ /dev/null @@ -1,133 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.8 Release, including Compiler and Distributed Training updates, and New Mobile Tutorials' -author: Team PyTorch ---- - -We are excited to announce the availability of PyTorch 1.8. This release is composed of more than 3,000 commits since 1.7. It includes major updates and new features for compilation, code optimization, frontend APIs for scientific computing, and AMD ROCm support through binaries that are available via pytorch.org. It also provides improved features for large-scale training for pipeline and model parallelism, and gradient compression. A few of the highlights include: -1. Support for doing python to python functional transformations via ```torch.fx```; -2. Added or stabilized APIs to support FFTs (```torch.fft```), Linear Algebra functions (```torch.linalg```), added support for autograd for complex tensors and updates to improve performance for calculating hessians and jacobians; and -3. Significant updates and improvements to distributed training including: Improved NCCL reliability; Pipeline parallelism support; RPC profiling; and support for communication hooks adding gradient compression. -See the full release notes [here](https://github.com/pytorch/pytorch/releases). - -Along with 1.8, we are also releasing major updates to PyTorch libraries including [TorchCSPRNG](https://github.com/pytorch/csprng), [TorchVision](https://github.com/pytorch/vision), [TorchText](https://github.com/pytorch/text) and [TorchAudio](https://github.com/pytorch/audio). For more on the library releases, see the post [here](http://pytorch.org/blog/pytorch-1.8-new-library-releases). As previously noted, features in PyTorch releases are classified as Stable, Beta and Prototype. You can learn more about the definitions in the post [here](https://pytorch.org/blog/pytorch-feature-classification-changes/). - -# New and Updated APIs -The PyTorch 1.8 release brings a host of new and updated API surfaces ranging from additional APIs for NumPy compatibility, also support for ways to improve and scale your code for performance at both inference and training time. Here is a brief summary of the major features coming in this release: - -### [Stable] ```Torch.fft``` support for high performance NumPy style FFTs -As part of PyTorch’s goal to support scientific computing, we have invested in improving our FFT support and with PyTorch 1.8, we are releasing the ```torch.fft``` module. This module implements the same functions as NumPy’s ```np.fft``` module, but with support for hardware acceleration and autograd. -* See this [blog post](https://pytorch.org/blog/the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch/) for more details -* [Documentation](https://pytorch.org/docs/1.8.0/fft.html) - -### [Beta] Support for NumPy style linear algebra functions via ```torch.linalg``` -The ```torch.linalg``` module, modeled after NumPy’s [np.linalg](https://numpy.org/doc/stable/reference/routines.linalg.html?highlight=linalg#module-numpy.linalg) module, brings NumPy-style support for common linear algebra operations including Cholesky decompositions, determinants, eigenvalues and many others. -* [Documentation](https://pytorch.org/docs/1.8.0/linalg.html) - -## [Beta] Python code Transformations with FX -FX allows you to write transformations of the form ```transform(input_module : nn.Module)``` -> ```nn.Module```, where you can feed in a ```Module``` instance and get a transformed ```Module``` instance out of it. - -This kind of functionality is applicable in many scenarios. For example, the FX-based Graph Mode Quantization product is releasing as a prototype contemporaneously with FX. Graph Mode Quantization automates the process of quantizing a neural net and does so by leveraging FX’s program capture, analysis and transformation facilities. We are also developing many other transformation products with FX and we are excited to share this powerful toolkit with the community. - -Because FX transforms consume and produce nn.Module instances, they can be used within many existing PyTorch workflows. This includes workflows that, for example, train in Python then deploy via TorchScript. - -You can read more about FX in the official [documentation](https://pytorch.org/docs/master/fx.html). You can also find several examples of program transformations implemented using ```torch.fx``` [here](https://github.com/pytorch/examples/tree/master/fx). We are constantly improving FX and invite you to share any feedback you have about the toolkit on the [forums](https://discuss.pytorch.org/) or [issue tracker](https://github.com/pytorch/pytorch/issues). - -We’d like to acknowledge [TorchScript](https://pytorch.org/docs/stable/jit.html) tracing, [Apache MXNet](https://mxnet.apache.org/versions/1.7.0/) hybridize, and more recently [JAX](https://github.com/google/jax) as influences for program acquisition via tracing. We’d also like to acknowledge [Caffe2](https://caffe2.ai/), [JAX](https://github.com/google/jax), and [TensorFlow](https://www.tensorflow.org/) as inspiration for the value of simple, directed dataflow graph program representations and transformations over those representations. - -# Distributed Training -The PyTorch 1.8 release added a number of new features as well as improvements to reliability and usability. Concretely, support for: [Stable level async error/timeout handling](https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group) was added to improve NCCL reliability; and stable support for [RPC based profiling](https://pytorch.org/docs/stable/rpc.html). Additionally, we have added support for pipeline parallelism as well as gradient compression through the use of communication hooks in DDP. Details are below: - -### [Beta] Pipeline Parallelism -As machine learning models continue to grow in size, traditional Distributed DataParallel (DDP) training no longer scales as these models don’t fit on a single GPU device. The new pipeline parallelism feature provides an easy to use PyTorch API to leverage pipeline parallelism as part of your training loop. -* [RFC](https://github.com/pytorch/pytorch/issues/44827) -* [Documentation](https://pytorch.org/docs/1.8.0/pipeline.html?highlight=pipeline#) - -### [Beta] DDP Communication Hook -The DDP communication hook is a generic interface to control how to communicate gradients across workers by overriding the vanilla allreduce in DistributedDataParallel. A few built-in communication hooks are provided including PowerSGD, and users can easily apply any of these hooks to optimize communication. Additionally, the communication hook interface can also support user-defined communication strategies for more advanced use cases. -* [RFC](https://github.com/pytorch/pytorch/issues/39272) -* [Documentation](https://pytorch.org/docs/1.8.0/ddp_comm_hooks.html?highlight=powersgd) - -### Additional Prototype Features for Distributed Training -In addition to the major stable and beta distributed training features in this release, we also have a number of prototype features available in our nightlies to try out and provide feedback. We have linked in the draft docs below for reference: -* **(Prototype) ZeroRedundancyOptimizer** - Based on and in partnership with the Microsoft DeepSpeed team, this feature helps reduce per-process memory footprint by sharding optimizer states across all participating processes in the ```ProcessGroup``` gang. Refer to this [documentation](https://pytorch.org/docs/master/distributed.optim.html#torch.distributed.optim.ZeroRedundancyOptimizer) for more details. -* **(Prototype) Process Group NCCL Send/Recv** - The NCCL send/recv API was introduced in v2.7 and this feature adds support for it in NCCL process groups. This feature will provide an option for users to implement collective operations at Python layer instead of C++ layer. Refer to this [documentation](https://pytorch.org/docs/master/distributed.html#distributed-communication-package-torch-distributed) and [code examples](https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L899) to learn more. -* **(Prototype) CUDA-support in RPC using TensorPipe** - This feature should bring consequent speed improvements for users of PyTorch RPC with multiple-GPU machines, as TensorPipe will automatically leverage NVLink when available, and avoid costly copies to and from host memory when exchanging GPU tensors between processes. When not on the same machine, TensorPipe will fall back to copying the tensor to host memory and sending it as a regular CPU tensor. This will also improve the user experience as users will be able to treat GPU tensors like regular CPU tensors in their code. Refer to this [documentation](https://pytorch.org/docs/1.8.0/rpc.html) for more details. -* **(Prototype) Remote Module** - This feature allows users to operate a module on a remote worker like using a local module, where the RPCs are transparent to the user. In the past, this functionality was implemented in an ad-hoc way and overall this feature will improve the usability of model parallelism on PyTorch. Refer to this [documentation](https://pytorch.org/docs/master/rpc.html#remotemodule) for more details. - -# PyTorch Mobile -Support for PyTorch Mobile is expanding with a new set of tutorials to help new users launch models on-device quicker and give existing users a tool to get more out of our framework. These include: -* [Image segmentation DeepLabV3 on iOS](https://pytorch.org/tutorials/beginner/deeplabv3_on_ios.html) -* [Image segmentation DeepLabV3 on Android](https://pytorch.org/tutorials/beginner/deeplabv3_on_android.html) - -Our new demo apps also include examples of image segmentation, object detection, neural machine translation, question answering, and vision transformers. They are available on both iOS and Android: -* [iOS demo app](https://github.com/pytorch/ios-demo-app) -* [Android demo app](https://github.com/pytorch/android-demo-app) - -In addition to performance improvements on CPU for MobileNetV3 and other models, we also revamped our Android GPU backend prototype for broader models coverage and faster inferencing: -* [Android tutorial](https://pytorch.org/tutorials/prototype/vulkan_workflow.html) - -Lastly, we are launching the PyTorch Mobile Lite Interpreter as a prototype feature in this release. The Lite Interpreter allows users to reduce the runtime binary size. Please try these out and send us your feedback on the [PyTorch Forums](https://discuss.pytorch.org/c/mobile/). All our latest updates can be found on the [PyTorch Mobile page](https://pytorch.org/mobile/home/) - -### [Prototype] PyTorch Mobile Lite Interpreter -PyTorch Lite Interpreter is a streamlined version of the PyTorch runtime that can execute PyTorch programs in resource constrained devices, with reduced binary size footprint. This prototype feature reduces binary sizes by up to 70% compared to the current on-device runtime in the current release. -* [iOS/Android Tutorial](https://pytorch.org/tutorials/prototype/lite_interpreter.html) - -# Performance Optimization -In 1.8, we are releasing the support for benchmark utils to enable users to better monitor performance. We are also opening up a new automated quantization API. See the details below: - -### (Beta) Benchmark utils -Benchmark utils allows users to take accurate performance measurements, and provides composable tools to help with both benchmark formulation and post processing. This expected to be helpful for contributors to PyTorch to quickly understand how their contributions are impacting PyTorch performance. - -Example: -```python -from torch.utils.benchmark import Timer - -results = [] -for num_threads in [1, 2, 4]: - timer = Timer( - stmt="torch.add(x, y, out=out)", - setup=""" - n = 1024 - x = torch.ones((n, n)) - y = torch.ones((n, 1)) - out = torch.empty((n, n)) - """, - num_threads=num_threads, - ) - results.append(timer.blocked_autorange(min_run_time=5)) - print( - f"{num_threads} thread{'s' if num_threads > 1 else ' ':<4}" - f"{results[-1].median * 1e6:>4.0f} us " + - (f"({results[0].median / results[-1].median:.1f}x)" if num_threads > 1 else '') - ) - -1 thread 376 us -2 threads 189 us (2.0x) -4 threads 99 us (3.8x) -``` -* [Documentation](https://pytorch.org/docs/1.8.0/benchmark_utils.html?highlight=benchmark#) -* [Tutorial](https://pytorch.org/tutorials/recipes/recipes/benchmark.html) - -### (Prototype) FX Graph Mode Quantization - FX Graph Mode Quantization is the new automated quantization API in PyTorch. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ```torch.fx```). -* [Documentation](https://pytorch.org/docs/master/quantization.html#prototype-fx-graph-mode-quantization) -* Tutorials: - * [(Prototype) FX Graph Mode Post Training Dynamic Quantization](https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html) - * [(Prototype) FX Graph Mode Post Training Static Qunatization](https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html) - * [(Prototype) FX Graph Mode Quantization User Guide](https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html) - -# Hardware Support - -### [Beta] Ability to Extend the PyTorch Dispatcher for a new backend in C++ -In PyTorch 1.8, you can now create new out-of-tree devices that live outside the ```pytorch/pytorch``` repo. The tutorial linked below shows how to register your device and keep it in sync with native PyTorch devices. -* [Tutorial](https://pytorch.org/tutorials/advanced/extend_dispatcher.html) - -### [Beta] AMD GPU Binaries Now Available -Starting in PyTorch 1.8, we have added support for ROCm wheels providing an easy onboarding to using AMD GPUs. You can simply go to the standard [PyTorch installation selector](https://pytorch.org/get-started/locally/) and choose ROCm as an installation option and execute the provided command. - -Thanks for reading, and if you are excited about these updates and want to participate in the future of PyTorch, we encourage you to join the [discussion forums](https://discuss.pytorch.org/) and [open GitHub issues](https://github.com/pytorch/pytorch/issues). - -Cheers! - -***Team PyTorch*** diff --git a/_posts/2021-3-9-ecosystem_day_2021.md b/_posts/2021-3-9-ecosystem_day_2021.md deleted file mode 100644 index e9095f83a8ab..000000000000 --- a/_posts/2021-3-9-ecosystem_day_2021.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: blog_detail -title: 'Announcing PyTorch Ecosystem Day' -author: Team PyTorch ---- - -We’re proud to announce our first PyTorch Ecosystem Day. The virtual, one-day event will focus completely on our Ecosystem and Industry PyTorch communities! - - -PyTorch is a deep learning framework of choice for academics and companies, all thanks to its rich ecosystem of tools and strong community. As with our developers, our ecosystem partners play a pivotal role in the development and growth of the community. - -
        - -
        - -We will be hosting our first PyTorch Ecosystem Day, a virtual event designed for our ecosystem and industry communities to showcase their work and discover new opportunities to collaborate. - -PyTorch Ecosystem Day will be held on April 21, with both a morning and evening session, to ensure we reach our global community. Join us virtually for a day filled with discussions on new developments, trends, challenges, and best practices through keynotes, breakout sessions, and a unique networking opportunity hosted through Gather.Town . - -## Event Details -April 21, 2021 (Pacific Time) -Fully digital experience - -* Morning Session: (EMEA) -Opening Talks - 8:00 am-9:00 am PT -Poster Exhibition & Breakout Sessions - 9:00 am-12:00 pm PT - -* Evening Session (APAC/US) -Opening Talks - 3:00 pm-4:00 pm PT -Poster Exhibition & Breakout Sessions - 3:00 pm-6:00 pm PT - -* Networking - 9:00 am-7:00 pm PT - -### There are two ways to participate in PyTorch Ecosystem Day: - -1. **Poster Exhibition** from the PyTorch ecosystem and industry communities covering a variety of topics. Posters are available for viewing throughout the duration of the event. To be part of the poster exhibition, please see below for submission details. If your poster is accepted, we highly recommend tending your poster during one of the morning or evening sessions or both! - -2. **Breakout Sessions** are 40-min sessions freely designed by the community. The breakouts can be talks, demos, tutorials or discussions. Note: you must have an accepted poster to apply for the breakout sessions. - -Call for posters now open! Submit your proposal today! Please send us the **title** and **summary** of your projects, tools, and libraries that could benefit PyTorch researchers in academia and industry, application developers, and ML engineers for consideration. The focus must be on academic papers, machine learning research, or open-source projects. Please no sales pitches. **Deadline for submission is March 18, 2021.** - -Visit pytorchecosystemday.fbreg.com for more information and we look forward to welcoming you to PyTorch Ecosystem Day on April 21st! - - diff --git a/_posts/2021-4-16-ml-models-torchvision-v0.9.md b/_posts/2021-4-16-ml-models-torchvision-v0.9.md deleted file mode 100644 index ff4e22f2c7c6..000000000000 --- a/_posts/2021-4-16-ml-models-torchvision-v0.9.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: blog_detail -title: 'An overview of the ML models introduced in TorchVision v0.9' -author: Team PyTorch ---- - -TorchVision v0.9 has been [released](https://github.com/pytorch/vision/releases) and it is packed with numerous new Machine Learning models and features, speed improvements and bug fixes. In this blog post, we provide a quick overview of the newly introduced ML models and discuss their key features and characteristics. - -### Classification -* **MobileNetV3 Large & Small:** These two classification models are optimized for Mobile use-cases and are used as backbones on other Computer Vision tasks. The implementation of the new [MobileNetV3 architecture](https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenetv3.py) supports the Large & Small variants and the depth multiplier parameter as described in the [original paper](https://arxiv.org/pdf/1905.02244.pdf). We offer pre-trained weights on ImageNet for both Large and Small networks with depth multiplier 1.0 and resolution 224x224. Our previous [training recipes](https://github.com/pytorch/vision/tree/master/references/classification#mobilenetv3-large--small) have been updated and can be used to easily train the models from scratch (shoutout to Ross Wightman for inspiring some of our training configuration). The Large variant offers a [competitive accuracy](https://github.com/pytorch/vision/blob/master/docs/source/models.rst#classification) comparing to ResNet50 while being over 6x faster on CPU, meaning that it is a good candidate for applications where speed is important. For applications where speed is critical, one can sacrifice further accuracy for speed and use the Small variant which is 15x faster than ResNet50. - -* **Quantized MobileNetV3 Large:** The quantized version of MobilNetV3 Large reduces the number of parameters by 45% and it is roughly 2.5x faster than the non-quantized version while remaining competitive in [terms of accuracy](https://github.com/pytorch/vision/blob/master/docs/source/models.rst#quantized-models). It was fitted on ImageNet using Quantization Aware Training by iterating on the non-quantized version and it can be trained from scratch using the existing [reference scripts](https://github.com/pytorch/vision/tree/master/references/classification#quantized). - -**Usage:** -``` -model = torchvision.models.mobilenet_v3_large(pretrained=True) -# model = torchvision.models.mobilenet_v3_small(pretrained=True) -# model = torchvision.models.quantization.mobilenet_v3_large(pretrained=True) -model.eval() -predictions = model(img) -``` -### Object Detection -* **Faster R-CNN MobileNetV3-Large FPN:** Combining the MobileNetV3 Large backbone with a Faster R-CNN detector and a Feature Pyramid Network leads to a highly accurate and fast object detector. The pre-trained weights are fitted on COCO 2017 using the provided reference [scripts](https://github.com/pytorch/vision/tree/master/references/detection#faster-r-cnn-mobilenetv3-large-fpn) and the model is 5x faster on CPU than the equivalent ResNet50 detector while remaining competitive in [terms of accuracy](https://github.com/pytorch/vision/blob/master/docs/source/models.rst#object-detection-instance-segmentation-and-person-keypoint-detection). -* **Faster R-CNN MobileNetV3-Large 320 FPN:** This is an iteration of the previous model that uses reduced resolution (min_size=320 pixel) and sacrifices accuracy for speed. It is 25x faster on CPU than the equivalent ResNet50 detector and thus it is good for real mobile use-cases. - -**Usage:** -``` -model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True) -# model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=True) -model.eval() -predictions = model(img) -``` -### Semantic Segmentation -* **DeepLabV3 with Dilated MobileNetV3 Large Backbone:** A dilated version of the MobileNetV3 Large backbone combined with DeepLabV3 helps us build a highly accurate and fast semantic segmentation model. The pre-trained weights are fitted on COCO 2017 using our [standard training recipes](https://github.com/pytorch/vision/tree/master/references/segmentation#deeplabv3_mobilenet_v3_large). The final model has the [same accuracy](https://github.com/pytorch/vision/blob/master/docs/source/models.rst#semantic-segmentation) as the FCN ResNet50 but it is 8.5x faster on CPU and thus making it an excellent replacement for the majority of applications. -* **Lite R-ASPP with Dilated MobileNetV3 Large Backbone:** We introduce the implementation of a new segmentation head called Lite R-ASPP and combine it with the dilated MobileNetV3 Large backbone to build a very fast segmentation model. The new model sacrifices some accuracy to achieve a 15x speed improvement comparing to the previously most lightweight segmentation model which was the FCN ResNet50. - -**Usage:** -``` -model = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True) -# model = torchvision.models.segmentation.lraspp_mobilenet_v3_large(pretrained=True) -model.eval() -predictions = model(img) -``` -In the near future we plan to publish an article that covers the details of how the above models were trained and discuss their tradeoffs and design choices. Until then we encourage you to try out the new models and provide your feedback. diff --git a/_posts/2021-5-10-ecosystem-day-2021-recap.md b/_posts/2021-5-10-ecosystem-day-2021-recap.md deleted file mode 100644 index d6cf63f899c8..000000000000 --- a/_posts/2021-5-10-ecosystem-day-2021-recap.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch Ecosystem Day 2021 Recap and New Contributor Resources' -author: Team PyTorch ---- - -Thank you to our incredible community for making the first ever PyTorch Ecosystem Day a success! The day was filled with discussions on new developments, trends and challenges showcased through 71 posters, 32 breakout sessions and 6 keynote speakers. - -
        - -
        - -Special thanks to our keynote speakers: Piotr Bialecki, Ritchie Ng, Miquel Farré, Joe Spisak, Geeta Chauhan, and Suraj Subramanian who shared updates from the latest release of PyTorch, exciting work being done with partners, use case example from Disney, the growth and development of the PyTorch community in Asia Pacific, and latest contributor highlights. - -If you missed the opening talks, you rewatch them here: -* [Morning/EMEA Opening Talks](https://www.youtube.com/watch?v=MYE01-XaSZA) -* [Evening/APAC Opening Talks](https://www.youtube.com/watch?v=CjU_6OaYKpw) - -In addition to the talks, we had 71 posters covering various topics such as multimodal, NLP, compiler, distributed training, researcher productivity tools, AI accelerators, and more. From the event, it was clear that an underlying thread that ties all of these different projects together is the cross-collaboration of the PyTorch community. Thank you for continuing to push the state of the art with PyTorch! - -To view the full catalogue of poster, please visit **[PyTorch Ecosystem Day 2021 Event Page](https://pytorch.org/ecosystem/pted/2021)**. - -### New Contributor Resources -Today, we are also sharing new contributor resources that we are trying out to give you the most access to up-to-date news, networking opportunities and more. -* [Contributor Newsletter](https://pytorch.org/newsletter) - Includes curated news including RFCs, feature roadmaps, notable PRs, editorials from developers, and more to support keeping track of everything that’s happening in our community. -* [Contributors Discussion Forum](https://dev-discuss.pytorch.org/) - Designed for contributors to learn and collaborate on the latest development across PyTorch. -* [PyTorch Developer Podcast (Beta)](https://pytorch-dev-podcast.simplecast.com/) - Edward Yang, PyTorch Research Scientist, at Facebook AI shares bite-sized (10 to 20 mins) podcast episodes discussing topics about all sorts of internal development topics in PyTorch. - -Thank you, - -Team PyTorch diff --git a/_posts/2021-5-25-announcing-pytorch-enterprise.md b/_posts/2021-5-25-announcing-pytorch-enterprise.md deleted file mode 100644 index b902bbd0d445..000000000000 --- a/_posts/2021-5-25-announcing-pytorch-enterprise.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -layout: blog_detail -title: 'Announcing the PyTorch Enterprise Support Program' -author: Team PyTorch ---- - -Today, we are excited to announce the PyTorch Enterprise Support Program, a participatory program that enables service providers to develop and offer tailored enterprise-grade support to their customers. This new offering, built in collaboration between Facebook and Microsoft, was created in direct response to feedback from PyTorch enterprise users who are developing models in production at scale for mission-critical applications. - -The PyTorch Enterprise Support Program is available to any service provider. It is designed to mutually benefit all program Participants by sharing and improving PyTorch long-term support (LTS), including contributions of hotfixes and other improvements found while working closely with customers and on their systems. - -To benefit the open source community, all hotfixes developed by Participants will be tested and fed back to the LTS releases of PyTorch regularly through PyTorch’s standard pull request process. To participate in the program, a service provider must apply and meet a set of program terms and certification requirements. Once accepted, the service provider becomes a program Participant and can offer a packaged PyTorch Enterprise support service with LTS, prioritized troubleshooting, useful integrations, and more. - -
        - -
        - -As one of the founding members and an inaugural member of the PyTorch Enterprise Support Program, Microsoft is launching [PyTorch Enterprise on Microsoft Azure](https://Aka.ms/PyTorchEnterpriseHeroBlog) to deliver a reliable production experience for PyTorch users. Microsoft will support each PyTorch release for as long as it is current. In addition, it will support selected releases for two years, enabling a stable production experience. Microsoft Premier and Unified Support customers can access prioritized troubleshooting for hotfixes, bugs, and security patches at no additional cost. Microsoft will extensively test PyTorch releases for performance regression. The latest release of PyTorch will be integrated with [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning/) and other PyTorch add-ons including [ONNX Runtime](https://www.onnxruntime.ai/) for faster inference. - -PyTorch Enterprise on Microsoft Azure not only benefits its customers, but also the PyTorch community users. All improvements will be tested and fed back to the future release for PyTorch so everyone in the community can use them. - -As an organization or PyTorch user, the standard way of researching and deploying with different release versions of PyTorch does not change. If your organization is looking for the managed long-term support, prioritized patches, bug fixes, and additional enterprise-grade support, then you should reach out to service providers participating in the program. - -To learn more and participate in the program as a service provider, visit the PyTorch Enterprise Support Program. If you want to learn more about Microsoft’s offering, visit [PyTorch Enterprise on Microsoft Azure](https://Aka.ms/PyTorchEnterpriseHeroBlog). - -Thank you, - -Team PyTorch diff --git a/_posts/2021-5-26-torchvision-mobilenet-v3-implementation.md b/_posts/2021-5-26-torchvision-mobilenet-v3-implementation.md deleted file mode 100644 index 2dfe3bba49c1..000000000000 --- a/_posts/2021-5-26-torchvision-mobilenet-v3-implementation.md +++ /dev/null @@ -1,218 +0,0 @@ ---- -layout: blog_detail -title: 'Everything you need to know about TorchVision’s MobileNetV3 implementation' -author: Vasilis Vryniotis and Francisco Massa ---- - -In TorchVision v0.9, we released a series of [new mobile-friendly models](https://pytorch.org/blog/ml-models-torchvision-v0.9/) that can be used for Classification, Object Detection and Semantic Segmentation. In this article, we will dig deep into the code of the models, share notable implementation details, explain how we configured and trained them, and highlight important tradeoffs we made during their tuning. Our goal is to disclose technical details that typically remain undocumented in the original papers and repos of the models. - -### Network Architecture - -The implementation of the [MobileNetV3 architecture](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py) follows closely the [original paper](https://arxiv.org/abs/1905.02244). It is customizable and offers different configurations for building Classification, Object Detection and Semantic Segmentation backbones. It was designed to follow a similar structure to MobileNetV2 and the two share [common building blocks](https://github.com/pytorch/vision/blob/cac8a97b0bd14eddeff56f87a890d5cc85776e18/torchvision/models/mobilenetv2.py#L32). - -Off-the-shelf, we offer the two variants described on the paper: the [Large](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L196-L214) and the [Small](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L215-L229). Both are constructed using the same code with the only difference being their configuration which describes the number of blocks, their sizes, their activation functions etc. - -### Configuration parameters - -Even though one can write a [custom InvertedResidual setting](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L105) and pass it to the MobileNetV3 class directly, for the majority of applications we can adapt the existing configs by passing parameters to the [model building methods](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L253). Some of the key configuration parameters are the following: - -- The `width_mult` [parameter](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L188) is a multiplier that affects the number of channels of the model. The default value is 1 and by increasing or decreasing it one can change the number of filters of all convolutions, including the ones of the first and last layers. The implementation ensures that the number of filters is always a [multiple of 8](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L56-L57). This is a hardware optimization trick which allows for faster vectorization of operations. - -- The `reduced_tail` [parameter](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L188) halves the number of channels on the [last blocks](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L210-L214) of the network. This version is used by some Object Detection and Semantic Segmentation models. It’s a speed optimization which is described on the [MobileNetV3 paper](https://arxiv.org/abs/1905.02244) and reportedly leads to a 15% latency reduction without a significant negative effect on accuracy. - -- The `dilated` [parameter](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L188) affects the [last 3](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L210-L212) InvertedResidual blocks of the model and turns their normal depthwise Convolutions to Atrous Convolutions. This is used to control the output stride of these blocks and has a [significant positive effect](https://arxiv.org/abs/1706.05587) on the accuracy of Semantic Segmentation models. - -### Implementation details - -Below we provide additional information on some notable implementation details of the architecture. -The [MobileNetV3 class](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L101) is responsible for building a network out of the provided configuration. Here are some implementation details of the class: - -- The last convolution block expands the output of the last InvertedResidual block by a [factor of 6](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L149). The implementation is aligned with the Large and Small configurations described on the paper and can adapt to different values of the multiplier parameter. - -- Similarly to other models such as MobileNetV2, a dropout layer is placed just before the final Linear layer of the classifier. - -The [InvertedResidual class](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L60) is the main building block of the network. Here are some notable implementation details of the block along with its visualization which comes from Figure 4 of the paper: - -- There is no [expansion step](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L73-L76) if the input channels and the expanded channels are the same. This happens on the first convolution block of the network. - -- There is always a [projection step](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L86-L88) even when the expanded channels are the same as the output channels. - -- The activation method of the depthwise block is placed [before](https://github.com/pytorch/vision/blob/11bf27e37190b320216c349e39b085fb33aefed1/torchvision/models/mobilenetv3.py#L82-L84) the Squeeze-and-Excite layer as this improves marginally the accuracy. - -
        - -
        - -### Classification - -In this section we provide benchmarks of the pre-trained models and details on how they were configured, trained and quantized. - -**Benchmarks** - -Here is how to initialize the pre-trained models: -``` -large = torchvision.models.mobilenet_v3_large(pretrained=True, width_mult=1.0, reduced_tail=False, dilated=False) -small = torchvision.models.mobilenet_v3_small(pretrained=True) -quantized = torchvision.models.quantization.mobilenet_v3_large(pretrained=True) -``` - -Below we have the detailed benchmarks between new and selected previous models. As we can see MobileNetV3-Large is a viable replacement of ResNet50 for users who are willing to sacrifice a bit of accuracy for a roughly 6x speed-up: - -| Model | Acc@1 | Acc@5 | Inference on CPU (sec) | # Params (M) | -|-----------------------------|--------:|--------:|------------------------:|--------------:| -| MobileNetV3-Large | 74.042 | 91.340 | 0.0411 | 5.48 | -| MobileNetV3-Small | 67.668 | 87.402 | 0.0165 | 2.54 | -| Quantized MobileNetV3-Large | 73.004 | 90.858 | 0.0162 | 2.96 | -| MobileNetV2 | 71.880 | 90.290 | 0.0608 | 3.50 | -| ResNet50 | 76.150 | 92.870 | 0.2545 | 25.56 | -| ResNet18 | 69.760 | 89.080 | 0.1032 | 11.69 | - -Note that the inference times are measured on CPU. They are not absolute benchmarks, but they allow for relative comparisons between models. - -**Training process** - -All pre-trained models are configured with a width multiplier of 1, have full tails, are non-dilated, and were fitted on ImageNet. Both the Large and Small variants were trained using the same hyper-parameters and scripts which can be found in our [references](https://github.com/pytorch/vision/tree/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification#mobilenetv3-large--small) folder. Below we provide details on the most notable aspects of the training process. - - **Achieving fast and stable training** - -[Configuring RMSProp](https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173) correctly was crucial to achieve fast training with numerical stability. The authors of the paper used TensorFlow in their experiments and in their runs they reported using [quite high](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet#v3) `rmsprop_epsilon` comparing to the default. Typically this hyper-parameter takes small values as it’s used to avoid zero denominators, but in this specific model choosing the right value seems important to avoid numerical instabilities in the loss. - -Another important detail is that though PyTorch’s and TensorFlow’s RMSProp implementations typically behave similarly, there are [a few differences](https://github.com/pytorch/pytorch/issues/32545) with the most notable in our setup being how the epsilon hyperparameter is handled. More specifically, PyTorch adds the epsilon [outside of the square root calculation](https://github.com/tensorflow/tensorflow/blob/v2.5.0/tensorflow/python/training/rmsprop.py#L25) while TensorFlow [adds it inside](https://github.com/tensorflow/tensorflow/blob/v2.5.0/tensorflow/python/training/rmsprop.py#L25). The result of this implementation detail is that one needs to adjust the epsilon value while porting the hyper parameter of the paper. A reasonable approximation can be taken with the formula `PyTorch_eps = sqrt(TF_eps)`. - -**Increasing our accuracy by tuning hyperparameters & improving our training recipe** - -After configuring the optimizer to achieve fast and stable training, we turned into optimizing the accuracy of the model. There are a few techniques that helped us achieve this. First of all, to avoid overfitting we augmented out data using the AutoAugment algorithm, followed by RandomErasing. Additionally we tuned parameters such as the weight decay using cross validation. We also found beneficial to perform [weight averaging](https://github.com/pytorch/vision/blob/674e8140042c2a3cbb1eb9ebad1fa49501599130/references/classification/utils.py#L259) across different epoch checkpoints after the end of the training. Finally, though not used in our published training recipe, we found that using Label Smoothing, Stochastic Depth and LR noise injection improve the overall accuracy by over 1.5 points. - -The graph and table depict a simplified summary of the most important iterations for improving the accuracy of the MobileNetV3 Large variant. Note that the actual number of iterations done while training the model was significantly larger and that the progress in accuracy was not always monotonically increasing. Also note that the Y-axis of the graph starts from 70% instead from 0% to make the difference between iterations more visible: - -
        - -
        - -| Iteration | Acc@1 | Acc@5 | -|-------------------------------------------------|--------:|--------:| -| Baseline with "MobileNetV2-style" Hyperparams | 71.542 | 90.068 | -| + RMSProp with default eps | 70.684 | 89.38 | -| + RMSProp with adjusted eps & LR scheme | 71.764 | 90.178 | -| + Data Augmentation & Tuned Hyperparams | 73.86 | 91.292 | -| + Checkpoint Averaging | 74.028 | 91.382 | -| + Label Smoothing & Stochastic Depth & LR noise | 75.536 | 92.368 | - -Note that once we’ve achieved an acceptable accuracy, we verified the model performance on the hold-out test dataset which hasn't been used before for training or hyper-parameter tuning. This process helps us detect overfitting and is always performed for all pre-trained models prior their release. - -**Quantization** - -We currently offer quantized weights for the QNNPACK backend of the [MobileNetV3-Large variant](https://github.com/pytorch/vision/blob/b94a4014a68d08f37697f4672729571a46f0042d/torchvision/models/quantization/mobilenetv3.py#L115) which provides a speed-up of 2.5x. To quantize the model, Quantized Aware Training (QAT) was used. The hyper parameters and the scripts used to train the model can be found in our [references](https://github.com/pytorch/vision/tree/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification#quantized) folder. - -Note that QAT allows us to model the effects of quantization and adjust the weights so that we can improve the model accuracy. This translates to an accuracy increase of 1.8 points comparing to simple post-training quantization: - -| Quantization Status | Acc@1 | Acc@5 | -|----------------------------|--------:|--------:| -| Non-quantized | 74.042 | 91.340 | -| Quantized Aware Training | 73.004 | 90.858 | -| Post-training Quantization | 71.160 | 89.834 | - -### Object Detection - -In this section, we will first provide benchmarks of the released models, and then discuss how the MobileNetV3-Large backbone was used in a Feature Pyramid Network along with the FasterRCNN detector to perform Object Detection. We will also explain how the network was trained and tuned alongside with any tradeoffs we had to make. We will not cover details about how it was used with [SSDlite](https://github.com/pytorch/vision/blob/b94a4014a68d08f37697f4672729571a46f0042d/torchvision/models/detection/ssdlite.py) as this will be discussed on a future article. - -**Benchmarks** - -Here is how the models are initialized: -``` -high_res = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True) -low_res = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=True) -``` - -Below are some benchmarks between new and selected previous models. As we can see the high resolution Faster R-CNN with MobileNetV3-Large FPN backbone seems a viable replacement of the equivalent ResNet50 model for those users who are willing to sacrifice few accuracy points for a 5x speed-up: - -| Model | mAP | Inference on CPU (sec) | # Params (M) | -|--------------------------------------------------|------:|------------------------:|--------------:| -| Faster R-CNN MobileNetV3-Large FPN (High-Res) | 32.8 | 0.8409 | 19.39 | -| Faster R-CNN MobileNetV3-Large 320 FPN (Low-Res) | 22.8 | 0.1679 | 19.39 | -| Faster R-CNN ResNet-50 FPN | 37.0 | 4.1514 | 41.76 | -| RetinaNet ResNet-50 FPN | 36.4 | 4.8825 | 34.01 | - -**Implementation details** - -The Detector uses a FPN-style backbone which extracts features from different convolutions of the MobileNetV3 model. [By default](https://github.com/pytorch/vision/blob/eca37cf735064702189ff5d5b1428cbe25ab2bcf/torchvision/models/detection/backbone_utils.py#L165-L166) the pre-trained model uses the output of the 13th InvertedResidual block and the output of the Convolution prior to the pooling layer but the implementation supports using the outputs of [more stages](https://github.com/pytorch/vision/blob/eca37cf735064702189ff5d5b1428cbe25ab2bcf/torchvision/models/detection/backbone_utils.py#L147-L150). - -All feature maps extracted from the network have their output projected down to [256 channels](https://github.com/pytorch/vision/blob/eca37cf735064702189ff5d5b1428cbe25ab2bcf/torchvision/models/detection/backbone_utils.py#L160) by the FPN block as this greatly improves the speed of the network. These feature maps provided by the FPN backbone are used by the FasterRCNN detector to provide box and class predictions at [different scales](https://github.com/pytorch/vision/blob/7af30ee9ab64039d04150d118e8b72473184fd6e/torchvision/models/detection/faster_rcnn.py#L382-L389). - -**Training & Tuning process** - -We currently offer two pre-trained models capable of doing object detection at different resolutions. Both models were trained on the COCO dataset using the same hyper-parameters and scripts which can be found in our [references](https://github.com/pytorch/vision/tree/e35793a1a4000db1f9f99673437c514e24e65451/references/detection#faster-r-cnn-mobilenetv3-large-fpn) folder. - -The [High Resolution detector](https://github.com/pytorch/vision/blob/7af30ee9ab64039d04150d118e8b72473184fd6e/torchvision/models/detection/faster_rcnn.py#L398-L399) was trained with images of 800-1333px, while the mobile-friendly [Low Resolution detector](https://github.com/pytorch/vision/blob/7af30ee9ab64039d04150d118e8b72473184fd6e/torchvision/models/detection/faster_rcnn.py#L398-L399) was trained with images of 320-640px. The reason why we provide two separate sets of pre-trained weights is because training a detector directly on the smaller images leads to a 5 mAP increase in precision comparing to passing small images to the pre-trained high-res model. Both backbones were initialized with weights fitted on ImageNet and the [3 last stages](https://github.com/pytorch/vision/blob/7af30ee9ab64039d04150d118e8b72473184fd6e/torchvision/models/detection/faster_rcnn.py#L377-L378) of their weights where fined-tuned during the training process. - -An additional speed optimization can be applied on the mobile-friendly model by [tuning the RPN NMS thresholds](https://github.com/pytorch/vision/blob/7af30ee9ab64039d04150d118e8b72473184fd6e/torchvision/models/detection/faster_rcnn.py#L423-L424). By sacrificing only 0.2 mAP of precision we were able to improve the CPU speed of the model by roughly 45%. The details of the optimization can be seen below: - -| Tuning Status | mAP | Inference on CPU (sec) | -|---------------|------:|------------------------:| -| Before | 23.0 | 0.2904 | -| After | 22.8 | 0.1679 | - -Below we provide some examples of visualizing the predictions of the Faster R-CNN MobileNetV3-Large FPN model: - -
        - -
        - -### Semantic Segmentation - -In this section we will start by providing some benchmarks of the released pre-trained models. Then we will discuss how a MobileNetV3-Large backbone was combined with segmentation heads such as [LR-ASPP](https://arxiv.org/abs/1905.02244), [DeepLabV3](https://arxiv.org/abs/1706.05587) and the [FCN](https://arxiv.org/abs/1411.4038) to conduct Semantic Segmentation. We will also explain how the network was trained and propose a few optional optimization techniques for speed critical applications. - -**Benchmarks** - -This is how to initialize the pre-trained models: - -``` -lraspp = torchvision.models.segmentation.lraspp_mobilenet_v3_large(pretrained=True) -deeplabv3 = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True) -``` - -Below are the detailed benchmarks between new and selected existing models. As we can see, the DeepLabV3 with a MobileNetV3-Large backbone is a viable replacement of FCN with ResNet50 for the majority of applications as it achieves similar accuracy with a 8.5x speed-up. We also observe that the LR-ASPP network supersedes the equivalent FCN in all metrics: - -| Model | mIoU | Global Pixel Acc | Inference on CPU (sec) | # Params (M) | -|--------------------------------------|------:|------------------:|------------------------:|--------------:| -| LR-ASPP MobileNetV3-Large | 57.9 | 91.2 | 0.3278 | 3.22 | -| DeepLabV3 MobileNetV3-Large | 60.3 | 91.2 | 0.5869 | 11.03 | -| FCN MobileNetV3-Large (not released) | 57.8 | 90.9 | 0.3702 | 5.05 | -| DeepLabV3 ResNet50 | 66.4 | 92.4 | 6.3531 | 39.64 | -| FCN ResNet50 | 60.5 | 91.4 | 5.0146 | 32.96 | - -### Implementation details - -In this section we will discuss important implementation details of tested segmentation heads. Note that all models described in this section use a dilated MobileNetV3-Large backbone. - -**LR-ASPP** - -The LR-ASPP is the Lite variant of the Reduced Atrous Spatial Pyramid Pooling model proposed by the authors of the MobileNetV3 paper. Unlike the other segmentation models in TorchVision, it does not make use of an [auxiliary loss](https://github.com/pytorch/vision/blob/b94a4014a68d08f37697f4672729571a46f0042d/torchvision/models/segmentation/segmentation.py#L185-L186). Instead it uses [low and high-level features](https://github.com/pytorch/vision/blob/b94a4014a68d08f37697f4672729571a46f0042d/torchvision/models/segmentation/segmentation.py#L92-L100) with output strides of 8 and 16 respectively. - -Unlike the paper where a 49x49 AveragePooling layer with variable strides is used, [our implementation](https://github.com/pytorch/vision/blob/e2db2eddbb1699a59fbb5ccbec912979048ef3bf/torchvision/models/segmentation/lraspp.py#L53) uses an `AdaptiveAvgPool2d` layer to process the global features. This is because the authors of the paper tailored the head to the Cityscapes dataset while our focus is to provide a general purpose implementation that can work on multiple datasets. Finally our implementation always has a bilinear interpolation [before returning the output](https://github.com/pytorch/vision/blob/e2db2eddbb1699a59fbb5ccbec912979048ef3bf/torchvision/models/segmentation/lraspp.py#L35) to ensure that the sizes of the input and output images match exactly. - -**DeepLabV3 & FCN** - -The combination of MobileNetV3 with DeepLabV3 and FCN follows closely the ones of other models and the stage estimation for these methods is identical to LR-ASPP. The only notable difference is that instead of using high and low level features, [we attach](https://github.com/pytorch/vision/blob/b94a4014a68d08f37697f4672729571a46f0042d/torchvision/models/segmentation/segmentation.py#L37-L45) the normal loss to the feature map with output stride 16 and an auxiliary loss on the feature map with output stride 8. - -Finally we should note that the FCN version of the model was not released because it was completely superseded by the LR-ASPP both in terms of speed and accuracy. The [pre-trained weights](https://github.com/pytorch/vision/pull/3276/commits/1641d5f4c7d41f534444fab340c598d61a91bd12#diff-ccff7af514d99eeb40416c8b9ec30f032d1a3f450aaa4057958ca39ab174452eL17) are still available and can be used with minimal changes to the code. - -### Training & Tuning process - -We currently offer two MobileNetV3 pre-trained models capable of doing semantic segmentation: the LR-ASPP and the DeepLabV3. The backbones of the models were [initialized with ImageNet weights](https://github.com/pytorch/vision/blob/b94a4014a68d08f37697f4672729571a46f0042d/torchvision/models/segmentation/segmentation.py#L89-L90) and trained end-to-end. Both architectures were trained on the COCO dataset using the same scripts with similar hyper-parameters. Their details can be found in our [references](https://github.com/pytorch/vision/tree/a78d0d83d0a499fe8480d7a9f493676e746c4699/references/segmentation#deeplabv3_mobilenet_v3_large) folder. - -Normally, during inference the images are [resized to 520 pixels](https://github.com/pytorch/vision/blob/a78d0d83d0a499fe8480d7a9f493676e746c4699/references/segmentation/train.py#L30-L33). An optional speed optimization is to construct a Low Res configuration of the model by using the High-Res pre-trained weights and reducing the inference resizing to 320 pixels. This will improve the CPU execution times by roughly 60% while sacrificing a couple of mIoU points. The detailed numbers of this optimization can be found on the table below: - -| Low-Res Configuration | mIoU Difference | Speed Improvement | mIoU | Global Pixel Acc | Inference on CPU (sec) | -|--------------------------------------|-----------------:|-------------------:|------:|------------------:|------------------------:| -| LR-ASPP MobileNetV3-Large| -2.1 | 65.26% | 55.8 | 90.3 | 0.1139 | -| DeepLabV3 MobileNetV3-Large | -3.8 | 63.86% | 56.5 | 90.3 | 0.2121 | -| FCN MobileNetV3-Large (not released) | -3.0 | 57.57% | 54.8 | 90.1 | 0.1571 | - -Here are some examples of visualizing the predictions of the LR-ASPP MobileNetV3-Large model: - -
        - -
        - -We hope that you found this article interesting. We are looking forward to your feedback to see if this is the type of content you would like us to publish more often. If the community finds that such posts are useful, we will be happy to publish more articles that cover the implementation details of newly introduced Machine Learning models. diff --git a/_posts/2021-6-15-pytorch-1.9-new-library-releases.md b/_posts/2021-6-15-pytorch-1.9-new-library-releases.md deleted file mode 100644 index 6ed505185db8..000000000000 --- a/_posts/2021-6-15-pytorch-1.9-new-library-releases.md +++ /dev/null @@ -1,204 +0,0 @@ ---- -layout: blog_detail -title: 'New PyTorch Library Releases in PyTorch 1.9, including TorchVision, TorchAudio, and more' -author: Team PyTorch ---- - -Today, we are announcing updates to a number of PyTorch libraries, alongside the [PyTorch 1.9 release](https://pytorch.org/blog/pytorch-1.9-released/). The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio. These releases, along with the PyTorch 1.9 release, include a number of new features and improvements that will provide a broad set of updates for the PyTorch community. - -Some highlights include: - -* **TorchVision** - Added new SSD and SSDLite models, quantized kernels for object detection, GPU Jpeg decoding, and iOS support. See [release notes](https://github.com/pytorch/vision/releases) here. -* **TorchAudio** - Added wav2vec 2.0 model deployable in non-Python environments (including C++, Android, and iOS). Many performance improvements in lfilter, spectral operations, resampling. Added options for quality control in sampling (i.e. Kaiser window support). Initiated the migration of complex tensors operations. Improved autograd support. See [release notes](https://github.com/pytorch/audio/releases) here. -* **TorchText** - Added a new high-performance Vocab module that provides common functional APIs for NLP workflows. See [release notes](https://github.com/pytorch/text/releases) here. - -We’d like to thank the community for their support and work on this latest release. - -Features in PyTorch releases are classified as Stable, Beta, and Prototype. You can learn more about the definitions in [this blog post](https://pytorch.org/blog/pytorch-feature-classification-changes/). - -# TorchVision 0.10 - -### (Stable) Quantized kernels for object detection -The forward pass of the nms and roi_align operators now support tensors with a quantized dtype, which can help lower the memory footprint of object detection models, particularly on mobile environments. For more details, refer to [the documentation](https://pytorch.org/vision/stable/ops.html#torchvision.ops.roi_align). - -### (Stable) Speed optimizations for Tensor transforms -The resize and flip transforms have been optimized and its runtime improved by up to 5x on the CPU. - -### (Stable) Documentation improvements -Significant improvements were made to the documentation. In particular, a new gallery of examples is available. These examples visually illustrate how each transform acts on an image, and also properly documents and illustrates the output of the segmentation models. - -The example gallery will be extended in the future to provide more comprehensive examples and serve as a reference for common torchvision tasks. For more details, refer to [the documentation](https://pytorch.org/vision/stable/auto_examples/index.html). - -### (Beta) New models for detection -[SSD](https://arxiv.org/abs/1512.02325) and [SSDlite](https://arxiv.org/abs/1801.04381) are two popular object detection architectures that are efficient in terms of speed and provide good results for low resolution pictures. In this release, we provide implementations for the original SSD model with VGG16 backbone and for its mobile-friendly variant SSDlite with MobileNetV3-Large backbone. - -The models were pre-trained on COCO train2017 and can be used as follows: - -```python -import torch -import torchvision - -# Original SSD variant -x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)] -m_detector = torchvision.models.detection.ssd300_vgg16(pretrained=True) -m_detector.eval() -predictions = m_detector(x) - -# Mobile-friendly SSDlite variant -x = [torch.rand(3, 320, 320), torch.rand(3, 500, 400)] -m_detector = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True) -m_detector.eval() -predictions = m_detector(x) -``` - -The following accuracies can be obtained on COCO val2017 (full results available in [#3403](https://github.com/pytorch/vision/pull/3403) and [#3757](https://github.com/pytorch/vision/pull/3757)): - - - {:.table.table-striped.table-bordered} -| Model | mAP | mAP@50 | mAP@75 | -| ------------- | ------------- | ------------- | ------------- | -| SSD300 VGG16 | 25.1 | 41.5 | 26.2 | -| SSDlite320 MobileNetV3-Large | 21.3 | 34.3 | 22.1 | - - -For more details, refer to [the documentation](https://pytorch.org/vision/stable/models.html#id37). - -### (Beta) JPEG decoding on the GPU -Decoding jpegs is now possible on GPUs with the use of [nvjpeg](https://developer.nvidia.com/nvjpeg), which should be readily available in your CUDA setup. The decoding time of a single image should be about 2 to 3 times faster than with libjpeg on CPU. While the resulting tensor will be stored on the GPU device, the input raw tensor still needs to reside on the host (CPU), because the first stages of the decoding process take place on the host: -from torchvision.io.image import read_file, decode_jpeg - -```python -data = read_file('path_to_image.jpg') # raw data is on CPU -img = decode_jpeg(data, device='cuda') # decoded image in on GPU -``` -For more details, see [the documentation](https://pytorch.org/vision/stable/io.html#torchvision.io.decode_jpeg). - -### (Beta) iOS support -TorchVision 0.10 now provides pre-compiled iOS binaries for its C++ operators, which means you can run Faster R-CNN and Mask R-CNN on iOS. An example app on how to build a program leveraging those ops can be found [here](https://github.com/pytorch/vision/tree/master/ios/VisionTestApp). - -# TorchAudio 0.9.0 - -### (Stable) Complex Tensor Migration -TorchAudio has functions that handle complex-valued tensors. These functions follow a convention to use an extra dimension to represent real and imaginary parts. In PyTorch 1.6, the native complex type was introduced. As its API is getting stable, torchaudio has started to migrate to the native complex type. - -In this release, we added support for native complex tensors, and you can opt-in to use them. Using the native complex types, we have verified that affected functions continue to support autograd and TorchScript, moreover, switching to native complex types improves their performance. For more details, refer to [pytorch/audio#1337](https://github.com/pytorch/audio/issues/1337). - -### (Stable) Filtering Improvement -In release 0.8, we added the C++ implementation of the core part of ```lfilter``` for CPU, which improved the performance. In this release, we optimized some internal operations of the CPU implementation for further performance improvement. We also added autograd support to both CPU and GPU. Now ```lfilter``` and all the ```biquad``` filters (```biquad```, ```band_biquad```, ```bass_biquad```, ```treble_biquad```, ```allpass_biquad```, ```lowpass_biquad```, ```highpass_biquad```, ```bandpass_biquad```, ```equalizer_biquad``` and ```bandrefect_biquad```) benefit from the performance improvement and support autograd. We also moved the implementation of overdrive to C++ for performance improvement. For more details, refer to [the documentation](https://pytorch.org/audio/0.9.0/functional.html#lfilter). - -### (Stable) Improved Autograd Support -Along with the work of Complex Tensor Migration and Filtering Improvement, we also added autograd tests to transforms. `lfilter`, `biquad` and its variants, and most transforms are now guaranteed to support autograd. For more details, refer to [the release note](https://github.com/pytorch/audio/releases). - -### (Stable) Improved Windows Support -Torchaudio implements some operations in C++ for reasons such as performance and integration with third-party libraries. These C++ components were only available on Linux and macOS. In this release, we have added support to Windows. With this, the efficient filtering implementation mentioned above is also available on Windows. - -However, please note that not all the C++ components are available for Windows. “sox_io” backend and ```torchaudio.functional.compute_kaldi_pitch``` are not supported. - -### (Stable) I/O Functions Migration -Since the 0.6 release, we have continuously improved I/O functionality. Specifically, in 0.8 we changed the default backend from “sox” to “sox_io” and applied the same switch to API of the “soundfile” backend. The 0.9 release concludes this migration by removing the deprecated backends. For more details, please refer to [#903](https://github.com/pytorch/audio/issues/903). - -### (Beta) Wav2Vec2.0 Model -We have added the model architectures from [Wav2Vec2.0](https://arxiv.org/abs/2006.11477). You can import fine-tuned models parameters published on [fairseq](https://github.com/pytorch/fairseq/tree/master/examples/wav2vec) and [Hugging Face Hub](https://huggingface.co/models?filter=wav2vec2). Our model definition supports TorchScript, and it is possible to deploy the model to non-Python environments, such as C++, [Android](https://github.com/pytorch/android-demo-app/tree/master/SpeechRecognition) and [iOS](https://github.com/pytorch/ios-demo-app/tree/master/SpeechRecognition). - -The following code snippet illustrates such a use case. Please check out our [c++ example directory](https://github.com/pytorch/audio/tree/master/examples/libtorchaudio) for the complete example. Currently, it is designed for running inference. If you would like more support for training, please file a feature request. - -```python -# Import fine-tuned model from Hugging Face Hub -import transformers -from torchaudio.models.wav2vec2.utils import import_huggingface_model - -original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") -imported = import_huggingface_model(original) -``` - -```python -# Import fine-tuned model from fairseq -import fairseq -from torchaudio.models.wav2vec2.utils import import_fairseq_model - -original, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - ["wav2vec_small_960h.pt"], arg_overrides={'data': ""}) -imported = import_fairseq_model(original[0].w2v_encoder) -``` - -```python -# Build uninitialized model and load state dict -from torchaudio.models import wav2vec2_base - -model = wav2vec2_base(num_out=32) -model.load_state_dict(imported.state_dict()) - -# Quantize / script / optimize for mobile -quantized_model = torch.quantization.quantize_dynamic( - model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8) -scripted_model = torch.jit.script(quantized_model) -optimized_model = optimize_for_mobile(scripted_model) -optimized_model.save("model_for_deployment.pt") -``` - -For more details, see [the documentation](https://pytorch.org/audio/0.9.0/models.html#wav2vec2-0). - -### (Beta) Resampling Improvement -In release 0.8, we vectorized the operation in ```torchaudio.compliance.kaldi.resample_waveform```, which improved the performance of ```resample_waveform``` and ```torchaudio.transforms.Resample```. In this release, we have further revised the way the resampling algorithm is implemented. - -We have: -* Added Kaiser Window support for a wider range of resampling quality. -* Added ```rolloff``` parameter for anti-aliasing control. -* Added the mechanism to precompute the kernel and cache it in ```torchaudio.transforms.Resample``` for even faster operation. -* Moved the implementation from ```torchaudio.compliance.kaldi.resample_waveform``` to ```torchaudio.functional.resample``` and deprecated ```torchaudio.compliance.kaldi.resample_waveform```. - -For more details, see [the documentation](https://pytorch.org/audio/0.9.0/transforms.html#resample). - -### (Prototype) RNN Transducer Loss -The RNN transducer loss is used in training RNN transducer models, which is a popular architecture for speech recognition tasks. The prototype loss in torchaudio currently supports autograd, torchscript, float16 and float32, and can also be run on both CPU and CUDA. For more details, please refer to [the documentation](https://pytorch.org/audio/stable/index.html). - -# TorchText 0.10.0 - -### (Beta) New Vocab Module -In this release, we introduce a new Vocab module that replaces the current Vocab class. The new Vocab provides common functional APIs for NLP workflows. This module is backed by an efficient C++ implementation that reduces batch look-up time by up-to ~85% (refer to summary of [#1248](https://github.com/pytorch/text/pull/1248) and [#1290](https://github.com/pytorch/text/pull/1290) for further information on benchmarks), and provides support for TorchScript. We provide accompanying factory functions that can be used to build the Vocab object either through a python ordered dictionary or an Iterator that yields lists of tokens. - -```python -#creating Vocab from text file -import io -from torchtext.vocab import build_vocab_from_iterator -#generator that yield list of tokens -def yield_tokens(file_path): - with io.open(file_path, encoding = 'utf-8') as f: - for line in f: - yield line.strip().split() -#get Vocab object -vocab_obj = build_vocab_from_iterator(yield_tokens(file_path), specials=[""]) - -#creating Vocab through ordered dict -from torchtext.vocab import vocab -from collections import Counter, OrderedDict -counter = Counter(["a", "a", "b", "b", "b"]) -sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) -ordered_dict = OrderedDict(sorted_by_freq_tuples) -vocab_obj = vocab(ordered_dict) - -#common API usage - -#look-up index -vocab_obj["a"] - -#batch look-up indices -vocab_obj.looup_indices(["a","b"]) -#support forward API of PyTorch nn Modules -vocab_obj(["a","b"]) - -#batch look-up tokens -vocab_obj.lookup_tokens([0,1]) - -#set default index to return when token not found -vocab_obj.set_default_index(0) -vocab_obj["out_of_vocabulary"] #prints 0 -``` - -For more details, refer to [the documentation](https://pytorch.org/text/stable/vocab.html). - -Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join [the discussion](https://discuss.pytorch.org/) forums and [open GitHub issues](https://github.com/pytorch/pytorch/issues). To get the latest news from PyTorch, follow us on [Facebook](https://www.facebook.com/pytorch/), [Twitter](https://twitter.com/PyTorch), [Medium](https://medium.com/pytorch), [YouTube](https://www.youtube.com/pytorch) or [LinkedIn](https://www.linkedin.com/company/pytorch). - -Cheers! - --Team PyTorch diff --git a/_posts/2021-6-15-pytorch-1.9-released.md b/_posts/2021-6-15-pytorch-1.9-released.md deleted file mode 100644 index 9394a036efca..000000000000 --- a/_posts/2021-6-15-pytorch-1.9-released.md +++ /dev/null @@ -1,177 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch 1.9 Release, including torch.linalg and Mobile Interpreter' -author: Team PyTorch ---- - -We are excited to announce the release of PyTorch 1.9. The release is composed of more than 3,400 commits since 1.8, made by 398 contributors. The release notes are available [here](https://github.com/pytorch/pytorch/releases). Highlights include: -1. Major improvements to support scientific computing, including *torch.linalg*, *torch.special*, and Complex Autograd -2. Major improvements in on-device binary size with Mobile Interpreter -3. Native support for elastic-fault tolerance training through the upstreaming of TorchElastic into PyTorch Core -4. Major updates to the PyTorch RPC framework to support large scale distributed training with GPU support -5. New APIs to optimize performance and packaging for model inference deployment -6. Support for Distributed training, GPU utilization and SM efficiency in the PyTorch Profiler - -Along with 1.9, we are also releasing major updates to the PyTorch libraries, which you can read about in [this blog post](https://pytorch.org/blog/pytorch-1.9-new-library-releases/). - -We’d like to thank the community for their support and work on this latest release. We’d especially like to thank Quansight and Microsoft for their contributions. - -Features in PyTorch releases are classified as Stable, Beta, and Prototype. You can learn more about the definitions in [this blog post](https://pytorch.org/blog/pytorch-feature-classification-changes/). - -# Frontend APIs - -### (Stable) *torch.linalg* - -In 1.9, the *torch.linalg* module is moving to a stable release. Linear algebra is essential to deep learning and scientific computing, and the *torch.linalg* module extends PyTorch’s support for it with implementations of every function from [NumPy’s linear algebra module](https://numpy.org/doc/stable/reference/routines.linalg.html) (now with support for accelerators and autograd) and more, like [*torch.linalg.matrix_norm*](https://pytorch.org/docs/1.9.0/generated/torch.linalg.matrix_norm.html?highlight=matrix_norm#torch.linalg.matrix_norm) and [*torch.linalg.householder_product*](https://pytorch.org/docs/1.9.0/generated/torch.linalg.householder_product.html?highlight=householder_product#torch.linalg.householder_product). This makes the module immediately familiar to users who have worked with NumPy. Refer to [the documentation](https://pytorch.org/docs/1.9.0/linalg.html?highlight=linalg#module-torch.linalg) here. - -We plan to publish another blog post with more details on the *torch.linalg* module next week! - -### (Stable) Complex Autograd - -The Complex Autograd feature, released as a beta in PyTorch 1.8, is now stable. Since the beta release, we have extended support for Complex Autograd for over 98% operators in PyTorch 1.9, improved testing for complex operators by adding more OpInfos, and added greater validation through TorchAudio migration to native complex tensors (refer to [this issue](https://github.com/pytorch/audio/issues/1337)). - -This feature provides users the functionality to calculate complex gradients and optimize real valued loss functions with complex variables. This is a required feature for multiple current and downstream prospective users of complex numbers in PyTorch like TorchAudio, ESPNet, Asteroid, and FastMRI. Refer to [the documentation](https://pytorch.org/docs/1.9.0/notes/autograd.html#autograd-for-complex-numbers) for more details. - -### (Stable) torch.use_deterministic_algorithms() - -To help with debugging and writing reproducible programs, PyTorch 1.9 includes a *torch.use_determinstic_algorithms* option. When this setting is enabled, operations will behave deterministically, if possible, or throw a runtime error if they might behave nondeterministically. Here are a couple examples: - -```python ->>> a = torch.randn(100, 100, 100, device='cuda').to_sparse() ->>> b = torch.randn(100, 100, 100, device='cuda') - -# Sparse-dense CUDA bmm is usually nondeterministic ->>> torch.bmm(a, b).eq(torch.bmm(a, b)).all().item() -False - ->>> torch.use_deterministic_algorithms(True) - -# Now torch.bmm gives the same result each time, but with reduced performance ->>> torch.bmm(a, b).eq(torch.bmm(a, b)).all().item() -True - -# CUDA kthvalue has no deterministic algorithm, so it throws a runtime error ->>> torch.zeros(10000, device='cuda').kthvalue(1) -RuntimeError: kthvalue CUDA does not have a deterministic implementation... -``` - -PyTorch 1.9 adds deterministic implementations for a number of indexing operations, too, including *index_add*, *index_copy*, and *index_put with accum=False*. For more details, refer to the [documentation](https://pytorch.org/docs/1.9.0/generated/torch.use_deterministic_algorithms.html?highlight=use_deterministic#torch.use_deterministic_algorithms) and [reproducibility note](https://pytorch.org/docs/1.9.0/notes/randomness.html?highlight=reproducibility). - -### (Beta) *torch.special* - -A *torch.special* module, analogous to [SciPy’s special module](https://docs.scipy.org/doc/scipy/reference/special.html), is now available in beta. This module contains many functions useful for scientific computing and working with distributions such as *iv*, *ive*, *erfcx*, *logerfc*, and *logerfcx*. Refer to [the documentation](https://pytorch.org/docs/master/special.html) for more details. - -### (Beta) nn.Module parameterization - -```nn.Module``` parameterization allows users to parametrize any parameter or buffer of an ```nn.Module``` without modifying the ```nn.Module``` itself. It allows you to constrain the space in which your parameters live without the need for special optimization methods. - -This also contains a new implementation of the ```spectral_norm``` parametrization for PyTorch 1.9. More parametrization will be added to this feature (weight_norm, matrix constraints and part of pruning) for the feature to become stable in 1.10. For more details, refer to the [documentation](https://pytorch.org/docs/1.9.0/generated/torch.nn.utils.parametrizations.spectral_norm.html?highlight=parametrize) and [tutorial](https://pytorch.org/tutorials/intermediate/parametrizations.html). - -# PyTorch Mobile - -### (Beta) Mobile Interpreter - -We are releasing Mobile Interpreter, a streamlined version of the PyTorch runtime, in beta. The Interpreter will execute PyTorch programs in edge devices, with reduced binary size footprint. - -Mobile Interpreter is one of the top requested features for PyTorch Mobile. This new release will significantly reduce binary size compared with the current on-device runtime. In order for you to get the binary size improvements with our interpreter (which can reduce the binary size up to ~75% for a typical application) follow these instructions. As an example, using Mobile Interpreter, we can reach 2.6 MB compressed with MobileNetV2 in arm64-v7a Android. With this latest release we are making it much simpler to integrate the interpreter by providing pre-built libraries for iOS and Android. - -### TorchVision Library - -Starting from 1.9, users can use the TorchVision library on their iOS/Android apps. The Torchvision library contains the C++ TorchVision ops and needs to be linked together with the main PyTorch library for iOS, for Android it can be added as a gradle dependency. This allows using TorchVision prebuilt MaskRCNN operators for object detections and segmentation. To learn more about the library, please refer to our tutorials and [demo apps](https://github.com/pytorch/android-demo-app/tree/master/D2Go). - -### Demo apps - -We are releasing a new video app based on [PyTorch Video](https://pytorchvideo.org/) library and an updated speech recognition app based on the latest torchaudio, wave2vec model. Both are available on [iOS](https://github.com/pytorch/ios-demo-app) and [Android](https://github.com/pytorch/android-demo-app). In addition, we have updated the seven Computer Vision and three Natural Language Processing demo apps, including the HuggingFace DistilBERT, and the DeiT vision transformer models, with PyTorch Mobile v1.9. With the addition of these two apps, we now offer a full suite of demo apps covering image, text, audio, and video. To get started check out our [iOS demo apps](https://github.com/pytorch/ios-demo-app) and [Android demo apps](https://github.com/pytorch/android-demo-app). - -
        - -
        - -# Distributed Training - -### (Beta) TorchElastic is now part of core - -[TorchElastic](https://github.com/pytorch/pytorch/issues/50621), which was open sourced over a year ago in the [pytorch/elastic](https://github.com/pytorch/elastic) github repository, is a runner and coordinator for PyTorch worker processes. Since then, it has been adopted by various distributed torch use-cases: 1) [deepspeech.pytorch](https://medium.com/pytorch/training-deepspeech-using-torchelastic-ad013539682) 2) pytorch-lightning 3) [Kubernetes CRD](https://github.com/pytorch/elastic/blob/master/kubernetes/README.md). Now, it is part of PyTorch core. - -As its name suggests, the core function of TorcheElastic is to gracefully handle scaling events. A notable corollary of elasticity is that peer discovery and rank assignment are built into TorchElastic enabling users to run distributed training on preemptible instances without requiring a gang scheduler. As a side note, [etcd](https://etcd.io/) used to be a hard dependency of TorchElastic. With the upstream, this is no longer the case since we have added a “standalone” rendezvous based on c10d::Store. For more details, refer to the [documentation](https://pytorch.org/docs/1.9.0/distributed.elastic.html). - -### (Beta) Distributed Training Updates - -In addition to TorchElastic, there are a number of beta features available in the distributed package: - -* **(Beta) CUDA support is available in RPC**: Compared to CPU RPC and general-purpose RPC frameworks, CUDA RPC is a much more efficient way for P2P Tensor communication. It is built on top of TensorPipe which can automatically choose a communication channel for each Tensor based on Tensor device type and channel availability on both the caller and the callee. Existing TensorPipe channels cover NVLink, InfiniBand, SHM, CMA, TCP, etc. See [this recipe](https://pytorch.org/tutorials/recipes/cuda_rpc.html) for how CUDA RPC helps to attain 34x speedup compared to CPU RPC. - -* **(Beta) ZeroRedundancyOptimizer**: ZeroRedundancyOptimizer can be used in conjunction with DistributedDataParallel to reduce the size of per-process optimizer states. The idea of ZeroRedundancyOptimizer comes from [DeepSpeed/ZeRO project](https://github.com/microsoft/DeepSpeed) and [Marian](https://github.com/marian-nmt/marian-dev), where the optimizer in each process owns a shard of model parameters and their corresponding optimizer states. When running `step()`, each optimizer only updates its own parameters, and then uses collective communication to synchronize updated parameters across all processes. Refer to [this documentation](https://pytorch.org/docs/master/distributed.optim.html) and this [tutorial](https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html) to learn more. - -* **(Beta) Support for profiling distributed collectives**: PyTorch’s profiler tools, *torch.profiler* and *torch.autograd.profiler*, are able to profile distributed collectives and point to point communication primitives including allreduce, alltoall, allgather, send/recv, etc. This is enabled for all backends supported natively by PyTorch: gloo, mpi, and nccl. This can be used to debug performance issues, analyze traces that contain distributed communication, and gain insight into performance of applications that use distributed training. To learn more, refer to [this documentation](https://pytorch.org/docs/1.9.0/distributed.html#profiling-collective-communication). - -# Performance Optimization and Tooling - -### (Stable) Freezing API - -Module Freezing is the process of inlining module parameters and attributes values as constants into the TorchScript internal representation. This allows further optimization and specialization of your program, both for TorchScript optimizations and lowering to other backends. It is used by [optimize_for_mobile API](https://github.com/pytorch/pytorch/blob/master/torch/utils/mobile_optimizer.py), ONNX, and others. - -Freezing is recommended for model deployment. It helps TorchScript JIT optimizations optimize away overhead and bookkeeping that is necessary for training, tuning, or debugging PyTorch models. It enables graph fusions that are not semantically valid on non-frozen graphs - such as fusing Conv-BN. For more details, refer to the [documentation](https://pytorch.org/docs/1.9.0/generated/torch.jit.freeze.html). - -### (Beta) PyTorch Profiler - -
        - -
        - -The new PyTorch Profiler graduates to beta and leverages [Kineto](https://github.com/pytorch/kineto/) for GPU profiling, TensorBoard for visualization and is now the standard across our tutorials and documentation. - -PyTorch 1.9 extends support for the new *torch.profiler* API to more builds, including Windows and Mac and is recommended in most cases instead of the previous *torch.autograd.profiler* API. The new API supports existing profiler features, integrates with CUPTI library (Linux-only) to trace on-device CUDA kernels and provides support for long-running jobs, e.g.: - -```python -def trace_handler(p): - output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10) - print(output) - p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json") - -with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], - # schedule argument specifies the iterations on which the profiler is active - schedule=torch.profiler.schedule( - wait=1, - warmup=1, - active=2), - # on_trace_ready argument specifies the handler for the traces - on_trace_ready=trace_handler -) as p: - for idx in range(8): - model(inputs) - # profiler will trace iterations 2 and 3, and then 6 and 7 (counting from zero) - p.step() -``` - -More usage examples can be found on the [profiler recipe page](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html). - -The PyTorch Profiler Tensorboard plugin has new features for: -* Distributed Training summary view with communications overview for NCCL -* GPU Utilization and SM Efficiency in Trace view and GPU operators view -* Memory Profiling view -* Jump to source when launched from Microsoft VSCode -* Ability for load traces from cloud object storage systems - -### (Beta) Inference Mode API - -Inference Mode API allows significant speed-up for inference workloads while remaining safe and ensuring no incorrect gradients can ever be computed. It offers the best possible performance when no autograd is required. For more details, refer to [the documentation for inference mode itself](https://pytorch.org/docs/1.9.0/generated/torch.inference_mode.html?highlight=inference%20mode#torch.inference_mode) and [the documentation explaining when to use it and the difference with no_grad mode](https://pytorch.org/docs/1.9.0/notes/autograd.html#locally-disabling-gradient-computation). - -### (Beta) *torch.package* - -*torch.package* is a new way to package PyTorch models in a self-contained, stable format. A package will include both the model’s data (e.g. parameters, buffers) and its code (model architecture). Packaging a model with its full set of Python dependencies, combined with a description of a conda environment with pinned versions, can be used to easily reproduce training. Representing a model in a self-contained artifact will also allow it to be published and transferred throughout a production ML pipeline while retaining the flexibility of a pure-Python representation. For more details, refer to [the documentation](https://pytorch.org/docs/1.9.0/package.html). - -### (Prototype) prepare_for_inference - -prepare_for_inference is a new prototype feature that takes in a module and performs graph-level optimizations to improve inference performance, depending on the device. It is meant to be a PyTorch-native option that requires minimal changes to user’s workflows. For more details, see [the documentation](https://github.com/pytorch/pytorch/blob/master/torch/jit/_freeze.py#L168) for the Torchscript version [here](https://github.com/pytorch/pytorch/blob/master/torch/jit/_freeze.py#L168) or the FX version [here](https://github.com/pytorch/pytorch/blob/master/torch/fx/experimental/optimization.py#L234). - -### (Prototype) Profile-directed typing in TorchScript - -TorchScript has a hard requirement for source code to have type annotations in order for compilation to be successful. For a long time, it was only possible to add missing or incorrect type annotations through trial and error (i.e., by fixing the type-checking errors generated by *torch.jit.script* one by one), which was inefficient and time consuming. Now, we have enabled profile directed typing for *torch.jit.script* by leveraging existing tools like MonkeyType, which makes the process much easier, faster, and more efficient. For more details, refer to [the documentation](https://pytorch.org/docs/1.9.0/jit.html). - -Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the [discussion forums](https://discuss.pytorch.org/) and [open GitHub issues](https://github.com/pytorch/pytorch/issues). To get the latest news from PyTorch, follow us on [Facebook](https://www.facebook.com/pytorch/), [Twitter](https://twitter.com/PyTorch), [Medium](https://medium.com/pytorch), [YouTube](https://www.youtube.com/pytorch), or [LinkedIn](https://www.linkedin.com/company/pytorch). - -Cheers! - -Team PyTorch diff --git a/_posts/2021-6-16-torchvision-ssd-implementation.md b/_posts/2021-6-16-torchvision-ssd-implementation.md deleted file mode 100644 index 3f55188b4847..000000000000 --- a/_posts/2021-6-16-torchvision-ssd-implementation.md +++ /dev/null @@ -1,125 +0,0 @@ ---- -layout: blog_detail -title: 'Everything You Need To Know About Torchvision’s SSD Implementation' -author: Vasilis Vryniotis ---- - -In TorchVision v0.10, we’ve released two new Object Detection models based on the SSD architecture. Our plan is to cover the key implementation details of the algorithms along with information on how they were trained in a two-part article. - -In part 1 of the series, we will focus on the original implementation of the SSD algorithm as described on the [Single Shot MultiBox Detector paper](https://arxiv.org/abs/1512.02325). We will briefly give a high-level description of how the algorithm works, then go through its main components, highlight key parts of its code, and finally discuss how we trained the released model. Our goal is to cover all the necessary details to reproduce the model including those optimizations which are not covered on the paper but are part on the [original implementation](https://github.com/weiliu89/caffe/tree/ssd). - -# How Does SSD Work? - -Reading the aforementioned paper is highly recommended but here is a quick oversimplified refresher. Our target is to detect the locations of objects in an image along with their categories. Here is the Figure 5 from the [SSD paper](https://arxiv.org/abs/1512.02325) with prediction examples of the model: - -
        - -
        - -The SSD algorithm uses a CNN backbone, passes the input image through it and takes the convolutional outputs from different levels of the network. The list of these outputs are called feature maps. These feature maps are then passed through the Classification and Regression heads which are responsible for predicting the class and the location of the boxes. - -Since the feature maps of each image contain outputs from different levels of the network, their size varies and thus they can capture objects of different dimensions. On top of each, we tile several default boxes which can be thought as our rough prior guesses. For each default box, we predict whether there is an object (along with its class) and its offset (correction over the original location). During training time, we need to first match the ground truth to the default boxes and then we use those matches to estimate our loss. During inference, similar prediction boxes are combined to estimate the final predictions. - -# The SSD Network Architecture - -In this section, we will discuss the key components of SSD. Our code follows closely [the paper](https://arxiv.org/abs/1512.02325) and makes use of many of the undocumented optimizations included in [the official implementation](https://github.com/weiliu89/caffe/tree/ssd). - -### DefaultBoxGenerator - -The [DefaultBoxGenerator class](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/anchor_utils.py#L134) is responsible for generating the default boxes of SSD and operates similarly to the [AnchorGenerator](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/anchor_utils.py#L9) of FasterRCNN (for more info on their differences see pages 4-6 of the paper). It produces a set of predefined boxes of specific width and height which are tiled across the image and serve as the first rough prior guesses of where objects might be located. Here is Figure 1 from the SSD paper with a visualization of ground truths and default boxes: - -
        - -
        - -The class is parameterized by a set of hyperparameters that control [their shape](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/anchor_utils.py#L139) and [tiling](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/anchor_utils.py#L140-L149). The implementation will provide [automatically good guesses](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/anchor_utils.py#L162-L171) with the default parameters for those who want to experiment with new backbones/datasets but one can also pass [optimized custom values](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/anchor_utils.py#L144-L147). - -### SSDMatcher - -The [SSDMatcher class](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/_utils.py#L348) extends the standard [Matcher](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/_utils.py#L227) used by FasterRCNN and it is responsible for matching the default boxes to the ground truth. After estimating the [IoUs of all combinations](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L349), we use the matcher to find for each default box the best [candidate](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/_utils.py#L296) ground truth with overlap higher than the [IoU threshold](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/_utils.py#L350-L351). The SSD version of the matcher has an extra step to ensure that each ground truth is matched with the default box that has the [highest overlap](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/_utils.py#L356-L360). The results of the matcher are used in the loss estimation during the training process of the model. - -### Classification and Regression Heads - -The [SSDHead class](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L38) is responsible for initializing the Classification and Regression parts of the network. Here are a few notable details about their code: - -* Both the [Classification](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L90) and the [Regression](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L99) head inherit from the [same class](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L51) which is responsible for making the predictions for each feature map. -* Each level of the feature map uses a separate 3x3 Convolution to estimate the [class logits](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L92-L94) and [box locations](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L101-L103). -* The [number of predictions](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L79) that each head makes per level depends on the number of default boxes and the sizes of the feature maps. - -### Backbone Feature Extractor - -The [feature extractor](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L413) reconfigures and enhances a standard VGG backbone with extra layers as depicted on the Figure 2 of the SSD paper: - -
        - -
        - -The class supports all VGG models of TorchVision and one can create a similar extractor class for other types of CNNs (see [this example for ResNet](https://github.com/pytorch/vision/blob/644bdcdc438c1723714950d0771da76333b53954/torchvision/models/detection/ssd.py#L600)). Here are a few implementation details of the class: - -* [Patching](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L419-L420) the ```ceil_mode parameter``` of the 3rd Maxpool layer is necessary to get the same feature map sizes as the paper. This is due to small differences between PyTorch and the original Caffe implementation of the model. -* It adds a series of [extra feature layers](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L430-L456)on top of VGG. If the highres parameter is ```True``` during its construction, it will append an [extra convolution](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L457-L464). This is useful for the SSD512 version of the model. -* As discussed on section 3 of the paper, the fully connected layers of the original VGG are converted to convolutions with the [first one using Atrous](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L469). Moreover maxpool5’s stride and kernel size is [modified](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L468). -* As described on section 3.1, L2 normalization is used on the [output of conv4_3](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L484) and a set of [learnable weights](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L422-L423) are introduced to control its scaling. - -### SSD Algorithm - -The final key piece of the implementation is on the [SSD class](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L108). Here are some notable details: - -* The algorithm is [parameterized](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L167-L176) by a set of arguments similar to other detection models. The mandatory parameters are: the backbone which is responsible for [estimating the feature maps](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L137-L139), the ```anchor_generator``` which should be a [configured instance](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L140-L141) of the ```DefaultBoxGenerator``` class, the size to which the [input images](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L142-L143) will be resized and the ```num_classes``` for classification [excluding](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L144) the background. -* If a [head](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L150-L151) is not provided, the constructor will [initialize](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L194) the default ```SSDHead```. To do so, we need to know the number of output channels for each feature map produced by the backbone. Initially we try to [retrieve this information](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L186) from the backbone but if not available we will [dynamically estimate it](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L189). -* The algorithm [reuses](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L183) the standard [BoxCoder class](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/_utils.py#L129) used by other Detection models. The class is responsible for [encoding and decoding](https://leimao.github.io/blog/Bounding-Box-Encoding-Decoding/) the bounding boxes and is configured to use the same prior variances as the [original implementation](https://github.com/weiliu89/caffe/blob/2c4e4c2899ad7c3a997afef2c1fbac76adca1bad/examples/ssd/ssd_coco.py#L326). -* Though we reuse the standard [GeneralizedRCNNTransform class](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/transform.py#L64) to resize and normalize the input images, the SSD algorithm [configures](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L203-L204) it to ensure that the image size will remain fixed. - -Here are the two core methods of the implementation: - -* The ```compute_loss``` method estimates the standard Multi-box loss as described on page 5 of the SSD paper. It uses the [smooth L1 loss](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L244) for regression and the standard [cross-entropy loss](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L262-L266) with [hard-negative sampling](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L268-L276) for classification. -* As in all detection models, the forward method currently has different behaviour depending on whether the model is on training or eval mode. It starts by [resizing & normalizing the input images](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L309-L310) and then [passes them through the backbone](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L324-L325) to get the feature maps. The feature maps are then [passed through the head](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L331-L332) to get the predictions and then the method [generates the default boxes](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L334-L335). - * If the model is on [training mode](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L339-L352), the forward will estimate the [IoUs of the default boxes with the ground truth](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L349), use the ```SSDmatcher``` to [produce matches](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L350) and finally [estimate the losses](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L352) by calling the ```compute_loss method```. - * If the model is on [eval mode](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L353-L355), we first select the best detections by keeping only the ones that [pass the score threshold](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L384), select the [most promising boxes](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L388-L391) and run NMS to [clean up and select](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L401-L403) the best predictions. Finally we [postprocess the predictions](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L355) to resize them to the original image size. - -# The SSD300 VGG16 Model - -The SSD is a family of models because it can be configured with different backbones and different Head configurations. In this section, we will focus on the provided [SSD pre-trained model](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L522-L523). We will discuss the details of its configuration and the training process used to reproduce the reported results. - -### Training process - -The model was trained using the COCO dataset and all of its hyper-parameters and scripts can be found in our [references](https://github.com/pytorch/vision/blob/e35793a1a4000db1f9f99673437c514e24e65451/references/detection/README.md#ssd300-vgg16) folder. Below we provide details on the most notable aspects of the training process. - -### Paper Hyperparameters - -In order to achieve the best possible results on COCO, we adopted the hyperparameters described on the section 3 of the paper concerning the optimizer configuration, the weight regularization etc. Moreover we found it useful to adopt the optimizations that appear in the [official implementation](https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L310-L321) concerning the [tiling configuration](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L579-L581) of the DefaultBox generator. This optimization was not described in the paper but it was crucial for improving the detection precision of smaller objects. - -### Data Augmentation - -Implementing the [SSD Data Augmentation strategy](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/references/detection/transforms.py#L20-L239) as described on page 6 and page 12 of the paper was critical to reproducing the results. More specifically the use of random “Zoom In” and “Zoom Out” transformations make the model robust to various input sizes and improve its precision on the small and medium objects. Finally since the VGG16 has quite a few parameters, the photometric distortions [included in the augmentations](https://github.com/pytorch/vision/blob/43d772067fe77965ec8fc49c799de5cea44b8aa2/references/detection/presets.py#L11-L18) have a regularization effect and help avoid the overfitting. - -### Weight Initialization & Input Scaling - -Another aspect that we found beneficial was to follow the [weight initialization scheme](https://github.com/intel/caffe/blob/master/models/intel_optimized_models/ssd/VGGNet/coco/SSD_300x300/train.prototxt) proposed by the paper. To do that, we had to adapt our input scaling method by [undoing the 0-1 scaling](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L583-L587) performed by ```ToTensor()``` and use [pre-trained ImageNet weights](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L24-L26) fitted with this scaling (shoutout to [Max deGroot](https://github.com/amdegroot) for providing them in his repo). All the weights of new convolutions were [initialized using Xavier](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L30-L35) and their biases were set to zero. After initialization, the network was [trained end-to-end](https://github.com/pytorch/vision/blob/33db2b3ebfdd2f73a9228f430fa7dd91c3b18078/torchvision/models/detection/ssd.py#L571-L572). - -### LR Scheme - -As reported on the paper, after applying aggressive data augmentations it’s necessary to train the models for longer. Our experiments confirm this and we had to tweak the Learning rate, batch sizes and overall steps to achieve the best results. Our [proposed learning scheme](https://github.com/pytorch/vision/blob/e35793a1a4000db1f9f99673437c514e24e65451/references/detection/README.md#ssd300-vgg16) is configured to be rather on the safe side, showed signs of plateauing between the steps and thus one is likely to be able to train a similar model by doing only 66% of our epochs. - -# Breakdown of Key Accuracy Improvements - -It is important to note that implementing a model directly from a paper is an iterative process that circles between coding, training, bug fixing and adapting the configuration until we match the accuracies reported on the paper. Quite often it also involves simplifying the training recipe or enhancing it with more recent methodologies. It is definitely not a linear process where incremental accuracy improvements are achieved by improving a single direction at a time but instead involves exploring different hypothesis, making incremental improvements in different aspects and doing a lot of backtracking. - -With that in mind, below we try to summarize the optimizations that affected our accuracy the most. We did this by grouping together the various experiments in 4 main groups and attributing the experiment improvements to the closest match. Note that the Y-axis of the graph starts from 18 instead from 0 to make the difference between optimizations more visible: - -
        - -
        - -| Model Configuration | mAP delta | mAP | -| ------------- | ------------- | ------------- | -| Baseline with "FasterRCNN-style" Hyperparams | - | 19.5 | -| + Paper Hyperparams | 1.6 | 21.1 | -| + Data Augmentation | 1.8 | 22.9 | -| + Weight Initialization & Input Scaling | 1 | 23.9 | -| + LR scheme | 1.2 | 25.1 | - -Our final model achieves an mAP of 25.1 and reproduces exactly the COCO results reported on the paper. Here is a [detailed breakdown](https://github.com/pytorch/vision/pull/3403) of the accuracy metrics. - - -We hope you found the part 1 of the series interesting. On the part 2, we will focus on the implementation of SSDlite and discuss its differences from SSD. Until then, we are looking forward to your feedback. diff --git a/_posts/2021-6-18-mobile-demo-apps-overview.md b/_posts/2021-6-18-mobile-demo-apps-overview.md deleted file mode 100644 index 7f3392ca20bb..000000000000 --- a/_posts/2021-6-18-mobile-demo-apps-overview.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -layout: blog_detail -title: 'An Overview of the PyTorch Mobile Demo Apps' -author: Jeff Tang and Mark Saroufim -featured-img: 'assets/images/android-demo-app.png' -date: 2021-06-18 12:00:00 -0500 ---- - - -PyTorch Mobile provides a runtime environment to execute state-of-the-art machine learning models on mobile devices. Latency is reduced, privacy preserved, and models can run on mobile devices anytime, anywhere. - -In this blog post, we provide a quick overview of 10 currently available PyTorch Mobile powered demo apps running various state-of-the-art PyTorch 1.9 machine learning models spanning images, video, audio and text. - -It’s never been easier to deploy a state-of-the-art ML model to a phone. You don’t need any domain knowledge in Machine Learning and we hope one of the below examples resonates enough with you to be the starting point for your next project. - -
        - -
        - -## Computer Vision -### Image Classification -This app demonstrates how to use PyTorch C++ libraries on iOS and Android to classify a static image with the MobileNetv2/3 model. - - [iOS #1](https://github.com/pytorch/ios-demo-app/tree/master/HelloWorld) [iOS #2](https://github.com/pytorch/workshops/tree/master/PTMobileWalkthruIOS) [Android #1](https://github.com/pytorch/android-demo-app/tree/master/HelloWorldApp) [Android #2](https://github.com/pytorch/workshops/tree/master/PTMobileWalkthruAndroid) - - [iOS](https://youtu.be/amTepUIR93k) [Android](https://youtu.be/5Lxuu16_28o) - -
        - -
        - - -### Live Image Classification -This app demonstrates how to run a quantized MobileNetV2 and Resnet18 models to classify images in real time with an iOS and Android device camera. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/PyTorchDemo) [Android](https://github.com/pytorch/android-demo-app/tree/master/PyTorchDemoApp) - -
        - - -
        - - -### Image Segmentation -This app demonstrates how to use the PyTorch DeepLabV3 model to segment images. The updated app for PyTorch 1.9 also demonstrates how to create the model using the Mobile Interpreter and load the model with the LiteModuleLoader API. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation) [Android](https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation) - - [iOS](https://pytorch.org/tutorials/beginner/deeplabv3_on_ios.html) [Android](https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation) - -
        - -
        - - -### Vision Transformer for Handwritten Digit Recognition -This app demonstrates how to use Facebook's latest optimized Vision Transformer DeiT model to do image classification and handwritten digit recognition. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/ViT4MNIST) [Android](https://github.com/pytorch/android-demo-app/tree/master/ViT4MNIST) - - [Android](https://drive.google.com/file/d/11L5mIjrLn7B7VdwjQl5vJv3ZVK4hcYut/view?usp=sharing) - -
        - -
        - - -### Object Detection -This app demonstrates how to convert the popular YOLOv5 model and use it on an iOS app that detects objects from pictures in your photos, taken with camera, or with live camera. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/ObjectDetection) [Android](https://github.com/pytorch/android-demo-app/tree/master/ObjectDetection) - - [iOS](https://drive.google.com/file/d/1pIDrUDnCD5uF-mIz8nbSlZcXxPlRBKhl/view) [Android](https://drive.google.com/file/d/1-5AoRONUqZPZByM-fy0m7r8Ct11OnlIT/view) - -
        - -
        - - -### D2Go -This app demonstrates how to create and use a much lighter and faster Facebook D2Go model to detect objects from pictures in your photos, taken with camera, or with live camera. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/D2Go) [Android](https://github.com/pytorch/android-demo-app/tree/master/D2Go) - - [iOS](https://drive.google.com/file/d/1GO2Ykfv5ut2Mfoc06Y3QUTFkS7407YA4/view) [Android](https://drive.google.com/file/d/18-2hLc-7JAKtd1q00X-5pHQCAdyJg7dZ/view?usp=sharing) - - -
        - -
        - - -## Video -### Video Classification -This app demonstrates how to use a pre-trained PyTorchVideo model to perform video classification on tested videos, videos from the Photos library, or even real-time videos. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/TorchVideo) [Android](https://github.com/pytorch/android-demo-app/tree/master/TorchVideo) - - [iOS](https://drive.google.com/file/d/1ijb4UIuF2VQiab4xfAsBwrQXCInvb9wd/view) [Android](https://drive.google.com/file/d/193tkZgt5Rlk7u-EQPcvkoFtmOQ14-zCC/view) [Deep Dive](https://www.youtube.com/watch?v=Qb4vDm-ruwI) - -
        - -
        - - - -## Natural Language Processing -### Text Classification -This app demonstrates how to use a pre-trained Reddit model to perform text classification. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/PyTorchDemo) [Android](https://github.com/pytorch/android-demo-app/tree/master/PyTorchDemoApp) - -
        - -
        - - -### Machine Translation -This app demonstrates how to convert a sequence-to-sequence neural machine translation model trained with the code in the PyTorch NMT tutorial for french to english translation. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/Seq2SeqNMT) [Android](https://github.com/pytorch/android-demo-app/tree/master/Seq2SeqNMT) - - [iOS](https://drive.google.com/file/d/17Edk-yAyfzijHPR_2ZDAIX7VY-TkQnLf/view) [Android](https://drive.google.com/file/d/110KN3Pa9DprkBWnzj8Ppa8KMymhmBI61/view?usp=sharing) - -
        - -
        - -### Question Answering -This app demonstrates how to use the DistilBERT Hugging Face transformer model to answer questions about Pytorch Mobile itself. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/QuestionAnswering) [Android](https://github.com/pytorch/android-demo-app/tree/master/QuestionAnswering) - - [iOS](https://drive.google.com/file/d/1QIB3yoP4I3zUU0bLCpvUqPV5Kv8f8JvB/view) [Android](https://drive.google.com/file/d/10hwGNFo5tylalKwut_CWFPJmV7JRdDKF/view?usp=sharing) - -
        - -
        - - -## Audio -### Speech Recognition -This app demonstrates how to convert Facebook AI's torchaudio-powered wav2vec 2.0, one of the leading models in speech recognition to TorchScript before deploying it. - - [iOS](https://github.com/pytorch/ios-demo-app/tree/master/SpeechRecognition) [Android](https://github.com/pytorch/android-demo-app/tree/master/SpeechRecognition) - -
        - -
        - - -We really hope one of these demo apps stood out for you. For the full list, make sure to visit the [iOS](https://github.com/pytorch/ios-demo-app) and [Android](https://github.com/pytorch/android-demo-app) demo app repos. You should also definitely check out the video [An Overview of the PyTorch Mobile Demo Apps](https://www.youtube.com/watch?v=Qb4vDm-ruwI) which provides both an overview of the PyTorch mobile demo apps and a deep dive into the PyTorch Video app for iOS and Android. diff --git a/_posts/2021-6-23-torch-linalg-autograd.md b/_posts/2021-6-23-torch-linalg-autograd.md deleted file mode 100644 index d5e03993aa31..000000000000 --- a/_posts/2021-6-23-torch-linalg-autograd.md +++ /dev/null @@ -1,164 +0,0 @@ ---- -layout: blog_detail -title: 'The torch.linalg module: Accelerated Linear Algebra with Autograd in PyTorch' -author: Mike Ruberry, Ivan Yashchuk, Xiao Wang, Mario Lezcano and Natalia Gimelshein -featured-img: 'assets/images/cholesky-decomposition.png' ---- - -Linear algebra is essential to deep learning and scientific computing, and it’s always been a core part of PyTorch. PyTorch 1.9 extends PyTorch’s support for linear algebra operations with the ```torch.linalg``` module. This module, documented [here](https://pytorch.org/docs/master/linalg.html?highlight=linalg#module-torch.linalg), has 26 operators, including faster and easier to use versions of older PyTorch operators, every function from [NumPy’s linear algebra module](https://numpy.org/doc/stable/reference/routines.linalg.html) extended with accelerator and autograd support, and a few operators that are completely new. This makes the ```torch.linalg``` immediately familiar to NumPy users and an exciting update to PyTorch’s linear algebra support. - -# NumPy-like linear algebra in PyTorch - -If you’re familiar with NumPy’s linear algebra module then it’ll be easy to start using ```torch.linalg```. In most cases it’s a drop-in replacement. Let’s looking at drawing samples from a [multivariate normal distribution](https://en.wikipedia.org/wiki/Multivariate_normal_distribution) using the [Cholesky decomposition](https://en.wikipedia.org/wiki/Cholesky_decomposition) as a motivating example to demonstrate this: - -```python -import numpy as np - -# Creates inputs -np.random.seed(0) -mu_np = np.random.rand(4) -L = np.random.rand(4, 4) -# Covariance matrix sigma is positive-definite -sigma_np = L @ L.T + np.eye(4) -normal_noise_np = np.random.standard_normal(mu_np.size) - -def multivariate_normal_sample_np(mu, sigma, normal_noise): - return mu + np.linalg.cholesky(sigma) @ normal_noise - -print("Random sample: ", - multivariate_normal_sample_np(mu_np, sigma_np, normal_noise_np)) -: Random sample: [2.9502426 1.78518077 1.83168697 0.90798228] -``` - -Now let’s see the same sampler implemented in PyTorch: - -```python -import torch - -def multivariate_normal_sample_torch(mu, sigma, normal_noise): - return mu + torch.linalg.cholesky(sigma) @ normal_noise -``` - -The two functions are identical, and we can validate their behavior by calling the function with the same arguments wrapped as PyTorch tensors: - -```python -# NumPy arrays are wrapped as tensors and share their memory -mu_torch = torch.from_numpy(mu_np) -sigma_torch = torch.from_numpy(sigma_np) -normal_noise_torch = torch.from_numpy(normal_noise_np) - -multivariate_normal_sample_torch(mu_torch, sigma_torch, normal_noise_torch) -: tensor([2.9502, 1.7852, 1.8317, 0.9080], dtype=torch.float64) -``` - -The only difference is in how PyTorch prints tensors by default. - -The Cholesky decomposition can also help us quickly compute the probability density function of the non-degenerate multivariate normal distribution. One of the expensive terms in that computation is the square root of the determinant of the covariance matrix. Using [properties of the determinant](https://en.wikipedia.org/wiki/Determinant#Properties_of_the_determinant) and the Cholesky decomposition we can calculate the same result faster than the naive computation, however. Here’s the NumPy program that demonstrates this: - -```python -sqrt_sigma_det_np = np.sqrt(np.linalg.det(sigma_np)) -sqrt_L_det_np = np.prod(np.diag(np.linalg.cholesky(sigma_np))) - -print("|sigma|^0.5 = ", sqrt_sigma_det_np) -: |sigma|^0.5 = 4.237127491242027 - -print("|L| = ", sqrt_L_det_np) -: |L| = 4.237127491242028 -``` - -And here’s the same validation in PyTorch: - -```python -sqrt_sigma_det_torch = torch.sqrt(torch.linalg.det(sigma_torch)) -sqrt_L_det_torch = torch.prod(torch.diag(torch.linalg.cholesky(sigma_torch))) - -print("|sigma|^0.5 = ", sqrt_sigma_det_torch) -: |sigma|^0.5 = tensor(4.2371, dtype=torch.float64) - -print("|L| = ", sqrt_L_det_torch) -: |L| = tensor(4.2371, dtype=torch.float64) -``` - -We can measure the difference in run time using PyTorch’s built-in benchmark utility: - -```python -import torch.utils.benchmark as benchmark - -t0 = benchmark.Timer( - stmt='torch.sqrt(torch.linalg.det(sigma))', - globals={'sigma': sigma_torch}) - -t1 = benchmark.Timer( - stmt='torch.prod(torch.diag(torch.linalg.cholesky(sigma)))', - globals={'sigma': sigma_torch}) - -print(t0.timeit(100)) -: torch.sqrt(torch.linalg.det(sigma)) - 80.80 us - 1 measurement, 100 runs , 1 thread - - -print(t1.timeit(100)) -: torch.prod(torch.diag(torch.linalg.cholesky(sigma))) - 11.56 us - 1 measurement, 100 runs , 1 thread - ``` - -Demonstrating that the approach using the Cholesky decomposition can be significantly faster. Behind the scenes, PyTorch’s linear algebra module uses OpenBLAS or MKL implementations of the LAPACK standard to maximize its CPU performance. - -# Autograd Support - -PyTorch’s linear algebra module doesn’t just implement the same functions as NumPy’s linear algebra module (and a few more), it also extends them with autograd and CUDA support. - -Let’s look at a very simple program that just computes an inverse and the gradient of that operation to show how autograd works: - -```python -t = torch.tensor(((1, 2), (3, 4)), dtype=torch.float32, requires_grad=True) - -inv = torch.linalg.inv(t) -inv.backward(torch.ones_like(inv)) - -print(t.grad) -: tensor([[-0.5000, 0.5000], - [ 0.5000, -0.5000]]) -``` - -We can mimic the same computation in NumPy by defining the autograd formula ourselves: - -```python -a = np.array(((1, 2), (3, 4)), dtype=np.float32) - -inv_np = np.linalg.inv(a) - -def inv_backward(result, grad): - return -(result.transpose(-2, -1) @ (grad @ result.transpose(-2, -1))) -grad_np = inv_backward(inv_np, np.ones_like(inv_np)) - -print(grad_np) -: [[-0.5 0.5] - [ 0.5 -0.5]] -``` - -Of course, as programs become more complicated it’s convenient to have builtin autograd support, and PyTorch’s linear algebra module supports both real and complex autograd. - -# CUDA Support - -Support for autograd and accelerators, like CUDA devices, is a core part of PyTorch. The ```torch.linalg``` module was developed with NVIDIA’s PyTorch and cuSOLVER teams, who helped optimize its performance on CUDA devices with the cuSOLVER, cuBLAS, and MAGMA libraries. These improvements make PyTorch’s CUDA linear algebra operations faster than ever. For example, let’s look at the performance of PyTorch 1.9’s ```torch.linalg.cholesky``` vs. PyTorch 1.8’s (now deprecated) ```torch.cholesky```: - -
        - -
        - -(The above charts were created using an Ampere A100 GPU with CUDA 11.3, cuSOLVER 11.1.1.58, and MAGMA 2.5.2. Matrices are in double precision.) - -These charts show that performance has increased significantly on larger matrices, and that batched performance is better across the board. Other linear algebra operations, including ```torch.linalg.qr``` and ```torch.linalg.lstsq```, have also had their CUDA performance improved. - -# Beyond NumPy - -In addition to offering all the functions in NumPy’s linear algebra module with support for autograd and accelerators, ```torch.linalg``` has a few new functions of its own. NumPy’s ```linalg.norm``` does not allow users to compute vector norms over arbitrary subsets of dimensions, so to enable this functionality we added ```torch.linalg.vector_norm```. We’ve also started modernizing other linear algebra functionality in PyTorch, so we created ```torch.linalg.householder_product``` to replace the older ```torch.orgqr```, and we plan to continue adding more linear algebra functionality in the future, too. - -# The Future of Linear Algebra in PyTorch - -The ```torch.linalg``` module is fast and familiar with great support for autograd and accelerators. It’s already being used in libraries like [botorch](https://github.com/pytorch/botorch), too. But we’re not stopping here. We plan to continue updating more of PyTorch’s existing linear algebra functionality (like ```torch.lobpcg```) and offering more support for low rank and sparse linear algebra. We also want to hear your feedback on how we can improve, so start a conversation on the [forum](https://discuss.pytorch.org/) or file an issue on our [Github](https://github.com/pytorch/pytorch) and share your thoughts. - -We look forward to hearing from you and seeing what the community does with PyTorch’s new linear algebra functionality! diff --git a/_posts/2021-6-27-torchvision-ssdlite-implementation.md b/_posts/2021-6-27-torchvision-ssdlite-implementation.md deleted file mode 100644 index d5d7f9e4df2d..000000000000 --- a/_posts/2021-6-27-torchvision-ssdlite-implementation.md +++ /dev/null @@ -1,105 +0,0 @@ ---- -layout: blog_detail -title: 'Everything You Need To Know About Torchvision’s SSDlite Implementation' -author: Vasilis Vryniotis -featured-img: 'assets/images/mAP-of-SSD320-MobileNetV3-Large.png' ---- - -In the [previous article](https://pytorch.org/blog/torchvision-ssd-implementation/), we’ve discussed how the SSD algorithm works, covered its implementation details and presented its training process. If you have not read the previous blog post, I encourage you to check it out before continuing. - -In this part 2 of the series, we will focus on the mobile-friendly variant of SSD called SSDlite. Our plan is to first go through the main components of the algorithm highlighting the parts that differ from the original SSD, then discuss how the released model was trained and finally provide detailed benchmarks for all the new Object Detection models that we explored. - -# The SSDlite Network Architecture - -The SSDlite is an adaptation of SSD which was first briefly introduced on the [MobileNetV2 paper](https://arxiv.org/abs/1801.04381) and later reused on the [MobileNetV3 paper](https://arxiv.org/abs/1905.02244). Because the main focus of the two papers was to introduce novel CNN architectures, most of the implementation details of SSDlite were not clarified. Our code follows all the details presented on the two papers and where necessary fills the gaps from the [official implementation](https://github.com/tensorflow/models/tree/238922e98dd0e8254b5c0921b241a1f5a151782f/research/object_detection). - -As noted before, the SSD is a family of models because one can configure it with different backbones (such as VGG, MobileNetV3 etc) and different Heads (such as using regular convolutions, separable convolutions etc). Thus many of the SSD components remain the same in SSDlite. Below we discuss only those that are different - -## Classification and Regression Heads - -Following the Section 6.2 of the MobileNetV2 paper, SSDlite replaces the regular convolutions used on the original Heads with separable convolutions. Consequently, our implementation introduces [new heads](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L65-L95) that use [3x3 Depthwise convolutions and 1x1 projections](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L26-L36). Since all other components of the SSD method remain the same, to create an SSDlite model our implementation [initializes the SSDlite head](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L222-L223) and passes it directly to the SSD constructor. - -## Backbone Feature Extractor - -Our implementation introduces a new class for building MobileNet [feature extractors](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L98). Following the Section 6.3 of the MobileNetV3 paper, the backbone returns the [output of the expansion layer](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L106) of the Inverted Bottleneck block which has an output stride of 16 and the [output of the layer just before the pooling](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L107) which has an output stride of 32. Moreover, all [extra blocks](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L111-L116) of the backbone are replaced with [lightweight equivalents](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L39-L54) which use a 1x1 compression, a separable 3x3 convolution with stride 2 and a 1x1 expansion. Finally to ensure that the heads have enough prediction power even when small [width multipliers](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L99) are used, the [minimum depth](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L110) size of all convolutions is controlled by the ```min_depth``` hyperparameter. - -# The SSDlite320 MobileNetV3-Large model - -
        - -
        - -This section discusses the configuration of the provided [SSDlite pre-trained](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L159-L162) model along with the training processes followed to replicate the paper results as closely as possible. - -## Training process - -All of the hyperparameters and scripts used to train the model on the COCO dataset can be found in our [references](https://github.com/pytorch/vision/blob/e35793a1a4000db1f9f99673437c514e24e65451/references/detection/README.md#ssdlite320-mobilenetv3-large) folder. Here we discuss the most notable details of the training process. - -### Tuned Hyperparameters - -Though the papers don’t provide any information on the hyperparameters used for training the models (such as regularization, learning rate and the batch size), the parameters listed in the [configuration files](https://github.com/tensorflow/models/blob/238922e98dd0e8254b5c0921b241a1f5a151782f/research/object_detection/samples/configs/ssdlite_mobilenet_v3_large_320x320_coco.config) on the official repo were good starting points and using cross validation we adjusted them to their optimal values. All the above gave us a significant boost over the baseline SSD configuration. - -### Data Augmentation - -Key important difference of SSDlite comparing to SSD is that the backbone of the first has only a fraction of the weights of the latter. This is why in SSDlite, the Data Augmentation focuses more on making the model robust to objects of variable sizes than trying to avoid overfitting. Consequently, SSDlite [uses only a subset](https://github.com/pytorch/vision/blob/43d772067fe77965ec8fc49c799de5cea44b8aa2/references/detection/presets.py#L19-L24) of the SSD transformations and this way it avoids the over-regularization of the model. - -### LR Scheme - -Due to the reliance on Data Augmentation to make the model robust to small and medium sized objects, we found that it is particularly beneficial for the training recipe to use large number of epochs. More specifically by using roughly 3x more epochs than SSD we are able to increase our precision by 4.2mAP points and by using a 6x multiplier we improve by 4.9mAP. Increasing further the epochs seems to yield diminishing returns and makes the training too slow and impractical, nevertheless based on the [model configuration](https://github.com/tensorflow/models/blob/238922e98dd0e8254b5c0921b241a1f5a151782f/research/object_detection/samples/configs/ssdlite_mobilenet_v3_large_320x320_coco.config#L154) it seems that the authors of the paper used an equivalent *16x multiplier*. - -### Weight Initialization & Input Scaling & ReLU6 - -A set of final optimizations that brought our implementation very close to the official one and helped us bridge the accuracy gap was training the backbone [from scratch](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L139-L141) instead of initializing from ImageNet, adapting our [weight initialization scheme](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L57-L62), changing our [Input Scaling](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L216-L219) and replacing all standard ReLUs added on the SSDlite heads with ReLU6. Note that since we trained the model from random weights, we additionally applied the speed optimization described on the paper of using a [reduced tail](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L196-L197) on the backbone. - -### Implementation Differences - -Comparing the above implementation with the one on the official repo, we’ve identified a few differences. Most of them are minor and they are related to how we initialize the weights (for example [Normal initialization](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/torchvision/models/detection/ssdlite.py#L57-L62) vs [Truncated Normal](https://github.com/tensorflow/models/blob/238922e98dd0e8254b5c0921b241a1f5a151782f/research/object_detection/samples/configs/ssdlite_mobilenet_v3_large_320x320_coco.config#L104-L107)), how we parameterize the LR Scheduling (for example [smaller](https://github.com/pytorch/vision/blob/b6f733046c9259f354d060cd808241a558d7d596/references/detection/engine.py#L21-L22) vs [larger](https://github.com/tensorflow/models/blob/238922e98dd0e8254b5c0921b241a1f5a151782f/research/object_detection/samples/configs/ssdlite_mobilenet_v3_large_320x320_coco.config#L169-L170) warmup rate, [shorter](https://github.com/pytorch/vision/tree/master/references/detection#ssdlite320-mobilenetv3-large) vs [longer](https://github.com/tensorflow/models/blob/238922e98dd0e8254b5c0921b241a1f5a151782f/research/object_detection/samples/configs/ssdlite_mobilenet_v3_large_320x320_coco.config#L154) training) etc. The biggest known difference lies in the way we compute the Classification loss. More specifically the implementation of SSDlite with MobileNetV3 backbone on the official repo [doesn’t use the SSD’s Multibox loss](https://github.com/tensorflow/models/blob/238922e98dd0e8254b5c0921b241a1f5a151782f/research/object_detection/samples/configs/ssdlite_mobilenet_v3_large_320x320_coco.config#L121-L124) but instead uses RetinaNet’s [focal loss](https://arxiv.org/abs/1708.02002). This is a rather significant deviation from the paper and since TorchVision already offers a full implementation of RetinaNet, we decided to implement SSDlite using the normal Multi-box SSD loss. - -## Break down of key accuracy improvements - -As discussed in previous articles, reproducing research papers and porting them to code is not a journey of monotonically increasing accuracies, especially in cases where the full training and implementation details are not known. Typically the process involves lots of backtracking as one needs to identify those implementation details and parameters that have significant impact on the accuracy from those that don’t. Below we try to visualize the most important iterations that improved our accuracy from the baseline: - -
        - -
        - - - {:.table.table-striped.table-bordered} -| **Iteration** | **mAP** | -| ------------- | ------------- | -| Baseline with "SSD-style" Hyperparams | 10.6 | -| + Tuned Hyperparams | 14.2 | -| + SSDlite Data Augmentation | 15.2 | -| + 3x LR Scheme | 19.4 | -| + 6x LR Scheme | 20.1 | -| + Weight Initialization & Input Scaling & ReLU6 | 21.3 | - -The order of optimizations presented above is accurate, though a bit idealized in some cases. For example, though different schedulers were tested during the Hyperparameter tuning phase, none of them provided significant improvements and thus we maintained the MultiStepLR which was used in the baseline. Nevertheless while later experimenting with different LR Schemes, we found it beneficial to switch to CosineAnnealingLR, as it required less configuration. Consequently, we believe that the main takeaway from the above summary should be that even by starting with a correct implementation and a set of optimal hyperparams from a model of the same family, there is always accuracy points to be found by optimizing the training recipe and tuning the implementation. Admittedly the above is a rather extreme case where the accuracy doubled, but still in many cases there is a large number of optimizations that can help us push the accuracy significantly. - -# Benchmarks - -Here is how to initialize the two pre-trained models: - -```python -ssdlite = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True) -ssd = torchvision.models.detection.ssd300_vgg16(pretrained=True) -``` - -Below are the benchmarks between the new and selected previous detection models: - - {:.table.table-striped.table-bordered} -| **Model** | **mAP** | **Inference on CPU (sec)** | **# Params (M)** | -| ------------- | ------------- | ------------- | ------------- | -| SSDlite320 MobileNetV3-Large | 21.3 | 0.0911 | 3.44 | -| SSD300 VGG16 | 25.1 | 0.8303 | 35.64 | -| SSD512 VGG16 (not released) | 28.8| 2.2494 | 37.08 | -| SSD512 ResNet50 (not released) | 30.2 | 1.1137 | 42.70 | -| Faster R-CNN MobileNetV3-Large 320 FPN (Low-Res) | 22.8 | 0.1679 | 19.39| -| Faster R-CNN MobileNetV3-Large FPN (High-Res) | 32.8 | 0.8409 | 19.39 | - -As we can see, the SSDlite320 MobileNetV3-Large model is by far the fastest and smallest model and thus it’s an excellent candidate for real-world mobile applications. Though its accuracy is lower than the pre-trained low-resolution Faster R-CNN equivalent, the SSDlite framework is adaptable and one can boost its accuracy by introducing heavier heads with more convolutions. - -On the other hand, the SSD300 VGG16 model is rather slow and less accurate. This is mainly because of its VGG16 backbone. Though extremely important and influential, the VGG architecture is nowadays quite outdated. Thus though the specific model has historical and research value and hence it’s included in TorchVision, we recommend to users who want high-resolution detectors for real world applications to either combine SSD with alternative backbones (see this [example](https://github.com/pytorch/vision/pull/3760) on how to create one) or use one of the Faster R-CNN pre-trained models. - - -We hope you enjoyed the 2nd and final part of the SSD series. We are looking forward to your feedback. diff --git a/_posts/2021-6-8-overview-of-pytorch-autograd-engine.md b/_posts/2021-6-8-overview-of-pytorch-autograd-engine.md deleted file mode 100644 index ffc8ce0e962c..000000000000 --- a/_posts/2021-6-8-overview-of-pytorch-autograd-engine.md +++ /dev/null @@ -1,165 +0,0 @@ ---- -layout: blog_detail -title: 'Overview of PyTorch Autograd Engine' -author: Preferred Networks, Inc. ---- - -This blog post is based on PyTorch version 1.8, although it should apply for older versions too, since most of the mechanics have remained constant. - -To help understand the concepts explained here, it is recommended that you read the awesome blog post by [@ezyang](https://twitter.com/ezyang): [PyTorch internals](http://blog.ezyang.com/2019/05/pytorch-internals/) if you are not familiar with PyTorch architecture components such as ATen or c10d. - -### What is autograd? - -**Background** - -PyTorch computes the gradient of a function with respect to the inputs by using automatic differentiation. Automatic differentiation is a technique that, given a computational graph, calculates the gradients of the inputs. Automatic differentiation can be performed in two different ways; forward and reverse mode. Forward mode means that we calculate the gradients along with the result of the function, while reverse mode requires us to evaluate the function first, and then we calculate the gradients starting from the output. While both modes have their pros and cons, the reverse mode is the de-facto choice since the number of outputs is smaller than the number of inputs, which allows a much more efficient computation. Check [3] to learn more about this. - -Automatic differentiation relies on a classic calculus formula known as the chain-rule. The chain rule allows us to calculate very complex derivatives by splitting them and recombining them later. - -Formally speaking, given a composite function , we can calculate its derivative as . This result is what makes automatic differentiation work. -By combining the derivatives of the simpler functions that compose a larger one, such as a neural network, it is possible to compute the exact value of the gradient at a given point rather than relying on the numerical approximation, which would require multiple perturbations in the input to obtain a value. - -To get the intuition of how the reverse mode works, let’s look at a simple function . Figure 1 shows its computational graph where the inputs x, y in the left, flow through a series of operations to generate the output z. - -
        - -

        Figure 1: Computational graph of f(x, y) = log(x*y)

        -
        - -The automatic differentiation engine will normally execute this graph. It will also extend it to calculate the derivatives of w with respect to the inputs x, y, and the intermediate result v. - -The example function can be decomposed in f and g, where and . Every time the engine executes an operation in the graph, the derivative of that operation is added to the graph to be executed later in the backward pass. Note, that the engine knows the derivatives of the basic functions. - -In the example above, when multiplying x and y to obtain v, the engine will extend the graph to calculate the partial derivatives of the multiplication by using the multiplication derivative definition that it already knows. and . The resulting extended graph is shown in Figure 2, where the *MultDerivative* node also calculates the product of the resulting gradients by an input gradient to apply the chain rule; this will be explicitly seen in the following operations. Note that the backward graph (green nodes) will not be executed until all the forward steps are completed. - -
        - -

        Figure 2: Computational graph extended after executing the logarithm

        -
        - -Continuing, the engine now calculates the operation and extends the graph again with the log derivative that it knows to be . This is shown in figure 3. This operation generates the result that when propagated backward and multiplied by the multiplication derivative as in the chain rule, generates the derivatives , . - -
        - -

        Figure 3: Computational graph extended after executing the logarithm

        -
        - -The original computation graph is extended with a new dummy variable z that is the same w. The derivative of z with respect to w is 1 as they are the same variable, this trick allows us to apply the chain rule to calculate the derivatives of the inputs. After the forward pass is complete, we start the backward pass, by supplying the initial value of 1.0 for . This is shown in Figure 4. - -
        - -

        Figure 4: Computational graph extended for reverse auto differentiation

        -
        - -Then following the green graph we execute the LogDerivative operation that the auto differentiation engine introduced, and multiply its result by to obtain the gradient as per the chain rule states. Next, the multiplication derivative is executed in the same way, and the desired derivatives are finally obtained. - -Formally, what we are doing here, and PyTorch autograd engine also does, is computing a Jacobian-vector product (Jvp) to calculate the gradients of the model parameters, since the model parameters and inputs are vectors. - -**The Jacobian-vector product** - -When we calculate the gradient of a vector-valued function (a function whose inputs and outputs are vectors), we are essentially constructing a Jacobian matrix . - -Thanks to the chain rule, multiplying the Jacobian matrix of a function by a vector with the previously calculated gradients of a scalar function results in the gradients of the scalar output with respect to the vector-valued function inputs. - -As an example, let’s look at some functions in python notation to show how the chain rule applies. -
        - - -
        def f(x1, x2):
        -      a = x1 * x2
        -      y1 = log(a)
        -      y2 = sin(x2)
        -      return (y1, y2)
        -  
        - - - -
        def g(y1, y2):
        -      return y1 * y2
        -  
        - -
        - -Now, if we derive this by hand using the chain rule and the definition of the derivatives, we obtain the following set of identities that we can directly plug into the Jacobian matrix of - -
        -

        -

        -

        -

        -
        - -Next, let’s consider the gradients for the scalar function - -
        -

        -

        -
        - -If we now calculate the transpose-Jacobian vector product obeying the chain rule, we obtain the following expression: -
        - -
        - -Evaluating the Jvp for yields the result: - -We can execute the same expression in PyTorch and calculate the gradient of the input: -
        -
        >>> import torch
        -
        >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
        -
        >>> y = torch.log(x[0] * x[1]) * torch.sin(x[1])
        -
        >>> y.backward(1.0)
        -
        >>> x.grad
        - tensor([1.3633, - 0.1912]) -
        - -The result is the same as our hand-calculated Jacobian-vector product! -However, PyTorch never constructed the matrix as it could grow prohibitively large but instead, created a graph of operations that traversed backward while applying the Jacobian-vector products defined in [tools/autograd/derivatives.yaml](https://github.com/pytorch/pytorch/blob/master/tools/autograd/derivatives.yaml). - -**Going through the graph** - -Every time PyTorch executes an operation, the autograd engine constructs the graph to be traversed backward. -The reverse mode auto differentiation starts by adding a scalar variable at the end so that as we saw in the introduction. This is the initial gradient value that is supplied to the Jvp engine calculation as we saw in the section above. - -In PyTorch, the initial gradient is explicitly set by the user when he calls the backward method. - -Then, the Jvp calculation starts but it never constructs the matrix. Instead, when PyTorch records the computational graph, the derivatives of the executed forward operations are added (Backward Nodes). Figure 5 shows a backward graph generated by the execution of the functions and seen before. - -
        - -

        Figure 5: Computational Graph extended with the backward pass

        -
        - -Once the forward pass is done, the results are used in the backward pass where the derivatives in the computational graph are executed. The basic derivatives are stored in the [tools/autograd/derivatives.yaml](https://github.com/pytorch/pytorch/blob/master/tools/autograd/derivatives.yaml) file and they are not regular derivatives but the Jvp versions of them [3]. They take their primitive function inputs and outputs as parameters along with the gradient of the function outputs with respect to the final outputs. By repeatedly multiplying the resulting gradients by the next Jvp derivatives in the graph, the gradients up to the inputs will be generated following the chain rule. - -
        - -

        Figure 6: How the chain rule is applied in backward differentiation

        -
        - -Figure 6 represents the process by showing the chain rule. We started with a value of 1.0 as detailed before which is the already calculated gradient highlighted in green. And we move to the next node in the graph. The *backward* function registered in [derivatives.yaml](https://github.com/pytorch/pytorch/blob/a0a7a2d648f05b0192e6943c9684406cdf404fbf/tools/autograd/derivatives.yaml#L635-L636) will calculate the associated - value highlighted in red and multiply it by . By the chain rule this results in which will be the already calculated gradient (green) when we process the next backward node in the graph. - -You may also have noticed that in Figure 5 there is a gradient generated from two different sources. When two different functions share an input, the gradients with respect to the output are aggregated for that input, and calculations using that gradient can’t proceed unless all the paths have been aggregated together. - -Let’s see an example of how the derivatives are stored in PyTorch. - -Suppose that we are currently processing the backward propagation of the function, in the *LogBackward* node in Figure 2. The derivative of in [`derivatives.yaml`](https://github.com/pytorch/pytorch/blob/a0a7a2d648f05b0192e6943c9684406cdf404fbf/tools/autograd/derivatives.yaml#L635-L636) is specified as `grad.div(self.conj())`. `grad` is the already calculated gradient and `self.conj()` is the complex conjugate of the input vector. For complex numbers PyTorch calculates a special derivative called the conjugate Wirtinger derivative [6]. This derivative takes the complex number and its conjugate and by operating some magic that is described in [6], they are the direction of steepest descent when plugged into optimizers. - -This code translates to , the corresponding green, and red squares in Figure 3. Continuing, the autograd engine will execute the next operation; backward of the multiplication. As before, the inputs are the original function’s inputs and the gradient calculated from the backward step. This step will keep repeating until we reach the gradient with respect to the inputs and the computation will be finished. The gradient of is only completed once the multiplication and sin gradients are added together. As you can see, we computed the equivalent of the Jvp but without constructing the matrix. - -In the next post we will dive inside PyTorch code to see how this graph is constructed and where are the relevant pieces should you want to experiment with it! - -### References - -
          -
        1. https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
        2. -
        3. https://web.stanford.edu/class/cs224n/readings/gradient-notes.pdf
        4. -
        5. https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec10.pdf
        6. -
        7. https://mustafaghali11.medium.com/how-pytorch-backward-function-works-55669b3b7c62
        8. -
        9. https://indico.cern.ch/event/708041/contributions/3308814/attachments/1813852/2963725/automatic_differentiation_and_deep_learning.pdf
        10. -
        11. https://pytorch.org/docs/stable/notes/autograd.html#complex-autograd-doc
        12. -

          Recommended: shows why the backprop is formally expressed with the Jacobian

          -
        13. https://cs.ubc.ca/~fwood/CS340/lectures/AD1.pdf
        14. -
        diff --git a/_posts/2021-8-18-pipetransformer-automated-elastic-pipelining.md b/_posts/2021-8-18-pipetransformer-automated-elastic-pipelining.md deleted file mode 100644 index 02c73d77541b..000000000000 --- a/_posts/2021-8-18-pipetransformer-automated-elastic-pipelining.md +++ /dev/null @@ -1,353 +0,0 @@ ---- -layout: blog_detail -title: 'PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models' -author: Chaoyang He, Shen Li, Mahdi Soltanolkotabi, and Salman Avestimehr -featured-img: 'assets/images/pipetransformer_overview.png' ---- - -In this blog post, we describe the first peer-reviewed research paper that explores accelerating the hybrid of PyTorch DDP (`torch.nn.parallel.DistributedDataParallel`) [1] and Pipeline (`torch.distributed.pipeline`) - [PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models](http://proceedings.mlr.press/v139/he21a.html) (Transformers such as BERT [2] and ViT [3]), published at ICML 2021. - -PipeTransformer leverages automated elastic pipelining for efficient distributed training of Transformer models. In PipeTransformer, we designed an adaptive on-the-fly freeze algorithm that can identify and freeze some layers gradually during training and an elastic pipelining system that can dynamically allocate resources to train the remaining active layers. More specifically, PipeTransformer automatically excludes frozen layers from the pipeline, packs active layers into fewer GPUs, and forks more replicas to increase data-parallel width. We evaluate PipeTransformer using Vision Transformer (ViT) on ImageNet and BERT on SQuAD and GLUE datasets. Our results show that compared to the state-of-the-art baseline, PipeTransformer attains up to 2.83-fold speedup without losing accuracy. We also provide various performance analyses for a more comprehensive understanding of our algorithmic and system-wise design. - -Next, we will introduce the background, motivation, our idea, design, and how we implement the algorithm and system with PyTorch Distributed APIs. - -* Paper: [http://proceedings.mlr.press/v139/he21a.html](http://proceedings.mlr.press/v139/he21a.html) -* Source Code: [https://DistML.ai](https://distml.ai). -* Slides: [https://docs.google.com/presentation/d/1t6HWL33KIQo2as0nSHeBpXYtTBcy0nXCoLiKd0EashY/edit?usp=sharing](https://docs.google.com/presentation/d/1t6HWL33KIQo2as0nSHeBpXYtTBcy0nXCoLiKd0EashY/edit?usp=sharing) - -# Introduction -

        -Model Size -
        -Figure 1: the Parameter Number of Transformer Models Increases Dramatically. -

        - - -Large Transformer models [4][5] have powered accuracy breakthroughs in both natural language processing and computer vision. GPT-3 [4] hit a new record high accuracy for nearly all NLP tasks. Vision Transformer (ViT) [3] also achieved 89\% top-1 accuracy in ImageNet, outperforming state-of-the-art convolutional networks ResNet-152 and EfficientNet. To tackle the growth in model sizes, researchers have proposed various distributed training techniques, including parameter servers [6][7][8], pipeline parallelism [9][10][11][12], intra-layer parallelism [13][14][15], and zero redundancy data-parallel [16]. - - -Existing distributed training solutions, however, only study scenarios where all model weights are required to be optimized throughout the training (i.e., computation and communication overhead remains relatively static over different iterations). Recent works on progressive training suggest that parameters in neural networks can be trained dynamically: - -* Freeze Training: Singular Vector Canonical Correlation Analysis for Deep Learning Dynamics and Interpretability. NeurIPS 2017 -* Efficient Training of BERT by Progressively Stacking. ICML 2019 -* Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. NeurIPS 2020. -* On the Transformer Growth for Progressive BERT Training. NACCL 2021 - - -

        -Freeze Training -
        -

        -Figure 2. Interpretable Freeze Training: DNNs converge bottom-up (Results on CIFAR10 using ResNet). Each pane shows layer-by-layer similarity using SVCCA [17][18] - -For example, in freeze training [17][18], neural networks usually converge from the bottom-up (i.e., not all layers need to be trained all the way through training). Figure 2 shows an example of how weights gradually stabilize during training in this approach. This observation motivates us to utilize freeze training for distributed training of Transformer models to accelerate training by dynamically allocating resources to focus on a shrinking set of active layers. Such a layer freezing strategy is especially pertinent to pipeline parallelism, as excluding consecutive bottom layers from the pipeline can reduce computation, memory, and communication overhead. - -

        - -
        -Figure 3. The process of PipeTransformer’s automated and elastic pipelining to accelerate distributed training of Transformer models -

        - - -We propose PipeTransformer, an elastic pipelining training acceleration framework that automatically reacts to frozen layers by dynamically transforming the scope of the pipelined model and the number of pipeline replicas. To the best of our knowledge, this is the first paper that studies layer freezing in the context of both pipeline and data-parallel training. Figure 3 demonstrates the benefits of such a combination. First, by excluding frozen layers from the pipeline, the same model can be packed into fewer GPUs, leading to both fewer cross-GPU communications and smaller pipeline bubbles. Second, after packing the model into fewer GPUs, the same cluster can accommodate more pipeline replicas, increasing the width of data parallelism. More importantly, the speedups acquired from these two benefits are multiplicative rather than additive, further accelerating the training. - -The design of PipeTransformer faces four major challenges. First, the freeze algorithm must make on-the-fly and adaptive freezing decisions; however, existing work [17][18] only provides a posterior analysis tool. Second, the efficiency of pipeline re-partitioning results is influenced by multiple factors, including partition granularity, cross-partition activation size, and the chunking (the number of micro-batches) in mini-batches, which require reasoning and searching in a large solution space. Third, to dynamically introduce additional pipeline replicas, PipeTransformer must overcome the static nature of collective communications and avoid potentially complex cross-process messaging protocols when onboarding new processes (one pipeline is handled by one process). Finally, caching can save time for repeated forward propagation of frozen layers, but it must be shared between existing pipelines and newly added ones, as the system cannot afford to create and warm up a dedicated cache for each replica. - -

        -Freeze Training -
        -Figure 4: An Animation to Show the Dynamics of PipeTransformer -

        - -As shown in the animation (Figure 4), PipeTransformer is designed with four core building blocks to address the aforementioned challenges. First, we design a tunable and adaptive algorithm to generate signals that guide the selection of layers to freeze over different iterations (Freeze Algorithm). Once triggered by these signals, our elastic pipelining module (AutoPipe), then packs the remaining active layers into fewer GPUs by taking both activation sizes and variances of workloads across heterogeneous partitions (frozen layers and active layers) into account. It then splits a mini-batch into an optimal number of micro-batches based on prior profiling results for different pipeline lengths. Our next module, AutoDP, spawns additional pipeline replicas to occupy freed-up GPUs and maintains hierarchical communication process groups to attain dynamic membership for collective communications. Our final module, AutoCache, efficiently shares activations across existing and new data-parallel processes and automatically replaces stale caches during transitions. - - -Overall, PipeTransformer combines the Freeze Algorithm, AutoPipe, AutoDP, and AutoCache modules to provide a significant training speedup. -We evaluate PipeTransformer using Vision Transformer (ViT) on ImageNet and BERT on GLUE and SQuAD datasets. Our results show that PipeTransformer attains up to 2.83-fold speedup without losing accuracy. We also provide various performance analyses for a more comprehensive understanding of our algorithmic and system-wise design. -Finally, we have also developed open-source flexible APIs for PipeTransformer, which offer a clean separation among the freeze algorithm, model definitions, and training accelerations, allowing for transferability to other algorithms that require similar freezing strategies. - -# Overall Design - -Suppose we aim to train a massive model in a distributed training system where the hybrid of pipelined model parallelism and data parallelism is used to target scenarios where either the memory of a single GPU device cannot hold the model, or if loaded, the batch size is small enough to avoid running out of memory. More specifically, we define our settings as follows: - -Training task and model definition. We train Transformer models (e.g., Vision Transformer, BERT on large-scale image or text datasets. The Transformer model mathcal{F} has L layers, in which the i th layer is composed of a forward computation function f_i and a corresponding set of parameters. - -Training infrastructure. Assume the training infrastructure contains a GPU cluster that has N GPU servers (i.e. nodes). Each node has I GPUs. Our cluster is homogeneous, meaning that each GPU and server have the same hardware configuration. Each GPU's memory capacity is M_\text{GPU}. Servers are connected by a high bandwidth network interface such as InfiniBand interconnect. - -Pipeline parallelism. In each machine, we load a model \mathcal{F} into a pipeline \mathcal{P} which has Kpartitions (K also represents the pipeline length). The kth partition p_k consists of consecutive layers. We assume each partition is handled by a single GPU device. 1 \leq K \leq I, meaning that we can build multiple pipelines for multiple model replicas in a single machine. We assume all GPU devices in a pipeline belonging to the same machine. Our pipeline is a synchronous pipeline, which does not involve stale gradients, and the number of micro-batches is M. In the Linux OS, each pipeline is handled by a single process. We refer the reader to GPipe [10] for more details. - -Data parallelism. DDP is a cross-machine distributed data-parallel process group within R parallel workers. Each worker is a pipeline replica (a single process). The rth worker's index (ID) is rank r. For any two pipelines in DDP, they can belong to either the same GPU server or different GPU servers, and they can exchange gradients with the AllReduce algorithm. - -Under these settings, our goal is to accelerate training by leveraging freeze training, which does not require all layers to be trained throughout the duration of the training. Additionally, it may help save computation, communication, memory cost, and potentially prevent overfitting by consecutively freezing layers. However, these benefits can only be achieved by overcoming the four challenges of designing an adaptive freezing algorithm, dynamical pipeline re-partitioning, efficient resource reallocation, and cross-process caching, as discussed in the introduction. - - -

        -Overview -
        -Figure 5. Overview of PipeTransformer Training System -

        - -PipeTransformer co-designs an on-the-fly freeze algorithm and an automated elastic pipelining training system that can dynamically transform the scope of the pipelined model and the number of pipeline replicas. The overall system architecture is illustrated in Figure 5. To support PipeTransformer’s elastic pipelining, we maintain a customized version of PyTorch Pipeline. For data parallelism, we use PyTorch DDP as a baseline. Other libraries are standard mechanisms of an operating system (e.g.,multi-processing) and thus avoid specialized software or hardware customization requirements. To ensure the generality of our framework, we have decoupled the training system into four core components: freeze algorithm, AutoPipe, AutoDP, and AutoCache. The freeze algorithm (grey) samples indicators from the training loop and makes layer-wise freezing decisions, which will be shared with AutoPipe (green). AutoPipe is an elastic pipeline module that speeds up training by excluding frozen layers from the pipeline and packing the active layers into fewer GPUs (pink), leading to both fewer cross-GPU communications and smaller pipeline bubbles. Subsequently, AutoPipe passes pipeline length information to AutoDP (purple), which then spawns more pipeline replicas to increase data-parallel width, if possible. The illustration also includes an example in which AutoDP introduces a new replica (purple). AutoCache (orange edges) is a cross-pipeline caching module, as illustrated by connections between pipelines. The source code architecture is aligned with Figure 5 for readability and generality. - - -# Implementation Using PyTorch APIs - -As can be seen from Figure 5, PipeTransformers contain four components: Freeze Algorithm, AutoPipe, AutoDP, and AutoCache. Among them, AutoPipe and AutoDP relies on PyTorch DDP (`torch.nn.parallel.DistributedDataParallel`) [1] and Pipeline (`torch.distributed.pipeline`), respectively. In this blog, we only highlight the key implementation details of AutoPipe and AutoDP. For details of Freeze Algorithm and AutoCache, please refer to our paper. - -## AutoPipe: Elastic Pipelining - -AutoPipe can accelerate training by excluding frozen layers from the pipeline and packing the active layers into fewer GPUs. This section elaborates on the key components of AutoPipe that dynamically 1) partition pipelines, 2) minimize the number of pipeline devices, and 3) optimize mini-batch chunk size accordingly. - -### Basic Usage of PyTorch Pipeline - -Before diving into details of AutoPipe, let us warm up the basic usage of PyTorch Pipeline (`torch.distributed.pipeline.sync.Pipe`, see [this tutorial](https://pytorch.org/docs/stable/pipeline.html)). More specially, we present a simple example to understand the design of Pipeline in practice: - -```python -# Step 1: build a model including two linear layers -fc1 = nn.Linear(16, 8).cuda(0) -fc2 = nn.Linear(8, 4).cuda(1) - -# Step 2: wrap the two layers with nn.Sequential -model = nn.Sequential(fc1, fc2) - -# Step 3: build Pipe (torch.distributed.pipeline.sync.Pipe) -model = Pipe(model, chunks=8) - -# do training/inference -input = torch.rand(16, 16).cuda(0) -output_rref = model(input) -``` - -In this basic example, we can see that before initializing `Pipe`, we need to partition the model `nn.Sequential` into multiple GPU devices and set optimal chunk number (`chunks`). Balancing computation time across partitions is critical to pipeline training speed, as skewed workload distributions across stages can lead to stragglers and forcing devices with lighter workloads to wait. The chunk number may also have a non-trivial influence on the throughput of the pipeline. - - -### Balanced Pipeline Partitioning - -In dynamic training system such as PipeTransformer, maintaining optimally balanced partitions in terms of parameter numbers does not guarantee the fastest training speed because other factors also play a crucial role: - -

        - -
        -Figure 6. The partition boundary is in the middle of a skip connection -

        - -1. Cross-partition communication overhead. Placing a partition boundary in the middle of a skip connection leads to additional communications since tensors in the skip connection must now be copied to a different GPU. For example, with BERT partitions in Figure 6, partition k must take intermediate outputs from both partition k-2 and partition k-1. In contrast, if the boundary is placed after the addition layer, the communication overhead between partition k-1 and k is visibly smaller. Our measurements show that having cross-device communication is more expensive than having slightly imbalanced partitions (see the Appendix in our paper). Therefore, we do not consider breaking skip connections (highlighted separately as an entire attention layer and MLP layer in green color at line 7 in Algorithm 1. - -2. Frozen layer memory footprint. During training, AutoPipe must recompute partition boundaries several times to balance two distinct types of layers: frozen layers and active layers. The frozen layer's memory cost is a fraction of that inactive layer, given that the frozen layer does not need backward activation maps, optimizer states, and gradients. Instead of launching intrusive profilers to obtain thorough metrics on memory and computational cost, we define a tunable cost factor lambda_{\text{frozen}} to estimate the memory footprint ratio of a frozen layer over the same active layer. Based on empirical measurements in our experimental hardware, we set it to \frac{1}{6}. - - - -

        - -
        -

        - -Based on the above two considerations, AutoPipe balances pipeline partitions based on parameter sizes. More specifically, AutoPipe uses a greedy algorithm to allocate all frozen and active layers to evenly distribute partitioned sublayers into K GPU devices. Pseudocode is described as the `load\_balance()` function in Algorithm 1. The frozen layers are extracted from the original model and kept in a separate model instance \mathcal{F}_{\text{frozen}} in the first device of a pipeline. - -Note that the partition algorithm employed in this paper is not the only option; PipeTransformer is modularized to work with any alternatives. - - -### Pipeline Compression - -Pipeline compression helps to free up GPUs to accommodate more pipeline replicas and reduce the number of cross-device communications between partitions. To determine the timing of compression, we can estimate the memory cost of the largest partition after compression, and then compare it with that of the largest partition of a pipeline at timestep T=0. To avoid extensive memory profiling, the compression algorithm uses the parameter size as a proxy for the training memory footprint. Based on this simplification, the criterion of pipeline compression is as follows: - -

        - -
        -

        - -Once the freeze notification is received, AutoPipe will always attempt to divide the pipeline length K by 2 (e.g., from 8 to 4, then 2). By using \frac{K}{2} as the input, the compression algorithm can verify if the result satisfies the criterion in Equation (1). Pseudocode is shown in lines 25-33 in Algorithm 1. Note that this compression makes the acceleration ratio exponentially increase during training, meaning that if a GPU server has a larger number of GPUs (e.g., more than 8), the acceleration ratio will be further amplified. - -

        - -
        -Figure 7. Pipeline Bubble: F_{d,b}, and U_d" denote forward, backward, and the optimizer update of micro-batch b on device d, respectively. The total bubble size in each iteration is K-1 times per micro-batch forward and backward cost. -

        - -Additionally, such a technique can also speed up training by shrinking the size of pipeline bubbles. To explain bubble sizes in a pipeline, Figure 7 depicts how 4 micro-batches run through a 4-device pipeline K = 4. In general, the total bubble size is (K-1) times per micro-batch forward and backward cost. Therefore, it is clear that shorter pipelines have smaller bubble sizes. - -### Dynamic Number of Micro-Batches - -Prior pipeline parallel systems use a fixed number of micro-batches per mini-batch (M ). GPipe suggests M \geq 4 \times K, where K is the number of partitions (pipeline length). However, given that PipeTransformer dynamically configures K, we find it to be sub-optimal to maintain a static M during training. Moreover, when integrated with DDP, the value of M also has an impact on the efficiency of DDP gradient synchronizations. Since DDP must wait for the last micro-batch to finish its backward computation on a parameter before launching its gradient synchronization, finer micro-batches lead to a smaller overlap between computation and communication. Hence, instead of using a static value, PipeTransformer searches for optimal M on the fly in the hybrid of DDP environment by enumerating M values ranging from K to 6K. For a specific training environment, the profiling needs only to be done once (see Algorithm 1 line 35). - -For the complete source code, please refer to `https://github.com/Distributed-AI/PipeTransformer/blob/master/pipe_transformer/pipe/auto_pipe.py`. - -## AutoDP: Spawning More Pipeline Replicas -As AutoPipe compresses the same pipeline into fewer GPUs, AutoDP can automatically spawn new pipeline replicas to increase data-parallel width. - -Despite the conceptual simplicity, subtle dependencies on communications and states require careful design. The challenges are threefold: - -1. DDP Communication: Collective communications in PyTorch DDP requires static membership, which prevents new pipelines from connecting with existing ones; - -2. State Synchronization: newly activated processes must be consistent with existing pipelines in the training progress (e.g., epoch number and learning rate), weights and optimizer states, the boundary of frozen layers, and pipeline GPU range; - -3. Dataset Redistribution: the dataset should be re-balanced to match a dynamic number of pipelines. This not only avoids stragglers but also ensures that gradients from all DDP processes are equally weighted. - -

        - -
        -Figure 8. AutoDP: handling dynamical data-parallel with messaging between double process groups (Process 0-7 belong to machine 0, while process 8-15 belong to machine 1) -

        - - - -To tackle these challenges, we create double communication process groups for DDP. As in the example shown in Figure 8, the message process group (purple) is responsible for light-weight control messages and covers all processes, while the active training process group (yellow) only contains active processes and serves as a vehicle for heavy-weight tensor communications during training. The message group remains static, whereas the training group is dismantled and reconstructed to match active processes. -In T0, only processes 0 and 8 are active. During the transition to T1, process 0 activates processes 1 and 9 (newly added pipeline replicas) and synchronizes necessary information mentioned above using the message group. The four active processes then form a new training group, allowing static collective communications adaptive to dynamic memberships. -To redistribute the dataset, we implement a variant of DistributedSampler that can seamlessly adjust data samples to match the number of active pipeline replicas. - -The above design also naturally helps to reduce DDP communication overhead. More specifically, when transitioning from T0 to T1, processes 0 and 1 destroy the existing DDP instances, and active processes construct a new DDP training group using a cached pipelined model (AutoPipe stores frozen model and cached model separately). - -We use the following APIs to implement the design above. - -```python -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel as DDP - -# initialize the process group (this must be called in the initialization of PyTorch DDP) -dist.init_process_group(init_method='tcp://' + str(self.config.master_addr) + ':' + -str(self.config.master_port), backend=Backend.GLOO, rank=self.global_rank, world_size=self.world_size) -... - -# create active process group (yellow color) -self.active_process_group = dist.new_group(ranks=self.active_ranks, backend=Backend.NCCL, timeout=timedelta(days=365)) -... - -# create message process group (yellow color) -self.comm_broadcast_group = dist.new_group(ranks=[i for i in range(self.world_size)], backend=Backend.GLOO, timeout=timedelta(days=365)) -... - -# create DDP-enabled model when the number of data-parallel workers is changed. Note: -# 1. The process group to be used for distributed data all-reduction. -If None, the default process group, which is created by torch.distributed.init_process_group, will be used. -In our case, we set it as self.active_process_group -# 2. device_ids should be set when the pipeline length = 1 (the model resides on a single CUDA device). - -self.pipe_len = gpu_num_per_process -if gpu_num_per_process > 1: - model = DDP(model, process_group=self.active_process_group, find_unused_parameters=True) -else: - model = DDP(model, device_ids=[self.local_rank], process_group=self.active_process_group, find_unused_parameters=True) - -# to broadcast message among processes, we use dist.broadcast_object_list -def dist_broadcast(object_list, src, group): - """Broadcasts a given object to all parties.""" - dist.broadcast_object_list(object_list, src, group=group) - return object_list -``` -For the complete source code, please refer to `https://github.com/Distributed-AI/PipeTransformer/blob/master/pipe_transformer/dp/auto_dp.py`. - -# Experiments - -This section first summarizes experiment setups and then evaluates PipeTransformer using computer vision and natural language processing tasks. - -Hardware. Experiments were conducted on 2 identical machines connected by InfiniBand CX353A (5GB/s), where each machine is equipped with 8 NVIDIA Quadro RTX 5000 (16GB GPU memory). GPU-to-GPU bandwidth within a machine (PCI 3.0, 16 lanes) is 15.754GB/s. - -Implementation. We used PyTorch Pipe as a building block. The BERT model definition, configuration, and related tokenizer are from HuggingFace 3.5.0. We implemented Vision Transformer using PyTorch by following its TensorFlow implementation. More details can be found in our source code. - -Models and Datasets. Experiments employ two representative Transformers in CV and NLP: Vision Transformer (ViT) and BERT. ViT was run on an image classification task, initialized with pre-trained weights on ImageNet21K and fine-tuned on ImageNet and CIFAR-100. BERT was run on two tasks, text classification on the SST-2 dataset from the General Language Understanding Evaluation (GLUE) benchmark, and question answering on the SQuAD v1.1 Dataset (Stanford Question Answering), which is a collection of 100k crowdsourced question/answer pairs. - -Training Schemes. Given that large models normally would require thousands of GPU-days {\emph{e.g.}, GPT-3) if trained from scratch, fine-tuning downstream tasks using pre-trained models has become a trend in CV and NLP communities. Moreover, PipeTransformer is a complex training system that involves multiple core components. Thus, for the first version of PipeTransformer system development and algorithmic research, it is not cost-efficient to develop and evaluate from scratch using large-scale pre-training. Therefore, the experiments presented in this section focuses on pre-trained models. Note that since the model architectures in pre-training and fine-tuning are the same, PipeTransformer can serve both. We discussed pre-training results in the Appendix. - -Baseline. Experiments in this section compare PipeTransformer to the state-of-the-art framework, a hybrid scheme of PyTorch Pipeline (PyTorch’s implementation of GPipe) and PyTorch DDP. Since this is the first paper that studies accelerating distributed training by freezing layers, there are no perfectly aligned counterpart solutions yet. - -Hyper-parameters. Experiments use ViT-B/16 (12 transformer layers, 16 \times 16 input patch size) for ImageNet and CIFAR-100, BERT-large-uncased (24 layers) for SQuAD 1.1, and BERT-base-uncased (12 layers) for SST-2. With PipeTransformer, ViT and BERT training can set the per-pipeline batch size to around 400 and 64, respectively. Other hyperparameters (e.g., epoch, learning rate) for all experiments are presented in Appendix. - -## Overall Training Acceleration -

        - -
        -

        - -We summarize the overall experimental results in the table above. Note that the speedup we report is based on a conservative \alpha \frac{1}{3} value that can obtain comparable or even higher accuracy. A more aggressive \alpha (\frac{2}{5}, \frac{1}{2}) can obtain a higher speedup but may lead to a slight loss in accuracy. Note that the model size of BERT (24 layers) is larger than ViT-B/16 (12 layers), thus it takes more time for communication. - -## Performance Analysis - -### Speedup Breakdown - -This section presents evaluation results and analyzes the performance of different components in \autopipe. More experimental results can be found in the Appendix. - -

        - -
        -Figure 9. Speedup Breakdown (ViT on ImageNet) -

        - -To understand the efficacy of all four components and their impacts on training speed, we experimented with different combinations and used their training sample throughput (samples/second) and speedup ratio as metrics. Results are illustrated in Figure 9. Key takeaways from these experimental results are: - -1. the main speedup is the result of elastic pipelining which is achieved through the joint use of AutoPipe and AutoDP; -2. AutoCache's contribution is amplified by AutoDP; -3. freeze training alone without system-wise adjustment even downgrades the training speed. - -### Tuning \alpha in Freezing Algorithm - -

        - -
        -Figure 10. Tuning \alpha in Freezing Algorithm -

        - -We ran experiments to show how the \alpha in the freeze algorithms influences training speed. The result clearly demonstrates that a larger \alpha (excessive freeze) leads to a greater speedup but suffers from a slight performance degradation. In the case shown in Figure 10, where \alpha=1/5, freeze training outperforms normal training and obtains a 2.04-fold speedup. We provide more results in the Appendix. - -### Optimal Chunks in the elastic pipeline - -

        - -
        -Figure 11. Optimal chunk number in the elastic pipeline -

        - -We profiled the optimal number of micro-batches M for different pipeline lengths K. Results are summarized in Figure 11. As we can see, different K values lead to different optimal M, and the throughput gaps across different M values are large (as shown when K=8), which confirms the necessity of an anterior profiler in elastic pipelining. - -### Understanding the Timing of Caching - -

        - -
        -Figure 12. the timing of caching -

        - -To evaluate AutoCache, we compared the sample throughput of training that activates AutoCache from epoch 0 (blue) with the training job without AutoCache (red). Figure 12 shows that enabling caching too early can slow down training, as caching can be more expensive than the forward propagation on a small number of frozen layers. After more layers are frozen, caching activations clearly outperform the corresponding forward propagation. As a result, AutoCache uses a profiler to determine the proper timing to enable caching. In our system, for ViT (12 layers), caching starts from 3 frozen layers, while for BERT (24 layers), caching starts from 5 frozen layers. - -For more detailed experimental analysis, please refer to our paper. - -# Summarization -This blog introduces PipeTransformer, a holistic solution that combines elastic pipeline-parallel and data-parallel for distributed training using PyTorch Distributed APIs. More specifically, PipeTransformer incrementally freezes layers in the pipeline, packs remaining active layers into fewer GPUs, and forks more pipeline replicas to increase the data-parallel width. Evaluations on ViT and BERT models show that compared to the state-of-the-art baseline, PipeTransformer attains up to 2.83× speedups without accuracy loss. - - -# Reference - -[1] Li, S., Zhao, Y., Varma, R., Salpekar, O., Noordhuis, P., Li,T., Paszke, A., Smith, J., Vaughan, B., Damania, P., et al. Pytorch Distributed: Experiences on Accelerating Dataparallel Training. Proceedings of the VLDB Endowment,13(12), 2020 - -[2] Devlin, J., Chang, M. W., Lee, K., and Toutanova, K. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT, 2019 - -[3] Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al. An image is Worth 16x16 words: Transformers for Image Recognition at Scale. - -[4] Brown, T. B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. Language Models are Few-shot Learners. - -[5] Lepikhin, D., Lee, H., Xu, Y., Chen, D., Firat, O., Huang, Y., Krikun, M., Shazeer, N., and Chen, Z. Gshard: Scaling Giant Models with Conditional Computation and Automatic Sharding. - -[6] Li, M., Andersen, D. G., Park, J. W., Smola, A. J., Ahmed, A., Josifovski, V., Long, J., Shekita, E. J., and Su, B. Y. Scaling Distributed Machine Learning with the Parameter Server. In 11th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 14), pp. 583–598, 2014. - -[7] Jiang, Y., Zhu, Y., Lan, C., Yi, B., Cui, Y., and Guo, C. A Unified Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU/CPU Clusters. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pp. 463–479. USENIX Association, November 2020. ISBN 978-1-939133-19- 9. - -[8] Kim, S., Yu, G. I., Park, H., Cho, S., Jeong, E., Ha, H., Lee, S., Jeong, J. S., and Chun, B. G. Parallax: Sparsity-aware Data Parallel Training of Deep Neural Networks. In Proceedings of the Fourteenth EuroSys Conference 2019, pp. 1–15, 2019. - -[9] Kim, C., Lee, H., Jeong, M., Baek, W., Yoon, B., Kim, I., Lim, S., and Kim, S. TorchGPipe: On-the-fly Pipeline Parallelism for Training Giant Models. - -[10] Huang, Y., Cheng, Y., Bapna, A., Firat, O., Chen, M. X., Chen, D., Lee, H., Ngiam, J., Le, Q. V., Wu, Y., et al. Gpipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. - -[11] Park, J. H., Yun, G., Yi, C. M., Nguyen, N. T., Lee, S., Choi, J., Noh, S. H., and ri Choi, Y. Hetpipe: Enabling Large DNN Training on (whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism. In 2020 USENIX Annual Technical Conference (USENIX ATC 20), pp. 307–321. USENIX Association, July 2020. ISBN 978-1-939133- 14-4. - -[12] Narayanan, D., Harlap, A., Phanishayee, A., Seshadri, V., Devanur, N. R., Ganger, G. R., Gibbons, P. B., and Zaharia, M. Pipedream: Generalized Pipeline Parallelism for DNN Training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles, SOSP ’19, pp. 1–15, New York, NY, USA, 2019. Association for Computing Machinery. ISBN 9781450368735. doi: 10.1145/3341301.3359646. - -[13] Lepikhin, D., Lee, H., Xu, Y., Chen, D., Firat, O., Huang, Y., Krikun, M., Shazeer, N., and Chen, Z. Gshard: Scaling Giant Models with Conditional Computation and Automatic Sharding. - -[14] Shazeer, N., Cheng, Y., Parmar, N., Tran, D., Vaswani, A., Koanantakool, P., Hawkins, P., Lee, H., Hong, M., Young, C., Sepassi, R., and Hechtman, B. Mesh-Tensorflow: Deep Learning for Supercomputers. In Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., and Garnett, R. (eds.), Advances in Neural Information Processing Systems, volume 31, pp. 10414–10423. Curran Associates, Inc., 2018. - -[15] Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., and Catanzaro, B. Megatron-LM: Training Multi-billion Parameter Language Models using Model Parallelism. - -[16] Rajbhandari, S., Rasley, J., Ruwase, O., and He, Y. ZERO: Memory Optimization towards Training a Trillion Parameter Models. - -[17] Raghu, M., Gilmer, J., Yosinski, J., and Sohl Dickstein, J. Svcca: Singular Vector Canonical Correlation Analysis for Deep Learning Dynamics and Interpretability. In NIPS, 2017. - -[18] Morcos, A., Raghu, M., and Bengio, S. Insights on Representational Similarity in Neural Networks with Canonical Correlation. In Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., and Garnett, R. (eds.), Advances in Neural Information Processing Systems 31, pp. 5732–5741. Curran Associates, Inc., 2018. diff --git a/_posts/2021-8-3-pytorch-profiler-1.9-released.md b/_posts/2021-8-3-pytorch-profiler-1.9-released.md deleted file mode 100644 index 9b820ff60416..000000000000 --- a/_posts/2021-8-3-pytorch-profiler-1.9-released.md +++ /dev/null @@ -1,217 +0,0 @@ ---- -layout: blog_detail -title: 'What’s New in PyTorch Profiler 1.9?' -author: Sabrina Smai, Program Manager on the AI Framework team at Microsoft ---- - -PyTorch Profiler v1.9 has been released! The goal of this new release (previous [PyTorch Profiler release](https://pytorch.org/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/)) is to provide you with new state-of-the-art tools to help diagnose and fix machine learning performance issues regardless of whether you are working on one or numerous machines. The objective is to target the execution steps that are the most costly in time and/or memory, and visualize the work load distribution between GPUs and CPUs. - -Here is a summary of the five major features being released: - -1. **Distributed Training View**: This helps you understand how much time and memory is consumed in your distributed training job. Many issues occur when you take a training model and split the load into worker nodes to be run in parallel as it can be a black box. The overall model goal is to speed up model training. This distributed training view will help you diagnose and debug issues within individual nodes. -2. **Memory View**: This view allows you to understand your memory usage better. This tool will help you avoid the famously pesky Out of Memory error by showing active memory allocations at various points of your program run. -3. **GPU Utilization Visualization**: This tool helps you make sure that your GPU is being fully utilized. -4. **Cloud Storage Support**: Tensorboard plugin can now read profiling data from Azure Blob Storage, Amazon S3, and Google Cloud Platform. -5. **Jump to Source Code**: This feature allows you to visualize stack tracing information and jump directly into the source code. This helps you quickly optimize and iterate on your code based on your profiling results. - -## Getting Started with PyTorch Profiling Tool -PyTorch includes a profiling functionality called « PyTorch Profiler ». The PyTorch Profiler tutorial can be found [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). - -To instrument your PyTorch code for profiling, you must: - -$ pip install torch-tb-profiler - -```python -import torch.profiler as profiler -With profiler.profile(XXXX) -``` - -**Comments**: - -• For CUDA and CPU profiling, see [below](https://github.com/pytorch/kineto/blob/master/tb_plugin/examples/resnet50_profiler_api.py): -``` -with torch.profiler.profile( -activities=[ -torch.profiler.ProfilerActivity.CPU, -torch.profiler.ProfilerActivity.CUDA], -``` - -• With profiler.record_function(“$NAME”): allows putting a decorator (a tag associated to a name) for a block of function - -• Profile_memory=True parameter under profiler.profile allows you to profile CPU and GPU memory footprint - -## Visualizing PyTorch Model Performance using PyTorch Profiler - -### Distributed Training - -Recent advances in deep learning argue for the value of large datasets and large models, which requires you to scale out model training to more computational resources. Distributed Data Parallel (DDP) and NVIDIA Collective Communications Library (NCCL) are the widely adopted paradigms in PyTorch for accelerating your deep learning training. - -In this release of PyTorch Profiler, DDP with NCCL backend is now supported. - -
        - -
        - -### Computation/Communication Overview - -In the Computation/Communication overview under the Distributed training view, you can observe the computation-to-communication ratio of each worker and [load balancer](https://en.wikipedia.org/wiki/Load_balancing_(computing) nodes between worker as measured by granularity. - -**Scenario 1**: - -If the computation and overlapping time of one worker is much larger than the others, this may suggest an issue in the workload balance or worker being a straggler. Computation is the sum of kernel time on GPU minus the overlapping time. The overlapping time is the time saved by interleaving communications during computation. The more overlapping time represents better parallelism between computation and communication. Ideally the computation and communication completely overlap with each other. Communication is the total communication time minus the overlapping time. The example image below displays how this scenario appears on Tensorboard. - -
        - -

        Figure: A straggler example

        -
        - -**Scenario 2**: - -If there is a small batch size (i.e. less computation on each worker) or the data to be transferred is large, the computation-to-communication may also be small and be seen in the profiler with low GPU utilization and long waiting times. This computation/communication view will allow you to diagnose your code to reduce communication by adopting gradient accumulation, or to decrease the communication proportion by increasing batch size. DDP communication time depends on model size. Batch size has no relationship with model size. So increasing batch size could make computation time longer and make computation-to-communication ratio bigger. - -### Synchronizing/Communication Overview - -In the Synchronizing/Communication view, you can observe the efficiency of communication. This is done by taking the step time minus computation and communication time. Synchronizing time is part of the total communication time for waiting and synchronizing with other workers. The Synchronizing/Communication view includes initialization, data loader, CPU computation, and so on Insights like what is the ratio of total communication is really used for exchanging data and what is the idle time of waiting for data from other workers can be drawn from this view. - -
        - -
        - -For example, if there is an inefficient workload balance or straggler issue, you’ll be able to identify it in this Synchronizing/Communication view. This view will show several workers’ waiting time being longer than others. - -
        - -
        - -This table view above allows you to see the detailed statistics of all communication ops in each node. This allows you to see what operation types are being called, how many times each op is called, what is the size of the data being transferred by each op, etc. - -### Memory View: - -This memory view tool helps you understand the hardware resource consumption of the operators in your model. Understanding the time and memory consumption on the operator-level allows you to resolve performance bottlenecks and in turn, allow your model to execute faster. Given limited GPU memory size, optimizing the memory usage can: - -1. Allow bigger model which can potentially generalize better on end level tasks. -2. Allow bigger batch size. Bigger batch sizes increase the training speed. - -The profiler records all the memory allocation during the profiler interval. Selecting the “Device” will allow you to see each operator’s memory usage on the GPU side or host side. You must enable ```profile_memory=True``` to generate the below memory data as shown [here](https://github.com/pytorch/kineto/blob/master/tb_plugin/examples/resnet50_profiler_api.py#L39). - -``` -With torch.profiler.profile( -Profiler_memory=True # this will take 1 – 2 minutes to complete. -) -``` - -**Important Definitions**: - -• “Size Increase” displays the sum of all allocation bytes and minus all the memory release bytes. - -• “Allocation Size” shows the sum of all allocation bytes without considering the memory release. - -• “Self” means the allocated memory is not from any child operators, instead by the operator itself. - -
        - -
        - - -### GPU Metric on Timeline: - -This feature will help you debug performance issues when one or more GPU are underutilized. Ideally, your program should have high GPU utilization (aiming for 100% GPU utilization), minimal CPU to GPU communication, and no overhead. - -**Overview**: -The overview page highlights the results of three important GPU usage metrics at different levels (i.e. GPU Utilization, Est. SM Efficiency, and Est. Achieved Occupancy). Essentially, each GPU has a bunch of SM each with a bunch of warps that can execute a bunch of threads concurrently. Warps execute a bunch because the amount depends on the GPU. But at a high level, this GPU Metric on Timeline tool allows you can see the whole stack, which is useful. - -If the GPU utilization result is low, this suggests a potential bottleneck is present in your model. Common reasons: - -•Insufficient parallelism in kernels (i.e., low batch size) - -•Small kernels called in a loop. This is to say the launch overheads are not amortized - -•CPU or I/O bottlenecks lead to the GPU not receiving enough work to keep busy - -Looking of the overview page where the performance recommendation section is where you’ll find potential suggestions on how to increase that GPU utilization. In this example, GPU utilization is low so the performance recommendation was to increase batch size. Increasing batch size 4 to 32, as per the performance recommendation, increased the GPU Utilization by 60.68%. - -GPU Utilization: the step interval time in the profiler when a GPU engine was executing a workload. The high the utilization %, the better. The drawback of using GPU utilization solely to diagnose performance bottlenecks is it is too high-level and coarse. It won’t be able to tell you how many Streaming Multiprocessors are in use. Note that while this metric is useful for detecting periods of idleness, a high value does not indicate efficient use of the GPU, only that it is doing anything at all. For instance, a kernel with a single thread running continuously will get a GPU Utilization of 100% - -Estimated Stream Multiprocessor Efficiency (Est. SM Efficiency) is a finer grained metric, it indicates what percentage of SMs are in use at any point in the trace This metric reports the percentage of time where there is at least one active warp on a SM and those that are stalled (NVIDIA [doc](https://forums.developer.nvidia.com/t/nvprof-question-about-the-sm-efficiency-metric/72640#:~:text=My%20understanding%20from%20the%20profiler%20documentation%20is%20that,that%20%E2%80%9Cactive%20warps%E2%80%9D%20include%20warps%20that%20are%20stalled.)). Est. SM Efficiency also has it’s limitation. For instance, a kernel with only one thread per block can’t fully use each SM. SM Efficiency does not tell us how busy each SM is, only that they are doing anything at all, which can include stalling while waiting on the result of a memory load. To keep an SM busy, it is necessary to have a sufficient number of ready warps that can be run whenever a stall occurs - -Estimated Achieved Occupancy (Est. Achieved Occupancy) is a layer deeper than Est. SM Efficiency and GPU Utilization for diagnosing performance issues. Estimated Achieved Occupancy indicates how many warps can be active at once per SMs. Having a sufficient number of active warps is usually key to achieving good throughput. Unlike GPU Utilization and SM Efficiency, it is not a goal to make this value as high as possible. As a rule of thumb, good throughput gains can be had by improving this metric to 15% and above. But at some point you will hit diminishing returns. If the value is already at 30% for example, further gains will be uncertain. This metric reports the average values of all warp schedulers for the kernel execution period (NVIDIA [doc](https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/achievedoccupancy.htm)). The larger the Est. Achieve Occupancy value is the better. - -
        - -

        Overview details: Resnet50_batchsize4

        -
        - -
        - -

        Overview details: Resnet50_batchsize32

        -
        - -_Kernel View_ -The kernel has “Blocks per SM” and “Est. Achieved Occupancy” which is a great tool to compare model runs. - -
        - -
        - -Mean Blocks per SM: -Blocks per SM = Blocks of this kernel / SM number of this GPU. If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized. “Mean Blocks per SM” is weighted average of all runs of this kernel name, using each run’s duration as weight. - -Mean Est. Achieved Occupancy: -Est. Achieved Occupancy is defined as above in overview. “Mean Est. Achieved Occupancy” is weighted average of all runs of this kernel name, using each run’s duration as weight. - -_Trace View_ -This trace view displays a timeline that shows the duration of operators in your model and which system executed the operation. This view can help you identify whether the high consumption and long execution is because of input or model training. Currently, this trace view shows GPU Utilization and Est. SM Efficiency on a timeline. - -
        - -
        - -GPU utilization is calculated independently and divided into multiple 10 millisecond buckets. The buckets’ GPU utilization values are drawn alongside the timeline between 0 – 100%. In the above example, the “ProfilerStep5” GPU utilization during thread 28022’s busy time is higher than the following the one during “Optimizer.step”. This is where you can zoom-in to investigate why that is. - -
        - -
        - -From above, we can see the former’s kernels are longer than the later’s kernels. The later’s kernels are too short in execution, which results in lower GPU utilization. - -Est. SM Efficiency: Each kernel has a calculated est. SM efficiency between 0 – 100%. For example, the below kernel has only 64 blocks, while the SMs in this GPU is 80. Then its “Est. SM Efficiency” is 64/80, which is 0.8. - -
        - -
        - -### Cloud Storage Support - -After running pip install tensorboard, to have data be read through these cloud providers, you can now run: - -``` sh -torch-tb-profiler[blob] -torch-tb-profiler[gs] -torch-tb-profiler[s3] -``` -```pip install torch-tb-profiler[blob]```, ```pip install torch-tb-profiler[gs]```, or ```pip install torch-tb-profiler[S3]``` to have data be read through these cloud providers. For more information, please refer to this [README](https://github.com/pytorch/kineto/tree/main/tb_plugin). - -### Jump to Source Code: - -One of the great benefits of having both TensorBoard and the PyTorch Profiler being integrated directly in Visual Studio Code (VS Code) is the ability to directly jump to the source code (file and line) from the profiler stack traces. VS Code Python Extension now [supports TensorBoard Integration](https://devblogs.microsoft.com/python/python-in-visual-studio-code-february-2021-release/). - -Jump to source is ONLY available when Tensorboard is launched within VS Code. Stack tracing will appear on the plugin UI if the profiling with_stack=True. When you click on a stack trace from the PyTorch Profiler, VS Code will automatically open the corresponding file side by side and jump directly to the line of code of interest for you to debug. This allows you to quickly make actionable optimizations and changes to your code based on the profiling results and suggestions. - -
        - -

        Gify: Jump to Source using Visual Studio Code Plug In UI

        -
        - -For how to optimize batch size performance, check out the step-by-step tutorial [here](https://opendatascience.com/optimizing-pytorch-performance-batch-size-with-pytorch-profiler/). PyTorch Profiler is also integrated with PyTorch Lightning and you can simply launch your lightning training jobs with --```trainer.profiler=pytorch``` flag to generate the traces. - -## What’s Next for the PyTorch Profiler? -You just saw how PyTorch Profiler can help optimize a model. You can now try the Profiler by ```pip install torch-tb-profiler``` to optimize your PyTorch model. - -Look out for an advanced version of this tutorial in the future. We are also thrilled to continue to bring state-of-the-art tool to PyTorch users to improve ML performance. We'd love to hear from you. Feel free to open an issue [here](https://github.com/pytorch/kineto/issues). - -For new and exciting features coming up with PyTorch Profiler, follow @PyTorch on Twitter and check us out on pytorch.org. - -## Acknowledgements - -The author would like to thank the contributions of the following individuals to this piece. From the Facebook side: Geeta Chauhan, Gisle Dankel, Woo Kim, Sam Farahzad, and Mark Saroufim. On the Microsoft side: AI Framework engineers (Teng Gao, Mike Guo, and Yang Gu), Guoliang Hua, and Thuy Nguyen. - diff --git a/_posts/2021-8-31-computational-graphs-constructed-in-pytorch.md b/_posts/2021-8-31-computational-graphs-constructed-in-pytorch.md deleted file mode 100644 index c185cd6b00f4..000000000000 --- a/_posts/2021-8-31-computational-graphs-constructed-in-pytorch.md +++ /dev/null @@ -1,482 +0,0 @@ ---- -layout: blog_detail -title: 'How Computational Graphs are Constructed in PyTorch' -author: Preferred Networks -featured-img: 'assets/images/augmented_computational_graph.png' ---- - -In the previous [post](https://pytorch.org/blog/overview-of-pytorch-autograd-engine/) we went over the theoretical foundations of automatic differentiation and reviewed the implementation in PyTorch. In this post, we will be showing the parts of PyTorch involved in creating the graph and executing it. In order to understand the following contents, please read @ezyang’s wonderful [blog post](http://blog.ezyang.com/2019/05/pytorch-internals/) about PyTorch internals. - -# Autograd components - -First of all, let’s look at where the different components of autograd live: - -[tools/autograd](https://github.com/pytorch/pytorch/tree/release/1.9/tools/autograd): Here we can find the definition of the derivatives as we saw in the previous post [derivatives.yaml](https://github.com/pytorch/pytorch/blob/release/1.9/tools/autograd/derivatives.yaml), several python scripts and a folder called [templates](https://github.com/pytorch/pytorch/tree/release/1.9/tools/autograd/templates). These scripts and the templates are used at building time to generate the C++ code for the derivatives as specified in the yaml file. Also, the scripts here generate wrappers for the regular ATen functions so that the computational graph can be constructed. - -[torch/autograd](https://github.com/pytorch/pytorch/tree/release/1.9/torch/autograd): This folder is where the autograd components that can be used directly from python are located. In [function.py](https://github.com/pytorch/pytorch/blob/release/1.9/torch/autograd/function.py) we find the actual definition of `torch.autograd.Function`, a class used by users to write their own differentiable functions in python as per the documentation. [functional.py](https://github.com/pytorch/pytorch/blob/release/1.9/torch/autograd/functional.py) holds components for functionally computing the jacobian vector product, hessian, and other gradient related computations of a given function. -The rest of the files have additional components such as gradient checkers, anomaly detection, and the autograd profiler. - -[torch/csrc/autograd](https://github.com/pytorch/pytorch/tree/release/1.9/torch/csrc/autograd): This is where the graph creation and execution-related code lives. -All this code is written in C++, since it is a critical part that is required to be extremely performant. Here we have several files that implement the engine, metadata storage, and all the needed components. Alongside this, we have several files whose names start with `python_`, and their main responsibility is to allow python objects to be used in the autograd engine. - -# Graph Creation - -[Previously](https://pytorch.org/blog/overview-of-pytorch-autograd-engine/), we described the creation of a computational graph. Now, we will see how PyTorch creates these graphs with references to the actual codebase. - -

        - -
        -Figure 1: Example of an augmented computational graph -

        - -It all starts when in our python code, where we request a tensor to require the gradient. - -```py ->>> x = torch.tensor([0.5, 0.75], requires_grad=True) -``` - -When the `required_grad` flag is set in tensor creation, c10 will [allocate](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/c10/core/TensorImpl.cpp#L382-L406) an `AutogradMeta` object that is used to hold the graph information. - -```c++ - -void TensorImpl::set_requires_grad(bool requires_grad) { - ... - if (!autograd_meta_) - autograd_meta_ = impl::GetAutogradMetaFactory()->make(); - autograd_meta_->set_requires_grad(requires_grad, this); -} -``` - - -The `AutogradMeta` object is defined in [torch/csrc/autograd/variable.h](https://github.com/pytorch/pytorch/blob/release/1.9/torch/csrc/autograd/variable.h#L190-L286) as follows: - -```c++ - -struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface { - std::string name_; - - Variable grad_; - std::shared_ptr grad_fn_; - std::weak_ptr grad_accumulator_; - // other fields and methods - ... -}; -``` - -The most important fields in this structure are the computed gradient in `grad_` and a pointer to the function `grad_fn` that will be called by the engine to produce the actual gradient. Also, there is a gradient accumulator object that is used to add together all the different gradients where this tensor is involved as we will see in the graph execution. - -### Graphs, Nodes and Edges. - -Now, when we call a differentiable function that takes this tensor as an argument, the associated metadata will be populated. Let’s suppose that we call a regular torch function that is implemented in ATen. Let it be the multiplication as in our previous blog post example. The resulting tensor has a field called `grad_fn` that is essentially a pointer to the function that will be used to compute the gradient of that operation. - -```py ->>> x = torch.tensor([0.5, 0.75], requires_grad=True) ->>> v = x[0] * x[1] ->>> v -tensor(0.3750, grad_fn=) -``` - -Here we see that the tensors’ `grad_fn` has a `MulBackward0` value. This function is the same that was written in the [derivatives.yaml](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/tools/autograd/derivatives.yaml#L840-L843) file, and its C++ code was generated automatically by all the scripts in `tools/autograd`. It’s auto-generated source code can be seen in `torch/csrc/autograd/generated/Functions.cpp`. - -```c++ -variable_list MulBackward0::apply(variable_list&& grads) { - std::lock_guard lock(mutex_); - - IndexRangeGenerator gen; - auto self_ix = gen.range(1); - auto other_ix = gen.range(1); - variable_list grad_inputs(gen.size()); - auto& grad = grads[0]; - auto self = self_.unpack(); - auto other = other_.unpack(); - bool any_grad_defined = any_variable_defined(grads); - if (should_compute_output({ other_ix })) { - auto grad_result = any_grad_defined ? (mul_tensor_backward(grad, self, other_scalar_type)) : Tensor(); - copy_range(grad_inputs, other_ix, grad_result); - } - if (should_compute_output({ self_ix })) { - auto grad_result = any_grad_defined ? (mul_tensor_backward(grad, other, self_scalar_type)) : Tensor(); - copy_range(grad_inputs, self_ix, grad_result); - } - return grad_inputs; -} -``` - -The `grad_fn` objects inherit from the [`TraceableFunction`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/function.h#L535-L541) class, a descendant of `Node` with just a property set to enable tracing for debugging and optimization purposes. A graph by definition has nodes and edges, so these functions are indeed the nodes of the computational graph that are linked together by using `Edge` objects to enable the graph traversal later on. - -The `Node` definition can be found in the [torch/csrc/autograd/function.h](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/function.h#L50-L533) file. - -```c++ -struct TORCH_API Node : std::enable_shared_from_this { - ... - /// Evaluates the function on the given inputs and returns the result of the - /// function call. - variable_list operator()(variable_list&& inputs) { - ... - } - -protected: - /// Performs the `Node`'s actual operation. - virtual variable_list apply(variable_list&& inputs) = 0; - … - edge_list next_edges_; -``` - -Essentially we see that it has an override of the `operator ()` that performs the call to the actual function, and a pure virtual function called `apply`. The automatically generated functions override this `apply` method as we saw in the `MulBackward0` example above. Finally, the node also has a list of edges to enable graph connectivity. - -The [Edge](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/edge.h#L14-L39) object is used to link `Node`s together and its implementation is straightforward. - -```c++ -struct Edge { - ... - /// The function this `Edge` points to. - std::shared_ptr function; - /// The identifier of a particular input to the function. - uint32_t input_nr; -}; -``` - -It only requires a function pointer (the actual `grad_fn` objects that the edges link together), and an input number that acts as an id for the edge. - -### Linking nodes together - -When we invoke the product operation of two tensors, we enter into the realm of autogenerated code. All the scripts that we saw in `tools/autograd` fill a series of templates that wrap the differentiable functions in ATen. These functions have code to construct the backward graph during the forward pass. - -The [gen_variable_type.py](https://github.com/pytorch/pytorch/blob/release/1.9/tools/autograd/gen_variable_type.py) script is in charge of writing all this wrapping code. This script is called from the [tools/autograd/gen_autograd.py](https://github.com/pytorch/pytorch/blob/release/1.9/tools/autograd/gen_autograd.py) during the pytorch build process and it will output the automatically generated function wrappers to `torch/csrc/autograd/generated/`. - - - -Let’s take a look at how the tensor multiplication generated function looks like. The code has been simplified, but it can be found in the `torch/csrc/autograd/generated/VariableType_4.cpp` file when compiling pytorch from source. - -```c++ -at::Tensor mul_Tensor(c10::DispatchKeySet ks, const at::Tensor & self, const at::Tensor & other) { - ... - auto _any_requires_grad = compute_requires_grad( self, other ); - std::shared_ptr grad_fn; - if (_any_requires_grad) { - // Creates the link to the actual grad_fn and links the graph for backward traversal - grad_fn = std::shared_ptr(new MulBackward0(), deleteNode); - grad_fn->set_next_edges(collect_next_edges( self, other )); - ... - } - … - // Does the actual function call to ATen - auto _tmp = ([&]() { - at::AutoDispatchBelowADInplaceOrView guard; - return at::redispatch::mul(ks & c10::after_autograd_keyset, self_, other_); - })(); - - auto result = std::move(_tmp); - if (grad_fn) { - // Connects the result to the graph - set_history(flatten_tensor_args( result ), grad_fn); - } - ... - return result; -} -``` - -Let’s walk through the most important lines of this code. -First of all, the `grad_fn` object is created with: ` grad_fn = std::shared_ptr(new MulBackward0(), deleteNode);`. - -After the `grad_fn` object is created, the edges used to link the nodes together are created by using the `grad_fn->set_next_edges(collect_next_edges( self, other ));` calls. - -```c++ -struct MakeNextFunctionList : IterArgs { - edge_list next_edges; - using IterArgs::operator(); - void operator()(const Variable& variable) { - if (variable.defined()) { - next_edges.push_back(impl::gradient_edge(variable)); - } else { - next_edges.emplace_back(); - } - } - void operator()(const c10::optional& variable) { - if (variable.has_value() && variable->defined()) { - next_edges.push_back(impl::gradient_edge(*variable)); - } else { - next_edges.emplace_back(); - } - } -}; - -template -edge_list collect_next_edges(Variables&&... variables) { - detail::MakeNextFunctionList make; - make.apply(std::forward(variables)...); - return std::move(make.next_edges); -} -``` - -Given an input variable (it’s just a regular tensor), [`collect_next_edges`]( -https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/function.h#L597-L603) - will create an `Edge` object by calling [`impl::gradient_edge`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/variable.cpp#L228-L240.) - -```c++ - Edge gradient_edge(const Variable& self) { - // If grad_fn is null (as is the case for a leaf node), we instead - // interpret the gradient function to be a gradient accumulator, which will - // accumulate its inputs into the grad property of the variable. These - // nodes get suppressed in some situations, see "suppress gradient - // accumulation" below. Note that only variables which have `requires_grad = - // True` can have gradient accumulators. - if (const auto& gradient = self.grad_fn()) { - return Edge(gradient, self.output_nr()); - } else { - return Edge(grad_accumulator(self), 0); - } - } -``` - -To understand how edges work, let’s assume that an early executed function produced two output tensors, both with their `grad_fn` set, each tensor also has an `output_nr` property with the order in which they were returned. When creating the edges for the current `grad_fn`, an `Edge` object per input variable will be created. The edges will point to the variable’s grad_fn and will also track the `output_nr` to establish ids used when traversing the graph. In the case that the input variables are “leaf”, i.e. they were not produced by any differentiable function, they don’t have a `grad_fn` attribute set. A special function called a gradient accumulator is set by default as seen in the above code snippet. - -After the edges are created, the `grad_fn` graph Node object that is being currently created will hold them using the [`set_next_edges`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/function.h#L258-L263) function. This is what connects `grad_fn`s together, producing the computational graph. - -```c++ - void set_next_edges(edge_list&& next_edges) { - next_edges_ = std::move(next_edges); - for(const auto& next_edge : next_edges_) { - update_topological_nr(next_edge); - } - } -``` - -Now, the forward pass of the function will execute, and after the execution `set_history` will connect the output tensors to the `grad_fn` Node. - -```c++ -inline void set_history( - at::Tensor& variable, - const std::shared_ptr& grad_fn) { - AT_ASSERT(grad_fn); - if (variable.defined()) { - // If the codegen triggers this, you most likely want to add your newly added function - // to the DONT_REQUIRE_DERIVATIVE list in tools/autograd/gen_variable_type.py - TORCH_INTERNAL_ASSERT(isDifferentiableType(variable.scalar_type())); - auto output_nr = - grad_fn->add_input_metadata(variable); - impl::set_gradient_edge(variable, {grad_fn, output_nr}); - } else { - grad_fn->add_input_metadata(Node::undefined_input()); - } -} -``` - -[`set_history`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/functions/utils.h#L58-L72) calls [`set_gradient_edge`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/variable.cpp#L242-L255), which just copies the grad_fn and the `output_nr` to the `AutogradMeta` object that the tensor has. - -```c++ - void set_gradient_edge(const Variable& self, Edge edge) { - auto* meta = materialize_autograd_meta(self); - meta->grad_fn_ = std::move(edge.function); - meta->output_nr_ = edge.input_nr; - // For views, make sure this new grad_fn_ is not overwritten unless it is necessary - // in the VariableHooks::grad_fn below. - // This logic is only relevant for custom autograd Functions for which multiple - // operations can happen on a given Tensor before its gradient edge is set when - // exiting the custom Function. - auto diff_view_meta = get_view_autograd_meta(self); - if (diff_view_meta && diff_view_meta->has_bw_view()) { - diff_view_meta->set_attr_version(self._version()); - } - } -``` - -This tensor now will be the input to another function and the above steps will be all repeated. Check the animation below to see how the graph is created. - -

        - -
        -Figure 2: Animation that shows the graph creation -

        - -### Registering Python Functions in the graph - -We have seen how autograd creates the graph for the functions included in ATen. However, when we define our differentiable functions in Python, they are also included in the graph! - -An autograd python defined function looks like the following: - -```python -class Exp(torch.autograd.Function): - @staticmethod - def forward(ctx, i): - result = i.exp() - ctx.save_for_backward(result) - return result - - @staticmethod - def backward(ctx, grad_output): - result, = ctx.saved_tensors - return grad_output * result - -# Call the function -Exp.apply(torch.tensor(0.5, requires_grad=True)) -# Outputs: tensor(1.6487, grad_fn=) -``` - -In the above snippet autograd detected our python function when creating the graph. All of this is possible thanks to the [`Function`](https://github.com/pytorch/pytorch/blob/release/1.9/torch/autograd/function.py#L106) class. Let’s take a look at what happens when we call `apply`. - -`apply` is defined in the [`torch._C._FunctionBase`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/python_function.cpp#L859-L908) class, but this class is not present in the python source. `_FunctionBase` is defined in C++ by using the python C API to hook C functions together into a single python class. We are looking for a function named [`THPFunction_apply`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/python_function.cpp#L577-L633). - -```c++ - -PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs) -{ - - // Generates the graph node - THPObjectPtr backward_cls(PyObject_GetAttrString(cls, "_backward_cls")); - if (!backward_cls) return nullptr; - THPObjectPtr ctx_obj(PyObject_CallFunctionObjArgs(backward_cls, nullptr)); - if (!ctx_obj) return nullptr; - THPFunction* ctx = (THPFunction*)ctx_obj.get(); - - auto cdata = std::shared_ptr(new PyNode(std::move(ctx_obj)), deleteNode); - ctx->cdata = cdata; - - // Prepare inputs and allocate context (grad fn) - // Unpack inputs will collect the edges - auto info_pair = unpack_input(inputs); - UnpackedInput& unpacked_input = info_pair.first; - InputFlags& input_info = info_pair.second; - - // Initialize backward function (and ctx) - bool is_executable = input_info.is_executable; - cdata->set_next_edges(std::move(input_info.next_edges)); - ctx->needs_input_grad = input_info.needs_input_grad.release(); - ctx->is_variable_input = std::move(input_info.is_variable_input); - - // Prepend ctx to input_tuple, in preparation for static method call - auto num_args = PyTuple_GET_SIZE(inputs); - THPObjectPtr ctx_input_tuple(PyTuple_New(num_args + 1)); - if (!ctx_input_tuple) return nullptr; - Py_INCREF(ctx); - PyTuple_SET_ITEM(ctx_input_tuple.get(), 0, (PyObject*)ctx); - for (int i = 0; i < num_args; ++i) { - PyObject *arg = PyTuple_GET_ITEM(unpacked_input.input_tuple.get(), i); - Py_INCREF(arg); - PyTuple_SET_ITEM(ctx_input_tuple.get(), i + 1, arg); - } - - // Call forward - THPObjectPtr tensor_outputs; - { - AutoGradMode grad_mode(false); - THPObjectPtr forward_fn(PyObject_GetAttrString(cls, "forward")); - if (!forward_fn) return nullptr; - tensor_outputs = PyObject_CallObject(forward_fn, ctx_input_tuple); - if (!tensor_outputs) return nullptr; - } - - // Here is where the outputs gets the tensors tracked - return process_outputs(cls, cdata, ctx, unpacked_input, inputs, std::move(tensor_outputs), - is_executable, node); - END_HANDLE_TH_ERRORS -} -``` - -Although this code is hard to read at first due to all the python API calls, it essentially does the same thing as the auto-generated forward functions that we saw for ATen: - -Create a `grad_fn` object. -Collect the edges to link the current `grad_fn` with the input tensors one. -Execute the function `forward`. -Assign the created `grad_fn` to the output tensors metadata. - -The `grad_fn` object is created in: - -```c++ - // Generates the graph node - THPObjectPtr backward_cls(PyObject_GetAttrString(cls, "_backward_cls")); - if (!backward_cls) return nullptr; - THPObjectPtr ctx_obj(PyObject_CallFunctionObjArgs(backward_cls, nullptr)); - if (!ctx_obj) return nullptr; - THPFunction* ctx = (THPFunction*)ctx_obj.get(); - - auto cdata = std::shared_ptr(new PyNode(std::move(ctx_obj)), deleteNode); - ctx->cdata = cdata; -``` - -Basically, it asks the python API to get a pointer to the Python object that can execute the user-written function. Then it wraps it into a [`PyNode`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/python_function.h#L24-L58) object that is a specialized `Node` object that calls the python interpreter with the provided python function when `apply` is executed during the forward pass. Note that in the code `cdata` is the actual `Node` object that is part of the graph. `ctx` is the object that is passed to the python `forward`/`backward` functions and it is used to store autograd related information by both, the user’s function and PyTorch. - -As in the regular C++ functions we also call `collect_next_edges` to track the inputs `grad_fn` objects, but this is done in [`unpack_input`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/python_function.cpp#L413-L448): - -```c++ -template -std::pair unpack_input(PyObject *args) { - ... - flags.next_edges = (flags.is_executable ? collect_next_edges(unpacked.input_vars) : edge_list()); - return std::make_pair(std::move(unpacked), std::move(flags)); -} -``` - -After this, the edges are assigned to the `grad_fn` by just doing `cdata->set_next_edges(std::move(input_info.next_edges));` and the forward function is called through the python interpreter C API. - -Once the output tensors are returned from the forward pass, they are processed and converted to variables inside the [`process_outputs`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/python_function.cpp#L519-L562) function. - -```c++ -PyObject* process_outputs(PyObject *op_obj, const std::shared_ptr& cdata, - THPFunction* grad_fn, const UnpackedInput& unpacked, - PyObject *inputs, THPObjectPtr&& raw_output, bool is_executable, - torch::jit::Node* node) { - ... - _wrap_outputs(cdata, grad_fn, unpacked.input_vars, raw_output, outputs, is_executable); - _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace, unpack_output); - if (is_executable) { - _save_variables(cdata, grad_fn); - } ... - return outputs.release(); -} -``` - -Here, [`_wrap_outputs`](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/python_function.cpp#L302-L346) is in charge of setting the forward outputs `grad_fn` to the newly created one. For this, it calls another `_wrap_outputs` function defined in a different [file](https://github.com/pytorch/pytorch/blob/e7cd59c7a061c78d8d0265e4308b5933e44f9176/torch/csrc/autograd/custom_function.cpp#L28-L105), so the process here gets a little confusing. - -```c++ -static void _wrap_outputs(const std::shared_ptr& cdata, THPFunction *self, - const variable_list &input_vars, PyObject *raw_output, PyObject *outputs, bool is_executable) -{ - auto cdata_if_executable = is_executable ? cdata : nullptr; - ... - - // Wrap only the tensor outputs. - // This calls csrc/autograd/custom_function.cpp - auto wrapped_outputs = _wrap_outputs(input_vars, non_differentiable, dirty_inputs, raw_output_vars, cdata_if_executable); -... -} -``` - -The called `_wrap_outputs` is the one in charge of setting the autograd metadata in the output tensors: - -```c++ -std::vector> _wrap_outputs(const variable_list &input_vars, - const std::unordered_set &non_differentiable, - const std::unordered_set &dirty_inputs, - const at::ArrayRef> raw_outputs, - const std::shared_ptr &cdata) { - - - std::unordered_set inputs; - … - // Sets the grad_fn and output_nr of an output Variable. - auto set_history = [&](Variable& var, uint32_t output_nr, bool is_input, bool is_modified, - bool is_differentiable) { - // Lots of checks - if (!is_differentiable) { - ... - } else if (is_input) { - // An input has been returned, but it wasn't modified. Return it as a view - // so that we can attach a new grad_fn to the Variable. - // Run in no_grad mode to mimic the behavior of the forward. - { - AutoGradMode grad_mode(false); - var = var.view_as(var); - } - impl::set_gradient_edge(var, {cdata, output_nr}); - } else if (cdata) { - impl::set_gradient_edge(var, {cdata, output_nr}); - } - }; -``` - -And this is where `set_gradient_edge` was called and this is how a user-written python function gets included in the computational graph with its associated backward function! - -# Closing remarks - -This blog post is intended to be a code overview on how PyTorch constructs the actual computational graphs that we discussed in the previous post. The next entry will deal with how the autograd engine executes these graphs. diff --git a/_posts/2022-10-13-scaling-pytorch-models-on-cloud-tpus-with-fsdp.md b/_posts/2022-10-13-scaling-pytorch-models-on-cloud-tpus-with-fsdp.md deleted file mode 100644 index 4f07564cbc86..000000000000 --- a/_posts/2022-10-13-scaling-pytorch-models-on-cloud-tpus-with-fsdp.md +++ /dev/null @@ -1,127 +0,0 @@ ---- -layout: blog_detail -title: "Scaling PyTorch models on Cloud TPUs with FSDP" -author: Ronghang Hu, Vaibhav Singh, Jack Cao, Milad Mohammadi, Yeounoh Chung, Shauheen Zahirazami, Ross Girshick -featured-img: "/assets/images/scaling-pytorch-models-on-cloud-tpus-with-fsdp.jpg" ---- - -## Introduction - -The research community has witnessed a lot of successes with large models across NLP, computer vision, and other domains in recent years. Many of these successes were enabled by Cloud TPUs -- which are powerful hardware for distributed training. To support TPUs in PyTorch, the PyTorch/XLA library provides a backend for XLA devices (most notably TPUs) and lays the groundwork for scaling large PyTorch models on TPUs. - -However, most existing modeling scaling tools in the PyTorch ecosystem assume GPU (or CPU) devices, often depend on specific features in CUDA, and do not work directly on TPUs. The lack of scaling tools makes it challenging to build large models that cannot fit into the memory of a single TPU chip. - -To support model scaling on TPUs, we implemented the widely-adopted [Fully Sharded Data Parallel (FSDP)](https://engineering.fb.com/2021/07/15/open-source/fsdp/) algorithm for XLA devices as part of the PyTorch/XLA 1.12 release. We provide an FSDP interface with a similar high-level design to the CUDA-based PyTorch FSDP class while also handling several restrictions in XLA (see Design Notes below for more details). This FSDP interface allowed us to easily build models with e.g. 10B+ parameters on TPUs and has enabled many research explorations. - -## Using Fully Sharded Data Parallel (FSDP) in PyTorch/XLA - -We provide a wrapper class `XlaFullyShardedDataParallel` over a given PyTorch model to shard its parameters across data-parallel workers. An example usage is as follows: - -```python -import torch -import torch_xla.core.xla_model as xm -from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP - -model = FSDP(my_module) -optim = torch.optim.Adam(model.parameters(), lr=0.0001) -output = model(x, y) -loss = output.sum() -loss.backward() -optim.step() -``` - -Wrapping an `nn.Module` instance with `XlaFullyShardedDataParallel` enables the [ZeRO-2](https://arxiv.org/abs/1910.02054) algorithm on it, where its gradients and the optimizer states are sharded for the entire training process. During its forward and backward passes, the full parameters of the wrapped module are first reconstructed from their corresponding shards for computation. - -**Nested FSDP** wrapping can be used to further save memory. This allows the model to store only the full parameters of one individual layer at any given time. For nested FSDP, one should first wrap its individual submodules with an inner FSDP before wrapping the base model with an outer FSDP. This allows the model to store only the full parameters of one individual layer at any given time. And having an outer wrapper ensures to handle any leftover parameters, corresponding to the [ZeRO-3](https://arxiv.org/abs/1910.02054) algorithm. Nested FSDP wrapping can be applied at any depth of submodules and there can be more than 2 layers of nesting. - -**Model checkpoint saving and loading** for models and optimizers can be done like before by saving and loading their `.state_dict()`. Meanwhile, each training process should save its own checkpoint file of the sharded model parameters and optimizer states, and load the checkpoint file for the corresponding rank when resuming (regardless of ZeRO-2 or ZeRO-3, i.e. nested wrapping or not). A command line tool and a Python interface are provided to consolidate the sharded model checkpoint files together into a full/unshareded model checkpoint file. - -**Gradient checkpointing** (also referred to as "activation checkpointing" or "rematerialization") is another common technique for model scaling and can be used in conjunction with FSDP. We provide `checkpoint_module`, a wrapper function over a given `nn.Module` instance for gradient checkpointing (based on `torch_xla.utils.checkpoint.checkpoint`). - -The MNIST and ImageNet examples below provide illustrative usages of (plain or nested) FSDP, saving and consolidation of model checkpoints, as well as gradient checkpointing. - -## Starting examples of FSDP in PyTorch/XLA - -### Training MNIST and ImageNet with FSDP - -MNIST and ImageNet classification can often be used as starting points to build more complicated deep learning models. We provide the following FSDP examples on these two datasets: - -- MNIST: [test/test_train_mp_mnist_fsdp_with_ckpt.py](https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist_fsdp_with_ckpt.py) (it also illustrates checkpoint saving and consolidation) -- ImageNet: [test/test_train_mp_imagenet_fsdp.py](https://github.com/pytorch/xla/blob/master/test/test_train_mp_imagenet_fsdp.py) - -A comparison of them with the vanilla data-parallel examples of [MNIST](https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist.py) and [ImageNet](https://github.com/pytorch/xla/blob/master/test/test_train_mp_imagenet.py) illustrates how to adapt a training script to use FSDP. A major distinction to keep in mind is that when stepping the optimizer on an FSDP-wrapped model, one should directly call `optimizer.step()` instead of `xm.optimizer_step(optimizer)`. The latter reduces the gradients across ranks, which is not what we need in FSDP, where the gradients are already reduced and sharded (from a reduce-scatter op in its backward pass). - -#### Installation - -FSDP is available from the PyTorch/XLA 1.12 and newer nightly releases. Please refer to [https://github.com/pytorch/xla#-available-images-and-wheels](https://github.com/pytorch/xla#-available-images-and-wheels) for a guide on installation as well as Cloud TPU allocation. Then clone PyTorch/XLA repo on a TPU VM as follows - -```python -mkdir -p ~/pytorch && cd ~/pytorch -git clone --recursive https://github.com/pytorch/xla.git -cd ~/ -``` - -#### Train MNIST on v3-8 TPU - -It gets around 98.9 accuracy for 2 epochs: - -```python -python3 ~/pytorch/xla/test/test_train_mp_mnist_fsdp_with_ckpt.py \ - --batch_size 16 --drop_last --num_epochs 2 \ - --use_nested_fsdp -``` - -The script above automatically tests consolidation of the sharded model checkpoints at the end. You can also manually consolidate the sharded checkpoint files via - -```python -python3 -m torch_xla.distributed.fsdp.consolidate_sharded_ckpts \ - --ckpt_prefix /tmp/mnist-fsdp/final_ckpt \ - --ckpt_suffix "_rank-*-of-*.pth" -``` - -#### Train ImageNet with ResNet-50 on v3-8 TPU - -It gets around 75.9 accuracy for 100 epochs, same as what one would get without using FSDP; download and preprocess the [ImageNet-1k](https://github.com/pytorch/examples/tree/master/imagenet#requirements) dataset to `/datasets/imagenet-1k`: - -```python -python3 ~/pytorch/xla/test/test_train_mp_imagenet_fsdp.py \ - --datadir /datasets/imagenet-1k --drop_last \ - --model resnet50 --test_set_batch_size 64 --eval_interval 10 \ - --lr 0.4 --batch_size 128 --num_warmup_epochs 5 \ - --lr_scheduler_divide_every_n_epochs 30 --lr_scheduler_divisor 10 \ - --num_epochs 100 \ - --use_nested_fsdp -``` - -You can also explore other options in these two examples, such as `--use_gradient_checkpointing` to apply gradient checkpointing (i.e. activation checkpointing) on the ResNet blocks, or `--compute_dtype bfloat16` to perform forward and backward passes in bfloat16 precision. - -### Examples on large-scale models - -When building large models on TPUs, we often need to be aware of the memory constraints (e.g. 16 GB per core in TPU v3 and 32 GB per chip in TPU v4). For large models that cannot fit into a single TPU memory or the host CPU memory, one should use nested FSDP to implement the ZeRO-3 algorithm interleave submodule construction with inner FSDP wrapping, so that the full model never needs to be stored in memory during construction. - -We illustrate these cases in [https://github.com/ronghanghu/ptxla_scaling_examples](https://github.com/ronghanghu/ptxla_scaling_examples), which provides examples of training a Vision Transformer (ViT) model with 10B+ parameters on a TPU v3 pod (with 128 cores) as well as other cases. - -### Design Notes - -One might wonder why we need to develop a separate FSDP class in PyTorch/XLA instead of directly reusing [PyTorch's FSDP class](https://pytorch.org/docs/stable/fsdp.html) or extending it to the XLA backend. The main motivation behind a separate FSDP class in PyTorch/XLA is that the native PyTorch's FSDP class heavily relies on CUDA features that are not supported by XLA devices, while XLA also has several unique characteristics that need special handling. These distinctions require a different implementation of FSDP that would be much easier to build in a separate class. - -#### Changes in API calls -One prominent distinction is that the native PyTorch FSDP is built upon separate CUDA streams for asynchronous execution in eager mode, while PyTorch/XLA runs in lazy mode and also does not support streams. In addition, TPU requires that all devices homogeneously run the same program. As a result, in the PyTorch/XLA FSDP implementation, CUDA calls and per-process heterogeneity need to be replaced by XLA APIs and alternative homogeneous implementations. - -#### Tensor Storage Handling - -Another prominent distinction is how to free a tensor's storage, which is much harder in XLA than in CUDA. To implement ZeRO-3, one needs to free the storage of full parameters after a module's forward pass, so that the next module can reuse this memory buffer for subsequent computation. PyTorch's FSPD accomplishes this on CUDA by freeing the actual storage of a parameter `p` via `p.data.storage().resize_(0)`. However, XLA tensors do not have this `.storage()` handle given that the XLA HLO IRs are completely functional and do not provide any ops to deallocate a tensor or resize its storage. Below the PyTorch interface, only the XLA compiler can decide when to free a TPU device memory corresponding to an XLA tensor, and a prerequisite is that the memory can only be released when the tensor object gets deallocated in Python -- which cannot happen in FSDP because these parameter tensors are referenced as module attributes and also saved by PyTorch autograd for the backward pass. - -Our solution to this issue is to split a tensor's value properties from its autograd Variable properties, and to free a `nn.Parameter` tensor by setting its `.data` attribute to a dummy scalar of size 1. This way the actual data tensor for the full parameter gets dereferenced in Python so that XLA can recycle its memory for other computation, while autograd can still trace the base `nn.Parameter` as a weak reference to the parameter data. To get this to work, one also needs to handle views over the parameters as views in PyTorch also hold references to its actual data (this required fixing a shape-related issue with views in PyTorch/XLA). - -#### Working with XLA compiler - -The solution above should be enough to free full parameters if the XLA compiler faithfully preserves the operations and their execution order in our PyTorch program. But there is another problem -- XLA attempts to optimize the program to speed up its execution by applying common subexpression elimination (CSE) to the HLO IRs. In a naive implementation of FSDP, the XLA compiler typically eliminates the 2nd all-gather in the backward pass to reconstruct the full parameters when it sees that it is a repeated computation from the forward pass, and directly holds and reuses the full parameters we want to free up after the forward pass. To guard against this undesired compiler behavior, we introduced the [optimization barrier op](https://www.tensorflow.org/xla/operation_semantics#optimizationbarrier) into PyTorch/XLA and used it to stop eliminating the 2nd all-gather. This optimization barrier is also applied to a similar case of gradient checkpointing to prevent CSE between forward and backward passes that could eliminate the rematerialization. - -In the future, if the distinctions between CUDA and XLA become not as prominent as mentioned above, it could be worth considering a merge of the PyTorch/XLA FSDP with the native PyTorch FSDP to have a unified interface. - -## Acknowledgments - -Thanks to Junmin Hao from AWS for reviewing the PyTorch/XLA FSDP pull request. Thanks to Brian Hirsh from the Meta PyTorch team for support on the PyTorch core issues. Thanks to Isaack Karanja, Will Cromar, and Blake Hechtman from Google for support on GCP, XLA, and TPU issues. - -Thanks to Piotr Dollar, Wan-Yen Lo, Alex Berg, Ryan Mark, Kaiming He, Xinlei Chen, Saining Xie, Shoubhik Debnath, Min Xu, and Vaibhav Aggarwal from Meta FAIR for various TPU-related discussions. \ No newline at end of file diff --git a/_posts/2022-10-17-pytorchs-tracing-based-selective-build.md b/_posts/2022-10-17-pytorchs-tracing-based-selective-build.md deleted file mode 100644 index 4501714057ec..000000000000 --- a/_posts/2022-10-17-pytorchs-tracing-based-selective-build.md +++ /dev/null @@ -1,256 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch’s Tracing Based Selective Build" -author: Dhruv Matani, Suraj Subramanian -featured-img: "/assets/images/pytorchs-tracing-based-selective-build_Figure_4.png" ---- - -## Introduction - -**TL;DR**: It can be challenging to run PyTorch on mobile devices, SBCs (Single Board Computers), and IOT devices. When compiled, the PyTorch library is huge and includes dependencies that might not be needed for the on-device use case. - -To run a specific set of models on-device, we actually require only a small subset of the features in the PyTorch library. We found that using a PyTorch runtime generated using **selective build** can achieve up to 90% reduction in binary size (for the CPU and QuantizedCPU backends on an x86-64 build on Linux). In this blog, we share our experience of generating model-specific minimal runtimes using Selective Build and show you how to do the same. - -## Why is this important for app developers? - -Using a PyTorch runtime generated by **selective build** can reduce the size of AI-powered apps by 30+ MB - a significant reduction for a typical mobile app! Making mobile applications more lightweight has many benefits - they are runnable on a wider variety of devices, consume less cellular data, and can be downloaded and updated faster on user’s devices. - -## What does the Developer Experience look like? - -This method can work seamlessly with any existing PyTorch Mobile deployment workflows. All you need to do is replace the general PyTorch runtime library with a runtime customized for the specific models you wish to use in your application. The general steps in this process are: - -1. Build the PyTorch Runtime in **instrumentation mode** (this is called an **instrumentation build** of PyTorch). This will record the used operators, kernels and features. -2. Run your models through this instrumentation build by using the provided **model_tracer** binary. This will generate a single YAML file that stores all the features used by your model. These features will be preserved in the minimal runtime. -3. Build PyTorch using this YAML file as input. This is the **selective build** technique, and it greatly reduces the size of the final PyTorch binary. -4. Use this selectively-built PyTorch library to reduce the size of your mobile application! - - -Building the PyTorch Runtime in a special **“instrumentation” mode** ( by passing the `TRACING_BASED=1` build option) generates an **instrumentation build** runtime of PyTorch, along with a **model_tracer** binary. Running a model with this build allows us to trace the parts of PyTorch used by the model. - -

        - -

        - -

        - Figure 1: Instrumentation build of PyTorch -

        - -```python -# Clone the PyTorch repo -git clone https://github.com/pytorch/pytorch.git -cd pytorch - -# Build the model_tracer -USE_NUMPY=0 USE_DISTRIBUTED=0 USE_CUDA=0 TRACING_BASED=1 \ - python setup.py develop -``` - -Now this instrumentation build is used to run a model inference with representative inputs. The **model_tracer** binary observes parts of the instrumentation build that were activated during the inference run, and dumps it to a YAML file. - -

        - -

        - -

        - Figure 2: YAML file generated by running model(s) on an instrumentation build -

        - -```python -# Generate YAML file -./build/bin/model_tracer \ - --model_input_path /tmp/path_to_model.ptl \ - --build_yaml_path /tmp/selected_ops.yaml -``` - -Now we build the PyTorch Runtime again, but this time using the YAML file generated by the tracer. The runtime now only includes those parts that are needed for this model. This is called **“Selectively built PyTorch runtime”** in the diagram below. - -```python -# Clean out cached configuration -make clean - -# Build PyTorch using Selected Operators (from the YAML file) -# using the host toolchain, and use this generated library -BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN=1 \ -USE_LIGHTWEIGHT_DISPATCH=0 \ -BUILD_LITE_INTERPRETER=1 \ -SELECTED_OP_LIST=/tmp/selected_ops.yaml \ -TRACING_BASED=1 \ - ./scripts/build_mobile.sh -``` - -

        - -

        - -

        - Figure 3: Selective Build of PyTorch and model execution on a selectively built PyTorch runtime -

        - -### Show me the code! - -We’ve put together a [notebook](https://gist.github.com/dhruvbird/65fd800983f362a72d78afe68031568c) to illustrate what the process above looks like in code using a simple PyTorch model. - -For a more hands-on tutorial to deploy this on Android/iOS [this tutorial](https://pytorch.org/tutorials/prototype/tracing_based_selective_build.html) should be helpful. - -## Technical FAQs - -### Why is Tracing needed for a Selective Build of PyTorch? - -In PyTorch, CPU kernels can call other operators via the [PyTorch Dispatcher](http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/). Simply including the set of root operators called directly by the model is not sufficient as there might be many more being called under-the-hood transitively. Running the model on representative inputs and observing the actual list of operators called (aka “tracing”) is the most accurate way of determining what parts of PyTorch are used. - -Additionally, factors such as which dtypes a kernel should handle are also runtime features that depend on actual input provided to the model. Hence, the tracing mechanism is extremely suitable for this purpose. - -### Which features can be selected (in or out) by using Tracing Based Selective Build? - -The following features can be selected for the PyTorch runtime during the tracing based selective build process: - -1. [CPU/QuantizedCPU](https://codebrowser.bddppq.com/pytorch/pytorch/build/aten/src/ATen/) kernels for [PyTorch’s ATen Operators](https://pytorch.org/cppdocs/): If a PyTorch Operator is not needed by a model targeted at a selectively built runtime, then the registration of that CPU kernel is omitted in the runtime. This is controlled via [Torchgen code-gen](https://github.com/pytorch/pytorch/blob/master/torchgen/gen.py). -2. [Primary Operators](https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/runtime/register_prim_ops.cpp): This is controlled by a macro named [TORCH_SELECTIVE_SCHEMA](https://codebrowser.bddppq.com/pytorch/pytorch/torch/library.h.html) (via templated selective build) that either selects a primary operator or de-selects it based on information in a generated header file. -3. Code that handles [specific dtypes](https://codebrowser.bddppq.com/pytorch/pytorch/aten/src/ATen/Dispatch.h.html) in CPU kernels: This is performed by generating exception throws in specific case statements in the switch case generated by the macro [AT_PRIVATE_CHECK_SELECTIVE_BUILD](https://codebrowser.bddppq.com/pytorch/pytorch/aten/src/ATen/Dispatch.h.html#_M/AT_PRIVATE_CHECK_SELECTIVE_BUILD). -4. Registration of [Custom C++ Classes](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html) that extend PyTorch: This is controlled by the macro [TORCH_SELECTIVE_CLASS](https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp#L385-L386), which can be used when registering Custom C++ Classes. The [torch::selective_class_<>](https://github.com/pytorch/pytorch/blob/master/torch/custom_class.h#L443-L460) helper is to be used in conjunction with the macro [TORCH_SELECTIVE_CLASS](https://codebrowser.bddppq.com/pytorch/pytorch/torch/library.h.html#_M/TORCH_SELECTIVE_CLASS). - -### What is the structure of the YAML file used during the build? - -The YAML file generated after tracing looks like the example below. It encodes all the elements of the “selectable” build feature as specified above. - -```python -include_all_non_op_selectives: false -build_features: [] -operators: - aten::add.Tensor: - is_used_for_training: false - is_root_operator: true - include_all_overloads: false - aten::len.t: - is_used_for_training: false - is_root_operator: true - include_all_overloads: false -kernel_metadata: - _local_scalar_dense_cpu: - - Float - add_stub: - - Float - copy_: - - Bool - - Byte - mul_cpu: - - Float -custom_classes: [] -``` - -### How exactly is code eliminated from the generated binary? - -Depending on the specific scenario, there are 2 main techniques that are used to hint the compiler and linker about unused and unreachable code. This code is then cleaned up by the compiler or linker as unreachable code. - -#### [1] Unreferenced functions removed by the Linker - -When a function that isn’t transitively referenced from any visible function is present in the compiled object files that are being linked together, the linker will remove it (if the right build flags are provided). This is leveraged in 2 scenarios by the selective build system. - -##### Kernel Registration in the Dispatcher - -If an operator’s kernel isn’t needed, then it isn’t registered with the dispatcher. An unregistered kernel means that the function is unreachable, and it will be removed by the linker. - -##### Templated Selective Build - -The general idea here is that a class template specialization is used to select a class that either captures a reference to a function or not (depending on whether it’s used) and the linker can come along and clean out the unreferenced function. - -For example, in the code below, there’s no reference to the function “`fn2`”, so it will be cleaned up by the linker since it’s not referenced anywhere. - -```python -#include -#include - -template -struct FunctionSelector { - T fn_; - FunctionSelector(T fn): fn_(fn) {} - T get() { return this->fn_; } -}; - -// The "false" specialization of this class does NOT retain the argument passed -// to the class constructor, which means that the function pointer passed in -// is considered to be unreferenced in the program (unless it is referenced -// elsewhere). -template -struct FunctionSelector { - FunctionSelector(T) {} -}; - -template -FunctionSelector make_function_selector_true(T fn) { - return FunctionSelector(fn); -} - -template -FunctionSelector make_function_selector_false(T fn) { - return FunctionSelector(fn); -} - -typedef void(*fn_ptr_type)(); - -std::vector fns; - -template -void add_fn(FunctionSelector fs) { - fns.push_back(fs.get()); -} - -template -void add_fn(FunctionSelector) { - // Do nothing. -} - -// fn1 will be kept by the linker since it is added to the vector "fns" at -// runtime. -void fn1() { - printf("fn1\n"); -} - -// fn2 will be removed by the linker since it isn't referenced at all. -void fn2() { - printf("fn2\n"); -} - -int main() { - add_fn(make_function_selector_true(fn1)); - add_fn(make_function_selector_false(fn2)); -} -``` - -#### [2] Dead Code Eliminated by the Compiler - -C++ Compilers can detect dead ([unreachable](https://en.wikipedia.org/wiki/Unreachable_code)) code by analyzing the code’s control flow statically. For example, if there’s a code-path that comes after an **unconditional exception throw**, then all the code after it will be marked as dead code and not converted to object code by the compiler. Typically, compilers require the use of the `-fdce` flag to eliminate dead code. - -In the example below, you can see that the C++ code on the left (in the red boxes) doesn’t have any corresponding generated object code on the right. - -

        - -

        - -

        - Figure 4: Dead Code Elimination by C++ Compilers -

        - -This property is leveraged in the bodies of PyTorch kernel implementations that have a lot of repeated code to handle multiple dtypes of a Tensor. A [dtype](https://pytorch.org/docs/stable/tensor_attributes.html) is the underlying data-type that the Tensor stores elements of. This can be one of float, double, int64, bool, int8, etc… - -Almost every PyTorch CPU kernel uses a macro of the form AT_DISPATCH_ALL_TYPES* that is used to substitute some code specialized for every dtype that the kernel needs to handle. For example: - -```python -AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - kBool, kHalf, kBFloat16, dtype, "copy_kernel", [&] { - cpu_kernel_vec( - iter, - [=](scalar_t a) -> scalar_t { return a; }, - [=](Vectorized a) -> Vectorized { return a; }); -}); -``` - -The macro `AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3` internally has a switch-case statement that looks like the code in Figure-4 above. The tracing process records the dtypes triggered for the kernel tag "`copy_kernel`" and the build process processes these tags and inserts `throw` statements in every `case` statement that is handling the dtype that isn’t required for this kernel tag. - -This is how dtype selectivity is implemented in PyTorch’s Tracing Based Selective Build. - -## Conclusion - -Tracing Based Selective Build is a practical and scalable approach to selecting only the used parts of an application to retain code that static analysis can not detect. This code is usually extremely data/input dependent in nature. - -This article provides detailed insights into how Tracing Based Selective Build works under the hood, and the technical details related to its implementation. These techniques can also be applied to other applications and situations that can benefit from reduced binary size. \ No newline at end of file diff --git a/_posts/2022-10-28-PyTorch-1.13-release.md b/_posts/2022-10-28-PyTorch-1.13-release.md deleted file mode 100644 index 23660fb4900e..000000000000 --- a/_posts/2022-10-28-PyTorch-1.13-release.md +++ /dev/null @@ -1,168 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 1.13 release, including beta versions of functorch and improved support for Apple’s new M1 chips." -author: Team PyTorch -featured-img: "/assets/images/blog-2022-10-25-Pytorch-1.13-Release.png" ---- - -We are excited to announce the release of PyTorch® 1.13 ([release note](https://github.com/pytorch/pytorch/releases/tag/v1.13.0))! This includes Stable versions of BetterTransformer. We deprecated CUDA 10.2 and 11.3 and completed migration of CUDA 11.6 and 11.7. Beta includes improved support for Apple M1 chips and functorch, a library that offers composable vmap (vectorization) and autodiff transforms, being included in-tree with the PyTorch release. This release is composed of over 3,749 commits and 467 contributors since 1.12.1. We want to sincerely thank our dedicated community for your contributions. - -Summary: - -- The [BetterTransformer](#stable-features) feature set supports fastpath execution for common Transformer models during Inference out-of-the-box, without the need to modify the model. Additional improvements include accelerated add+matmul linear algebra kernels for sizes commonly used in Transformer models and Nested Tensors is now enabled by default. - -- Timely [deprecating older CUDA versions](#introduction-of-cuda-116-and-117-and-deprecation-of-cuda-102-and-113) allows us to proceed with introducing the latest CUDA version as they are introduced by Nvidia®, and hence allows support for C++17 in PyTorch and new NVIDIA Open GPU Kernel Modules. - -- Previously, [functorch](#beta-features) was released out-of-tree in a separate package. After installing PyTorch, a user will be able to `import functorch` and use functorch without needing to install another package. - -- PyTorch is offering native builds for Apple® silicon machines that use Apple's new [M1 chip](#beta-support-for-m1-devices) as a beta feature, providing improved support across PyTorch's APIs. - - - -Along with 1.13, we are also releasing major updates to the PyTorch libraries, more details can be found in this [blog](https://pytorch.org/blog/new-library-updates-in-pytorch-1.13/). - -## Stable Features - -### (Stable) BetterTransformer API - -The [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) feature set, first released in PyTorch 1.12, is stable. PyTorch BetterTransformer supports fastpath execution for common Transformer models during Inference out-of-the-box, without the need to modify the model. To complement the improvements in Better Transformer, we have also accelerated add+matmul linear algebra kernels for sizes commonly used in Transformer models. - -Reflecting the performance benefits for many NLP users, Nested Tensors use for Better Transformer is now enabled by default. To ensure compatibility, a mask check is performed to ensure a contiguous mask is supplied. In Transformer Encoder, the mask check for src_key_padding_mask may be suppressed by setting mask_check=False. This accelerates processing for users than can guarantee that only aligned masks are provided. Finally, better error messages are provided to diagnose incorrect inputs, together with improved diagnostics why fastpath execution cannot be used. - -Better Transformer is directly integrated into the PyTorch TorchText library, enabling TorchText users to transparently and automatically take advantage of BetterTransformer speed and efficiency performance. ([Tutorial](https://pytorch.org/tutorials/beginner/bettertransformer_tutorial.html)) - -

        - -

        - - - -

        -Figure: BetterTransformer fastpath execution is now stable and enables sparsity optimization using Nested Tensor representation as default -

        - -### Introduction of CUDA 11.6 and 11.7 and deprecation of CUDA 10.2 and 11.3 - -Timely deprecating older CUDA versions allows us to proceed with introducing the latest CUDA version as they are introduced by Nvidia®, and hence allows developers to use the latest features of CUDA and benefit from correctness fixes provided by the latest version. - -Decommissioning of CUDA 10.2. CUDA 11 is the first CUDA version to support C++17. Hence decommissioning legacy CUDA 10.2 was a major step in adding support for C++17 in PyTorch. It also helps to improve PyTorch code by eliminating legacy CUDA 10.2 specific instructions. - -Decommissioning of CUDA 11.3 and introduction of CUDA 11.7 brings compatibility support for the new NVIDIA Open GPU Kernel Modules and another significant highlight is the lazy loading support. CUDA 11.7 is shipped with cuDNN 8.5.0 which contains a number of optimizations accelerating transformer-based models, 30% reduction in library size , and various improvements in the runtime fusion engine. Learn more on CUDA 11.7 with our [release notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html). - -## Beta Features - -### (Beta) functorch - -Inspired by [Google® JAX](https://github.com/google/jax), functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples include: - - - - -We’re excited to announce that, as a first step towards closer integration with PyTorch, functorch has moved to inside the PyTorch library and no longer requires the installation of a separate functorch package. After installing PyTorch via conda or pip, you’ll be able to `import functorch’ in your program. Learn more with our [detailed instructions](https://pytorch.org/functorch/1.13/install.html), [nightly](https://pytorch.org/functorch/nightly/) and [release notes](https://github.com/pytorch/pytorch/releases). - -### (Beta) Intel® VTune™ Profiler's Instrumentation and Tracing Technology APIs (ITT) integration - -PyTorch users are able to visualize op-level timeline of PyTorch scripts execution in Intel® VTune™ Profiler when they need to analyze per-op performance with low-level performance metrics on Intel platforms. - -```python -with torch.autograd.profiler.emit_itt(): - for i in range(10): - torch.itt.range_push('step_{}'.format(i)) - model(input) - torch.itt.range_pop() -``` - - -Learn more with our [tutorial](https://pytorch.org/tutorials/recipes/profile_with_itt.html). - -### (Beta) NNC: Add BF16 and Channels last support - -TorchScript graph-mode inference performance on x86 CPU is boosted by adding channels last and BF16 support to NNC. PyTorch users may benefit from channels last optimization on most popular x86 CPUs and benefit from BF16 optimization on Intel Cooper Lake Processor and Sapphire Rapids Processor. >2X geomean performance boost is observed on broad vision models with these two optimizations on Intel Cooper Lake Processor. - -The performance benefit can be obtained with existing TorchScript, channels last and BF16 Autocast APIs. See code snippet below. We will migrate the optimizations in NNC to the new PyTorch DL Compiler TorchInductor. - - - -```python -import torch -import torchvision.models as models -model = models.resnet50(pretrained=True) -# Convert the model to channels-last -model = model.to(memory_format=torch.channels_last) -model.eval() -data = torch.rand(1, 3, 224, 224) -# Convert the data to channels-lastdata = data.to(memory_format=torch.channels_last) -# Enable autocast to run with BF16 -with torch.cpu.amp.autocast(), torch.no_grad(): -# Trace the model -model = torch.jit.trace(model, torch.rand(1, 3, 224, 224)) - model = torch.jit.freeze(model) - # Run the traced model - model(data) -``` - -### (Beta) Support for M1 Devices - -Since v1.12, PyTorch has been offering native builds for Apple® silicon machines that use Apple's new M1 chip as a prototype feature. In this release, we bring this feature to beta, providing improved support across PyTorch's APIs. - -We now run tests for all submodules except `torch.distributed` on M1 macOS 12.6 instances. With this improved testing, we were able to fix features such as cpp extension and convolution correctness for certain inputs. - -To get started, just install PyTorch v1.13 on your Apple silicon Mac running macOS 12 or later with a native version (arm64) of Python. Learn more with our [release notes](https://github.com/pytorch/pytorch/releases). - -## Prototype Features - - - -### (Prototype) Arm® Compute Library (ACL) backend support for AWS Graviton - -We achieved substantial improvements for CV and NLP inference on aarch64 cpu with Arm Compute Library (acl) to enable acl backend for pytorch and torch-xla modules. Highlights include: - - -- Enabled mkldnn + acl as the default backend for aarch64 torch wheel. -- Enabled mkldnn matmul operator for aarch64 bf16 device. -- Brought TensorFlow xla+acl feature into torch-xla. We enhanced the TensorFlow xla with Arm Compute Library runtime for aarch64 cpu. These changes are included in TensorFlow master and then the upcoming TF 2.10. Once the torch-xla repo is updated for the tensorflow commit, it will have compiling support for torch-xla. We observed ~2.5-3x improvement for MLPerf Bert inference compared to the torch 1.12 wheel on Graviton3. - -### (Prototype) CUDA Sanitizer - -When enabled, the sanitizer begins to analyze low-level CUDA operations invoked as a result of the user’s PyTorch code to detect data race errors caused by unsynchronized data access from different CUDA streams. The errors found are then printed along with stack traces of faulty accesses, much like [Thread Sanitizer](https://clang.llvm.org/docs/ThreadSanitizer.html) does. An example of a simple error and the output produced by the sanitizer can be viewed [here](https://gist.github.com/sypneiwski/5989d634f7090913b80012be835e811d). It will be especially useful for machine learning applications, where corrupted data can be easy to miss for a human and the errors may not always manifest themselves; the sanitizer will always be able to detect them. - -### (Prototype) Limited Python 3.11 support - -Binaries for Linux with Python 3.11 support are available to download via pip. Please follow the instructions on the [get started page](https://pytorch.org/get-started/locally/). Please note that Python 3.11 support is only a preview. In particular, features including Distributed, Profiler, FX and JIT might not be fully functional yet. diff --git a/_posts/2022-10-28-new-library-updates-in-pytorch-1.13.md b/_posts/2022-10-28-new-library-updates-in-pytorch-1.13.md deleted file mode 100644 index 4882d1538079..000000000000 --- a/_posts/2022-10-28-new-library-updates-in-pytorch-1.13.md +++ /dev/null @@ -1,358 +0,0 @@ ---- -layout: blog_detail -title: "New Library Updates in PyTorch 1.13" -author: Team PyTorch -featured-img: "assets/images/new-library-updates-in-pytorch-1.13-2.jpg" ---- - -## Summary - -We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 1.13 [release](https://github.com/pytorch/pytorch/releases). These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. - -Along with **1.13**, we are releasing updates to the PyTorch Libraries, please find them below. - -### TorchAudio - -#### (Beta) Hybrid Demucs Model and Pipeline - -Hybrid Demucs is a music source separation model that uses both spectrogram and time domain features. It has demonstrated state-of-the-art performance in the Sony® Music DeMixing Challenge. (citation: [https://arxiv.org/abs/2111.03600](https://arxiv.org/abs/2111.03600)) - -The TorchAudio v0.13 release includes the following features - -- MUSDB_HQ Dataset, which is used in Hybrid Demucs training ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.MUSDB_HQ.html#torchaudio.datasets.MUSDB_HQ)) -- Hybrid Demucs model architecture ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.models.HDemucs.html#torchaudio.models.HDemucs)) -- Three factory functions suitable for different sample rate ranges -- Pre-trained pipelines ([docs](https://pytorch.org/audio/0.13.0/pipelines.html#id46)) -- SDR Results of pre-trained pipelines on MUSDB_HQ test set -- Tutorial that steps through music source separation using the pretrained pipeline ([docs](https://pytorch.org/audio/0.13.0/tutorials/hybrid_demucs_tutorial.html)) - -| Pipeline | All | Drums | Bass | Other | Vocals | -|----------------------------------------|-------|-------|--------|-------|--------| -| HDEMUCS_HIGH_MUSDB* | 6.42 | 7.76 | 6.51 | 4.47 | 6.93 | -| HDEMUCS_HIGH_MUSDB_PLUS** | 9.37 | 11.38 | 10.53 | 7.24 | 8.32 | - -

        * Trained on the training data of MUSDB-HQ dataset.
        ** Trained on both training and test sets of MUSDB-HQ and 150 extra songs from an internal database that were specifically produced for Meta.

        - -```python -from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS - -bundle = HDEMUCS_HIGH_MUSDB_PLUS -model = bundle.get_model() -sources_list = model.sources - -mixture, samplerate = torchaudio.load("song.wav") -sources = model(mixture) -audios = dict(zip(sources_list, sources) -``` - -Special thanks to Alexandre Defossez for the guidance. - -#### (Beta) Datasets and Metadata Mode for SUPERB Benchmark - -TorchAudio adds support for various audio-related datasets used in downstream tasks for benchmarking self-supervised learning models. With the addition of several new datasets, there is now support for the downstream tasks in version 1 of the [SUPERB benchmark](https://superbbenchmark.org/), which can be found in the [s3prl repository](https://github.com/s3prl/s3prl/blob/master/s3prl/downstream/docs/superb.md). - -For these datasets, we also add metadata support through a `get_metadata` function, enabling faster dataset iteration or preprocessing without the need to load waveforms. The function returns the same features as `__getitem__`, except it returns the relative waveform path rather than the loaded waveform. - -Datasets with metadata functionality - -- LIBRISPEECH ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.LIBRISPEECH.html#torchaudio.datasets.LIBRISPEECH)) -- LibriMix ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.LibriMix.html#torchaudio.datasets.LibriMix)) -- QUESST14 ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.QUESST14.html#torchaudio.datasets.QUESST14)) -- SPEECHCOMMANDS ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.SPEECHCOMMANDS.html#torchaudio.datasets.SPEECHCOMMANDS)) -- (new) FluentSpeechCommands ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.FluentSpeechCommands.html#torchaudio.datasets.FluentSpeechCommands)) -- (new) Snips ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.Snips.html#torchaudio.datasets.Snips)) -- (new) IEMOCAP ([docs](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.IEMOCAP.html#torchaudio.datasets.IEMOCAP)) -- (new) VoxCeleb1 ([Identification](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.VoxCeleb1Identification.html#torchaudio.datasets.VoxCeleb1Identification), [Verification](https://pytorch.org/audio/0.13.0/generated/torchaudio.datasets.VoxCeleb1Verification.html#torchaudio.datasets.VoxCeleb1Verification)) - -#### (Beta) Custom Language Model support in CTC Beam Search Decoding - -TorchAudio released a CTC beam search decoder in release 0.12, with KenLM language model support. This release, there is added functionality for creating custom Python language models that are compatible with the decoder, using the `torchaudio.models.decoder.CTCDecoderLM` wrapper. - -For more information on using a custom language model, please refer to the [documentation](https://pytorch.org/audio/0.13.0/generated/torchaudio.models.decoder.CTCDecoder.html#ctcdecoderlm) and [tutorial](https://pytorch.org/audio/0.13.0/tutorials/asr_inference_with_ctc_decoder_tutorial.html#custom-language-model). - -#### (Beta) StreamWriter - -torchaudio.io.StreamWriter is a class for encoding media including audio and video. This can handle a wide variety of codecs, chunk-by-chunk encoding and GPU encoding. - -```python -writer = StreamWriter("example.mp4") -writer.add_audio_stream( - sample_rate=16_000, - num_channels=2, -) -writer.add_video_stream( - frame_rate=30, - height=96, - width=128, - format="rgb24", -) -with writer.open(): - writer.write_audio_chunk(0, audio) - writer.write_video_chunk(1, video) -``` - -For more information, refer to [the documentation](https://pytorch.org/audio/0.13.0/generated/torchaudio.io.StreamWriter.html) and the following tutorials -- [StreamWriter Basic Usage](https://pytorch.org/audio/0.13.0/tutorials/streamwriter_basic_tutorial.html) -- [StreamWriter Advanced Usage](https://pytorch.org/audio/0.13.0/tutorials/streamwriter_advanced.html) -- [Hardware-Accelerated Video Decoding and Encoding](https://pytorch.org/audio/0.13.0/hw_acceleration_tutorial.html) - -### TorchData - -For a complete list of changes and new features, please visit [our repository’s 0.5.0 release note](https://github.com/pytorch/data/releases). - -#### (Prototype) DataLoader2 - -`DataLoader2` was introduced in the last release to execute `DataPipe` graph, with support for dynamic sharding for multi-process/distributed data loading, multiple backend ReadingServices, and `DataPipe` graph in-place modification (e.g. shuffle control). - -In this release, we further consolidated the API for `DataLoader2` and a [detailed documentation is now available here](https://pytorch.org/data/0.5/dataloader2.html). We continue to welcome early adopters and feedback, as well as potential contributors. If you are interested in trying it out, we encourage you to install the nightly version of TorchData. - -#### (Beta) Data Loading from Cloud Service Providers - -We extended our support to load data from additional cloud storage providers via DataPipes, now covering AWS, Google Cloud Storage, and Azure. A [tutorial is also available](https://pytorch.org/data/0.5/tutorial.html#working-with-cloud-storage-providers). We are open to feedback and feature requests. - -We also performed a simple benchmark, comparing the performance of data loading from AWS S3 and attached volume on an AWS EC2 instance. - -### torch::deploy (Beta) - -torch::deploy is now in Beta! torch::deploy is a C++ library for Linux based operating systems that allows you to run multiple Python interpreters in a single process. You can run your existing eager PyTorch models without any changes for production inference use cases. Highlights include: - -- Existing models work out of the box–no need to modify your python code to support tracing. -- Full support for your existing Python environment including C extensions. -- No need to cross process boundaries to load balance in multi-GPU serving environments. -- Model weight can be shared between multiple Python interpreters. -- A vastly improved installation and setup process. - -```Python -torch::deploy::InterpreterManager manager(4); - -// access one of the 4 interpreters -auto I = manager.acquireOne(); - -// run infer from your_model.py -I.global("your_model", "infer")({at::randn({10, 240, 320})}); -``` - -Learn more [here](https://github.com/pytorch/multipy). - -#### (Beta) CUDA/ROCm/CPU Backends - -torch::deploy now links against standard PyTorch Python distributions so all accelerators that PyTorch core supports such as CUDA and AMD/HIP work out of the box. - -- Can install any device variant of PyTorch via pip/conda like normal. -- [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) - -#### (Prototype) aarch64/arm64 support - -torch::deploy now has basic support for aarch64 Linux systems. - -- We're looking to gather feedback on it and learn more about arm use cases for eager PyTorch models. -- Learn more / share your use case at [https://github.com/pytorch/multipy/issues/64](https://github.com/pytorch/multipy/issues/64) - -### TorchEval - -#### (Prototype) Introducing Native Metrics Support for PyTorch - -TorchEval is a library built for users who want highly performant implementations of common metrics to evaluate machine learning models. It also provides an easy to use interface for building custom metrics with the same toolkit. Building your metrics with TorchEval makes running distributed training loops with [torch.distributed](https://pytorch.org/docs/stable/distributed.html) a breeze. - -Learn more with our [docs](https://pytorch.org/torcheval), see our [examples](https://pytorch.org/torcheval/stable/metric_example.html), or check out our [GitHub repo](http://github.com/pytorch/torcheval). - -### TorchMultimodal Release (Beta) - -Please watch for upcoming blogs in early November that will introduce TorchMultimodal, a PyTorch domain library for training SoTA multi-task multimodal models at scale, in more details; in the meantime, play around with the library and models through our [tutorial](https://pytorch.org/tutorials/beginner/flava_finetuning_tutorial.html). - -### TorchRec - -#### (Prototype) Simplified Optimizer Fusion APIs - -We’ve provided a simplified and more intuitive API for setting fused optimizer settings via apply_optimizer_in_backward. This new approach enables the ability to specify optimizer settings on a per-parameter basis and sharded modules will configure [FBGEMM’s TableBatchedEmbedding modules accordingly](https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py#L181). Additionally, this now let's TorchRec’s planner account for optimizer memory usage. This should alleviate reports of sharding jobs OOMing after using Adam using a plan generated from planner. - -#### (Prototype) Simplified Sharding APIs - -We’re introducing the shard API, which now allows you to shard only the embedding modules within a model, and provides an alternative to the current main entry point - DistributedModelParallel. This lets you have a finer grained control over the rest of the model, which can be useful for customized parallelization logic, and inference use cases (which may not require any parallelization on the dense layers). We’re also introducing construct_module_sharding_plan, providing a simpler interface to the TorchRec sharder. - -#### (Beta) Quantized Comms - -Applying [quantization or mixed precision](https://dlp-kdd.github.io/assets/pdf/a11-yang.pdf) to tensors in a collective call during model parallel training greatly improves training efficiency, with little to no effect on model quality. TorchRec now integrates with the [quantized comms library provided by FBGEMM GPU](https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/fbgemm_gpu/quantize_comm.py) and provides an interface to construct encoders and decoders (codecs) that surround the all_to_all, and reduce_scatter collective calls in the output_dist of a sharded module. We also allow you to construct your own codecs to apply to your sharded module. The codces provided by FBGEMM allow FP16, BF16, FP8, and INT8 compressions, and you may use different quantizations for the forward pass and backward pass. - -### TorchSnapshot (Beta) - -Along with PyTorch 1.13, we are releasing the beta version of TorchSnapshot, which is a performant, memory-efficient checkpointing library for PyTorch applications, designed with large, complex distributed workloads in mind. Highlights include: - -- Performance: TorchSnapshot provides a fast checkpointing implementation employing various optimizations, including zero-copy serialization for most tensor types, overlapped device-to-host copy and storage I/O, parallelized storage I/O -- Memory Use: TorchSnapshot's memory usage adapts to the host's available resources, greatly reducing the chance of out-of-memory issues when saving and loading checkpoints -- Usability: Simple APIs that are consistent between distributed and non-distributed workloads - -Learn more with our [tutorial](https://pytorch.org/torchsnapshot/main/getting_started.html). - -### TorchVision - -We are happy to introduce torchvision v0.14 [(release note)](https://github.com/pytorch/vision/releases). This version introduces a new [model registration API](https://pytorch.org/blog/easily-list-and-initialize-models-with-new-apis-in-torchvision/) to help users retrieving and listing models and weights. It also includes new image and video classification models such as MViT, S3D, Swin Transformer V2, and MaxViT. Last but not least, we also have new primitives and augmentation such as PolynomicalLR scheduler and SimpleCopyPaste. - -#### (Beta) Model Registration API - -Following up on the [multi-weight support API](https://pytorch.org/blog/introducing-torchvision-new-multi-weight-support-api/) that was released on the previous version, we have added a new [model registration API](https://pytorch.org/blog/easily-list-and-initialize-models-with-new-apis-in-torchvision/) to help users retrieve models and weights. There are now 4 new methods under the torchvision.models module: get_model, get_model_weights, get_weight, and list_models. Here are examples of how we can use them: - -```Python -import torchvision -from torchvision.models import get_model, get_model_weights, list_models - - -max_params = 5000000 - -tiny_models = [] -for model_name in list_models(module=torchvision.models): - weights_enum = get_model_weights(model_name) - if len([w for w in weights_enum if w.meta["num_params"] <= max_params]) > 0: - tiny_models.append(model_name) - -print(tiny_models) -# ['mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mobilenet_v2', ...] - -model = get_model(tiny_models[0], weights="DEFAULT") -print(sum(x.numel() for x in model.state_dict().values())) -# 2239188 -``` - -#### (Beta) New Video Classification Models - -We added two new video classification models, MViT and S3D. MViT is a state of the art video classification transformer model which has 80.757% accuracy on the Kinetics400 dataset, while S3D is a relatively small model with good accuracy for its size. These models can be used as follows: - -```Python -import torch -from torchvision.models.video import * - -video = torch.rand(3, 32, 800, 600) -model = mvit_v2_s(weights="DEFAULT") -# model = s3d(weights="DEFAULT") -model.eval() -prediction = model(images) -``` - -Here is the table showing the accuracy of the new video classification models tested in the Kinetics400 dataset. - -| **Model** | **Acc@1** | **Acc@5** | -|--------------------------------|-----------|-----------| -| mvit_v1_b | 81.474 | 95.776 | -| mvit_v2_s | 83.196 | 96.36 | -| s3d | 83.582 | 96.64 | - -We would like to thank Haoqi Fan, Yanghao Li, Christoph Feichtenhofer and Wan-Yen Lo for their work on [PyTorchVideo](https://github.com/facebookresearch/pytorchvideo/) and their support during the development of the MViT model. We would like to thank Sophia Zhi for her contribution implementing the S3D model in torchvision. - -#### (Stable) New Architecture and Model Variants - -For Classification Models, we’ve added the Swin Transformer V2 architecture along with pre-trained weights for its tiny/small/base variants. In addition, we have added support for the MaxViT transformer. Here is an example on how to use the models: - -```Python -import torch -from torchvision.models import * - -image = torch.rand(1, 3, 224, 224) -model = swin_v2_t(weights="DEFAULT").eval() -# model = maxvit_t(weights="DEFAULT").eval() -prediction = model(image) -``` - -Here is the table showing the accuracy of the models tested on ImageNet1K dataset. - -| **Model** | **Acc@1** | **Acc@1 change over V1** | **Acc@5** | **Acc@5 change over V1** | -|---------------|-----------|--------------------------|-----------|--------------------------| -| swin_v2_t | 82.072 | + 0.598 | 96.132 | + 0.356 | -| swin_v2_s | 83.712 | + 0.516 | 96.816 | + 0.456 | -| swin_v2_b | 84.112 | + 0.530 | 96.864 | + 0.224 | -| maxvit_t | 83.700 | - | 96.722 | - | - -We would like to thank [Ren Pang](https://github.com/ain-soph) and [Teodor Poncu](https://github.com/TeodorPoncu) for contributing the 2 models to torchvision. - -### (Stable) New Primitives & Augmentations - -In this release we’ve added the [SimpleCopyPaste](https://arxiv.org/abs/2012.07177) augmentation in our reference scripts and we up-streamed the PolynomialLR scheduler to PyTorch Core. We would like to thank [Lezwon Castelino](https://github.com/lezwon) and [Federico Pozzi](https://github.com/federicopozzi33) for their contributions. We are continuing our efforts to modernize TorchVision by adding more SoTA primitives, Augmentations and architectures with the help of our community. If you are interested in contributing, have a look at the following [issue](https://github.com/pytorch/vision/issues/6323). - -### Torch-TensorRT - -#### (Prototype) TensorRT with FX2TRT frontend - -Torch-TensorRT is the PyTorch integration for TensorRT, providing high performance inference on NVIDIA GPUs. Torch-TRT allows for optimizing models directly in PyTorch for deployment providing up to 6x performance improvement. - -Torch-TRT is an AoT compiler which ingests an nn.Module or TorchScript module, optimizes compatible subgraphs in TensorRT & leaves the rest to run in PyTorch. This gives users the performance of TensorRT, but the usability and familiarity of Torch. - -Torch-TensorRT is part of the PyTorch ecosystem, and was released as v1.0 in November ‘21. There are currently two distinct front-ends: Torchscript & FX. Each provides the same value proposition and underlying operation with the primary difference being the input & output formats (TS vs FX / Python). - -The Torchscript front-end was included in v1.0 and should be considered stable. The FX front-end is first released in v1.2 and should be considered a Beta. - -Relevant Links: - -- [Github](https://github.com/pytorch/TensorRT) -- [Documentation](https://pytorch.org/TensorRT/) -- [Generic (TS) getting started guide](https://pytorch.org/TensorRT/getting_started/getting_started_with_python_api.html) -- [FX getting started guide](https://pytorch.org/TensorRT/tutorials/getting_started_with_fx_path.html) - -#### (Stable) Introducing Torch-TensorRT - -Torch-TensorRT is an integration for PyTorch that leverages inference optimizations of TensorRT on NVIDIA GPUs. It takes advantage of TensorRT optimizations, such as FP16 and INT8 reduced precision, graph optimization, operation fusion, etc. while offering a fallback to native PyTorch when TensorRT does not support the model subgraphs. Currently, there are two frontend paths existing in the library that help to convert a PyTorch model to tensorRT engine. One path is through Torch Script (TS) and the other is through FX frontend. That being said, the models are traced by either TS or FX into their IR graph and then converted to TensorRT from it. - -Learn more with our [tutorial](https://pytorch.org/TensorRT/). - -### TorchX - -TorchX 0.3 updates include a new list API, experiment tracking, elastic training and improved scheduler support. There’s also a new Multi-Objective NAS [tutorial](https://pytorch.org/tutorials/intermediate/ax_multiobjective_nas_tutorial.html) using TorchX + Ax. - -#### (Prototype) List - -The newly added list command and API allows you to list recently launched jobs and their statuses for a given scheduler directly from within TorchX. - -- This removes the need for using secondary tools to list the jobs. -- Full programmatic access to recent jobs for integration with custom tools. - -```Python -$ torchx list -s kubernetes -APP HANDLE APP STATUS ------------------------------------------------ ----------------- -kubernetes://torchx/default:train-f2nx4459p5crr SUCCEEDED -``` - -Learn more with our [documentation](https://pytorch.org/torchx/main/schedulers.html#torchx.schedulers.Scheduler.list). - -#### (Prototype) Tracker - -TorchX Tracker is a new prototype library that provides a flexible and customizable experiment and artifact tracking interface. This allows you to track inputs and outputs for jobs across multiple steps to make it easier to use TorchX with pipelines and other external systems. - -```Python -from torchx import tracker - -app_run = tracker.app_run_from_env() -app_run.add_metadata(lr=lr, gamma=gamma) # hyper parameters -app_run.add_artifact("model", "storage://path/mnist_cnn.pt") # logs / checkpoints -app_run.add_source(parent_run_id, "model") # lineage -``` - -Example: - -- [https://github.com/pytorch/torchx/tree/main/torchx/examples/apps/tracker](https://github.com/pytorch/torchx/tree/main/torchx/examples/apps/tracker) -- [https://pytorch.org/torchx/main/tracker.html](https://pytorch.org/torchx/main/tracker.html) - -#### (Prototype) Elastic Training and Autoscaling - -Elasticity on Ray and Kubernetes – automatic scale up of distributed training jobs when using a supported scheduler. Learn more with our [documentation](https://pytorch.org/torchx/main/components/distributed.html). - -#### (Prototype) Scheduler Improvements: IBM® Spectrum LSF - -Added prototype support for the IBM Spectrum LSF scheduler. - -#### (Beta) AWS Batch Scheduler - -The AWS Batch scheduler integration is now in beta. - -- log fetching and listing jobs is now supported. -- Added configs for job priorities and queue policies -- Easily access job UI via ui_url -[https://pytorch.org/torchx/main/schedulers/aws_batch.html](https://pytorch.org/torchx/main/schedulers/aws_batch.html) - -#### (Prototype) AnyPrecision Optimizer - -Drop in replacement for AdamW optimizer that reduces GPU memory, enables two main features: - -- Ability to successfully train the entire model pipeline in full BFloat16. -Kahan summation ensures precision. This can improve training throughput, especially on huge models, by reduced memory and increased computation speed. -- Ability to change the variance state to BFloat16. This can reduce overall memory required for model training with additional speed improvements. - -Find more information [here](https://github.com/pytorch/torchdistx/pull/52). diff --git a/_posts/2022-11-03-extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks.md b/_posts/2022-11-03-extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks.md deleted file mode 100644 index e4c15dd87834..000000000000 --- a/_posts/2022-11-03-extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks.md +++ /dev/null @@ -1,172 +0,0 @@ ---- -layout: blog_detail -title: "Extending TorchVision’s Transforms to Object Detection, Segmentation & Video tasks" -author: Philip Meier, Victor Fomin, Vasilis Vryniotis, Nicolas Hug -featured-img: "assets/images/Transforms-v2-feature-image.png" ---- - -**Note**: A previous version of this post was published in November 2022. We have updated this post with the most up-to-date info, in view of the upcoming 0.15 release of torchvision in March 2023, jointly with PyTorch 2.0. - -TorchVision is extending its Transforms API! Here is what’s new: - -- You can use them not only for Image Classification but also for Object Detection, Instance & Semantic Segmentation and Video Classification. -- You can use new functional transforms for transforming Videos, Bounding Boxes and Segmentation Masks. - - -The API is completely backward compatible with the previous one, and remains the same to assist the migration and adoption. We are now releasing this new API as Beta in the torchvision.transforms.v2 namespace, and we would love to get early feedback from you to improve its functionality. Please [_reach out to us_](https://github.com/pytorch/vision/issues/6753) if you have any questions or suggestions. - -## Limitations of current Transforms - -The existing Transforms API of TorchVision (aka V1) only supports single images. As a result it can only be used for classification tasks: - -```python -from torchvision import transforms -trans = transforms.Compose([ - transforms.ColorJitter(contrast=0.5), - transforms.RandomRotation(30), - transforms.CenterCrop(480), -]) -imgs = trans(imgs) -``` - -The above approach doesn’t support Object Detection nor Segmentation. This limitation made any non-classification Computer Vision tasks second-class citizens as one couldn’t use the Transforms API to perform the necessary augmentations. Historically this made it difficult to train high-accuracy models using TorchVision’s primitives and thus our Model Zoo lagged by several points from SoTA. - -To circumvent this limitation, TorchVision offered [_custom implementations_](https://github.com/pytorch/vision/blob/main/references/detection/transforms.py) in its reference scripts that show-cased how one could perform augmentations in each task. Though this practice enabled us to train high accuracy [_classification_](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/), [_object detection & segmentation_](https://pytorch.org/blog/pytorch-1.12-new-library-releases/#beta-object-detection-and-instance-segmentation) models, it was a hacky approach which made those transforms impossible to import from the TorchVision binary. - -## The new Transforms API - -The Transforms V2 API supports videos, bounding boxes, and segmentation masks meaning that it offers native support for many Computer Vision tasks. The new solution is a drop-in replacement: - -```python -import torchvision.transforms.v2 as transforms - -# Exactly the same interface as V1: -trans = transforms.Compose([ - transforms.ColorJitter(contrast=0.5), - transforms.RandomRotation(30), - transforms.CenterCrop(480), -]) -imgs, bboxes, labels = trans(imgs, bboxes, labels) -``` - -The new Transform Classes can receive any arbitrary number of inputs without enforcing specific order or structure: - -```python -# Already supported: -trans(imgs) # Image Classification -trans(videos) # Video Tasks -trans(imgs, bboxes, labels) # Object Detection -trans(imgs, bboxes, masks, labels) # Instance Segmentation -trans(imgs, masks) # Semantic Segmentation -trans({"image": imgs, "box": bboxes, "tag": labels}) # Arbitrary Structure - -# Future support: -trans(imgs, bboxes, labels, keypoints) # Keypoint Detection -trans(stereo_images, disparities, masks) # Depth Perception -trans(image1, image2, optical_flows, masks) # Optical Flow -trans(imgs_or_videos, labels) # MixUp/CutMix-style Transforms -``` - -The Transform Classes make sure that they apply the same random transforms to all the inputs to ensure consistent results. - -The functional API has been updated to support all necessary signal processing kernels (resizing, cropping, affine transforms, padding etc) for all inputs: - -```python -from torchvision.transforms.v2 import functional as F - - -# High-level dispatcher, accepts any supported input type, fully BC -F.resize(inpt, size=[224, 224]) -# Image tensor kernel -F.resize_image_tensor(img_tensor, size=[224, 224], antialias=True) -# PIL image kernel -F.resize_image_pil(img_pil, size=[224, 224], interpolation=BILINEAR) -# Video kernel -F.resize_video(video, size=[224, 224], antialias=True) -# Mask kernel -F.resize_mask(mask, size=[224, 224]) -# Bounding box kernel -F.resize_bounding_box(bbox, size=[224, 224], spatial_size=[256, 256]) -``` - -Under the hood, the API uses Tensor subclassing to wrap the input, attach useful meta-data and dispatch to the right kernel. For your data to be compatible with these new transforms, you can either use the provided dataset wrapper which should work with most of torchvision built-in datasets, or your can wrap your data manually into Datapoints: - -```python -from torchvision.datasets import wrap_dataset_for_transforms_v2 -ds = CocoDetection(..., transforms=v2_transforms) -ds = wrap_dataset_for_transforms_v2(ds) # data is now compatible with transforms v2! - -# Or wrap your data manually using the lower-level Datapoint classes: -from torchvision import datapoints - -imgs = datapoints.Image(images) -vids = datapoints.Video(videos) -masks = datapoints.Mask(target["masks“]) -bboxes = datapoints.BoundingBox(target["boxes“], format=”XYXY”, spatial_size=imgs.shape) -``` - - -In addition to the new API, we now provide importable implementations for several data augmentations that are used in SoTA research such as [_Large Scale Jitter_](https://github.com/pytorch/vision/blob/928b05cad36eadb13e169f03028767c8bcd1f21d/torchvision/transforms/v2/_geometry.py#L1109), [_AutoAugmentation_](https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/_auto_augment.py) methods and [_several_](https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/__init__.py) new Geometric, Color and Type Conversion transforms. - -The API continues to support both PIL and Tensor backends for Images, single or batched input and maintains JIT-scriptability on both the functional and class APIs.. The new API has been [_verified_](https://github.com/pytorch/vision/pull/6433#issuecomment-1256741233) to achieve the same accuracy as the previous implementation. - -## An end-to-end example - - Here is an example of the new API using the following [_image_](https://user-images.githubusercontent.com/5347466/195350223-8683ef25-1367-4292-9174-c15f85c7358e.jpg). It works both with PIL images and Tensors. For more examples and tutorials, [_take a look at our gallery!_](https://pytorch.org/vision/0.15/auto_examples/index.html) - - -```python -from torchvision import io, utils -from torchvision import datapoints -from torchvision.transforms import v2 as T -from torchvision.transforms.v2 import functional as F - -# Defining and wrapping input to appropriate Tensor Subclasses -path = "COCO_val2014_000000418825.jpg" -img = datapoints.Image(io.read_image(path)) -# img = PIL.Image.open(path) -bboxes = datapoints.BoundingBox( - [[2, 0, 206, 253], [396, 92, 479, 241], [328, 253, 417, 332], - [148, 68, 256, 182], [93, 158, 170, 260], [432, 0, 438, 26], - [422, 0, 480, 25], [419, 39, 424, 52], [448, 37, 456, 62], - [435, 43, 437, 50], [461, 36, 469, 63], [461, 75, 469, 94], - [469, 36, 480, 64], [440, 37, 446, 56], [398, 233, 480, 304], - [452, 39, 463, 63], [424, 38, 429, 50]], - format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=F.get_spatial_size(img), -) -labels = [59, 58, 50, 64, 76, 74, 74, 74, 74, 74, 74, 74, 74, 74, 50, 74, 74] -# Defining and applying Transforms V2 -trans = T.Compose( - [ - T.ColorJitter(contrast=0.5), - T.RandomRotation(30), - T.CenterCrop(480), - ] -) -img, bboxes, labels = trans(img, bboxes, labels) -# Visualizing results -viz = utils.draw_bounding_boxes(F.to_image_tensor(img), boxes=bboxes) -F.to_pil_image(viz).show() -``` - -## Development milestones and future work - -Here is where we are in development: - -- [x] Design API -- [x] Write Kernels for transforming Videos, Bounding Boxes, Masks and Labels -- [x] Rewrite all existing Transform Classes (stable + references) on the new API: - - [x] Image Classification - - [x] Video Classification - - [x] Object Detection - - [x] Instance Segmentation - - [x] Semantic Segmentation -- [x] Verify the accuracy of the new API for all supported Tasks and Backends -- [x] Speed Benchmarks and Performance Optimizations (in progress - planned for Dec) -- [x] Graduate from Prototype (planned for Q1) -- [ ] Add support of Depth Perception, Keypoint Detection, Optical Flow and more (future) -- [ ] Add smooth support for batch-wise transforms like MixUp and CutMix - - -We would love to get [_feedback_](https://github.com/pytorch/vision/issues/6753) from you to improve its functionality. Please reach out to us if you have any questions or suggestions. diff --git a/_posts/2022-11-10-pytorch-enterprise-support-update.md b/_posts/2022-11-10-pytorch-enterprise-support-update.md deleted file mode 100644 index 7d7241c4e6b2..000000000000 --- a/_posts/2022-11-10-pytorch-enterprise-support-update.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Enterprise Support Program Update" -author: Team PyTorch -featured-img: "" ---- - -On May 25, 2021, we announced the [PyTorch Enterprise Support Program](https://pytorch.org/blog/announcing-pytorch-enterprise/) (ESP) that enabled providers to develop and offer tailored enterprise-grade support to their customers. - -The program enabled Program certified service providers to develop and offer tailored enterprise-grade support to their customers through contribution of hotfixes and other improvements requested by PyTorch enterprise users who were developing models in production at scale for mission-critical applications. However, as we evaluate community feedback, we found ongoing ESP support was not necessary at this time and will immediately divert these resources to other areas to improve the user experience for the entire community. - -Today, we are removing the PyTorch long-term support (LTS 1.8.2) download link from the “Get Started” page from the “[Start Locally](https://pytorch.org/get-started/locally/)” download option in order to simplify the user experience. One can download PyTorch v1.8.2 in [previous versions](/get-started/previous-versions/#v182-with-lts-support). Please note that it is only supported for Python while it is being deprecated. If there are any updates to ESP/LTS, we will update future blogs. - -

        - -

        - -Please reach out to [marketing@pytorch.org](mailto:marketing@pytorch.org) with any questions. diff --git a/_posts/2022-11-17-introducing-torchmultimodal.md b/_posts/2022-11-17-introducing-torchmultimodal.md deleted file mode 100644 index 43359f58829b..000000000000 --- a/_posts/2022-11-17-introducing-torchmultimodal.md +++ /dev/null @@ -1,89 +0,0 @@ ---- -layout: blog_detail -title: "Introducing TorchMultimodal - a library for accelerating exploration in Multimodal AI" -author: Kartikay Khandelwal, Ankita De -featured-img: "assets/images/torch-multimodal-feature-image.png" ---- - -We are announcing TorchMultimodal Beta, a PyTorch domain library for training SoTA multi-task multimodal models at scale. The library provides composable building blocks (modules, transforms, loss functions) to accelerate model development, SoTA model architectures (FLAVA, MDETR, Omnivore) from published research, training and evaluation scripts, as well as notebooks for exploring these models. The library is under active development, and we’d love to hear your feedback! You can find more details on how to get started [here](https://github.com/facebookresearch/multimodal#installation). - -## Why TorchMultimodal? - -Interest is rising around AI models that understand multiple input types (text, images, videos and audio signals), and optionally use this understanding to generate different forms of outputs (sentences, pictures, videos). Recent work from FAIR such as [FLAVA](https://arxiv.org/abs/2112.04482), [Omnivore](https://arxiv.org/pdf/2201.08377.pdf) and [data2vec](https://arxiv.org/abs/2202.03555) have shown that [multimodal models for understanding](https://ai.facebook.com/blog/advances-in-multimodal-understanding-research-at-meta-ai/) are competitive with unimodal counterparts, and in some cases are establishing the new state-of-the art. Generative models such as [Make-a-video](https://ai.facebook.com/blog/generative-ai-text-to-video/) and [Make-a-scene](https://ai.facebook.com/blog/greater-creative-control-for-ai-image-generation/) are redefining what modern AI systems can do. - -As interest in multimodal AI has grown, researchers are looking for tools and libraries to quickly experiment with ideas, and build on top of the latest research in the field. While the PyTorch ecosystem has a rich repository of libraries and frameworks, it’s not always obvious how components from these interoperate with each other, or how they can be stitched together to build SoTA multimodal models. - -TorchMultimodal solves this problem by providing: - -- **Composable and easy-to-use building blocks** which researchers can use to accelerate model development and experimentation in their own workflows. These are designed to be modular, and can be easily extended to handle new modalities. - -- **End-to-end examples for training and evaluating the latest models from research.** These should serve as starting points for ongoing/future research, as well as examples for using advanced features such as integrating with FSDP and activation checkpointing for scaling up model and batch sizes. - -## Introducing TorchMultimodal - -TorchMultimodal is a PyTorch domain library for training multi-task multimodal models at scale. In the repository, we provide: - -- **[Building Blocks](https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal)**. A collection of modular and composable building blocks like models, fusion layers, loss functions, datasets and utilities. Some examples include: - - - [Contrastive Loss with Temperature](https://github.com/facebookresearch/multimodal/blob/4d2236877467ff8f56aa1935dd92d7782751b135/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py#L145). Commonly used function for training models like CLIP and FLAVA. We also include variants such as [ImageTextContrastiveLoss](https://github.com/facebookresearch/multimodal/blob/4d2236877467ff8f56aa1935dd92d7782751b135/torchmultimodal/modules/losses/albef.py#L14) used in models like ALBEF. - - - [Codebook layers](https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/modules/layers/codebook.py#L31) which compresses high dimensional data by nearest neighbor lookup in an embedding space and is a vital component of VQVAEs (provided as a [model](https://github.com/facebookresearch/multimodal/blob/4d2236877467ff8f56aa1935dd92d7782751b135/torchmultimodal/models/vqvae.py#L26) in the repository). - - - [Shifted-window Attention](https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/modules/encoders/swin_transformer_3d_encoder.py#L76) window based multi-head self attention which is a vital component of encoders like Swin 3D Transformers. - - - [Components for CLIP.](https://github.com/facebookresearch/multimodal/tree/4d2236877467ff8f56aa1935dd92d7782751b135/torchmultimodal/models/clip) A popular model published by OpenAI which has proven to be extremely effective at learning text and image representations. - - - [Multimodal GPT.](https://github.com/facebookresearch/multimodal/blob/4d2236877467ff8f56aa1935dd92d7782751b135/torchmultimodal/models/gpt.py) An abstraction that extends OpenAI’s [GPT](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf) architecture for multimodal generation when combined with the [generation utility](https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/utils/generate.py#L33). - - - [MultiHeadAttention](https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/modules/layers/attention.py#L134). A critical component for attention-based models with support for fast auto-regressive decoding. - -- **[Examples](https://github.com/facebookresearch/multimodal/tree/main/examples)**. A collection of examples that show how to combine these building blocks with components and common infrastructure (Lightning, TorchMetrics) from across the PyTorch Ecosystem to replicate state-of-the-art models published in literature. We currently provide five examples, which include. - - - [FLAVA](https://github.com/facebookresearch/multimodal/tree/main/examples/flava) \[[paper](https://arxiv.org/abs/2112.04482)\]. Official code for the paper accepted at CVPR, including a tutorial on finetuning FLAVA. - - - [MDETR](https://github.com/facebookresearch/multimodal/tree/main/examples/mdetr) \[[paper](https://arxiv.org/abs/2104.12763)\]. Collaboration with authors from NYU to provide an example which alleviates interoperability pain points in the PyTorch ecosystem, including a [notebook](https://github.com/facebookresearch/multimodal/blob/main/examples/mdetr/MDETRTutorial.ipynb) on using MDETR for phrase grounding and visual question answering. - - - [Omnivore](https://github.com/facebookresearch/multimodal/tree/main/examples/omnivore) \[[paper](https://arxiv.org/abs/2204.08058)\]. First example in TorchMultimodal of a model which deals with Video and 3D data, including a [notebook](https://github.com/facebookresearch/multimodal/blob/main/examples/omnivore/omnivore_inference_demo.ipynb) for exploring the model. - - - [MUGEN](https://github.com/facebookresearch/multimodal/tree/main/examples/mugen) \[[paper](https://arxiv.org/abs/2204.08058)\]. Foundational work for auto-regressive [generation](https://colab.research.google.com/drive/1C3ZbH_l19g_KqW3CPeX2-8Q2sOUCpmZo?usp=sharing) and [retrieval](https://colab.research.google.com/drive/1gZfz1jsy79CNCK9t2_r43yt3z7v-w4HS?usp=sharing), including demos for text-video generation and retrieval with a large-scale synthetic dataset enriched from OpenAI [coinrun](https://github.com/openai/coinrun). - - - [ALBEF](https://github.com/facebookresearch/multimodal/tree/main/examples/albef) \[[paper](https://arxiv.org/abs/2107.07651)\] Code for the model, including a [notebook](https://github.com/facebookresearch/multimodal/blob/main/examples/albef/vqa_with_albef.ipynb) for using this model for Visual Question Answering. - -The following code snippet showcases an example usage of several TorchMultimodal components related to CLIP: - -```python - -# instantiate clip transform -clip_transform = CLIPTransform() - -# pass the transform to your dataset. Here we use coco captions -dataset = CocoCaptions(root= ..., annFile=..., transforms=clip_transform) -dataloader = DataLoader(dataset, batch_size=16) - -# instantiate model. Here we use clip with vit-L as the image encoder -model= clip_vit_l14() - -# define loss and other things needed for training -clip_loss = ContrastiveLossWithTemperature() -optim = torch.optim.AdamW(model.parameters(), lr = 1e-5) -epochs = 1 - -# write your train loop -for _ in range(epochs): - for batch_idx, batch in enumerate(dataloader): - image, text = batch - image_embeddings, text_embeddings = model(image, text) - loss = contrastive_loss_with_temperature(image_embeddings, text_embeddings) - loss.backward() - optimizer.step() -``` - -Apart from the code, we are also **releasing a tutorial for fine-tuning multimodal foundation models, and a blog post (with code pointers) on how to scale up such models using techniques from PyTorch Distributed (FSDP and activation checkpointing)**. We hope such examples and tutorials will serve to demystify a number of advanced features available in the PyTorch ecosystem. - -## What’s Next? - -While this is an exciting launch, there’s a lot more to come. The library is under development and we are working on adding some of the exciting developments in the space of diffusion models, and examples to showcase common trends from research. As you explore and use the library, we’d love to hear any feedback you might have! You can find more details on how to get started [here](https://github.com/facebookresearch/multimodal#installation). - -## Team - -The primary contributors and developers of TorchMultimodal include Ankita De, Evan Smothers, Kartikay Khandelwal, Lan Gong, Laurence Rouesnel, Nahiyan Malik, Rafi Ayub and Yosua Michael Maranatha. diff --git a/_posts/2022-11-21-scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed.md b/_posts/2022-11-21-scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed.md deleted file mode 100644 index 90f9a67a0c1b..000000000000 --- a/_posts/2022-11-21-scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed.md +++ /dev/null @@ -1,183 +0,0 @@ ---- -layout: blog_detail -title: "Scaling Multimodal Foundation Models in TorchMultimodal with Pytorch Distributed" -author: Ankita De, Edward Wang (EcoF), Rohan Varma, Anjali Sridhar, Kartikay Khandelwal -featured-img: "/assets/images/scaling-multimodal-image1-diagram-of-multimodal-flava-new.png" ---- - -## Introduction - -In recent years, scaling model sizes has become a promising area of research. In the field of NLP, language models have gone from hundreds of millions of parameters (BERT) to hundreds of billions of parameters (GPT-3) demonstrating significant improvements on downstream tasks. The [scaling laws](https://arxiv.org/pdf/2001.08361.pdf) for large scale language models have also been studied extensively in the industry. A similar trend can be observed in the vision field, with the community moving to transformer based models (like [Vision Transformer](https://arxiv.org/pdf/2010.11929.pdf), [Masked Auto Encoders](https://arxiv.org/pdf/2111.06377.pdf)) as well. It is clear that individual modalities - text, image, video - have benefited massively from recent advancements in scale, and frameworks have quickly adapted to accommodate larger models. - -At the same time, multimodality is becoming increasingly important in research with tasks like image-text retrieval, visual question-answering, visual dialog and text to image generation gaining traction in real world applications. Training large scale multimodal models is the natural next step and we already see several efforts in this area like [CLIP](https://openai.com/blog/clip/) from OpenAI, [Parti](https://parti.research.google/) from Google and [CM3](https://arxiv.org/pdf/2201.07520.pdf) from Meta. - -In this blog, we present a case study demonstrating the scaling of [FLAVA](https://flava-model.github.io/) to 10B params using techniques from PyTorch Distributed. FLAVA is a vision and language foundation model, available in [TorchMultimodal](https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/flava), which has shown competitive performance on both unimodal and multimodal benchmarks. We also give the relevant code pointers in this blog. The instructions for running an example script to scale FLAVA can be found [here](https://github.com/facebookresearch/multimodal/tree/main/examples/flava/native). - -## Scaling FLAVA Overview - -FLAVA is a foundation multimodal model which consists of transformer based image and text encoders followed by a transformer-based multimodal fusion module. It is pretrained on both unimodal and multimodal data with a diverse set of losses. This includes masked language, image and multimodal modeling losses that require the model to reconstruct the original input from its context (self-supervised learning). It also uses image text matching loss over positive and negative examples of aligned image-text pairs as well as CLIP style contrastive loss. In addition to multimodal tasks (like image-text retrieval), FLAVA demonstrated competitive performance on unimodal benchmarks as well (GLUE tasks for NLP and image classification for vision). - -

        - -

        - -The original FLAVA model has ~350M parameters and uses ViT-B16 configurations (from the [Vision Transformer paper](https://arxiv.org/pdf/2010.11929.pdf)) for image and text encoders. The multimodal fusion transformer follows the unimodal encoders but with half the number of layers. We explore increasing the size of each encoder to larger ViT variants. - -Another aspect of scaling is adding the ability to increase the batch size. FLAVA makes use of contrastive loss over in-batch negatives, which typically benefits from large batch size (as studied [here](https://openreview.net/pdf?id=U2exBrf_SJh)). The largest training efficiency or throughput is also generally achieved when operating near maximum possible batch sizes as determined by the amount of GPU memory available (also see the experiments section). - -The following table displays the different model configurations we experimented with. We also determine the maximum batch size that was able to fit in memory for each configuration in the experiments section. - -| Approx Model params | Hidden size | MLP size | Heads | Unimodal layers | Multimodal layers | Model size (fp32) | -|-----------------------|---------------|----------|---------|-------------------|---------------------|---------------------| -| 350M (original) | 768 | 3072 | 12 | 12 | 6 | 1.33GB | -| 900M | 1024 | 4096 | 16 | 24 | 12 | 3.48GB | -| 1.8B | 1280 | 5120 | 16 | 32 | 16 | 6.66GB | -| 2.7B | 1408 | 6144 | 16 | 40 | 20 | 10.3GB | -| 4.8B | 1664 | 8192 | 16 | 48 | 24 | 18.1GB | -| 10B | 2048 | 10240 | 16 | 64 | 40 | 38GB | - -## Optimization overview - -PyTorch offers several native techniques to efficiently scale models. In the following sections, we go over some of these techniques and show how they can be applied to scale up a FLAVA model to 10 billion parameters. - -## Distributed Data Parallel - -A common starting point for distributed training is data parallelism. Data parallelism replicates the model across each worker (GPU), and partitions the dataset across the workers. Different workers process different data partitions in parallel and synchronize their gradients (via all reduce) before model weights are updated. The figure below showcases the flow (forward, backward, and weight update steps) for processing a single example for data parallelism: - -

        - -

        - -

        - Source: https://engineering.fb.com/2021/07/15/open-source/fsdp/ -

        - -PyTorch provides a native API, [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) (DDP) to enable data parallelism which can be used as a module wrapper as showcased below. Please see PyTorch Distributed [documentation](https://pytorch.org/docs/stable/distributed.html#) for more details. - -```Python -from torchmultimodal.models.flava.model import flava_model_for_pretraining -import torch -import torch.distributed as dist - -model = flava_model_for_pretraining().cuda() -# Initialize PyTorch Distributed process groups -# Please see https://pytorch.org/tutorials/intermediate/dist_tuto.html for details -dist.init_process_group(backend=”nccl”) -# Wrap model in DDP -model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[torch.cuda.current_device()]) -``` - -## Fully Sharded Data Parallel - -GPU memory usage of a training application can roughly be broken down into model inputs, intermediate activations (needed for gradient computation), model parameters, gradients, and optimizer states. Scaling a model will typically increase each of these elements. Scaling a model with DDP can eventually result in out-of-memory issues when a single GPU's memory becomes insufficient since it replicates the parameters, gradients, and optimizer states on all workers. - -To reduce this replication and save GPU memory, we can shard the model parameters, gradients, and optimizer states across all workers with each worker only managing a single shard. This technique was popularized by the [ZeRO-3](https://arxiv.org/abs/1910.02054) approach developed by Microsoft. A PyTorch-native implementation of this approach is available as [FullyShardedDataParallel](https://pytorch.org/docs/stable/fsdp.html) (FSDP) API, released as a beta feature in PyTorch 1.12. During a module’s forward and backward passes, FSDP unshards the model parameters as needed for computation (using all-gather) and reshards them after computation. It synchronizes gradients using the reduce-scatter collective to ensure sharded gradients are globally averaged. The forward and backward pass flow of a model wrapped in FSDP are detailed below: - -

        - -

        - -

        - Source: https://engineering.fb.com/2021/07/15/open-source/fsdp/ -

        - -To use FSDP, the submodules of a model need to be wrapped with the API to control when specific submodules are sharded or unsharded. FSDP provides an auto-wrapping API (see the [auto_wrap_policy](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel) argument) that can be used out of the box as well as several [wrapping policies](https://github.com/pytorch/pytorch/blob/master/torch/distributed/fsdp/wrap.py) and the ability to [write your own policy](https://github.com/pytorch/pytorch/blob/75c0e3a471c19b883feca15fd4ecfabedf746691/torch/distributed/fsdp/fully_sharded_data_parallel.py#L858). - -The following example demonstrates wrapping the FLAVA model with FSDP. We specify the auto-wrapping policy as `transformer_auto_wrap_policy`. This will wrap individual transformer layers (`TransformerEncoderLayer`), the image transformer (`ImageTransformer`), text encoder (`BERTTextEncoder`) and multimodal encoder (`FLAVATransformerWithoutEmbeddings`) as individual FSDP units. This uses a recursive wrapping approach for efficient memory management. For example, after an individual transformer layer’s forward or backward pass is finished, its parameters are discarded, freeing up memory thereby reducing peak memory usage. - -FSDP also provides a number of configurable options to tune the performance of applications. For example, in our use case, we illustrate the use of the new `limit_all_gathers` flag, which prevents all-gathering model parameters too early thereby alleviating memory pressure on the application. We encourage users to experiment with this flag which can potentially improve the performance of applications with high active memory usage. - -```Python -import torch -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy -from torchmultimodal.models.flava.model import flava_model_for_pretraining -from torchmultimodal.models.flava.text_encoder import BertTextEncoder -from torchmultimodal.models.flava.image_encoder import ImageTransformer -from torchmultimodal.models.flava.transformer import FLAVATransformerWithoutEmbeddings -from torchmultimodal.modules.layers.transformer import TransformerEncoderLayer - -model = flava_model_for_pretraining().cuda() -dist.init_process_group(backend=”nccl”) - -model = FSDP( - model, - device_id=torch.cuda.current_device(), - auto_wrap_policy=partial( - transformer_auto_wrap_policy, - transformer_layer_cls={ - TransformerEncoderLayer, - ImageTransformer, - BERTTextEncoder, - FLAVATransformerWithoutEmbeddings - }, - ), - limit_all_gathers=True, - ) -``` - -## Activation Checkpointing - -As discussed above, intermediate activations, model parameters, gradients, and optimizer states contribute to the overall GPU memory usage. FSDP can reduce memory consumption due to the latter three but does not reduce memory consumed by activations. Memory used by activations increases with increase in batch size or number of hidden layers. Activation checkpointing is a technique to decrease this memory usage by recomputing the activations during the backward pass instead of holding them in memory for a specific checkpointed module. For example, we observed ~4x reduction in the peak active memory after forward pass by applying activation checkpointing to the 2.7B parameter model. - -PyTorch offers a wrapper based activation checkpointing API. In particular, `checkpoint_wrapper` allows users to wrap an individual module with checkpointing, and `apply_activation_checkpointing` allows users to specify a policy with which to wrap modules within an overall module with checkpointing. Both these APIs can be applied to most models as they do not require any modifications to the model definition code. However, if more granular control over checkpointed segments, such as checkpointing specific functions within a module, is required, the functional `torch.utils.checkpoint` [API](https://pytorch.org/docs/stable/checkpoint.html) can be leveraged, although this requires modification to the model code. The application of the activation checkpointing wrapper to individual FLAVA transformer layers (denoted by `TransformerEncoderLayer`) is shown below. For a thorough description of activation checkpointing, please see the description in the [PyTorch documentation](https://pytorch.org/docs/stable/checkpoint.html). - -```Python -from torchmultimodal.models.flava.model import flava_model_for_pretraining -from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import apply_activation_checkpointing, checkpoint_wrapper, CheckpointImpl -from torchmultimodal.modules.layers.transformer import TransformerEncoderLayer - -model = flava_model_for_pretraining() -checkpoint_tformer_layers_policy = lambda submodule: isinstance(submodule, TransformerEncoderLayer) - -apply_activation_checkpointing( - model, - checkpoint_wrapper_fn=checkpoint_wrapper, - check_fn=checkpoint_tformer_layers_policy, - ) -``` -Used together, wrapping FLAVA transformer layers with activation checkpointing and wrapping the overall model with FSDP as demonstrated above, we are able to scale FLAVA to 10B parameters. - -## Experiments - -We conduct an empirical study about the impact of the different optimizations from the previous section on system performance. For all our experiments, we use a single node with 8 A100 40GB GPUs and run the pretraining for 1000 iterations. All runs also used PyTorch’s [automatic mixed precision](https://pytorch.org/docs/stable/amp.html) with the bfloat16 data type. [TensorFloat32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) format is also enabled to improve matmul performance on the A100. We define throughput as the average number of items (text or image) processed per second (we ignore the first 100 iterations while measuring throughput to account for warmup). We leave training to convergence and its impact on downstream task metrics as an area for future study. - -Figure 1 plots the throughput for each model configuration and optimization, both with a local batch size of 8 and then with the maximum batch size possible on 1 node. Absence of a data point for a model variant for an optimization indicates that the model could not be trained on a single node. - -Figure 2 plots the maximum possible batch size per worker for each optimization. We observe a few things: - -1. Scaling model size: DDP is only able to fit the 350M and 900M model on a node. With FSDP, due to memory savings, we are able to train ~3x bigger models compared to DDP (i.e. the 1.8B and 2.7B variants). Combining activation checkpointing (AC) with FSDP enables training even bigger models, on the order of ~10x compared to DDP (i.e. 4.8B and 10B variants) -2. Throughput: - - For smaller model sizes, at a constant batch size of 8, the throughput for DDP is slightly higher than or equal to FSDP, explainable by the additional communication required by FSDP. It is lowest for FSDP and AC combined together. This is because AC re-runs checkpointed forward passes during the backwards pass, trading off additional computation for memory savings. However, in the case of the 2.7B model, FSDP + AC actually has higher throughput compared to FSDP alone. This is because the 2.7B model with FSDP is operating close to the memory limit even at batch size 8 triggering CUDA malloc retries which tend to slow down training. AC helps with reducing the memory pressure and leads to no retries. - - For DDP and FSDP + AC, the throughput increases with an increase in batch size for each model. For FSDP alone, this is true for smaller variants. However, with the 1.8B and 2.7B parameter models, we observe throughput degradation when increasing batch size. A potential reason for this, as noted above also, is that at the memory limit, PyTorch’s CUDA memory management may have to retry cudaMalloc calls and/or run expensive defragmentation steps to find free memory blocks to handle the workload’s memory requirements which can result in training slowdown. - - For larger models that can only be trained with FSDP (1.8B, 2.7B, 4.8B) the setting with highest throughput achieved is with FSDP + AC scaling to the maximum batch size. For 10B, we observe nearly equal throughput for smaller and maximum batch size. This might be counterintuitive as AC results in increased computation and maxing out batch size potentially leads to expensive defragmentation operations due to operating at CUDA memory limit. However, for these large models, the increase in batch size is large enough to mask this overhead. - -

        - -

        - -

        - Figure 1: Training throughput for different configurations -

        - -
          -
        1. Batch size: FSDP alone enables slightly higher batch sizes compared to DDP. Using FSDP + AC enables ~3x batch size compared to DDP for the 350M param model and ~5.5x for 900M param model. Even for 10B, a max batch size of ~20 which is fairly decent. This essentially enables larger global batch size using fewer GPUs which is especially useful for contrastive learning tasks.
        2. -
        - -

        - -

        - -

        - Figure 2: Max local batchsize possible for different configurations -

        - -## Conclusion - -As the world moves towards multimodal foundation models, scaling model parameters and efficient training is becoming an area of focus. The PyTorch ecosystem aims to accelerate innovation in this field by providing different tools to the research community, both for training and scaling multimodal models. With FLAVA, we laid out an example of scaling a model for multimodal understanding. In the future, we plan to add support for other kinds of models like the ones for multimodal generation and demonstrate their scaling factors. We also hope to automate many of these scaling and memory saving techniques (such as sharding and activation checkpointing) to reduce the amount of user experimentation needed to achieve the desired scale and maximum training throughput. - -## References - -- [Introducing TorchMultimodal - a library for accelerating exploration in Multimodal AI](https://pytorch.org/blog/introducing-torchmultimodal/) -- [FLAVA paper](https://deploy-preview-1186--pytorch-dot-org-preview.netlify.app/blog/introducing-torchmultimodal/) -- [Introducing Pytorch FSDP](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) \ No newline at end of file diff --git a/_posts/2022-11-22-effective-multi-objective-nueral-architecture.md b/_posts/2022-11-22-effective-multi-objective-nueral-architecture.md deleted file mode 100644 index ab3643a4873b..000000000000 --- a/_posts/2022-11-22-effective-multi-objective-nueral-architecture.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -layout: blog_detail -title: "Efficient Multi-Objective Neural Architecture Search with Ax" -author: David Eriksson, Max Balandat -featured-img: "/assets/images/MOO-NAS-blog-img2-pareto_frontier_plot.png" ---- - -## tl;dr - -Multi-Objective Optimization in Ax enables efficient exploration of tradeoffs (e.g. between model performance and model size or latency) in Neural Architecture Search. This method has been successfully applied at Meta for a variety of products such as On-Device AI. In this post, we provide an [end-to-end](https://pytorch.org/tutorials/intermediate/ax_multiobjective_nas_tutorial.html) tutorial that allows you to try it out yourself. - -## Introduction - -Neural networks continue to grow in both size and complexity. Developing state-of-the-art architectures is often a cumbersome and time-consuming process that requires both domain expertise and large engineering efforts. In an attempt to overcome these challenges, several Neural Architecture Search (NAS) approaches have been proposed to automatically design well-performing architectures without requiring a human in-the-loop. - -Despite being very sample-inefficient, naïve approaches like random search and grid search are still popular for both hyperparameter optimization and NAS (a [study](https://hal.archives-ouvertes.fr/hal-02447823/document) conducted at NeurIPS 2019 and ICLR 2020 found that 80% of NeurIPS papers and 88% of ICLR papers tuned their ML model hyperparameters using manual tuning, random search, or grid search). But as models are often time-consuming to train and may require large amounts of computational resources, minimizing the number of configurations that are evaluated is important. - -[Ax](https://ax.dev/) is a general tool for black-box optimization that allows users to explore large search spaces in a sample-efficient manner using [state-of-the art algorithms such as Bayesian Optimization](http://proceedings.mlr.press/v133/turner21a/turner21a.pdf). At Meta, Ax is used in a variety of domains, including hyperparameter tuning, NAS, identifying optimal product settings through large-scale A/B testing, infrastructure optimization, and designing cutting-edge AR/VR hardware. - -In many NAS applications, there is a natural tradeoff between multiple metrics of interest. For instance, when deploying models on-device we may want to maximize model performance (e.g., accuracy), while simultaneously minimizing competing metrics such as power consumption, inference latency, or model size, in order to satisfy deployment constraints. In many cases, we have been able to reduce computational requirements or latency of predictions substantially by accepting a small degradation in model performance (in some cases we were able to both increase accuracy and reduce latency!). Principled methods for exploring such tradeoffs efficiently are key enablers of [Sustainable AI](https://arxiv.org/abs/2111.00364). - -At Meta, we have successfully used [multi-objective Bayesian NAS](https://research.facebook.com/blog/2021/07/optimizing-model-accuracy-and-latency-using-bayesian-multi-objective-neural-architecture-search/) in Ax to explore such tradeoffs. Our methodology is being used routinely for optimizing AR/VR on-device ML models. Beyond NAS applications, we have also developed [MORBO](https://arxiv.org/pdf/2109.10964.pdf) which is a method for high-dimensional multi-objective optimization that can be used to optimize optical systems for augmented reality (AR). - -## Fully automated Multi-Objective NAS with Ax - -Ax’s Scheduler allows running experiments asynchronously in a closed-loop fashion by continuously deploying trials to an external system, polling for results, leveraging the fetched data to generate more trials, and repeating the process until a stopping condition is met. No human intervention or oversight is required. Features of the Scheduler include: - -- Customizability of parallelism, failure tolerance, and many other settings; - -- A large selection of state-of-the-art optimization algorithms; - -- Saving in-progress experiments (to a SQL DB or json) and resuming an experiment from storage; - -- Easy extensibility to new backends for running trial evaluations remotely. - -The following illustration from the [Ax scheduler tutorial](https://ax.dev/tutorials/scheduler.html) summarizes how the scheduler interacts with any external system used to run trial evaluations: - - - -

        - -

        - -To run automated NAS with the Scheduler, the main things we need to do are: - -- Define a [Runner](https://github.com/facebook/Ax/blob/main/ax/core/runner.py#L21), which is responsible for sending off a model with a particular architecture to be trained on a platform of our choice (like Kubernetes, or maybe just a Docker image on our local machine). In the tutorial below, we use TorchX for handling deployment of training jobs. - -- Define a [Metric](https://github.com/facebook/Ax/blob/main/ax/core/metric.py#L21), which is responsible for fetching the objective metrics (such as accuracy, model size, latency) from the training job. In our tutorial, we use Tensorboard to log data, and so can use the Tensorboard metrics that come bundled with Ax. - -## Tutorial - -In our tutorial we show how to use Ax to run multi-objective NAS for a simple neural network model on the popular MNIST dataset. While the underlying methodology can be used for more complicated models and larger datasets, we opt for a tutorial that is easily runnable end-to-end on a laptop in less than an hour. In our example, we will tune the widths of two hidden layers, the learning rate, the dropout probability, the batch size, and the number of training epochs. The goal is to trade off performance (accuracy on the validation set) and model size (the number of model parameters) using [multi-objective Bayesian optimization](https://proceedings.neurips.cc/paper/2021/file/11704817e347269b7254e744b5e22dac-Paper.pdf). - -The tutorial makes use of the following PyTorch libraries: - -- [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) (specifying the model and training loop) - -- [TorchX](https://github.com/pytorch/torchx) (for running training jobs remotely / asynchronously) - -- [BoTorch](https://github.com/pytorch/botorch) (the Bayesian optimization library that powers Ax’s algorithms) - -The complete runnable example is available as a **[PyTorch Tutorial](https://pytorch.org/tutorials/intermediate/ax_multiobjective_nas_tutorial.html)**. - -### Results - -The final results from the NAS optimization performed in the tutorial can be seen in the tradeoff plot below. Here, each point corresponds to the result of a trial, with the color representing its iteration number, and the star indicating the reference point defined by the thresholds we imposed on the objectives. We see that our method was able to successfully explore the trade-offs between validation accuracy and number of parameters and found both large models with high validation accuracy as well as small models with lower validation accuracy. Depending on the performance requirements and model size constraints, the decision maker can now choose which model to use or analyze further. - -

        - -

        - -### Visualizations - -Ax provides a number of visualizations that make it possible to analyze and understand the results of an experiment. Here, we will focus on the performance of the Gaussian process models that model the unknown objectives, which are used to help us discover promising configurations faster. Ax makes it easy to better understand how accurate these models are and how they perform on unseen data via leave-one-out cross-validation. In the figures below, we see that the model fits look quite good - predictions are close to the actual outcomes, and predictive 95% confidence intervals cover the actual outcomes well. Additionally, we observe that the model size `(num_params)` metric is much easier to model than the validation accuracy `(val_acc)` metric. - - - - - -
        -

        - -

        - -

        - -

        -
        - -## Takeaways - -- We showed how to run a fully automated multi-objective Neural Architecture Search using Ax. - -- Using the Ax Scheduler, we were able to run the optimization automatically in a fully asynchronous fashion - this can be done locally (as done in the tutorial) or by deploying trials remotely to a cluster (simply by changing the TorchX scheduler configuration). - -- The state-of-the-art multi-objective Bayesian optimization algorithms available in Ax allowed us to efficiently explore the tradeoffs between validation accuracy and model size. - -## Advanced Functionality - -Ax has a number of other advanced capabilities that we did not discuss in our tutorial. Among these are the following: - -### Early Stopping - -When evaluating a new candidate configuration, partial learning curves are typically available while the NN training job is running. We can use the information contained in the partial curves to identify under-performing trials to stop early in order to free up computational resources for more promising candidates. While not demonstrated in the above tutorial, Ax supports early stopping out-of-the-box. - -### High-dimensional search spaces - -In our tutorial, we used Bayesian optimization with a standard Gaussian process in order to keep the runtime low. However, these models typically scale to only about 10-20 tunable parameters. Our new SAASBO method ([paper](https://proceedings.mlr.press/v161/eriksson21a/eriksson21a.pdf), [Ax tutorial](https://ax.dev/tutorials/saasbo.html), [BoTorch tutorial](https://botorch.org/tutorials/saasbo)) is very sample-efficient and enables tuning hundreds of parameters. SAASBO can easily be enabled by passing `use_saasbo=True` to `choose_generation_strategy`. - -## Acknowledgements - -We thank the TorchX team (in particular Kiuk Chung and Tristan Rice) for their help with integrating TorchX with Ax, and the Adaptive Experimentation team @ Meta for their contributions to Ax and BoTorch. - -## References - -[D. Eriksson, P. Chuang, S. Daulton, M. Balandat. Optimizing model accuracy and latency using Bayesian multi-objective neural architecture search. Meta Research blog, July 2021.](https://research.facebook.com/blog/2021/07/optimizing-model-accuracy-and-latency-using-bayesian-multi-objective-neural-architecture-search/) diff --git a/_posts/2022-11-28-optimizing-production-pytorch-performance-with-graph-transformations.md b/_posts/2022-11-28-optimizing-production-pytorch-performance-with-graph-transformations.md deleted file mode 100644 index 93fda1037bd4..000000000000 --- a/_posts/2022-11-28-optimizing-production-pytorch-performance-with-graph-transformations.md +++ /dev/null @@ -1,198 +0,0 @@ ---- -layout: blog_detail -title: "Optimizing Production PyTorch Models’ Performance with Graph Transformations" -author: Jade Nie, CK Luk, Xiaodong Wang, Jackie (Jiaqi) Xu ---- - -## 1. Introduction - -PyTorch supports two execution modes [1]: eager mode and graph mode. In eager mode, operators in a model are immediately executed as they are encountered. In contrast, in graph mode, operators are first synthesized into a graph, which will then be compiled and executed as a whole. Eager mode is easier to use, more suitable for ML researchers, and hence is the default mode of execution. On the other hand, graph mode typically delivers higher performance and hence is heavily used in production. - -Specifically, graph mode enables operator fusion [2], wherein one operator is merged with another to reduce/localize memory reads as well as total kernel launch overhead. Fusion can be horizontal—taking a single operation (e.g., BatchNorm) that is independently applied to many operands and merging those operands into an array; and vertical—merging a kernel with another kernel that consumes the output of the first kernel (e.g., Convolution followed by ReLU). - -Torch.FX [3, 4] (abbreviated as FX) is a publicly available toolkit as part of the PyTorch package that supports graph mode execution. In particular, it (1) captures the graph from a PyTorch program and (2) allows developers to write transformations on the captured graph. It is used inside Meta to optimize the training throughput of production models. By introducing a number of FX-based optimizations developed at Meta, we demonstrate the approach of using graph transformation to optimize PyTorch’s performance for production. - -## 2. Background - -Embedding tables are ubiquitous in recommendation systems. Section 3 will discuss three FX transformations that optimize accesses to embedding tables. In this section, we provide some background on FX (Section 2.1) and embedding tables (Section 2.2). - -### 2.1 FX - -Figure 1 is a simple example adopted from [3] which illustrates using FX to transform a PyTorch program. It contains three steps: (1) capturing the graph from a program, (2) modifying the graph (in this example, all uses of RELU are replaced by GELU), and (3) generating a new program from the modified graph. - -

        - -

        - -**Figure 1: A FX example which replaces all uses of RELU by GELU in a PyTorch module.** - -The FX API [4] provides many more functionalities for inspecting and transforming PyTorch program graphs. - -### 2.2 Embedding Tables - -

        - -

        - -**Figure 2: Illustration of an embedding table for a sparse feature with batch size = 1** - -In a recommendation system, sparse features (e.g., User ID, Story ID) are represented by embedding tables. An embedding table E is an HxD matrix, where H is the hash size, D is the embedding dimension. Each row of E is a vector of floats. Feature hashing [5] is used to map a sparse feature to a list of indices to E, say [S1,S2, …, Sk], where 0<=Si<H. Its output value is computed as f(E[S1], E[S2], …, E[Sk]), where E[Si] is the vector at row Si, and f is called the pooling function, which is typically one of the following functions: sum, average, maximum. See Figure 2 for an illustration. - -To fully utilize the GPU, sparse features are usually processed in a batch. Each entity in a batch has its own list of indices. If a batch has B entities, a naive representation has B lists of indices. A more compact representation is to combine the B lists of indices into a single list of indices and add a list of the lengths of indices (one length for each entity in the batch). For example, if a batch has 3 entities whose lists of indices are as follows: - -- Entity 1: indices = [10, 20] -- Entity 2: indices = [5, 9, 77, 81] -- Entity 3: indices = [15, 20, 45] - -Then the indices and lengths for the entire batch will be: - -- Indices = [10, 20, 5, 9, 77, 81, 15, 20, 45] -- Lengths = [2, 4, 3] - -And the output of the embedding table lookup for the whole batch is a BxD matrix. - -## 3. Three FX Transformations - -We have developed three FX transformations that accelerate accesses to embedding tables. Section 3.1 discusses a transformation that combines multiple small input tensors into a single big tensor; Section 3.2 a transformation that fuses multiple, parallel compute chains into a single compute chain; and Section 3.3 a transformation that overlaps communication with computation. - -### 3.1 Combining Input Sparse Features - -Recall that an input sparse feature in a batch is represented by two lists: a list of indices and a list of B lengths, where B is the batch size. In PyTorch, these two lists are implemented as two tensors. When a PyTorch model is run on a GPU, embedding tables are commonly stored in the GPU memory (which is closer to the GPU and has much higher read/write bandwidth than the CPU memory). To use an input sparse feature, its two tensors need to be first copied from CPU to GPU. Nevertheless, per host-to-device memory copying requires a kernel launch, which is relatively expensive compared to the actual data transfer time. If a model uses many input sparse features, this copying could become a performance bottleneck (e.g., 1000 input sparse features would require copying 2000 tensors from host to device). - -An optimization that reduces the number of host-to-device memcpy is to combine multiple input sparse features before sending them to the device. For instance, given the following three input features: - -- Feature_A: indices = [106, 211, 7], lengths = [2, 1] -- Feature_B: indices = [52, 498, 616, 870, 1013], lengths = [3, 2] -- Feature_C: indices = [2011, 19, 351, 790], lengths = [1, 3] - -The combined form is: - -- Features_A_B_C: indices = [106, 211, 7, 52, 498, 616, 870, 1013, 2011, 19, 351, 790], lengths = [2, 1, 3, 2, 1, 3] - -So, instead of copying 3x2=6 tensors from host to device, we only need to copy 2 tensors. - -Figure 3(b) describes an implementation of this optimization, which has two components: - -- On the CPU side: The input pipeline is modified to combine all the indices of sparse features into a single tensor and similarly all the lengths into another tensor. Then the two tensors are copied to the GPU. -- On the GPU side: Using FX, we insert a Permute_and_Split op into the model graph to recover the indices and lengths tensors of individual features from the combined tensors, and route them to the corresponding nodes downstream. - -

        - -

        - -(a). **Without the optimization** - -

        - -

        - -(b). **With the optimization** - -**Figure 3: Combining input sparse features** - -### 3.2 Horizontal fusion of computation chains started with accesses to embedding tables - -In a production model, it is fairly common to have 10s of embedding tables residing on each GPU. For performance reasons, lookups to these tables are grouped together so that their outputs are concatenated in a single big tensor (see the red part in Figure 4(a)). To apply computations to individual feature outputs, a Split op is used to divide the big tensors into N smaller tensors (where N is the number of features) and then the desired computations are applied to each tensor. This is shown in Figure 4(a), where the computation applied to each feature output O is Tanh(LayerNorm(O)). All the computation results are concatenated back to a big tensor, which is then passed to downstream ops (Op1 in Figure 4(a)). - -The main runtime cost here is the GPU kernel launch overhead. For instance, the number of GPU kernel launches in Figure 4(a) is 2\*N + 3 (each oval in the figure is a GPU kernel). This could become a performance issue because execution times of LayerNorm and Tanh on the GPU are short compared to their kernel launch times. In addition, the Split op may create an extra copy of the embedding output tensor, consuming additional GPU memory. - -We use FX to implement an optimization called horizontal fusion which dramatically reduces the number of GPU kernel launches (in this example, the optimized number of GPU kernel launches is 5, see Figure 4(b)). Instead of doing an explicit Split, we use the Add_middle_dim op to reshape the 2D embedding tensor of shape (B, NxD) to a 3D tensor of shape (B, N, D). Then a single LayerNorm is applied to the last dimension of it. Then a single Tanh is applied to the result of the LayerNorm. At the end, we use the Remove_middle_dim op to reshape the Tanh’s result back to a 2D tensor. In addition, since Add_middle_dim and Remove_middle_dim only reshape the tensor without creating an extra copy, the amount of GPU memory consumption could be reduced as well. - -

        - -

        - -(a). **Without the optimization** - -

        - -

        - -(b). **With the optimization** - -**Figure 4: Horizontal fusion** - -### 3.3 Overlapping Computation with Communication - -Training of a production recommendation model is typically done on a distributed GPU system. Since the capacity of the device memory per GPU is not big enough to hold all the embedding tables in the model, they need to be distributed among the GPUs. - -Within a training step, a GPU needs to read/write feature values from/to the embedding tables on the other GPUs. This is known as all-to-all communication [6] and can be a major performance bottleneck. - -We use FX to implement a transformation that can overlap computation with all-to-all communication. Figure 5(a) shows the example of a model graph which has embedding table accesses (EmbeddingAllToAll) and other ops. Without any optimization, they are sequentially executed on a GPU stream, as shown in Figure 5(b). Using FX, we break EmbeddingAllToAll into EmbeddingAllToAll_Request and EmbeddingAllToAll_Wait, and schedule independent ops in between them. - -

        - -

        - -**(a) Model graph** - -

        - -

        - -**(b) Original execution order** - -

        - -

        - -**(c)Optimized execution order** - -**Figure 5: Overlapping Computation with Communication** - -### 3.4 Summary - -Table 1 summarizes the optimizations discussed in this section and the corresponding performance bottlenecks addressed. - - - - - - - - - - - - - - - - - - -
        Optimization - Performance Bottleneck Addressed -
        Combining Input Sparse Features - Host-to-device memory copy -
        Horizontal fusion - GPU kernel launch overhead -
        Overlapping Computation with Communication - Embedding all-to-all access time -
        - -**Table 1: Summary of the optimizations and the performance bottlenecks addressed** - -We have also developed other FX transformations which are not discussed in this section due to space limitations. - -To discover which models would benefit from these transformations, we analyzed the performance data collected by MAIProf [7] from the models that run at Meta’s data centers. Altogether, these transformations provide up to 2-3x of speedups compared to eager mode on a set of production models. - -## 4. Concluding Remarks - -The graph mode in PyTorch is preferred over the eager mode for production use for performance reasons. FX is a powerful tool for capturing and optimizing the graph of a PyTorch program. We demonstrate three FX transformations that are used to optimize production recommendation models inside Meta. We hope that this blog can motivate other PyTorch model developers to use graph transformations to boost their models’ performance. - -References - -[1] [End-to-end Machine Learning Framework](https://pytorch.org/features/) - -[2] [DNNFusion: Accelerating Deep Neural Networks Execution with Advanced Operator Fusion](https://arxiv.org/abs/2108.13342) - -[3] [Torch.FX: Practical Program Capture and Transformation for Deep Learning In Python](https://arxiv.org/pdf/2112.08429.pdf), MLSys 2022. - -[4] [Torch.fx—PyTorch 1.12 documentation](https://pytorch.org/docs/stable/fx.html) - -[5] [Feature Hashing for Large Scale Multitask Learning](https://alex.smola.org/papers/2009/Weinbergeretal09.pdf) - -[6] [NVIDIA Collective Communication Library Documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/) - -[7] [Performance Debugging of Production PyTorch Models at Meta](https://pytorch.org/blog/performance-debugging-of-production-pytorch-models-at-meta/) diff --git a/_posts/2022-12-02-Accelerating-Hugging-Face-and-TIMM-models.md b/_posts/2022-12-02-Accelerating-Hugging-Face-and-TIMM-models.md deleted file mode 100644 index e7f78b39d8c2..000000000000 --- a/_posts/2022-12-02-Accelerating-Hugging-Face-and-TIMM-models.md +++ /dev/null @@ -1,184 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Hugging Face and TIMM models with PyTorch 2.0" -author: Mark Saroufim -featured-img: "assets/images/pytorch-2.0-feature-img.png" ---- - -`torch.compile()` makes it easy to experiment with different compiler backends to make PyTorch code faster with a single line decorator `torch.compile()`. It works either directly over an nn.Module as a drop-in replacement for `torch.jit.script()` but without requiring you to make any source code changes. We expect this one line code change to provide you with between 30%-2x training time speedups on the vast majority of models that you’re already running. - -```python - -opt_module = torch.compile(module) - -``` - -torch.compile supports arbitrary PyTorch code, control flow, mutation and comes with experimental support for dynamic shapes. We’re so excited about this development that we call it PyTorch 2.0. - -What makes this announcement different for us is we’ve already benchmarked some of the most popular open source PyTorch models and gotten substantial speedups ranging from 30% to 2x [https://github.com/pytorch/torchdynamo/issues/681](https://github.com/pytorch/torchdynamo/issues/681). - -There are no tricks here, we’ve pip installed popular libraries like [https://github.com/huggingface/transformers](https://github.com/huggingface/transformers), [https://github.com/huggingface/accelerate](https://github.com/huggingface/accelerate) and [https://github.com/rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models) and then ran torch.compile() on them and that’s it. - -It’s rare to get both performance and convenience, but this is why the core team finds PyTorch 2.0 so exciting. The Hugging Face team is also excited, in their words: - -Ross Wightman the primary maintainer of TIMM: “PT 2.0 works out of the box with majority of timm models for inference and train workloads and no code changes” - -Sylvain Gugger the primary maintainer of transformers and accelerate: "With just one line of code to add, PyTorch 2.0 gives a speedup between 1.5x and 2.x in training Transformers models. This is the most exciting thing since mixed precision training was introduced!" - -This tutorial will show you exactly how to replicate those speedups so you can be as excited as to PyTorch 2.0 as we are. - -## Requirements and Setup - -For GPU (newer generation GPUs will see drastically better performance) - -``` -pip3 install numpy --pre torch --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117 - -``` - -For CPU - -``` -pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu - -``` - -Optional: Verify Installation - -``` -git clone https://github.com/pytorch/pytorch -cd tools/dynamo -python verify_dynamo.py -``` - -Optional: Docker installation - -We also provide all the required dependencies in the PyTorch nightly -binaries which you can download with - -``` -docker pull ghcr.io/pytorch/pytorch-nightly - -``` - -And for ad hoc experiments just make sure that your container has access -to all your GPUs - -``` -docker run --gpus all -it ghcr.io/pytorch/pytorch-nightly:latest /bin/bash - -``` - -## Getting started - -### a toy exmaple - -Let’s start with a simple example and make things more complicated step -by step. Please note that you’re likely to see more significant speedups the newer your GPU is. - -```python -import torch -def fn(x, y): - a = torch.sin(x).cuda() - b = torch.sin(y).cuda() - return a + b -new_fn = torch.compile(fn, backend="inductor") -input_tensor = torch.randn(10000).to(device="cuda:0") -a = new_fn(input_tensor, input_tensor) -``` - -This example won’t actually run faster but it’s educational. - -example that features `torch.cos()` and `torch.sin()` which are examples of pointwise ops as in they operate element by element on a vector. A more famous pointwise op you might actually want to use would be something like `torch.relu()`. - -Pointwise ops in eager mode are suboptimal because each one would need to read a tensor from memory, make some changes and then write back those changes. - -The single most important optimization that PyTorch 2.0 does for you is fusion. - -So back to our example we can turn 2 reads and 2 writes into 1 read and 1 write which is crucial especially for newer GPUs where the bottleneck is memory bandwidth (how quickly you can send data to a GPU) instead of compute (how quickly your GPU can crunch floating point operations) - -The second most important optimization that PyTorch 2.0 does for you is CUDA graphs - -CUDA graphs help eliminate the overhead from launching individual kernels from a python program. - -torch.compile() supports many different backends but one that we’re particularly excited about is Inductor which generates Triton kernels [https://github.com/openai/triton](https://github.com/openai/triton) which are written in Python yet outperform the vast majority of handwritten CUDA kernels. Suppose our example above was called trig.py we can actually inspect the code generated triton kernels by running. - -``` -TORCH_COMPILE_DEBUG=1 python trig.py -``` - -```python - -@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]}) -@triton.jit -def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): - xnumel = 10000 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.reshape(tl.arange(0, XBLOCK), [XBLOCK]) - xmask = xindex < xnumel - x0 = xindex - tmp0 = tl.load(in_ptr0 + (x0), xmask) - tmp1 = tl.sin(tmp0) - tmp2 = tl.sin(tmp1) - tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask) - -``` - -And you can verify that fusing the two `sins` did actually occur because the two `sin` operations occur within a single Triton kernel and the temporary variables are held in registers with very fast access. - -### a real model - -As a next step let’s try a real model like resnet50 from the PyTorch hub. - -```python -import torch -model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True) -opt_model = torch.compile(model, backend="inductor") -model(torch.randn(1,3,64,64)) - -``` - -If you actually run you may be surprised that the first run is slow and that’s because the model is being compiled. Subsequent runs will be faster so it's common practice to warm up your model before you start benchmarking it. - -You may have noticed how we also passed in the name of a compiler explicitly here with “inductor” but it’s not the only available backend, you can run in a REPL `torch._dynamo.list_backends()` to see the full list of available backends. For fun you should try out `aot_cudagraphs` or `nvfuser`. - -### Hugging Face models - -Let’s do something a bit more interesting now, our community frequently -uses pretrained models from transformers [https://github.com/huggingface/transformers](https://github.com/huggingface/transformers) or TIMM [https://github.com/rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models) and one of our design goals for PyTorch 2.0 was that any new compiler stack needs to work out of the box with the vast majority of models people actually run. - -So we’re going to directly download a pretrained model from the Hugging Face hub and optimize it - -```python - -import torch -from transformers import BertTokenizer, BertModel -# Copy pasted from here https://huggingface.co/bert-base-uncased -tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') -model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0") -model = torch.compile(model) # This is the only line of code that we changed -text = "Replace me by any text you'd like." -encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0") -output = model(**encoded_input) - -``` - -If you remove the `to(device="cuda:0")` from the model and `encoded_input` then PyTorch 2.0 will generate C++ kernels that will be optimized for running on your CPU. You can inspect both Triton or C++ kernels for BERT, they’re obviously more complex than the trigonometry example we had above but you can similarly skim it and understand if you understand PyTorch. - -The same code also works just fine if used with [https://github.com/huggingface/accelerate](https://github.com/huggingface/accelerate) and DDP - -Similarly let’s try out a TIMM example - -```python -import timm -import torch -model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2) -opt_model = torch.compile(model, backend="inductor") -opt_model(torch.randn(64,3,7,7)) -``` - -Our goal with PyTorch was to build a breadth-first compiler that would speed up the vast majority of actual models people run in open source. The Hugging Face Hub ended up being an extremely valuable benchmarking tool for us, ensuring that any optimization we work on actually helps accelerate models people want to run. - -So please try out PyTorch 2.0, enjoy the free perf and if you’re not seeing it then please open an issue and we will make sure your model is supported [https://github.com/pytorch/torchdynamo/issues](https://github.com/pytorch/torchdynamo/issues) - -After all, we can’t claim we’re created a breadth-first unless YOUR models actually run faster. diff --git a/_posts/2022-12-02-getting-started-with-pytorch-2.0.md b/_posts/2022-12-02-getting-started-with-pytorch-2.0.md deleted file mode 100644 index 5cb11d223e1d..000000000000 --- a/_posts/2022-12-02-getting-started-with-pytorch-2.0.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: blog_detail -title: "Get Started with PyTorch 2.0 Summary and Overview" -author: Team PyTorch -featured-img: "assets/images/Pytorch_2_0_Animation_AdobeExpress.gif" ---- - -Introducing PyTorch 2.0, our first steps toward the next generation 2-series release of PyTorch. Over the last few years we have innovated and iterated from PyTorch 1.0 to the most recent 1.13 and moved to the newly formed PyTorch Foundation, part of the Linux Foundation. - -To complement the PyTorch 2.0 announcement and conference, we have also posted a comprehensive introduction and technical overview within the Get Started menu at [https://pytorch.org/get-started/pytorch-2.0](https://pytorch.org/get-started/pytorch-2.0). - -We also wanted to ensure you had all the information to quickly leverage PyTorch 2.0 in your models so we added the technical requirements, tutorial, user experience, Hugging Face benchmarks and FAQs to get you started today! - -Finally we are launching a new “Ask the Engineers: 2.0 Live Q&A” series that allows you to go deeper on a range of topics with PyTorch subject matter experts. We hope this content is helpful for the entire community and level of users/contributors. - -[https://pytorch.org/get-started/pytorch-2.0](https://pytorch.org/get-started/pytorch-2.0) diff --git a/_posts/2022-12-15-scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud.md b/_posts/2022-12-15-scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud.md deleted file mode 100644 index a9bd24a96eb5..000000000000 --- a/_posts/2022-12-15-scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud.md +++ /dev/null @@ -1,89 +0,0 @@ ---- -layout: blog_detail -title: "Scaling PyTorch FSDP for Training Foundation Models on IBM Cloud" -author: Linsong Chu, Less Wright, Hamid Shojanazeri, Sophia Wen, Raghu Ganti, Geeta Chauhan -featured-img: "/assets/images/scaling-pytorch-fsdp-image1-IBM_scaling_FSDP_visual_new.png" ---- - -Large model training using a cloud native approach is of growing interest for many enterprises given the emergence and success of [foundation models](https://research.ibm.com/blog/what-are-foundation-models). Some AI practitioners may assume that the only way they can achieve high GPU utilization for distributed training jobs is to run them on HPC systems, such as those inter-connected with Infiniband and may not consider Ethernet connected systems. We demonstrate how the latest distributed training technique, Fully Sharded Data Parallel (FSDP) from PyTorch, successfully scales to models of size 10B+ parameters using commodity Ethernet networking in IBM Cloud. - -## PyTorch FSDP Scaling - -As models get larger, the standard techniques for data parallel training work only if the GPU can hold a full replica of the model, along with its training state (optimizer, activations, etc.). However, GPU memory increases have not kept up with the model size increases and new techniques for training such models have emerged (e.g., Fully Sharded Data Parallel, [DeepSpeed](https://www.deepspeed.ai/)), which allow us to efficiently distribute the model and data over multiple GPUs during training. In this blog post, we demonstrate a path to achieve remarkable scaling of model training to 64 nodes (512 GPUs) using PyTorch native FSDP APIs as we increase model sizes to 11B. - -### What is Fully Sharded Data Parallel? - -FSDP extends the distributed data parallel training (DDP) approach by sharding model parameters, gradient and optimizer states into K FSDP units, determined by using a wrapping policy. FSDP achieves large model training efficiency in terms of resources and performance by significantly reducing the memory footprint on each GPU and overlapping computation and communication. - -Resource efficiency is achieved with memory footprint reduction by having all GPUs own a portion of each FSDP unit. To process a given FSDP unit, all GPUs share their locally owned portion via all_gather communication calls. - -Performance efficiency is accomplished by overlapping all_gather communication calls for upcoming FSDP units with computation of the current FSDP unit. Once the current FSDP unit has been processed, the non-locally owned parameters are dropped, freeing memory for the upcoming FSDP units. This process achieves training efficiency by the overlap of computation and communication, while also reducing the peak memory needed by each GPU. - -In what follows, we demonstrate how FSDP allows us to keep hundreds of GPUs highly utilized throughout a distributed training job, while running over standard Ethernet networking (system description towards the end of the blog). We chose the T5 architecture for our experiments and leveraged the code from the [FSDP workshop](https://github.com/pytorch/workshops/tree/master/FSDP_Workshop). In each of our experiments, we start with a single node experiment to create a baseline and report the metric seconds/iteration normalized by the batch size as well as compute the teraflops based on the [Megatron-LM paper](https://cs.stanford.edu/~matei/papers/2021/sc_megatron_lm.pdf) (see Appendix for details of teraflop computation for T5). Our experiments aim to maximize the batch size (while avoiding cudaMalloc retries) to take full advantage of overlap in computation and communications, as discussed below. Scaling is defined as the ratio of the seconds/iteration normalized by batch size for N nodes versus a single node, representing how well we can utilize the additional GPUs as more nodes are added. - -### Experimental Results - -Our first set of experiments using the T5-3B configuration (mixed precision with BF16, activation checkpointing, and transformer wrapping policy) demonstrated scaling efficiency of 95% as we increased the number of GPUs from 8 to 512 (1 to 64 nodes, respectively). We achieved these results without any modifications to the existing FSDP APIs. We observed that, for this scale, over Ethernet based network, there is sufficient bandwidth to enable continuous overlap of communication and computation. - -However, when we increased the T5 model size to 11B, the scaling efficiency declined substantially to 20%. The PyTorch profiler shows that overlap of communication and computation was very limited. Further investigation into the network bandwidth usage revealed that the poor overlap is being caused by latency in the communication of individual packets and not the bandwidth required (in fact, our peak bandwidth utilization is 1/4th of that available). This led us to hypothesize that if we can increase the compute time by increasing the batch size, we can better overlap communication and computation. However, given we are already at maximum GPU memory allocation, we must identify opportunities to rebalance the memory allocation to allow for increase in batch size. We identified that the model state was being allocated a lot more memory than was needed. The primary function of these reservations is to have pre-reserved memory ready to aggressively send/receive tensors during the communication periods and too few buffers can result in increased wait times, whereas too many buffers result in smaller batch sizes. - -To achieve better efficiency, the PyTorch distributed team introduced a new control knob, the rate_limiter which controls how much memory is allocated for send/receive of tensors, alleviating the memory pressure and providing room for higher batch sizes. In our case, the rate_limiter could increase the batch size from 20 to 50, thus increasing compute time by 2.5x and allowing for much greater overlap of communication and computation. With this fix, we increased the scaling efficiency to >75% (at 32 nodes)! - -Continued investigation into the factors limiting scaling efficiency uncovered that the rate limiter was creating a recurring pipeline bubble of GPU idle time. This was due to the rate limiter using a block and flush approach for the allocation and release of each set of memory buffers. By waiting for the entire block to complete before initiating a new all_gather, the GPU was idling at the start of each block, while waiting for the new set of all_gather parameters to arrive. This bubble was alleviated by moving to a sliding window approach. Upon the completion of a single all_gather step and its computation (rather than a block of them), the memory is freed and the next all_gather is immediately issued in a much more uniform manner. This improvement eliminated the pipeline bubble and boosted the scaling efficiencies to >90% (at 32 nodes). - -

        - -

        - -

        -Figure 1: Scaling of T5-XL (3B) and T5-XXL (11B) from 1 node to 64 nodes -

        - -

        - -

        - -

        -Figure 2: TFLOPs/sec usage for T5-XL(3B) and T5-XXL (11B) as we increase number of nodes -

        - -## IBM Cloud AI System and Middleware - -The AI infrastructure used for this work is a large-scale AI system on IBM Cloud consisting of nearly 200 nodes, each node with 8 NVIDIA A100 80GB cards, 96 vCPUs, and 1.2TB CPU RAM. The GPU cards within a node are connected via NVLink with a card-to-card bandwidth of 600GBps. Nodes are connected by 2 x 100Gbps Ethernet links with SRIOV based TCP/IP stack, providing a usable bandwidth of 120Gbps. - -The IBM Cloud AI System has been production-ready since May of 2022 and is configured with the OpenShift container platform to run AI workloads. We also built a software stack for production AI workloads that provide end-to-end tools for training workloads. The middleware leverages Ray for pre and post processing workloads and PyTorch for training of models. We also integrate a Kubernetes native scheduler, MCAD, that manages multiple jobs with job queuing, gang scheduling, prioritization, and quota management. A multi-NIC CNI discovers all available network interfaces and handles them as a single NIC pool enabling optimized use of the network interfaces in Kubernetes. Finally, CodeFlare CLI supports a single pane for observability of the full stack using a desktop CLI (e.g., GPU utilization, application metrics like loss, gradient norm). - -

        - -

        - -

        -Figure 3: Foundation Model Middleware Stack -

        - -### Conclusion and Future Work - -In conclusion, we demonstrated how we can achieve remarkable scaling of FSDP APIs over non-InfiniBand networks. We identified the bottleneck that had limited scaling to less than 20% efficiency for 11B parameter model training. After identifying the issue, we were able to correct this with a new rate limiter control to ensure a more optimal balance of reserved memory and communication overlap relative to compute time. With this improvement, we were able to achieve 90% scaling efficiency (a 4.5x improvement), at 256 GPUs and 80% at 512 GPUs for training of the 11B parameter model. In addition, the 3B parameter model scales extremely well with 95% efficiency even as we increase the number of GPUs to 512. - -This is a first in the industry to achieve such scaling efficiencies for up to 11B parameter models using Kubernetes with vanilla Ethernet and PyTorch native FSDP API’s. This improvement enables users to train huge models on a Hybrid Cloud platform in a cost efficient and sustainable manner. - -We plan on continuing to investigate scaling with decoder only models and increasing the size of these models to 100B+ parameters. From a system design perspective, we are exploring capabilities such as RoCE and GDR that can improve latencies of communications over Ethernet networks. - -## Acknowledgements - -This blog was possible because of contributions from both PyTorch Distributed and IBM Research teams. - -From the PyTorch Distributed team, we would like to thank Less Wright, Hamid Shojanazeri, Geeta Chauhan, Shen Li, Rohan Varma, Yanli Zhao, Andrew Gu, Anjali Sridhar, Chien-Chin Huang, and Bernard Nguyen. - -From the IBM Research team, we would like to thank Linsong Chu, Sophia Wen, Lixiang (Eric) Luo, Marquita Ellis, Davis Wertheimer, Supriyo Chakraborty, Raghu Ganti, Mudhakar Srivatsa, Seetharami Seelam, Carlos Costa, Abhishek Malvankar, Diana Arroyo, Alaa Youssef, Nick Mitchell. - -## Appendix - -#### Teraflop computation - -The T5-XXL (11B) architecture has two types of T5 blocks, one is an encoder and the second is a decoder. Following the approach of Megatron-LM, where each matrix multiplication requires 2m×k×n FLOPs, where the first matrix is of size m×k and the second is k×n. The encoder block consists of self-attention and feed forward layers, whereas the decoder block consists of self-attention, cross-attention, and feed forward layers. - -The attention (both self and cross) block consists of a QKV projection, which requires 6Bsh2 operations, an attention matrix computation requiring 2Bs2h operations, an attention over values which needs 2Bs2h computations, and the post-attention linear projection requires 2Bsh2 operations. Finally, the feed forward layer requires 15Bsh2 operations. - -The total for an encoder block is 23Bsh2+4Bs2h, whereas for a decoder block, it comes to 31Bsh2+8Bs2h. With a total of 24 encoder and 24 decoder blocks and 2 forward passes (as we discard the activations) and one backward pass (equivalent to two forward passes), the final FLOPs computation comes to be 96×(54Bsh2+ 12Bs2h) + 6BshV. Here, B is the batch size per GPU, s is sequence length, h is hidden state size, and V is vocabulary size. -We repeat a similar computation for T5-XL (3B) architecture, which is slightly different. \ No newline at end of file diff --git a/_posts/2022-12-16-efficient-large-scale-training-with-pytorch.md b/_posts/2022-12-16-efficient-large-scale-training-with-pytorch.md deleted file mode 100644 index 6cc3f27a7f00..000000000000 --- a/_posts/2022-12-16-efficient-large-scale-training-with-pytorch.md +++ /dev/null @@ -1,463 +0,0 @@ ---- -layout: blog_detail -title: "Efficient Large-Scale Training with Pytorch FSDP and AWS" -author: Less Wright, Hamid Shojanazeri, Geeta Chauhan -featured-img: "assets/images/largeblog_index_1.png" ---- - -Cutting-edge AI models are becoming extremely large. The cost and overhead of training these models is increasing rapidly, and involves large amounts of engineering and guesswork to find the right training regime. FSDP reduces these costs significantly by enabling you to train much larger models with the same amount of resources. FSDP lowers the memory footprint on your GPUs, and is usable via a lightweight configuration that requires substantially less effort, typically with just a few lines of code. - -The main performance gains in FSDP come from maximizing the overlap between network communication and model computation, and eliminating the memory redundancy inherent in traditional data parallel training (DDP). PyTorch FSDP can train models approximately 4x larger on the same server resources as DDP and 20x larger if we combine activation checkpointing and activation offloading. - -Since PyTorch 1.12, FSDP is now in beta status, and has added a number of new features that can be tuned to further accelerate your model training. - -In this series of blog posts, we will explain multiple performance optimizations you can run with FSDP to boost your distributed training speed and model sizes within the context of your available server resources. We use the HuggingFace T5 3B, 11B and DeepVit, in fine-tuning mode, as the running examples throughout the series. - -As a preview of some of the optimizations discussed in this series, we show the before and after performance scaled in Flops below (Note that these results can vary based on your server resources and model architecture). - -

        - -

        - - *T5 3B Performance measured on AWS A100 and A10 servers. Original with no optimizations and Tuned with the applied optimization - -

        - -

        - - *T5 11B Performance measured on A100 servers. Original with no optimizations and Tuned with the applied optimization - -In this first post, we will provide a quick overview of FSDP and how it can make training large- scale AI models more efficient. We will highlight briefly the multiple performance options available, and dive deeper into the details on these in upcoming posts. We will then conclude with an overview on how to leverage AWS parallel cluster for large- scale training with FSDP. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Optimization - T5 Model - Throughput Improvement -
        Mixed Precision - 3 B - 5x -
        11 B - 10x -
        Activation Checkpointing (AC) - 3 B - 10x -
        11 B - 100x -
        Transformer Wrapping Policy - 3 B - 2x -
        11 B - Unable to run the experiment without the Transformer wrapping policy. -
        Full Shard Strategy - 3 B - 1.5x -
        11 B - Not able to run with Zero2 -
        - -_Performance optimization gains on T5 models over non-optimized._ - -In our experiments with the T5 3B model, using the [transformer wrapping policy](https://www.youtube.com/watch?v=HQeKwCsnH4k&list=PL_lsbAsL_o2BT6aerEKgIoufVD_fodnuT&index=2) resulted in >2x higher throughput measured in TFLOPS versus the default wrapping policy. [Activation checkpointing](https://www.youtube.com/watch?v=5B4d0FuxSQc&list=PL_lsbAsL_o2BT6aerEKgIoufVD_fodnuT&index=3) resulted in 10x improvement by reinvesting the freed memory from the checkpoints into larger batch size. [Mixed precision](https://www.youtube.com/watch?v=-caN92JtKqA&list=PL_lsbAsL_o2BT6aerEKgIoufVD_fodnuT&index=4) with BFloat16 resulted in ~5x improvement versus FP32 and finally the [full sharding strategy](https://www.youtube.com/watch?v=a3iW6Cggccw&list=PL_lsbAsL_o2BT6aerEKgIoufVD_fodnuT&index=5) versus zero2 (DDP) resulted in 1.5x improvement. - -We ran similar experiments for a larger model, T5 11B, but the larger model size resulted in some changes to the experiment space. Specifically, we found that two optimizations, transformer wrapping policy and activation checkpointing, were needed to enable us to run these experiments on 3 nodes (each node had 8 A100 gpus with 80 GB of memory). With these optimizations, we could fit a batch size of 50 and get higher throughput compared to removing each one of them. Thus rather than running on/off solely for a single optimization test as with the 3B model, the larger model experiments were done with 1 of 3 optimizations turned on/off while always running the other two in order to allow a usable batch size for both test states for each item. - -Based on TFLOP comparisons, with the 11B model, we saw even more payoff from the optimizations. Mixed precision(~10x improvement) and activation checkpointing (~100x improvement) had a much larger impact with the 11B model compared to the 3B parameter model. With mixed precision we could fit ~2x larger batch sizes and with activation checkpointing >15x batch sizes (from 3 with no activation checkpointing to 50 with activation checkpointing) which translated into large throughput improvements. - -We also have observed that for these larger models > 3B, using Zero2 sharding strategy would result in minimal room left in memory for the batch data, and had to go with very small batch sizes (e.g 1-2) that essentially makes full sharding strategy a necessity to enable fitting larger batches sizes. - -_Note - this tutorial assumes a basic understanding of FSDP. To learn more about basics of FSDP please refer to the [getting started](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html) and [advanced FSDP ](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html)tutorials._ - -**What is FSDP? How does it make Large-Scale Training More Efficient** - -**FSDP** expands upon distributed data parallel, by parallelizing not just data, but the model parameters, the optimizer states and gradients associated with the model. Specifically - **each** **GPU only stores a subset of the entire model** **and the associated subset of optimizer states and gradients.** - -_To show the evolution of distributed training, we can start from the beginning, where AI models were simply trained on a single GPU._ - -DDP (Distributed Data Parallel) was the initial step up from training with only a single GPU, and was an effort to address the data and model size growth, where multiple GPUs each housed their own copy of the same model. The gain here is that the data for each batch could be split and processed independently on each GPU, all at the same time,thus parallelizing the processing of the data set and increasing training speed by the increasing number of GPUs. The tradeoff is the need to communicate the gradients between each GPU to synchronize the models after the backward pass. - -FSDP expands on scaling models by removing the redundancy of optimizer calculations and state storage, as well as gradient and memory storage of model parameters that are present in DDP (DDP = Distributed Data Parallel). This redundancy reduction, along with increased communication overlap where model parameter communication takes place at the same time as model computation, is what allows FSDP to train much larger models with the same resources as DDP. - -A key point is that this efficiency also allows for AI models that are larger than a single GPU to be trained. The model size available for training is now increased to the aggregate memory of all GPUs, rather than the size of a single GPU. (And as a point of note, FSDP can go beyond aggregated GPU memory by leveraging CPU memory as well, though we will not directly cover this aspect here). - -As discussed in a previous [blog post](https://medium.com/pytorch/pytorch-data-parallel-best-practices-on-google-cloud-6c8da2be180d), with DDP the largest model that we could train on 32, A100 gpus with 40 GB memory (4 nodes) was up to 3B parameters, and batch size of 128, with the help of activation checkpointing. By contrast, using FSDP we were able to train up to 81B model size, combining activation checkpointing, along with activation and parameter offloading. In another [experiment](https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff), we benchmarked a 1T parameter model with FSDP using 512 gpus. - -

        - -

        - -For intuition on the parameter level workings of FSDP, below we show an animation detailing how the model parameters are sharded and communicated assuming a two GPU scenario and a simple 8 parameter model: - -

        - -

        - - -_Above - the animations walk through the steps involved with the initial sharding of the model amongst ranks, and we start the `all_gathers` and forward pass_ - -

        - -

        - -_We continue through the model with the forward pass. After each FSDP unit completes, non-locally owned params are dropped to free memory, and optionally activations can be checkpointed. This continues until we finish the forward pass and compute the loss._ - -

        - -

        - -_During the backward pass, another `all_gather` is used to load the parameters and the gradients are computed. These gradients are then `reduce_scattered` so that the local owners of each param can aggregate and prepare to update the weights._ - -

        - -

        - -_Finally, each rank passes the summed gradients through the optimizer states and updates the weights to complete the mini-batch._ - -With the model now distributed across the entire set of available GPUs, the logical question is how data moves through the model given this sharding of model parameters. - -This is accomplished by FSDP coordinating with all GPUs to effectively share (communicate) the respective parts of the model. The model is decomposed into FSDP units and parameters within each unit are flattened and then sharded across all GPUs. Within each FSDP unit, GPU’s are assigned interleaving ownership of individual model parameters. - -By interleaving, we mean the following - assuming 2 gpus with an id of 1 and 2, the FSDP unit ownership pattern would be [12121212], rather than a contiguous chunk of [111222]. - -During training, an `all_gather` is initiated and the locally owned model parameters within a FSDP unit are shared by the owner GPU with the other non-owners, when they need it, on a ‘just in time’ type basis. FSDP prefetches parameters to overlap `all_gather` communication with computation. - -When those requested parameters arrive, the GPU uses the delivered parameters, in combination with the parameters it already owns, to create a fully populated FSDP unit. Thus there is a moment where each GPU hits peak memory usage while holding a fully populated FSDP unit. - -It then processes the data through the FSDP unit, and drops the parameters it received from other GPU’s to free up memory for the next unit…the process continues over and over proceeding through the entire model to complete the forward pass.The process is then repeated (in general) for the backward pass.(note - this is a simplified version for understanding..there is additional complexity but this should help construct a basic mental model of the FSDP process). - -This eliminates much of the memory redundancy present in DDP, but imposes the cost of higher amounts of network communication to shuttle these requested parameters back and forth amongst all the GPUs.**Overlapping the communication timing with the computation taking place is the basis of many of the performance improvements we’ll discuss in this series.** The key gains are frequently based on the fact that communication can often take place at the same time as computation.As you can surmise, **having high communication speed is vital for FSDP performance.** - - -### **How do I optimize my training with FSDP?** - -There are four main performance improvements we will cover - the transformer wrapper, activation checkpointing, mixed precision, and selecting the proper sharding strategy. The flowchart below will help as a checklist for tuning options that we will discuss in this post. - -

        - -

        - -**Wrapping policy - _for transformers, use Transformer wrapping policy_** - -The first performance optimization is leveraging the FSDP transformer wrapper for transformer models. - -One of the pre-defined wrapping policy is `size_based_autowrap_policy`. With `size_based_autowrap_policy`, FSDP will traverse the module structure from bottom to top, a new FSDP unit will be created once the current unit has at least the `min_num_params` specified within the size policy (this defaults to 1e8, or 100M). If the module can not be created as an FSDP unit, FSDP will continue to check its parent module. This size based wrapping policy may not be ideal for some model structures, PyTorch distributed team is actively working on a new default wrapping policy in the next release which is based on size and also module execution order, users can simply tune the size and achieve the optimized performance. - -In the current release, you can greatly improve your performance when running Transformer models by using the ‘transformer wrapper’. You will need to provide the appropriate layer class for your model. Here, layer class is the class that houses the Multi-Head Attention and Feed Forward Network. - -FSDP will then form the FSDP units around the layer class rather than arbitrary breaks based on parameter size. By sharding the model around layer classes that are uniformly repeated within the transformer, FSDP can create uniform FSDP units that better balance the overlap of computation and communication. By contrast, size based wrapping can produce very uneven or skewed shards for models, which then have uneven matching of compute vs communication overlap. As discussed earlier, the main driver of FSDP high performance is the overlap of communication and computation, and hence why the Transformer wrapper provides improved performance. Note that the Transformer wrapper can also be used for non-transformer models if these models have a list of uniform layers. - -Let’s compare the performance difference on a T5, 3B parameter model when running under the default wrapper and the transformer wrapper. - -For default wrapping, we don’t need to take any action - we simply pass the model to FSDP as shown: - -```python -model = FSDP( - model, - device_id=torch.cuda.current_device(), - ) -``` - - -In this case FSDP will simply wrap the whole model in a single FSDP unit. - -Running on an [NVIDIA A100-SXM4–40GB](https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf) with 8 GPUs, we are able to reach 2.3 TFlops and 95% GPU memory utilization with a batch size of 14. - -However, since T5 is a transformer model, we are better served to leverage the transformer wrapper for this model. - -To use that, we need to isolate the layer class for the transformer, and then pass it in to create our transformer wrapper. - -```python -from transformers.models.t5.modeling_t5 import T5Block -``` - -And now we can create our Transformer wrapper: - -```python -transformer_auto_wrapper_policy = functools.partial( - transformer_auto_wrap_policy, - transformer_layer_cls={ - T5Block, # < ---- Your Transformer layer class - }, - ) -``` - -With our model aware wrapper ready, we can initialize FSDP: - -```python -# invoke FSDP with your transformer wrapper policy: -model = FSDP( - model, - auto_wrap_policy=transformer_auto_wrapper_policy, - device_id=torch.cuda.current_device(), # streaming init - ) -``` - -Running this wrapped model, we can see some substantial performance gains.We can fit nearly double the batch size, going to 28, and with better memory and communication efficiency, we see a TFlops increase to 5.07 from 2.3. - -Thus, we’ve increased our training throughput by over 200% (2.19x) due to providing greater model info to FSDP! The transformer wrapping policy results in more fine-grained and balanced FSDP units each holding a layer class, which leads to a more effective communication-computation overlap. - -

        - -

        - -_Above: Graphical comparison of TFlops based on wrapper type_ - -If you are training a Transformer model, it pays to configure your training with FSDP using the transformer wrapper. For more information on how to isolate your layer class, please see our in depth video on Transformer wrapping [here](https://www.youtube.com/watch?v=HQeKwCsnH4k), where we walk through a number of transformers showing where the layer class can be found. - -**Mixed precision - _use BF16 if you have an Ampere architecture GPU_** - -FSDP supports a flexible mixed precision policy that gives you granular control over parameters, gradients and buffer data types. This lets you easily leverage BFloat16 or FP16 to increase your training speed by up to 70%. - -*Note that BFloat 16 is only available on Ampere type GPUs. On AWS this is available with p4dn and g5 instances. - -By way of comparison, we can show a 77% speed improvement when comparing fully tuned BFloat16 vs FP32 on an 8B DeepVit model. - -

        - -

        - -We have obtained even greater acceleration using BFloat16 in fine-tuning a 3B HuggingFace T5 model as shown in the figures below. We observed that because of the lower precision the validation loss of BFloat16 is slightly behind in the first few epochs, but it is able to catch up and results in the same final accuracy as FP32. - -

        - -

        - - -To use mixed precision, we create a policy with our desired data types, and pass it in during the FSDP initialization. - -To create our policy, we need to import the MixedPrecision class, and then define our custom policy using our customized class: - -```python -from torch.distributed.fsdp import MixedPrecision -bfSixteen = MixedPrecision( - param_dtype=torch.bfloat16, - # Gradient communication precision. - reduce_dtype=torch.bfloat16, - # Buffer precision. - buffer_dtype=torch.bfloat16, -) -model = FSDP( - model, - auto_wrap_policy=transformer_auto_wrapper_policy, - mixed_precision=bfloatPolicy) -``` - -You can mix and match the precision for parameters, gradients and buffers as you prefer: - -```python -comboPolicy = MixedPrecision( - # Param precision - param_dtype=torch.bfloat16, - # Gradient communication precision. - reduce_dtype=torch.float32, - # Buffer precision. - buffer_dtype=torch.float32, - ) -``` - -For training with FP16, you will need to also use the ShardedGradScaler, which we will cover in subsequent posts. For BFloat16, it is a drop-in replacement. - -**AnyPrecision Optimizer - _going beyond mixed precision with full BF16 training_** - -Mixed precision training, both in FSDP and elsewhere, maintains the working weights in the reduced datatype (BF16 or FP16) while keeping the master weights in full FP32. The reason for the master weights in FP32 is that running in pure BF16 will result in ‘weight stagnation’, where very small weight updates are lost due to the lower precision, and the accuracy flatlines over time while FP32 weights can continue to improve from these small updates. - -In order to resolve this dilemma, we can use the new AnyPrecision optimizer available in [TorchDistX](https://github.com/pytorch/torchdistx) (Torch Distributed Experimental) that allows you to successfully train and keep the master weights in pure BF16 instead of FP32. In addition, unlike the typical storage of optimizer states in FP32, AnyPrecision is able to maintain states in pure BF16 as well. - -AnyPrecision enables pure BF16 training by maintaining an extra buffer that tracks the precision lost during the weight updates and re-applies that during the next update…effectively resolving the weight stagnation issue without requiring FP32. - -As a comparison of the throughput gains available with pure BF16 training using AnyPrecision, we ran experiments using FSDP with the T5 11B model with regular FP32 training, Mixed Precision training with BF16, and pure BF16 training using the AnyPrecision optimizer on 3 nodes with A100 gpus as mentioned previously. - -

        - -

        - -As shown above, training with AnyPrecision and pure BF16 resulted in 2x the throughput vs Mixed Precision, and over 20x improvement vs FP32. - -The potential tradeoff is the impact on final accuracy - in the cases we tested, the accuracy was equal or better than FP32 due to a regularization effect from the slightly reduced precision, but your results may vary. - -AnyPrecision optimizer is available for you to test with [here](https://github.com/pytorch/torchdistx), and is a drop in replacement for AdamW optimizer. - -**Activation checkpointing - _increasing throughput by trading compute for memory_** - -

        - -

        - -**FSDP supports activation checkpointing once the model has been sharded**, and makes it easy to implement. The graph above shows ~4x throughput improvement using activation checkpointing. - -Activation checkpointing is where the intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder. This generally increases available GPU memory by over 30%. - -The tradeoff is that during the backward pass, these previously removed intermediate activations must be re-calculated again using information in the checkpoint (duplicate compute), but by leveraging the increased GPU memory, one can increase the batch size such that the net throughput can increase substantially. - -```python -# verify we have FSDP activation support ready by importing: -from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( - checkpoint_wrapper, - CheckpointImpl, - apply_activation_checkpointing_wrapper, -) -``` - - -The steps required to implement activation checkpointing is to first import the FSDP checkpointing functions. We need declare our checkpointer wrapper type which is non-reentrant and create a check function to identify which layer to wrap as follows - -```python -non_reentrant_wrapper = partial( - checkpoint_wrapper, - offload_to_cpu=False, - checkpoint_impl=CheckpointImpl.NO_REENTRANT, -) -check_fn = lambda submodule: isinstance(submodule, T5Block) -``` - -```python -apply_activation_checkpointing_wrapper( - model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn - ) -``` - -_Important note - this must be run after the model has been initialized with FSDP._ - -However, hopefully you’ve seen how some initial tuning with FSDP options can have a large impact on your training performance. - -With that, we turn our attention from how to scale within FSDP, to how to scale your server hardware for FSDP using AWS. - -**Large Scale Training with FSDP on AWS - _For multi-node prioritize high speed network_** - -AWS provides several services that can be used to run distributed training with FSDP: [Amazon EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing), AWS [ParallelCluster](https://aws.amazon.com/hpc/parallelcluster/), and Amazon [Sagemaker](https://aws.amazon.com/sagemaker/features/?nc=sn&loc=2). - -In this series of blog posts, we used [Amazon EC2 p4d](https://aws.amazon.com/ec2/instance-types/p4/) instances in a single-instance multi-GPU configuration and in a multi-instance configuration using AWS [ParallelCluster](https://aws.amazon.com/hpc/parallelcluster/) and SageMaker in order to run our training jobs. - -Here, we’ll focus specifically on AWS parallel cluster and provide an overview of how to utilize it for training purposes. - -**AWS ParallelCluster Setup** - -

        AWS ParallelCluster is an open source, cluster management tool that makes it easy for you to deploy and manage High Performance Computing (HPC) clusters on AWS. AWS ParallelCluster uses yaml configuration files to provision all the necessary resources. It also supports multiple instance types, job submission queues, shared file systems like Amazon EFS (NFS) or Amazon FSx for Lustre, and job schedulers like AWS Batch and Slurm.

        - -

        - -

        - -**Workflow on Clusters** - -The high level idea is to have a cluster that has a head node which controls the compute nodes. The actual training job runs on the compute nodes. Overall steps to run a training job on a cluster are as follows: - -1. Set up an AWS ParallelCuster (we discuss below) -2. Connect to the head node, and import the training code/ setup the environment. -3. Pull the data and place it in a shared folder that compute nodes can access (FSx Lustre drive). -4. Run the training job using a job scheduler (in this case Slurm). - -**Setup AWS ParallelCuster** - -To setup AWS ParallelCluster, - -1. **Deploy a network stack.** This step is optional since you could use your account default VPC and let AWS ParallelCluster create your subnets and security groups. However, we prefer to compartmentalize our desired network infrastructure and do this deployment via a CloudFormation stack. - - Since we deploy a public and a private subnet, we want to create them into an Availability Zone that contains our target instances, in this case p4d. We consult their availability in the region we use (us-east-1) through the following AWS CLI command: - - `aws ec2 describe-instance-type-offerings --location-type availability-zone \ --filters Name=instance-type,Values=p4d.24xlarge --region us-east-1 --output table` - - We see three availability zones containing p4d instances, we pick one of them (`us-east-1c`, yours may be different) when deploying our network stack. This can be done with the AWS Console or the AWS CLI. In our case we use the latter as follows - - `aws cloudformation create-stack --stack-name VPC-Large-Scale --capabilities CAPABILITY_IAM --template-body file://VPC-Large-Scale.yaml --parameters ParameterKey=SubnetsAZ,ParameterValue=us-east-1c` - - CloudFormation will deploy our new VPC, subnets, security groups and endpoints on our behalf. Once done, you can retrieve the IDs of the public and private subnets by querying the stack outputs and the values `PublicSubnet` and `PrivateSubnet`. - - For example, using the AWS CLI for the private subnet: - - `aws cloudformation describe-stacks --stack-name VPC-Large-Scale --query "Stacks[0].Outputs[?OutputKey=='PrivateSubnet'].OutputValue" --output text` - -2. **Create ParallelCluster,** The cluster configuration file specifies the resources for our cluster. These resources include instance type for Head node, compute nodes, access to S3 buckets, shared storage where our data will be located. We will use Amazon FSx for Lustre that offers a fully managed shared storage service with [Lustre](). - - [Here](https://github.com/lessw2020/t5_11/blob/main/hpc-cluster/cluster.yaml) is an example of a cluster configuration file. We can use AWs ParallelCluster CLI to create the cluster. Please note that the private and public subnet IDs will need to be replaced by the ones you retrieved earlier. You will be able to control the cluster using the AWS ParallelCluster CLI to start, stop, pause, etc. - - ``` - pcluster create-cluster --cluster-name my-hpc-cluster --cluster-configuration cluster.yaml - ``` - -3. **SSH to Head node -** once the cluster is ready, we can connect to the Head node using the SSH protocol, pull our training code with and place the data in the shared storage specified in the cluster configuration file. - - pcluster ssh --cluster-name cluster -i your-key_pair - -4. **Launch the training job -** now that we have the data and training code, we can launch the slurm job for training. Here is an [example](https://github.com/lessw2020/t5_11/blob/main/hpc-cluster/modified-bert.slurm) of a slurm script to launch the job using torchrun. - -More details on how to set up the cluster is out of the scope of this post, however we will have a separate post on it. - -**What’s next?** - -With this post we provided a high level overview of FSDP and how it efficiently scales distributed AI training. The flowchart included will help provide a checklist for you to review tuning options discussed such as the transformer wrapper and activation checkpointing. - -In the next posts, we will continue with the T5 model and go deeper into each of the topics above, specifically with sharding strategy and other optimizations to provide more insight and details. For now, a good reference for the sharding strategy is in our video tutorial [here](https://www.youtube.com/watch?v=a3iW6Cggccw&list=PL_lsbAsL_o2BT6aerEKgIoufVD_fodnuT&index=5): - -If you have questions or find an issue, please find the authors [Less](https://www.linkedin.com/in/less-wright-22b59017/), [Hamid](https://www.linkedin.com/in/hamid-nazeri/) and [Geeta](https://www.linkedin.com/in/geetachauhan/) or open an issue on[ PyTorch github](https://github.com/pytorch/pytorch). - -**Special thanks to:** - -Pytorch Distributed team, Shen Li, Rohan Varma, Yanli Zhao, Andrew Gu, Anjali Sridhar, Ana Simoes, Pierre-Yves Aquilanti, Sundar Ranganathan, and the broader AWS team for supporting us with providing infrastructure and technical support for running the large scale experiments. - -**Resources:** - -_[FSDP video series](https://www.youtube.com/playlist?list=PL_lsbAsL_o2BT6aerEKgIoufVD_fodnuT)_ - -_[Getting started with FSDP](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html)_ - -_[Advanced tutorial on FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html)_ - -_[API documentation](https://pytorch.org/docs/stable/fsdp.html?highlight=fsdp#module-torch.distributed.fsdp)_ - - \ No newline at end of file diff --git a/_posts/2022-12-22-scaling-vision-model-training-platforms-with-pytorch.md b/_posts/2022-12-22-scaling-vision-model-training-platforms-with-pytorch.md deleted file mode 100644 index fdf4058321b2..000000000000 --- a/_posts/2022-12-22-scaling-vision-model-training-platforms-with-pytorch.md +++ /dev/null @@ -1,161 +0,0 @@ ---- -layout: blog_detail -title: "Scaling Vision Model Training Platforms with PyTorch" -author: Vaibhav Aggarwal, Mannat Singh, Anjali Sridhar, Yanghao Li, Shoubhik Debnath, Ronghang Hu, Will Feng, Xinlei Chen, Tingting Markstrum, Diana Liskovich, Anupam Bhatnagar, Chay Ryali, Haoqi Fan, Tete Xiao, Min Xu, Rahul Iyer, Christoph Feichtenhofer, Ross Girshick, Piotr Dollar, Aaron Adcock, Wan-Yen Lo, CK Luk -featured-img: "/assets/images/scaling-vision-figure_1-solutions-to-the-challenges.png" ---- - -*TL;DR: We demonstrate the use of PyTorch with FairScale’s FullyShardedDataParallel (FSDP) API in writing large vision transformer models. We discuss our techniques for scaling and optimizing these models on a GPU cluster. The goal of this platform scaling effort is to enable research at scale. This blog does not discuss model accuracy, new model architectures, or new training recipes.* - -## 1. Introduction - -Latest vision research [1, 2] demonstrates model scaling as a promising research direction. In this project, we aim to enable our platforms to train massive vision transformer (ViT) [3] models. We present our work on scaling the largest trainable ViT from 1B to 120B parameters in FAIR vision platforms. We wrote ViT in PyTorch and leveraged its support for large-scale, distributed training on a GPU cluster. - -In the rest of this blog, we will first discuss the main challenges, namely *scalability*, *optimization*, and *numerical stability*. Then we will discuss how we tackle them with techniques including *data and model parallelism*, *automatic mixed precision*, *kernel fusion*, and *bfloat16*. Finally, we present our results and conclude. - -## 2. Main Challenges - -### 2.1 Scalability - -The key scalability challenge is to efficiently shard a model’s operations and state across multiple GPUs. A 100B parameter model requires ~200GB of RAM just for parameters, assuming fp16 representation. So, it is impossible to fit the model on a single GPU (A100 has at most 80GB RAM). Therefore, we need some way to efficiently shard a model’s data (input, parameters, activations, and optimizer state) across multiple GPUs. - -Another aspect of this problem is to scale without significantly changing the training recipe. E.g. Certain representation learning recipes use a global batch size of up to 4096 beyond which we start to see accuracy degradation. We cannot scale to more than 4096 GPUs without using some form of tensor or pipeline parallelism. - -### 2.2 Optimization - -The key optimization challenge is to maintain high GPU utilization even as we scale the number of model parameters and flops. When we scale models to teraflops and beyond, we start to hit major bottlenecks in our software stack that super-linearly increase training time and reduce accelerator utilization. We require hundreds or thousands of GPUs to run just a single experiment. Improvements in accelerator utilization can lead to significant reductions in cost and improve fleet utilization. It enables us to fund more projects and run more experiments in parallel. - -### 2.3 Numerical Stability - -The key stability challenge is to avoid numerical instability and divergence at large scale. We empirically observed in our experiments that the training instability gets severe and hard to deal with when we scale up model sizes, data, batch sizes, learning rate, etc. Vision Transformers particularly face training instability even at a lower parameter threshold. E.g., we find it challenging to train even ViT-H (with just 630M parameters) in mixed-precision mode without using strong data augmentation. We need to study the model properties and training recipes to make sure that the models train stably and converge. - -## 3. Our Solutions - -**Figure 1** depicts our solutions to each of the challenges. - -

        - -

        - -### 3.1 Addressing scaling challenges with data parallelism and model parallelism - -We apply various forms of data and model parallelism to enable fitting very large models in GPU memory. - -We use FairScale’s *FullyShardedDataParallel (FSDP)* API [4], based on PyTorch, to shard parameters, gradients, and optimizer state across multiple GPUs, thereby reducing the memory footprint per GPU. This process consists of the following three steps: - -- Step 1: We wrapped the entire model in a single FSDP instance. This shards the model parameters at the end of a forward pass and gathers parameters at the beginning of a forward pass. This enabled us to scale ~3x from 1.5B to 4.5B parameters. - -- Step 2: We experimented with wrapping individual model layers in separate FSDP instances. This nested wrapping further reduced the memory footprint by sharding and gathering parameters of individual model layers instead of an entire model. The peak memory is then determined by an individually wrapped transformer block in GPU memory in this mode instead of the entire model. - -- Step 3: We used *activation-checkpoint* to reduce the memory consumption by activations. It saves the input tensors and discards the intermediate activation tensors during the forward pass. These are recomputed during the backward pass. - -In addition, we experimented with model-parallelism techniques such as pipeline parallelism [5], which allow us to scale to more GPUs without increasing the batch size. - -### 3.2 Addressing optimization challenges with advanced AMP and kernel fusion - -#### Advanced AMP - -Automatic Mixed Precision (AMP) [6] training refers to training models using a lower precision of bits than FP32 or the default but still maintaining accuracy. We experimented with three levels of AMP as described below: - -- AMP O1: This refers to training in mixed precision where weights are in FP32 and some operations are in FP16. With AMP O1, the ops that might impact accuracy remain in FP32 and are not autocasted to FP16. - -- AMP O2: This refers to training in mixed precision but with more weights and ops in FP16 than in O1. Weights do not implicitly remain in FP32 and are cast to FP16. A copy of the master weights is maintained in the FP32 precision that is used by the optimizer. If we want the normalization layer weights in FP32 then we need to explicitly use layer wrapping to ensure that. - -- Full FP16: This refers to training in full FP16 where weights and operations are in FP16. FP16 is challenging to enable for training due to convergence issues. - -We found that AMP O2 with LayerNorm wrapping in FP32 leads to the best performance without sacrificing accuracy. - -#### Kernel Fusion - -- To reduce GPU kernel launch overhead and increase GPU work granularity, we experimented with kernel fusions, including fused dropout and fused layer-norm, using the [xformers library](https://github.com/facebookresearch/xformers) [7]. - -### 3.3 Addressing stability challenges by studying ops numerical stability and training recipes - -#### BFloat16 in general but with LayerNorm in FP32 - -The [bfloat16](https://cloud.google.com/tpu/docs/bfloat16) (BF16) [8] floating-point format provides the same dynamic range as FP32 with a memory footprint identical to FP16. We found that we could train models in the BF16 format using the same set of hyperparameters as in FP32, without special parameter tuning. Nevertheless, we found that we need to keep LayerNorm in FP32 mode in order for the training to converge. - -### 3.4 Final training recipe - -A summary of the final training recipe. - -1. Wrap the outer model in an FSDP instance. Enable parameter sharding after the forward pass. -2. Wrap individual ViT blocks with activation checkpointing, nested FSDP wrapping, and parameter flattening. -3. Enable mixed precision mode (AMP O2) with bfloat16 representation. Maintain the optimizer state in FP32 precision to enhance numerical stability. -4. Wrap normalization layers like LayerNorm in FP32 for better numerical stability. -5. Maximize the Nvidia TensorCore utilization by keeping matrix dimensions to be multiple of 8. For More details check [Nvidia Tensor Core Performance Guide](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9926-tensor-core-performance-the-ultimate-guide.pdf). - -## 4. Results - -In this section, we show the scaling results of ViT on three types of tasks: (1) image classification, (2) object detection (3) video understanding. **Our key result is that we are able to train massive ViT backbones across these vision tasks after applying the discussed scaling and optimization techniques. This enables vision research at a much larger scale.** We trained the models to convergence to verify that we maintain the current baselines even with all the optimizations. A common trend in Figures 2, 3, 4 is that we are able to train up to 25B-param models with an epoch time of less than 4 hours on 128 A100 GPUs. The 60B and 120B models are relatively slower to train. - -**Figure 2** shows the *image-classification* scaling result. It plots the epoch time for training ViTs on ImageNet using 128 A100-80GB GPUs with different model sizes. - -

        - -

        - -

        -Figure 2: Image-classification scaling result. -

        - -**Figure 3** shows the *object-detection* scaling result. It plots the epoch time for training [ViTDet](https://arxiv.org/abs/2203.16527) [9] with different ViT backbones on COCO using 128 A100-80GB GPUs. - -

        - -

        - -

        -Figure 3: Object-detection scaling result. -

        - -**Figure 4** shows the *video-understanding* scaling result. It plots the epoch time for training [MViTv2](https://arxiv.org/abs/2112.01526) [10] models on [Kinetics 400](https://www.deepmind.com/open-source/kinetics) [11] using 128 V100 (32 GB) GPUs in FP32. - -

        - -

        - -

        -Figure 4: Video-understanding scaling result. -

        - -**Figure 5** shows the optimization result with the ViT-H model in Figure 2 on 8 A100-40GB GPUs. -Three versions are used: (1) the baseline uses PyTorch’s DDP [12] with AMP O1, (2) FSDP + AMP-O2 + other optimizations, and (3) FSDP + FP16 + other optimizations. These optimizations altogether speed up the training by up to 2.2x. - -

        - -

        - -

        -Figure 5: Training speedups from various optimizations. -

        - -## 5. Concluding Remarks - -We have demonstrated the use of PyTorch with FairScale’s FullyShardedDataParallel (FSDP) API in writing large vision transformer models. We discuss our techniques for scaling and optimizing these models on a GPU cluster. We hope that this article can motivate others to develop large-scale ML models with PyTorch and its ecosystem. - -## References - -[1] [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) - -[2] [Revisiting Weakly Supervised Pre-Training of Visual Perception Models](https://arxiv.org/abs/2201.08371) - -[3] [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929v2) - -[4] [fairscale.nn.FullyShardedDataParallel](https://fairscale.readthedocs.io/en/stable/api/nn/fsdp.html) - -[5] [Pipeline parallelism in PyTorch](https://pytorch.org/docs/stable/pipeline.html) - -[6] [Automatic Mixed Precision (AMP) in PyTorch](https://pytorch.org/docs/stable/amp.html#module-torch.amp) - -[7] [xformers](https://github.com/facebookresearch/xformers) - -[8] [The bfloat16 numerical format](https://cloud.google.com/tpu/docs/bfloat16) - -[9] [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) - -[10] [MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](https://arxiv.org/abs/2112.01526) - -[11] [https://www.deepmind.com/open-source/kinetics](https://www.deepmind.com/open-source/kinetics) - -[12] [Getting Started with Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) \ No newline at end of file diff --git a/_posts/2022-12-28-torchserve-performance-tuning.md b/_posts/2022-12-28-torchserve-performance-tuning.md deleted file mode 100644 index e7b4e84f66a7..000000000000 --- a/_posts/2022-12-28-torchserve-performance-tuning.md +++ /dev/null @@ -1,445 +0,0 @@ ---- -layout: blog_detail -title: "Torchserve Performance Tuning, Animated Drawings Case-Study" -author: Hamid Shojanazeri, Geeta Chauhan, Mark Saroufim, Jesse Smith -featured-img: "assets/images/sketch_animator.png" ---- - -In this post we discuss performance tuning of Torchserve for serving your models in production. One of the biggest challenges in the life cycle of a ML project is deploying models in production. This requires a reliable serving solution along with solutions that address the MLOps needs. A robust serving solution needs to provide support for multi model serving, model versioning, metric logging, monitoring and scaling to serve the peak traffic. In this post, we will have an overview of Torchserve and how to tune its performance for production use-cases. We discuss the [Animated Drawings app](https://ai.facebook.com/blog/using-ai-to-bring-childrens-drawings-to-life/) from Meta that can turn your human figure sketches to animations and how it could serve the peak traffic with Torchserve. The Animated Drawing’s workflow is below. - -

        - -

        - -[https://ai.facebook.com/blog/using-ai-to-bring-childrens-drawings-to-life/](https://ai.facebook.com/blog/using-ai-to-bring-childrens-drawings-to-life/) - -Many AI systems and tools are designed to handle realistic images of humans, children's drawings add a level of complexity and unpredictability as they are often constructed in abstract, fanciful ways. These types of morphological and stylistic variations can confuse even state-of-the-art AI systems that excel at spotting objects in photorealistic images and drawings. -Meta AI researchers are working to overcome this challenge so that AI systems will be better able to recognize drawings of human figures in the wildly varied ways that children create them. This great blog post provides more details about the Animated Drawings and the approach taken. - -## Torchserve - -

        - -

        Fig1. Overall flow of Torchserve performance tuning
        -

        - -Once you have trained your model, it needs to be integrated into a larger system to have a full-fledged application, we use the term “model serving” to refer to this integration. Basically model serving is making your trained model available to run inferences and subsequent use of the model. - -Torchserve is the Pytorch preferred solution for serving models in production. It is a performant and scalable tool that wraps your model in a HTTP or HTTPS API. It has a frontend implemented in Java that handles multiple tasks from assigning workers for serving models to handling the connection between client and server. Torchserve has a Python backend that is responsible for handling the inference service. - -Torchserve supports multi model serving and versioning for AB test, dynamic batching, logging and metrics. It exposes four APIs for [inference](https://github.com/pytorch/serve/blob/master/docs/inference_api.md), [explanations](https://github.com/pytorch/serve/blob/master/docs/inference_api.md#explanations-api), [management](https://github.com/pytorch/serve/blob/master/docs/management_api.md) and [metrics](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md). - -[Inference](https://github.com/pytorch/serve/blob/master/docs/inference_api.md) API is listening on port 8080 and accessible through localhost by default, this can be configured in [Torchserve configuration](https://github.com/pytorch/serve/blob/master/docs/configuration.md) and enable getting predictions from the model. - -[Explanation](https://github.com/pytorch/serve/blob/master/docs/inference_api.md#explanations-api) API uses Captum under the hood to provide explanations of the model that is being served and listens to the port 8080 as well. - -[Management](https://github.com/pytorch/serve/blob/master/docs/management_api.md#management-api) API allows to register or unregister and describe a model. It also enables users to scale up or down the number of workers that serve the model. - -[Metric](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md) API by default listens to port 8082 and enables us to monitor the model that is being served. - -Torchserve let you scale your model serving and handle the peak traffic by supporting [batch inference](https://github.com/pytorch/serve/blob/master/docs/batch_inference_with_ts.md) and multiple workers that serve your model. Scaling can be done through [management](https://github.com/pytorch/serve/blob/master/docs/management_api.md) API and settings through a [configuration](https://github.com/pytorch/serve/blob/master/docs/configuration.md) file. Also, metric API helps you to monitor your model serving through default and customizable metrics. - -Other advanced settings such as the length of the queue for the received requests, maximum wait time for a batch of inputs and many other properties are configurable through a[ config file](https://github.com/pytorch/serve/blob/master/docs/configuration.md) that can be passed to Torchserve when it is started. - - -**Steps to serve your model with Torchserve** - -1. [Install Torchserve, model archiver](https://github.com/pytorch/serve/blob/master/docs/getting_started.md#install-torchserve-and-torch-model-archiver) and its requirements. -2. Choose a default handler that fits your task (e.g image classification, etc) or author a [custom handler](https://github.com/pytorch/serve/blob/master/docs/custom_service.md#custom-handlers). -3. [Package your model](https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers#create-model-archive-eager-mode) artifacts (trained model checkpoint and all other necessary files for loading and running your model) and the handler into a “.mar” file using [Torcharchive](https://github.com/pytorch/serve/blob/master/model-archiver/README.md) and place it in the model store. -4. [Start serving your model](https://github.com/pytorch/serve/blob/master/docs/getting_started.md). -5. [Run inference](https://github.com/pytorch/serve/blob/master/docs/getting_started.md#get-predictions-from-a-model). -We will discuss model handlers and metrics in more detail here. - -## Model handlers - -Torchserve uses a handler in the backend to load the models, preprocess the received data, run inference and post-process the response. Handler in torchserve is a **python script** that all the model initialization, preprocessing, inference and post processing logic goes into. - -Torchserve provides an out of the box handler for a number of applications like image classification, segmentation, object detection and text classification. It also supports custom handlers, in case your use case is not supported in default handlers. - -It provides a great flexibility in custom handlers, this potentially make Torchserve as **multi-framework** serving tool. Custom handlers let you define your custom logic to initialize a model that can be used also to load models from other frameworks such as ONNX. - -Torchserve **handler** is made of four main **functions**, **initialize**, **preprocess**, **inference** and **postprocess** that each return a list. The code snippet below shows an example of a custom handler.**Custom handlers inherit** from **BaseHandler** in Torchserve and can **overwrite** any of the **main** **functions**. Here is an example of the handler used for loading the [Detectron2](https://github.com/facebookresearch/detectron2) model for figure detection, this model has been exported to Torchscript and uses model.half() to run the inference with FP16, details are explained in another [section]() in this post. - -```python - -class MyModelHandler(BaseHandler): - def initialize(self, context): - self.manifest = ctx.manifest - properties = ctx.system_properties - model_dir = properties.get("model_dir") - serialized_file = self.manifest["model"]["serializedFile"] - model_pt_path = os.path.join(model_dir, serialized_file) - - self.device = torch.device( - "cuda:" + str(properties.get("gpu_id")) - if torch.cuda.is_available() and properties.get("gpu_id") is not None - else "cpu" - ) - self.model = torch.jit.load(model_pt_path, map_location=self.device) - - self.model = self.model.half() - - def preprocess(self, data): - - inputs = [] - for request in batch: - - request_body = request.get("body") - - input_ = io.BytesIO(request_body) - image = cv2.imdecode(np.fromstring(input_.read(), np.uint8), 1) - input = torch.Tensor(image).permute(2, 0, 1) - input = input.to(self.device) - input = input.half() - inputs.append({"image": input}) - - return inputs - - def inference(self,inputs): - predictions = self.model(**inputs) - return predictions - - def postprocess(self, output): - responses = [] - for inference_output in inference_outputs: - responses_json = { - 'classes': inference_output['pred_classes'].tolist(), - 'scores': inference_output['scores'].tolist(), - "boxes": inference_output['pred_boxes'].tolist() - } - responses.append(json.dumps(responses_json)) - - return responses -``` - -## Metrics - -An essential component in serving models in production is the ability to monitor them. **Torchserve** **collects** **system level** [metrics](https://github.com/pytorch/serve/blob/master/docs/metrics.md) regularly and **allows** adding **custom metrics** as well. - -**[System level metrics](https://github.com/pytorch/serve/blob/master/docs/metrics.md#system-metrics)** consist of CPU utilization, available and used disk space and memory on the host machine along with number of requests with different response codes (e.g 200-300, 400-500 and above 500). **Custom metrics** can be **added** to the metrics as explained [here](https://github.com/pytorch/serve/blob/master/docs/metrics.md#custom-metrics-api). TorchServe logs these two sets of metrics to different log files. Metrics are collected by default at: - -* System metrics - log_directory/ts_metrics.log -* Custom metrics - log directory/model_metrics.log - -As mentioned before, Torchserve also exposes [metric API](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md), that by default listens to port 8082 and enables users to query and monitor the collected metrics. The default metrics endpoint returns Prometheus formatted metrics. You can query metrics using curl requests or point a [Prometheus Server](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md#prometheus-server) to the endpoint and use [Grafana](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md#grafana) for dashboards. - -While serving a model you can query metrics using curl request as follows: - -``` -curl http://127.0.0.1:8082/metrics -``` - -In case you are looking into exporting the logged metrics, please refer to this [example](https://github.com/google/mtail) that uses mtail to export metrics to Prometheus. Tracking these metrics in a dashboard allows you to monitor performance regressions that may have been sporadic or hard to spot during an offline benchmark run. - -## What to consider for tuning performance of a model in production - -The workflow suggested in Fig 1, is the general idea on how to approach model deployment in production with Torchserve. - -In many cases serving models in production is **optimized** **based** on **throughput** or **latency** service level agreement (**SLA)s**. Usually **real-time** **applications** are more concerned about **latency** whereas **off-line applications** may care more about higher **throughput**. - -There are a number of main factors contributing to the performance of a serving model in production. In particular, we are focusing on serving Pytorch models with Torchserve here, however most of these factors generalize to all models from other frameworks as well. - -* **Model optimizations**: this is a pre-step for deploying models into production. This is a very broad discussion that we will get into in a series of future blogs. This includes techniques like quantization, pruning to decrease the size of the model, using Intermediate representations (IR graphs) such as Torchscript in Pytorch, fusing kernels and many others. Currently [torchprep](https://github.com/msaroufim/torchprep) provides many of these techniques as a CLI tool. -* **Batch inference:** it refers to feeding multiple inputs into a model, while it is essential during training, it can be very helpful to manage the cost at inference time as well. Hardware accelerators are optimized for parallelism and batching helps to saturate the compute capacity and often leads to higher throughput. The main difference in inference is you can’t wait too long to get a batch filled from clients, something we call dynamic batching -* **Number of Workers :** Torchserve uses workers to serve models. Torchserve workers are Python processes that hold a copy of the model weights for running inference. Too few workers means you’re not benefitting from enough parallelism but too many can cause worker contention and degrade end to end performance. - -- **Hardware :** choosing the appropriate hardware based on the model, application and latency, throughput budget. This could be one of the **supported** hardwares in Torchserve, **CPU, GPU, AWS Inferentia**. Some hardware configurations are intended for best in class performance and others are better suited for cost effective inference. From our experiments we’ve found that GPUs shine best at larger batch sizes whereas the right CPUs and AWS Inferentia can be far more cost effective for lower batch sizes and low latency. - -## Best Practices for Performance tuning on Torchserve - -To get the best performance out of your model while serving it with Torchserve, we are sharing some of the best practices here. Torchserve provides a [benchmark](https://github.com/pytorch/serve/tree/c87bfec8916d340de5de5810b14a016049b0e395/benchmarks#benchmarking-with-apache-bench) suite that provides helpful insight to make informed decisions on different choices as detailed below. - -* **Optimize your model** as the first step, Pytorch model optimization [tutorials](https://pytorch.org/tutorials/). **Model optimization** choices are also closely **tied** to the **hardware** of choice. We will discuss it in more detail in another blog post. -* **Deciding** the **hardware** for model deployment can be closely related to the latency and throughput budget and cost per inference. Depending on the size of model and application it can vary, for some models like computer vision models it has been historically not affordable to run in production on CPU. However, by having optimizations such [IPEX](https://github.com/pytorch/serve/blob/c87bfec8916d340de5de5810b14a016049b0e395/examples/intel_extension_for_pytorch/README.md) as recently added to Torchserve this has been much more affordable and cost beneficial and you can learn more in this investigative [case study](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex.html) -* **Workers** in Torchserve are Python processes that provide parallelism, setting the number of workers should be done carefully. By default Torchserve launch number of workers equal to VCPUs or available GPUs on the host, this can add a considerable amount of time to the Torchserve start. - - Torchserve exposes a [config property](https://github.com/pytorch/serve/blob/c87bfec8916d340de5de5810b14a016049b0e395/docs/configuration.md#config-model) to set the number of workers. To provide an **efficient parallelism** through **multiple workers** and avoiding them to compete over resources, as a baseline we **recommend** following setting on CPU and GPU: - - - **CPU** : In the handler, `torch.set_num_threads(1) `then set the number of workers to `num physical cores / 2. `But the the best threading configurations can be achieved by leveraging the Intel CPU launcher script. - - - **GPU**: number of available GPUs can be set through[ number_gpus](https://github.com/pytorch/serve/blob/c87bfec8916d340de5de5810b14a016049b0e395/docs/configuration.md#limit-gpu-usage) in config.properties. Torchserve uses round robin to assign workers to GPUs. We recommend setting the number of workers as follows. `Number of worker = (Number of available GPUs) / (Number of Unique Models). `Note that GPUs that are pre-Ampere do not provide any resource isolation with Multi Instance GPUs. - -* **Batch size** can directly affect the latency and the throughput. To better utilize the compute resources batch size needs to be increased. However, there is a tradeoff between latency and throughput. **Larger batch sizes** can **increase** the **throughput but results in a higher latency** as well. Batch size can be set in Torchserve in two ways, either through[ model config](https://github.com/pytorch/serve/blob/c87bfec8916d340de5de5810b14a016049b0e395/docs/configuration.md#config-model) in config.properties or while registering the model using [Management API](https://github.com/pytorch/serve/blob/c87bfec8916d340de5de5810b14a016049b0e395/docs/management_api.md#scale-workers). - -In the next section, we are going to use Torchserve benchmark suite to decide the best combination of model optimization, hardware, workers, and batch size. - -## Animated Drawings Performance Tuning - -To use the Torchserve benchmark suite, first we need to have an archived file, “.mar” file as discussed above, that contains the model, handler and all other artifacts to load and run inference. Animated Drawings uses Detectron2’s implementation of Mask-RCNN for an object detection model. - -### How to run benchmark suite - -The [Automated benchmark suite](https://github.com/pytorch/serve/tree/master/benchmarks#auto-benchmarking-with-apache-bench) in Torchserve let you benchmark multiple models with different setting including batch size and number of worker and finally generate a report for you. To get started: - -``` -git clone https://github.com/pytorch/serve.git - -cd serve/benchmarks - -pip install -r requirements-ab.txt - -apt-get install apache2-utils -``` - -Model level settings can be configured in a yaml file similar to - -```yaml - -Model_name: - eager_mode: - benchmark_engine: "ab" - url: "Path to .mar file" - workers: - - 1 - - 4 - batch_delay: 100 - batch_size: - - 1 - - 2 - - 4 - - 8 - requests: 10000 - concurrency: 10 - input: "Path to model input" - backend_profiling: False - exec_env: "local" - processors: - - "cpu" - - "gpus": "all" - -``` - -This yaml file will be referenced in the [benchmark_config_template](https://github.com/pytorch/serve/blob/master/benchmarks/benchmark_config_template.yaml#L12).yaml file that includes other settings for generating reports, this can optionally work with AWS cloud watch for logs as well. - -``` -python benchmarks/auto_benchmark.py --input benchmark_config_template.yaml -``` - -Running the **benchmarks**, results will be written in “csv” file that can be found in “_ /tmp/benchmark/ab_report.csv_” and full report “/tmp/ts_benchmark/report.md". It will include items such as Torchserve average latency, model P99 latency, throughput, number of concurrency, number of requests, handler time, and some other metrics. Here we focus on some of the important ones that we track to tune the performance which are, **concurrency**, **model P99** latency, **throughput**. We look at these numbers specifically in **combination** with **batch size**, the used **device, number of workers** and if any **model optimization** has been done. - - -The **latency SLA** for this model has been set to **100 ms,** this is real-time application and as we discussed earlier, latency is more of a concern and **throughput** ideally should be as high as possible while it does **not violate** the **latency SLA.** - -Through searching the space, over different batch sizes (1-32), number of workers (1-16) and devices (CPU,GPU), we have run a set of experiments that summarized the best ones in the table below. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Device - Concurrency - # Requests - #workers - Batch size - Payload/image - Optimization - Throughput - Latency P99 -
        CPU - 10 - 1000 - 1 - 1 - small - N/A - 3.45 - 305.3 ms -
        CPU - 1 - 1000 - 1 - 1 - small - N/A - 3.45 - 291.8 ms -
        GPU - 10 - 1000 - 1 - 1 - small - N/A - 41.05 - 25.48 ms -
        GPU - 1 - 1000 - 1 - 1 - small - N/A - 42.21 - 23.6 ms -
        GPU - 10 - 1000 - 1 - 4 - small - N/A - 54.78 - 73.62 ms -
        GPU - 10 - 1000 - 1 - 4 - small - model.half() - 78.62 - 50.69 ms -
        GPU - 10 - 1000 - 1 - 8 - small - model.half() - 85.29 - 94.4 ms -
        - -The latency of this model on CPU with all of the tried settings in terms of batch size, concurrency and number of workers did not meet the SLA, in fact ~13x higher. - -**Moving** the model serving **to GPU**, immediately could **improve** the **latency** ~**13x **from 305 ms down to 23.6 ms. - -One of the **simplest** **optimizations** that we could do for the model was lowering its precision to **fp16**, it is one liner (**model.half()**) and could reduce the **model P99 latency **by **32%** and increase the throughput by almost the same amount. - -There could be other optimization done by Torchscripting the model and using [optimize_for_inference](https://github.com/pytorch/pytorch/blob/master/torch/jit/_freeze.py#L168) or other tricks including onnx or tensorrt runtime optimizations which leverage aggressive fusions are out of the scope of this post. We will discuss model optimizations in a separate post. - -We found both on CPU and GPU , setting **number of workers=1 **worked the best in this case. - -* Moving the model to GPU, using **number of workers = 1**, and **batch size = 1** increased the **Throughput ~12x compared** to **CPU and latency ~13x.** -* Moving the model to GPU, using **model.half()**, **number of workers = 1**, and **batch size = 8** yielded **best** results in terms of **Throughput** and tolerable latency. **Throughput** increased **~25x compared** to **CPU with latency still meeting the SLA (94.4ms).** - -_Note: if you are running the benchmark suite, make sure you are setting a proper `batch_delay` and set the concurrency of the request to a number proportional to your batch size. Concurrency here means the number of concurrent requests being sent to the server._ - -## Conclusion - -In this post, we have discussed the considerations and knobs that Torchserve expose to tune the performance in production. We have discussed the Torchserve benchmark suite as a means to tune the performance and get insights on possible choices for model optimizations, hardware choice and cost in general. We used Animated Drawings app which uses Detectron2’s Mask-RCNN model as a case-study to showcase the performance tuning with benchmark suite. - -For more details on Performance tuning in Torchserve please refer to our documentation [here](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md). -Also feel free to open a ticket on [Torchserve repo](https://github.com/pytorch/serve/issues) for any further questions and feedback. - -### Acknowledgement - -We would like to thank Somya Jain (Meta), Christopher Gustave (Meta) for their great support and guidance throughout many steps of this blog and providing insights to Sketch Animator workflow. Also, special thanks to[ Li Ning](https://www.linkedin.com/in/li-ning-7274604/) from AWS for the great efforts to make performance tuning much easier on Torchserve with automated benchmark suite. - - - - diff --git a/_posts/2022-12-31-compromised-nightly-dependency.md b/_posts/2022-12-31-compromised-nightly-dependency.md deleted file mode 100644 index 78fc7f3b7b69..000000000000 --- a/_posts/2022-12-31-compromised-nightly-dependency.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -layout: blog_detail -title: "Compromised PyTorch-nightly dependency chain between December 25th and December 30th, 2022." -author: The PyTorch Team ---- - -If you installed PyTorch-nightly on Linux via pip between December 25, 2022 and December 30, 2022, please uninstall it and torchtriton immediately, and use the latest nightly binaries (newer than Dec 30th 2022). - -```bash -$ pip3 uninstall -y torch torchvision torchaudio torchtriton -$ pip3 cache purge -``` - -PyTorch-nightly Linux packages installed via pip during that time installed a dependency, torchtriton, which was compromised on the Python Package Index (PyPI) code repository and ran a malicious binary. This is what is known as a supply chain attack and directly affects dependencies for packages that are hosted on public package indices. - -**NOTE:** Users of the PyTorch **stable** packages **are not** affected by this issue. - - -## How to check if your Python environment is affected - -The following command searches for the malicious binary in the torchtriton package (`PYTHON_SITE_PACKAGES/triton/runtime/triton`) and prints out whether your current Python environment is affected or not. - -```bash -python3 -c "import pathlib;import importlib.util;s=importlib.util.find_spec('triton'); affected=any(x.name == 'triton' for x in (pathlib.Path(s.submodule_search_locations[0] if s is not None else '/' ) / 'runtime').glob('*'));print('You are {}affected'.format('' if affected else 'not '))" -``` - -The malicious binary is executed when the triton package is imported, which requires explicit code to do and is not PyTorch’s default behavior. - -## The Background - -At around 4:40pm GMT on December 30 (Friday), we learned about a malicious dependency package (`torchtriton`) that was uploaded to the Python Package Index (PyPI) code repository with the same package name as the one we ship on the [PyTorch nightly package index](https://download.pytorch.org/whl/nightly). Since the [PyPI index takes precedence](https://github.com/pypa/pip/issues/8606), this malicious package was being installed instead of the version from our official repository. This design enables somebody to register a package by the same name as one that exists in a third party index, and pip will install their version by default. - -This malicious package has the same name `torchtriton` but added in code that uploads sensitive data from the machine. - - -## What we know - -torchtriton on PyPI contains a malicious triton binary which is installed at `PYTHON_SITE_PACKAGES/triton/runtime/triton`. Its SHA256 hash is listed below. - -`SHA256(triton)= 2385b29489cd9e35f92c072780f903ae2e517ed422eae67246ae50a5cc738a0e` - -The binary’s main function does the following: - -- Get system information - - nameservers from `/etc/resolv.conf` - - hostname from `gethostname()` - - current username from `getlogin()` - - current working directory name from `getcwd()` - - environment variables -- Read the following files - - `/etc/hosts` - - `/etc/passwd` - - The first 1,000 files in `$HOME/*` - - `$HOME/.gitconfig` - - `$HOME/.ssh/*` -- Upload all of this information, including file contents, via encrypted DNS queries to the domain *.h4ck[.]cfd, using the DNS server wheezy[.]io - -The binary’s file upload functionality is limited to files less than 99,999 bytes in size. It also uploads only the first 1,000 files in $HOME (but all files < 99,999 bytes in the .ssh directory). - -## Steps taken towards mitigation - -- torchtriton has been removed as a dependency for our nightly packages and replaced with pytorch-triton ([pytorch/pytorch#91539](https://github.com/pytorch/pytorch/pull/91539)) and a dummy package registered on PyPI (so that this issue doesn’t repeat) -- All nightly packages that depend on torchtriton have been removed from our package indices at https://download.pytorch.org until further notice -- We have reached out to the PyPI security team to get proper ownership of the `torchtriton` package on PyPI and to delete the malicious version - - - diff --git a/_posts/2022-2-23-introducing-torchrec.md b/_posts/2022-2-23-introducing-torchrec.md deleted file mode 100644 index fbc41c9fa456..000000000000 --- a/_posts/2022-2-23-introducing-torchrec.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -layout: blog_detail -title: 'Introducing TorchRec, a library for modern production recommendation systems' -author: Meta AI - Donny Greenberg, Colin Taylor, Dmytro Ivchenko, Xing Liu, Anirudh Sudarshan -featured-img: '' ---- - -We are excited to announce [TorchRec](https://github.com/pytorch/torchrec), a PyTorch domain library for Recommendation Systems. This new library provides common sparsity and parallelism primitives, enabling researchers to build state-of-the-art personalization models and deploy them in production. - -

        - -

        - -## How did we get here? -Recommendation Systems (RecSys) comprise a large footprint of production-deployed AI today, but you might not know it from looking at Github. Unlike areas like Vision and NLP, much of the ongoing innovation and development in RecSys is behind closed company doors. For academic researchers studying these techniques or companies building personalized user experiences, the field is far from democratized. Further, RecSys as an area is largely defined by learning models over sparse and/or sequential events, which has large overlaps with other areas of AI. Many of the techniques are transferable, particularly for scaling and distributed execution. A large portion of the global investment in AI is in developing these RecSys techniques, so cordoning them off blocks this investment from flowing into the broader AI field. - -By mid-2020, the PyTorch team received a lot of feedback that there hasn't been a large-scale production-quality recommender systems package in the open-source PyTorch ecosystem. While we were trying to find a good answer, a group of engineers at Meta wanted to contribute Meta’s production RecSys stack as a PyTorch domain library, with a strong commitment to growing an ecosystem around it. This seemed like a good idea that benefits researchers and companies across the RecSys domain. So, starting from Meta’s stack, we began modularizing and designing a fully-scalable codebase that is adaptable for diverse recommendation use-cases. Our goal was to extract the key building blocks from across Meta’s software stack to simultaneously enable creative exploration and scale. After nearly two years, a battery of benchmarks, migrations, and testing across Meta, we’re excited to finally embark on this journey together with the RecSys community. We want this package to open a dialogue and collaboration across the RecSys industry, starting with Meta as the first sizable contributor. - - -## Introducing TorchRec -TorchRec includes a scalable low-level modeling foundation alongside rich batteries-included modules. We initially target “two-tower” ([[1]], [[2]]) architectures that have separate submodules to learn representations of candidate items and the query or context. Input signals can be a mix of floating point “dense” features or high-cardinality categorical “sparse” features that require large embedding tables to be trained. Efficient training of such architectures involves combining data parallelism that replicates the “dense” part of computation and model parallelism that partitions large embedding tables across many nodes. - -In particular, the library includes: -- Modeling primitives, such as embedding bags and jagged tensors, that enable easy authoring of large, performant multi-device/multi-node models using hybrid data-parallelism and model-parallelism. -- Optimized RecSys kernels powered by [FBGEMM](https://github.com/pytorch/FBGEMM) , including support for sparse and quantized operations. -- A sharder which can partition embedding tables with a variety of different strategies including data-parallel, table-wise, row-wise, table-wise-row-wise, and column-wise sharding. -- A planner which can automatically generate optimized sharding plans for models. -- Pipelining to overlap dataloading device transfer (copy to GPU), inter-device communications (input_dist), and computation (forward, backward) for increased performance. -- GPU inference support. -- Common modules for RecSys, such as models and public datasets (Criteo & Movielens). - -To showcase the flexibility of this tooling, let’s look at the following code snippet, pulled from our DLRM Event Prediction example: -```python -# Specify the sparse embedding layers -eb_configs = [ - EmbeddingBagConfig( - name=f"t_{feature_name}", - embedding_dim=64, - num_embeddings=100_000, - feature_names=[feature_name], - ) - for feature_idx, feature_name in enumerate(DEFAULT_CAT_NAMES) -] - -# Import and instantiate the model with the embedding configuration -# The "meta" device indicates lazy instantiation, with no memory allocated -train_model = DLRM( - embedding_bag_collection=EmbeddingBagCollection( - tables=eb_configs, device=torch.device("meta") - ), - dense_in_features=len(DEFAULT_INT_NAMES), - dense_arch_layer_sizes=[512, 256, 64], - over_arch_layer_sizes=[512, 512, 256, 1], - dense_device=device, -) - -# Distribute the model over many devices, just as one would with DDP. -model = DistributedModelParallel( - module=train_model, - device=device, -) - -optimizer = torch.optim.SGD(params, lr=args.learning_rate) -# Optimize the model in a standard loop just as you would any other model! -# Or, you can use the pipeliner to synchronize communication and compute -for epoch in range(epochs): - # Train -``` - - -## Scaling Performance -TorchRec has state-of-the-art infrastructure for scaled Recommendations AI, powering some of the largest models at Meta. It was used to train a 1.25 trillion parameter model, pushed to production in January, and a 3 trillion parameter model which will be in production soon. This should be a good indication that PyTorch is fully capable of the largest scale RecSys problems in industry. We’ve heard from many in the community that sharded embeddings are a pain point. TorchRec cleanly addresses that. Unfortunately it is challenging to provide large-scale benchmarks with public datasets, as most open-source benchmarks are too small to show performance at scale. - - -## Looking ahead -Open-source and open-technology have universal benefits. Meta is seeding the PyTorch community with a state-of-the-art RecSys package, with the hope that many join in on building it forward, enabling new research and helping many companies. The team behind TorchRec plan to continue this program indefinitely, building up TorchRec to meet the needs of the RecSys community, to welcome new contributors, and to continue to power personalization at Meta. We’re excited to begin this journey and look forward to contributions, ideas, and feedback! - - -## References -[[1]] Sampling-Bias-Corrected Neural Modeling for Large Corpus Item Recommendations - -[[2]] DLRM: An advanced, open source deep learning recommendation model - - -[1]: https://research.google/pubs/pub48840/ -[2]: https://ai.facebook.com/blog/dlrm-an-advanced-open-source-deep-learning-recommendation-model/ diff --git a/_posts/2022-2-24-amazon-ads-case-study.md b/_posts/2022-2-24-amazon-ads-case-study.md deleted file mode 100644 index 8c78c043039e..000000000000 --- a/_posts/2022-2-24-amazon-ads-case-study.md +++ /dev/null @@ -1,163 +0,0 @@ ---- -layout: blog_detail -title: "Case Study: Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing" -author: Yashal Kanungo – Applied Scientist, Kamran Khan - Sr. Technical Product Manager, Shubha Kumbadakone – Sr. Specialist, ML Frameworks -featured-img: "" ---- - -Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out. - -Amazon Ads helps companies build their brand and connect with shoppers through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses and brands of all sizes, including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies can upload their own ad creatives, which can include images, video, audio, and, of course, products sold on Amazon. - -

        - -

        - -To promote an accurate, safe, and pleasant shopping experience, these ads must comply with content guidelines. For example, ads cannot flash on and off, products must be featured in an appropriate context, and images and text should be appropriate for a general audience. To help ensure that ads meet the required policies and standards, we needed to develop scalable mechanisms and tools. - -As a solution, we used machine learning (ML) models to surface ads that might need revision. As deep neural networks flourished over the past decade, our data science team began exploring more versatile deep learning (DL) methods capable of processing text, images, audio, or video with minimal human intervention. To that end, we’ve used PyTorch to build computer vision (CV) and natural language processing (NLP) models that automatically flag potentially non-compliant ads. PyTorch is intuitive, flexible, and user-friendly, and has made our transition to using DL models seamless. Deploying these new models on [AWS Inferentia-based Amazon EC2 Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/), rather than on GPU-based instances, reduced our inference latency by 30 percent and our inference costs by 71 percent for the same workloads. - -## Transition to deep learning - -Our ML systems paired classical models with word embeddings to evaluate ad text. But our requirements evolved, and as the volume of submissions continued to expand, we needed a method nimble enough to scale along with our business. In addition, our models must be fast and serve ads within milliseconds to provide an optimal customer experience. - -Over the last decade, DL has become very popular in numerous domains, including natural language, vision, and audio. Because deep neural networks channel data sets through many layers — extracting progressively higher-level features — they can make more nuanced inferences than classical ML models. Rather than simply detecting prohibited language, for example, a DL model can reject an ad for making false claims. - -In addition, DL techniques are transferable– a model trained for one task can be adapted to carry out a related task. For instance, a pre-trained neural network can be optimized to detect objects in images and then fine-tuned to identify specific objects that are not allowed to be displayed in an ad. - -Deep neural networks can automate two of classical ML’s most time-consuming steps: feature engineering and data labeling. Unlike traditional supervised learning approaches, which require exploratory data analysis and hand-engineered features, deep neural networks learn the relevant features directly from the data. DL models can also analyze unstructured data, like text and images, without the preprocessing necessary in ML. Deep neural networks scale effectively with more data and perform especially well in applications involving large data sets. - -We chose PyTorch to develop our models because it helped us maximize the performance of our systems. With PyTorch, we can serve our customers better while taking advantage of Python’s most intuitive concepts. The programming in PyTorch is object-oriented: it groups processing functions with the data they modify. As a result, our codebase is modular, and we can reuse pieces of code in different applications. In addition, PyTorch’s eager mode allows loops and control structures and, therefore, more complex operations in the model. Eager mode makes it easy to prototype and iterate upon our models, and we can work with various data structures. This flexibility helps us update our models quickly to meet changing business requirements. - -“Before this, we experimented with other frameworks that were “Pythonic,” but PyTorch was the clear winner for us here.” said Yashal Kanungo, Applied Scientist. “Using PyTorch was easy because the structure felt native to Python programming, which the data scientists were very familiar with”. - -### Training pipeline - -Today, we build our text models entirely in PyTorch. To save time and money, we often skip the early stages of training by fine-tuning a pre-trained NLP model for language analysis. If we need a new model to evaluate images or video, we start by browsing PyTorch’s [torchvision](https://pytorch.org/vision/stable/index.html) library, which offers pretrained options for image and video classification, object detection, instance segmentation, and pose estimation. For specialized tasks, we build a custom model from the ground up. PyTorch is perfect for this, because eager mode and the user-friendly front end make it easy to experiment with different architectures. - -_To learn how to finetune neural networks in PyTorch, head to [this tutorial](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html#finetuning-from-a-pretrained-model)._ - -Before we begin training, we optimize our model’s [hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html), the variables that define the network architecture (for example, the number of hidden layers) and training mechanics (such as learning rate and batch size). Choosing appropriate hyperparameter values is essential, because they will shape the training behavior of the model. We rely on the [Bayesian search feature in SageMaker](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-how-it-works.html#automatic-tuning-bayesian-search.title), AWS’s ML platform, for this step. Bayesian search treats hyperparameter tuning as a regression problem: It proposes the hyperparameter combinations that are likely to produce the best results and runs training jobs to test those values. After each trial, a regression algorithm determines the next set of hyperparameter values to test, and performance improves incrementally. - -We prototype and iterate upon our models using SageMaker Notebooks. Eager mode lets us prototype models quickly by building a new computational graph for each training batch; the sequence of operations can change from iteration to iteration to accommodate different data structures or to jibe with intermediate results. That frees us to adjust the network during training without starting over from scratch. These dynamic graphs are particularly valuable for recursive computations based on variable sequence lengths, such as the words, sentences, and paragraphs in an ad that are analyzed with NLP. - -When we’ve finalized the model architecture, we deploy training jobs on [SageMaker](https://aws.amazon.com/sagemaker/). PyTorch helps us develop large models faster by running numerous training jobs at the same time. PyTorch’s [Distributed Data Parallel](https://sagemaker.readthedocs.io/en/stable/api/training/sdp_versions/v1.0.0/smd_data_parallel_pytorch.html) (DDP) module replicates a single model across multiple interconnected machines within SageMaker, and all the processes run forward passes simultaneously on their own unique portion of the data set. During the backward pass, the module averages the gradients of all the processes, so each local model is updated with the same parameter values. - -### Model deployment pipeline - -When we deploy the model in production, we want to ensure lower inference costs without impacting prediction accuracy. Several PyTorch features and AWS services have helped us address the challenge. - -The flexibility of a dynamic graph enriches training, but in deployment we want to maximize performance and portability. An advantage of developing NLP models in PyTorch is that out of the box, they can be traced into a static sequence of operations by [TorchScript](https://pytorch.org/docs/stable/jit.html), a subset of Python specialized for ML applications. Torchscript converts PyTorch models to a more efficient, production-friendly intermediate representation (IR) graph that is easily compiled. We run a sample input through the model, and TorchScript records the operations executed during the forward pass. The resulting IR graph can run in high-performance environments, including C++ and other multithreaded Python-free contexts, and optimizations such as operator fusion can speed up the runtime. - -### Neuron SDK and AWS Inferentia powered compute - -We deploy our models on [Amazon EC2 Inf1 instances](https://aws.amazon.com/ec2/instance-types/inf1/) powered by AWS Inferentia, Amazon's first ML silicon designed to accelerate deep learning inference workloads. Inferentia has shown to reduce inference costs by up to 70% compared to Amazon EC2 GPU-based instances. -We used the [AWS Neuron](https://aws.amazon.com/machine-learning/neuron/) SDK — a set of software tools used with Inferentia — to compile and optimize our models for deployment on EC2 Inf1 instances. - -The code snippet below shows how to compile a Hugging Face BERT model with Neuron. Like torch.jit.trace(), neuron.trace() records the model’s operations on an example input during the forward pass to build a static IR graph. - -```python -import torch -from transformers import BertModel, BertTokenizer -import torch.neuron -tokenizer = BertTokenizer.from_pretrained("path to saved vocab") -model = BertModel.from_pretrained("path to the saved model", returned_dict=False) -inputs = tokenizer ("sample input", return_tensor="pt") -neuron_model = torch.neuron.trace(model, - example_inputs = (inputs['input_ids'], inputs['attention_mask']), - verbose = 1) -output = neuron_model(*(inputs['input_ids'], inputs['attention_mask'])) -``` - -### Autocasting and recalibration - -Under the hood, Neuron optimizes our models for performance by autocasting them to a smaller data type. As a default, most applications represent neural network values in the 32-bit single-precision floating point (FP32) number format. Autocasting the model to a 16-bit format — half-precision floating point (FP16) or Brain Floating Point (BF16) — reduces a model’s memory footprint and execution time. In our case, we decided to use FP16 to optimize for performance while maintaining high accuracy. - -Autocasting to a smaller data type can, in some cases, trigger slight differences in the model’s predictions. To ensure that the model’s accuracy is not affected, Neuron compares the performance metrics and predictions of the FP16 and FP32 models. When autocasting diminishes the model’s accuracy, we can tell the Neuron compiler to convert only the weights and certain data inputs to FP16, keeping the rest of the intermediate results in FP32. In addition, we often run a few iterations with the training data to recalibrate our autocasted models. This process is much less intensive than the original training. - -### Deployment - -To analyze multimedia ads, we run an ensemble of DL models. All ads uploaded to Amazon are run through specialized models that assess every type of content they include: images, video and audio, headlines, texts, backgrounds, and even syntax, grammar, and potentially inappropriate language. The signals we receive from these models indicate whether or not an advertisement complies with our criteria. - -Deploying and monitoring multiple models is significantly complex, so we depend on [TorchServe](https://github.com/pytorch/serve), SageMaker’s default PyTorch model serving library. Jointly developed by Facebook’s PyTorch team and AWS to streamline the transition from prototyping to production, TorchServe helps us deploy trained PyTorch models at scale without having to write custom code. It provides a secure set of REST APIs for inference, management, metrics, and explanations. With features such as multi-model serving, model versioning, ensemble support, and automatic batching, TorchServe is ideal for supporting our immense workload. You can read more about deploying your Pytorch models on SageMaker with native TorchServe integration in this [blog post](https://aws.amazon.com/blogs/machine-learning/serving-pytorch-models-in-production-with-the-amazon-sagemaker-native-torchserve-integration/). - -In some use cases, we take advantage of PyTorch’s object-oriented programming paradigm to wrap multiple DL models into one parent object — a PyTorch nn.Module — and serve them as a single ensemble. In other cases, we use TorchServe to serve individual models on separate SageMaker endpoints, running on AWS Inf1 instances. - -### Custom handlers - -We particularly appreciate that TorchServe allows us to embed our model initialization, preprocessing, inferencing, and post processing code in a single Python script, handler.py, which lives on the server. This script — the handler —preprocesses the un-labeled data from an ad, runs that data through our models, and delivers the resulting inferences to downstream systems. TorchServe provides several default handlers that load weights and architecture and prepare the model to run on a particular device. We can bundle all the additional required artifacts, such as vocabulary files or label maps, with the model in a single archive file. - -When we need to deploy models that have complex initialization processes or that originated in third-party libraries, we design custom handlers in TorchServe. These let us load any model, from any library, with any required process. The following snippet shows a simple handler that can serve Hugging Face BERT models on any SageMaker hosting endpoint instance. - -```python -import torch -import torch.neuron -from ts.torch_handler.base_handler import BaseHandler -import transformers -from transformers import AutoModelForSequenceClassification,AutoTokenizer - -class MyModelHandler(BaseHandler): - def initialize(self, context): - self.manifest = ctx.manifest - properties = ctx.system_properties - model_dir = properties.get("model_dir") - serialized_file = self.manifest["model"]["serializedFile"] - model_pt_path = os.path.join(model_dir, serialized_file) - - - self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, do_lower_case=True - ) - self.model = AutoModelForSequenceClassification.from_pretrained( - model_dir - ) - - def preprocess(self, data): - - input_text = data.get("data") - if input_text is None: - input_text = data.get("body") - inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt') - return inputs - - def inference(self,inputs): - predictions = self.model(**inputs) - return predictions - - def postprocess(self, output): - return output -``` - -### Batching - -Hardware accelerators are optimized for parallelism, and batching — feeding a model multiple inputs in a single step — helps saturate all available capacity, typically resulting in higher throughputs. Excessively high batch sizes, however, can increase latency with minimal improvement in throughputs. Experimenting with different batch sizes helps us identify the sweet spot for our models and hardware accelerator. We run experiments to determine the best batch size for our model size, payload size, and request traffic patterns. - -The Neuron compiler now supports variable batch sizes. Previously, tracing a model hardcoded the predefined batch size, so we had to pad our data, which can waste compute, slow throughputs, and exacerbate latency. Inferentia is optimized to maximize throughput for small batches, reducing latency by easing the load on the system. - -### Parallelism - -Model parallelism on multi-cores also improves throughput and latency, which is crucial for our heavy workloads. Each Inferentia chip contains four NeuronCores that can either run separate models simultaneously or form a pipeline to stream a single model. In our use case, the data parallel configuration offers the highest throughput at the lowest cost, because it scales out concurrent processing requests. - -Data Parallel: - -

        - -

        - -Model Parallel: - -

        - -

        - -### Monitoring - -It is critical that we monitor the accuracy of our inferences in production. Models that initially make good predictions can eventually degrade in deployment as they are exposed to a wider variety of data. This phenomenon, called model drift, usually occurs when the input data distributions or the prediction targets change. - -We use [SageMaker Model Monitor](https://aws.amazon.com/sagemaker/model-monitor/) to track parity between the training and production data. Model Monitor notifies us when predictions in production begin to deviate from the training and validation results. Thanks to this early warning, we can restore accuracy — by retraining the model if necessary — before our advertisers are affected. To track performance in real time, Model Monitor also sends us metrics about the quality of predictions, such as accuracy, F-scores, and the distribution of the predicted classes. - -To determine if our application needs to scale, TorchServe logs resource utilization metrics for the CPU, Memory, and Disk at regular intervals; it also records the number of requests received versus the number served. For custom metrics, TorchServe offers a [Metrics API](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md). - -### A rewarding result - -Our DL models, developed in PyTorch and deployed on Inferentia, sped up our ads analysis while cutting costs. Starting with our first explorations in DL, programming in PyTorch felt natural. Its user-friendly features helped smooth the course from our early experiments to the deployment of our multimodal ensembles. PyTorch lets us prototype and build models quickly, which is vital as our advertising service evolves and expands. For an added benefit, PyTorch works seamlessly with Inferentia and our AWS ML stack. We look forward to building more use cases with PyTorch, so we can continue to serve our clients accurate, real-time results. diff --git a/_posts/2022-2-8-quantization-in-practice.md b/_posts/2022-2-8-quantization-in-practice.md deleted file mode 100644 index 43c9aeb1f73f..000000000000 --- a/_posts/2022-2-8-quantization-in-practice.md +++ /dev/null @@ -1,485 +0,0 @@ ---- -layout: blog_detail -title: 'Practical Quantization in PyTorch' -author: Suraj Subramanian, Mark Saroufim, Jerry Zhang -featured-img: '' ---- - -Quantization is a cheap and easy way to make your DNN run faster and with lower memory requirements. PyTorch offers a few different approaches to quantize your model. In this blog post, we'll lay a (quick) foundation of quantization in deep learning, and then take a look at how each technique looks like in practice. Finally we'll end with recommendations from the literature for using quantization in your workflows. - -

        - -
        - Fig 1. PyTorch <3 Quantization -

        - -**Contents** -* TOC -{:toc} -## Fundamentals of Quantization - -> If someone asks you what time it is, you don't respond "10:14:34:430705", but you might say "a quarter past 10". - -Quantization has roots in information compression; in deep networks it refers to reducing the numerical precision of its weights and/or activations. - -Overparameterized DNNs have more degrees of freedom and this makes them good candidates for information compression [[1]]. When you quantize a model, two things generally happen - the model gets smaller and runs with better efficiency. Hardware vendors explicitly allow for faster processing of 8-bit data (than 32-bit data) resulting in higher throughput. A smaller model has lower memory footprint and power consumption [[2]], crucial for deployment at the edge. - -### Mapping function -The mapping function is what you might guess - a function that maps values from floating-point to integer space. A commonly used mapping function is a linear transformation given by , where is the input and are **quantization parameters**. - -To reconvert to floating point space, the inverse function is given by . - -, and their difference constitutes the *quantization error*. - -### Quantization Parameters -The mapping function is parameterized by the **scaling factor** and **zero-point** . - - is simply the ratio of the input range to the output range - - -where [] is the clipping range of the input, i.e. the boundaries of permissible inputs. [] is the range in quantized output space that it is mapped to. For 8-bit quantization, the output range . - - - acts as a bias to ensure that a 0 in the input space maps perfectly to a 0 in the quantized space. - - - -### Calibration -The process of choosing the input clipping range is known as **calibration**. The simplest technique (also the default in PyTorch) is to record the running mininmum and maximum values and assign them to and . TensorRT also uses entropy minimization (KL divergence), mean-square-error minimization, or percentiles of the input range. - -In PyTorch, `Observer` modules ([code](https://github.com/PyTorch/PyTorch/blob/748d9d24940cd17938df963456c90fa1a13f3932/torch/ao/quantization/observer.py#L88)) collect statistics on the input values and calculate the qparams . Different calibration schemes result in different quantized outputs, and it's best to empirically verify which scheme works best for your application and architecture (more on that later). - -```python -from torch.quantization.observer import MinMaxObserver, MovingAverageMinMaxObserver, HistogramObserver -C, L = 3, 4 -normal = torch.distributions.normal.Normal(0,1) -inputs = [normal.sample((C, L)), normal.sample((C, L))] -print(inputs) - -# >>>>> -# [tensor([[-0.0590, 1.1674, 0.7119, -1.1270], -# [-1.3974, 0.5077, -0.5601, 0.0683], -# [-0.0929, 0.9473, 0.7159, -0.4574]]]), - -# tensor([[-0.0236, -0.7599, 1.0290, 0.8914], -# [-1.1727, -1.2556, -0.2271, 0.9568], -# [-0.2500, 1.4579, 1.4707, 0.4043]])] - -observers = [MinMaxObserver(), MovingAverageMinMaxObserver(), HistogramObserver()] -for obs in observers: - for x in inputs: obs(x) - print(obs.__class__.__name__, obs.calculate_qparams()) - -# >>>>> -# MinMaxObserver (tensor([0.0112]), tensor([124], dtype=torch.int32)) -# MovingAverageMinMaxObserver (tensor([0.0101]), tensor([139], dtype=torch.int32)) -# HistogramObserver (tensor([0.0100]), tensor([106], dtype=torch.int32)) -``` - -### Affine and Symmetric Quantization Schemes -**Affine or asymmetric quantization** schemes assign the input range to the min and max observed values. Affine schemes generally offer tighter clipping ranges and are useful for quantizing non-negative activations (you don't need the input range to contain negative values if your input tensors are never negative). The range is calculated as -. Affine quantization leads to more computationally expensive inference when used for weight tensors [[3]]. - -**Symmetric quantization** schemes center the input range around 0, eliminating the need to calculate a zero-point offset. The range is calculated as -. For skewed signals (like non-negative activations) this can result in bad quantization resolution because the clipping range includes values that never show up in the input (see the pyplot below). - -```python -act = torch.distributions.pareto.Pareto(1, 10).sample((1,1024)) -weights = torch.distributions.normal.Normal(0, 0.12).sample((3, 64, 7, 7)).flatten() - -def get_symmetric_range(x): - beta = torch.max(x.max(), x.min().abs()) - return -beta.item(), beta.item() - -def get_affine_range(x): - return x.min().item(), x.max().item() - -def plot(plt, data, scheme): - boundaries = get_affine_range(data) if scheme == 'affine' else get_symmetric_range(data) - a, _, _ = plt.hist(data, density=True, bins=100) - ymin, ymax = np.quantile(a[a>0], [0.25, 0.95]) - plt.vlines(x=boundaries, ls='--', colors='purple', ymin=ymin, ymax=ymax) - -fig, axs = plt.subplots(2,2) -plot(axs[0, 0], act, 'affine') -axs[0, 0].set_title("Activation, Affine-Quantized") - -plot(axs[0, 1], act, 'symmetric') -axs[0, 1].set_title("Activation, Symmetric-Quantized") - -plot(axs[1, 0], weights, 'affine') -axs[1, 0].set_title("Weights, Affine-Quantized") - -plot(axs[1, 1], weights, 'symmetric') -axs[1, 1].set_title("Weights, Symmetric-Quantized") -plt.show() -``` - -

        - -
        Fig 2. Clipping ranges (in purple) for affine and symmetric schemes -

        - - -In PyTorch, you can specify affine or symmetric schemes while initializing the Observer. Note that not all observers support both schemes. - -```python -for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]: - obs = MovingAverageMinMaxObserver(qscheme=qscheme) - for x in inputs: obs(x) - print(f"Qscheme: {qscheme} | {obs.calculate_qparams()}") - -# >>>>> -# Qscheme: torch.per_tensor_affine | (tensor([0.0101]), tensor([139], dtype=torch.int32)) -# Qscheme: torch.per_tensor_symmetric | (tensor([0.0109]), tensor([128])) -``` - -### Per-Tensor and Per-Channel Quantization Schemes -Quantization parameters can be calculated for the layer's entire weight tensor as a whole, or separately for each channel. In per-tensor, the same clipping range is applied to all the channels in a layer - -

        - -
        Fig 3. Per-Channel uses one set of qparams for each channel. Per-tensor uses the same qparams for the entire tensor. -

        - -For weights quantization, symmetric-per-channel quantization provides better accuracies; per-tensor quantization performs poorly, possibly due to high variance in conv weights across channels from batchnorm folding [[3]]. - -```python -from torch.quantization.observer import MovingAveragePerChannelMinMaxObserver -obs = MovingAveragePerChannelMinMaxObserver(ch_axis=0) # calculate qparams for all `C` channels separately -for x in inputs: obs(x) -print(obs.calculate_qparams()) - -# >>>>> -# (tensor([0.0090, 0.0075, 0.0055]), tensor([125, 187, 82], dtype=torch.int32)) -``` - -### Backend Engine -Currently, quantized operators run on x86 machines via the [FBGEMM backend](https://github.com/pytorch/FBGEMM), or use [QNNPACK](https://github.com/pytorch/QNNPACK) primitives on ARM machines. Backend support for server GPUs (via TensorRT and cuDNN) is coming soon. Learn more about extending quantization to custom backends: [RFC-0019](https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md). - -```python -backend = 'fbgemm' if x86 else 'qnnpack' -qconfig = torch.quantization.get_default_qconfig(backend) -torch.backends.quantized.engine = backend -``` - - -### QConfig - -The `QConfig` ([code](https://github.com/PyTorch/PyTorch/blob/d6b15bfcbdaff8eb73fa750ee47cef4ccee1cd92/torch/ao/quantization/qconfig.py#L165)) NamedTuple stores the Observers and the quantization schemes used to quantize activations and weights. - -Be sure to pass the Observer class (not the instance), or a callable that can return Observer instances. Use `with_args()` to override the default arguments. - -```python -my_qconfig = torch.quantization.QConfig( - activation=MovingAverageMinMaxObserver.with_args(qscheme=torch.per_tensor_affine), - weight=MovingAveragePerChannelMinMaxObserver.with_args(qscheme=torch.qint8) -) -# >>>>> -# QConfig(activation=functools.partial(, qscheme=torch.per_tensor_affine){}, weight=functools.partial(, qscheme=torch.qint8){}) -``` - - -## In PyTorch - -PyTorch allows you a few different ways to quantize your model depending on -- if you prefer a flexible but manual, or a restricted automagic process (*Eager Mode* v/s *FX Graph Mode*) -- if qparams for quantizing activations (layer outputs) are precomputed for all inputs, or calculated afresh with each input (*static* v/s *dynamic*), -- if qparams are computed with or without retraining (*quantization-aware training* v/s *post-training quantization*) - -FX Graph Mode automatically fuses eligible modules, inserts Quant/DeQuant stubs, calibrates the model and returns a quantized module - all in two method calls - but only for networks that are [symbolic traceable](https://PyTorch.org/docs/stable/fx.html#torch.fx.symbolic_trace). The examples below contain the calls using Eager Mode and FX Graph Mode for comparison. - -In DNNs, eligible candidates for quantization are the FP32 weights (layer parameters) and activations (layer outputs). Quantizing weights reduces the model size. Quantized activations typically result in faster inference. - -As an example, the 50-layer ResNet network has ~26 million weight parameters and computes ~16 million activations in the forward pass. - -### Post-Training Dynamic/Weight-only Quantization -Here the model's weights are pre-quantized; the activations are quantized on-the-fly ("dynamic") during inference. The simplest of all approaches, it has a one line API call in `torch.quantization.quantize_dynamic`. Currently only Linear and Recurrent (`LSTM`, `GRU`, `RNN`) layers are supported for dynamic quantization. - - **(+)** Can result in higher accuracies since the clipping range is exactly calibrated for each input [[1]]. - - **(+)** Dynamic quantization is preferred for models like LSTMs and Transformers where writing/retrieving the model's weights from memory dominate bandwidths [[4]]. - - **(-)** Calibrating and quantizing the activations at each layer during runtime can add to the compute overhead. - -```python -import torch -from torch import nn - -# toy model -m = nn.Sequential( - nn.Conv2d(2, 64, (8,)), - nn.ReLU(), - nn.Linear(16,10), - nn.LSTM(10, 10)) - -m.eval() - -## EAGER MODE -from torch.quantization import quantize_dynamic -model_quantized = quantize_dynamic( - model=m, qconfig_spec={nn.LSTM, nn.Linear}, dtype=torch.qint8, inplace=False -) - -## FX MODE -from torch.quantization import quantize_fx -qconfig_dict = {"": torch.quantization.default_dynamic_qconfig} # An empty key denotes the default applied to all modules -model_prepared = quantize_fx.prepare_fx(m, qconfig_dict) -model_quantized = quantize_fx.convert_fx(model_prepared) -``` - -### Post-Training Static Quantization (PTQ) -PTQ also pre-quantizes model weights but instead of calibrating activations on-the-fly, the clipping range is pre-calibrated and fixed ("static") using validation data. Activations stay in quantized precision between operations during inference. About 100 mini-batches of representative data are sufficient to calibrate the observers [[2]]. The examples below use random data in calibration for convenience - using that in your application will result in bad qparams. - - -

        - PTQ flowchart -
        - Fig 4. Steps in Post-Training Static Quantization -

        - - -[Module fusion](https://pytorch.org/tutorials/recipes/fuse.html) combines multiple sequential modules (eg: `[Conv2d, BatchNorm, ReLU]`) into one. Fusing modules means the compiler needs to only run one kernel instead of many; this speeds things up and improves accuracy by reducing quantization error. - -**(+)** Static quantization has faster inference than dynamic quantization because it eliminates the float<->int conversion costs between layers. - -**(-)** Static quantized models may need regular re-calibration to stay robust against distribution-drift. - - -```python -# Static quantization of a model consists of the following steps: - -# Fuse modules -# Insert Quant/DeQuant Stubs -# Prepare the fused module (insert observers before and after layers) -# Calibrate the prepared module (pass it representative data) -# Convert the calibrated module (replace with quantized version) - -import torch -from torch import nn -import copy - -backend = "fbgemm" # running on a x86 CPU. Use "qnnpack" if running on ARM. - -model = nn.Sequential( - nn.Conv2d(2,64,3), - nn.ReLU(), - nn.Conv2d(64, 128, 3), - nn.ReLU() -) - -## EAGER MODE -m = copy.deepcopy(model) -m.eval() -"""Fuse -- Inplace fusion replaces the first module in the sequence with the fused module, and the rest with identity modules -""" -torch.quantization.fuse_modules(m, ['0','1'], inplace=True) # fuse first Conv-ReLU pair -torch.quantization.fuse_modules(m, ['2','3'], inplace=True) # fuse second Conv-ReLU pair - -"""Insert stubs""" -m = nn.Sequential(torch.quantization.QuantStub(), - *m, - torch.quantization.DeQuantStub()) - -"""Prepare""" -m.qconfig = torch.quantization.get_default_qconfig(backend) -torch.quantization.prepare(m, inplace=True) - -"""Calibrate -- This example uses random data for convenience. Use representative (validation) data instead. -""" -with torch.inference_mode(): - for _ in range(10): - x = torch.rand(1,2, 28, 28) - m(x) - -"""Convert""" -torch.quantization.convert(m, inplace=True) - -"""Check""" -print(m[[1]].weight().element_size()) # 1 byte instead of 4 bytes for FP32 - - -## FX GRAPH -from torch.quantization import quantize_fx -m = copy.deepcopy(model) -m.eval() -qconfig_dict = {"": torch.quantization.get_default_qconfig(backend)} -# Prepare -model_prepared = quantize_fx.prepare_fx(m, qconfig_dict) -# Calibrate - Use representative (validation) data. -with torch.inference_mode(): - for _ in range(10): - x = torch.rand(1,2,28, 28) - model_prepared(x) -# quantize -model_quantized = quantize_fx.convert_fx(model_prepared) -``` - -### Quantization-aware Training (QAT) -

        - QAT flowchart -
        - Fig 5. Steps in Quantization-Aware Training -

        - -The PTQ approach is great for large models, but accuracy suffers in smaller models [[6]]. This is of course due to the loss in numerical precision when adapting a model from FP32 to the INT8 realm *(Figure 6(a))*. QAT tackles this by including this quantization error in the training loss, thereby training an INT8-first model. - -

        - Fig. 6: Comparison of PTQ and QAT -
        - Fig 6. Comparison of PTQ and QAT convergence [3] -

        - -All weights and biases are stored in FP32, and backpropagation happens as usual. However in the forward pass, quantization is internally simulated via `FakeQuantize` modules. They are called fake because they quantize and immediately dequantize the data, adding quantization noise similar to what might be encountered during quantized inference. The final loss thus accounts for any expected quantization errors. Optimizing on this allows the model to identify a wider region in the loss function *(Figure 6(b))*, and identify FP32 parameters such that quantizing them to INT8 does not significantly affect accuracy. - -

        - Fake Quantization in the forward and backward pass -
        Fig 7. Fake Quantization in the forward and backward pass -
        Image source: https://developer.nvidia.com/blog/achieving-fp32-accuracy-for-int8-inference-using-quantization-aware-training-with-tensorrt -

        - -**(+)** QAT yields higher accuracies than PTQ. - -**(+)** Qparams can be learned during model training for more fine-grained accuracy (see [LearnableFakeQuantize](https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/_learnable_fake_quantize.py)) - -**(-)** Computational cost of retraining a model in QAT can be several hundred epochs [[1]] - - -```python -# QAT follows the same steps as PTQ, with the exception of the training loop before you actually convert the model to its quantized version - -import torch -from torch import nn - -backend = "fbgemm" # running on a x86 CPU. Use "qnnpack" if running on ARM. - -m = nn.Sequential( - nn.Conv2d(2,64,8), - nn.ReLU(), - nn.Conv2d(64, 128, 8), - nn.ReLU() -) - -"""Fuse""" -torch.quantization.fuse_modules(m, ['0','1'], inplace=True) # fuse first Conv-ReLU pair -torch.quantization.fuse_modules(m, ['2','3'], inplace=True) # fuse second Conv-ReLU pair - -"""Insert stubs""" -m = nn.Sequential(torch.quantization.QuantStub(), - *m, - torch.quantization.DeQuantStub()) - -"""Prepare""" -m.train() -m.qconfig = torch.quantization.get_default_qconfig(backend) -torch.quantization.prepare_qat(m, inplace=True) - -"""Training Loop""" -n_epochs = 10 -opt = torch.optim.SGD(m.parameters(), lr=0.1) -loss_fn = lambda out, tgt: torch.pow(tgt-out, 2).mean() -for epoch in range(n_epochs): - x = torch.rand(10,2,24,24) - out = m(x) - loss = loss_fn(out, torch.rand_like(out)) - opt.zero_grad() - loss.backward() - opt.step() - -"""Convert""" -m.eval() -torch.quantization.convert(m, inplace=True) -``` - - - -## Sensitivity Analysis -Not all layers respond to quantization equally, some are more sensitive to precision drops than others. Identifying the optimal combination of layers that minimizes accuracy drop is time-consuming, so [[3]] suggest a one-at-a-time sensitivity analysis to identify which layers are most sensitive, and retaining FP32 precision on those. In their experiments, skipping just 2 conv layers (out of a total 28 in MobileNet v1) give them near-FP32 accuracy. Using FX Graph Mode, we can create custom qconfigs to do this easily: - -```python -# ONE-AT-A-TIME SENSITIVITY ANALYSIS - -for quantized_layer, _ in model.named_modules(): - print("Only quantizing layer: ", quantized_layer) - - # The module_name key allows module-specific qconfigs. - qconfig_dict = {"": None, - "module_name":[(quantized_layer, torch.quantization.get_default_qconfig(backend))]} - - model_prepared = quantize_fx.prepare_fx(model, qconfig_dict) - # calibrate - model_quantized = quantize_fx.convert_fx(model_prepared) - # evaluate(model) -``` - -Another approach is to compare statistics of the FP32 and INT8 layers; commonly used metrics for these are SQNR (Signal to Quantized Noise Ratio) and Mean-Squre-Error. Such a comparative analysis may also help in guiding further optimizations. - -

        - Fig 8. Comparing model weights and activations -
        - Fig 8. Comparing model weights and activations -

        - -PyTorch provides tools to help with this analysis under the Numeric Suite. Learn more about using Numeric Suite from the [full tutorial](https://pytorch.org/tutorials/prototype/numeric_suite_tutorial.html). - -```python -# extract from https://pytorch.org/tutorials/prototype/numeric_suite_tutorial.html -import torch.quantization._numeric_suite as ns - -def SQNR(x, y): - # Higher is better - Ps = torch.norm(x) - Pn = torch.norm(x-y) - return 20*torch.log10(Ps/Pn) - -wt_compare_dict = ns.compare_weights(fp32_model.state_dict(), int8_model.state_dict()) -for key in wt_compare_dict: - print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'].dequantize())) - -act_compare_dict = ns.compare_model_outputs(fp32_model, int8_model, input_data) -for key in act_compare_dict: - print(key, compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize())) - -``` - - -## Recommendations for your workflow -

        - Suggested quantization workflow -
        - Fig 9. Suggested quantization workflow -

        - Click for larger image - -### Points to note - - Large (10M+ parameters) models are more robust to quantization error. [[2]] - - Quantizing a model from a FP32 checkpoint provides better accuracy than training an INT8 model from scratch.[[2]] - - Profiling the model runtime is optional but it can help identify layers that bottleneck inference. - - Dynamic Quantization is an easy first step, especially if your model has many Linear or Recurrent layers. - - Use symmetric-per-channel quantization with `MinMax` observers for quantizing weights. Use affine-per-tensor quantization with `MovingAverageMinMax` observers for quantizing activations[[2], [3]] - - Use metrics like SQNR to identify which layers are most suscpetible to quantization error. Turn off quantization on these layers. - - Use QAT to fine-tune for around 10% of the original training schedule with an annealing learning rate schedule starting at 1% of the initial training learning rate. [[3]] - - If the above workflow didn't work for you, we want to know more. Post a thread with details of your code (model architecture, accuracy metric, techniques tried). Feel free to cc me [@suraj.pt](https://discuss.pytorch.org/u/suraj.pt/). - - -That was a lot to digest, congratulations for sticking with it! Next, we'll take a look at quantizing a "real-world" model that uses dynamic control structures (if-else, loops). These elements disallow symbolic tracing a model, which makes it a bit tricky to directly quantize the model out of the box. In the next post of this series, we'll get our hands dirty on a model that is chock full of loops and if-else blocks, and even uses third-party libraries in the `forward` call. - -We'll also cover a cool new feature in PyTorch Quantization called Define-by-Run, that tries to ease this constraint by needing only subsets of the model's computational graph to be free of dynamic flow. Check out the [Define-by-Run poster at PTDD'21](https://s3.amazonaws.com/assets.pytorch.org/ptdd2021/posters/C8.png) for a preview. - - -## References -[[1]] Gholami, A., Kim, S., Dong, Z., Yao, Z., Mahoney, M. W., & Keutzer, K. (2021). A survey of quantization methods for efficient neural network inference. arXiv preprint arXiv:2103.13630. - -[[2]] Krishnamoorthi, R. (2018). Quantizing deep convolutional networks for efficient inference: A whitepaper. arXiv preprint arXiv:1806.08342. - -[[3]] Wu, H., Judd, P., Zhang, X., Isaev, M., & Micikevicius, P. (2020). Integer quantization for deep learning inference: Principles and empirical evaluation. arXiv preprint arXiv:2004.09602. - -[[4]] PyTorch Quantization Docs - - -[1]: https://arxiv.org/pdf/2103.13630.pdf -[2]: https://arxiv.org/pdf/1806.08342.pdf -[3]: https://arxiv.org/abs/2004.09602 -[4]: https://pytorch.org/docs/stable/quantization.html#prototype-fx-graph-mode-quantization diff --git a/_posts/2022-3-10-pytorch-1.11-new-library-releases.md b/_posts/2022-3-10-pytorch-1.11-new-library-releases.md deleted file mode 100644 index 4c5f9c328f71..000000000000 --- a/_posts/2022-3-10-pytorch-1.11-new-library-releases.md +++ /dev/null @@ -1,273 +0,0 @@ ---- -layout: blog_detail -title: "Introducing TorchRec, and other domain library updates in PyTorch 1.11" -author: Team PyTorch -featured-img: "assets/images/pytorch-logo.jpg" ---- - -We are introducing the beta release of TorchRec and a number of improvements to the current PyTorch domain libraries, alongside the [PyTorch 1.11 release](https://pytorch.org/blog/pytorch-1.11-released/). These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. Highlights include: - -- **TorchRec**, a PyTorch domain library for Recommendation Systems, is available in beta. [View it on GitHub](https://github.com/pytorch/torchrec). -- **TorchAudio** - Added Enformer- and RNN-T-based models and recipes to support the full development lifecycle of a streaming ASR model. See the release notes [here](https://github.com/pytorch/audio/releases). -- **TorchText** - Added beta support for RoBERTa and XLM-R models, byte-level BPE tokenizer, and text datasets backed by TorchData. See the release notes [here](https://github.com/pytorch/text/releases). -- **TorchVision** - Added 4 new model families and 14 new classification datasets such as CLEVR, GTSRB, FER2013. See the release notes [here](https://github.com/pytorch/vision/releases). - -## TorchRec 0.1 - -We [announced TorchRec](https://pytorch.org/blog/introducing-torchrec/) a few weeks ago and we are excited to release the beta version today. To recap, TorchRec is a PyTorch domain library for Recommendation Systems. This new library provides common sparsity and parallelism primitives, enabling researchers to build state-of-the-art personalization models and deploy them in production. TorchRec was used to train a 1.25 trillion parameter model, pushed to production in January 2022. - -In particular, the library includes: - -- Modeling primitives, such as embedding bags and jagged tensors, that enable easy authoring of large, performant multi-device/multi-node models using hybrid data-parallelism and model-parallelism. -- Optimized RecSys kernels powered by [FBGEMM](https://github.com/pytorch/FBGEMM), including support for sparse and quantized operations. -- A sharder which can partition embedding tables with a variety of different strategies including data-parallel, table-wise, row-wise, table-wise-row-wise, and column-wise sharding. -- A planner which can automatically generate optimized sharding plans for models. -- Pipelining to overlap dataloading device transfer (copy to GPU), inter-device communications (input_dist), and computation (forward, backward) for increased performance. -- GPU inference support. -- Common modules for RecSys, such as models and public datasets (Criteo & Movielens). - -Please check the TorchRec announcement post [here](https://pytorch.org/blog/introducing-torchrec/), [video tutorial](https://www.youtube.com/watch?v=cjgj41dvSeQ), install instructions [here](https://github.com/pytorch/torchrec#readme), test drive the feature through this tutorial [here](https://pytorch.org/tutorials/intermediate/torchrec_tutorial.html), and refer to the reference document [here](https://pytorch.org/torchrec/). - -## TorchAudio 0.11 - -#### TorchAudio: Building Blocks for Audio and Speech Processing - -We published a paper, [TorchAudio: Building Blocks for Audio and Speech Processing](https://arxiv.org/abs/2110.15018), describing the overview of the TorchAudio library. If you find TorchAudio useful for your research, please help us share with the community by citing our paper. - -#### (Beta) RNN-T & (Prototype) Emformer Models and Recipes - -

        - -

        - -Emformer is an efficient memory-transformer-based streaming acoustic model that has demonstrated state-of-the-art streaming automatic speech recognition (ASR) performance in low-latency, resource-constrained scenarios, such as on-device applications (citation: [https://arxiv.org/abs/2010.10759](https://arxiv.org/abs/2010.10759)). - -The TorchAudio v0.11 release includes the following beta features: - -- Implementation of Emformer ([docs](https://pytorch.org/audio/main/models.html#emformer)) -- Recurrent neural network transducer (RNN-T) streaming ASR model that uses Emformer for its transcription network ([docs](https://pytorch.org/audio/main/models.html#rnn-t)) -- RNN-T beam search decoder with TorchScript support ([docs](https://pytorch.org/audio/main/models.html#rnntbeamsearch)) -- LibriSpeech Emformer RNN-T training recipe ([GitHub](https://github.com/pytorch/audio/tree/release/0.11/examples/asr/librispeech_emformer_rnnt)) and corresponding pre-trained streaming ASR inference pipeline ([docs](https://pytorch.org/audio/main/pipelines.html#emformer-rnnt-base-librispeech)) - -Also there are prototype features that are available from nightly builds or the main branch. - -- Training recipes trained on MuST-C and TED-LIUM3 datasets. ([GitHub](https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt)) -- Pre-trained pipelines corresponding to the recipes. ([docs](https://pytorch.org/audio/main/prototype.pipelines.html)) -- Tutorial that steps through performing online speech recognition with RNN-T Emformer model. ([docs](https://pytorch.org/audio/main/tutorials/online_asr_tutorial.html)) - -Collectively, these features cover the full development lifecycle of a streaming ASR model, from definition through training and inference, and enable users to easily develop their own Emformer- and RNN-T-based models. - -Special thanks to Yangyang Shi, Jay Mahadeokar, and Gil Keren for their code contributions and guidance. - -#### (Beta) HuBERT Pretrain Model - -The masked prediction training of HuBERT model requires the masked logits, unmasked logits, and feature norm as the outputs. The logits are for cross-entropy losses and the feature norm is for penalty loss. The release adds HuBERTPretrainModel and corresponding factory functions (hubert_pretrain_base, hubert_pretrain_large, and hubert_pretrain_xlarge) to enable training from scratch. - -#### (Prototype) CTC Beam Search Decoder - -In recent releases, TorchAudio has added support for ASR models fine-tuned on CTC loss. The addition of an inference time CTC beam search decoder enables running end-to-end ASR evaluation using TorchAudio utils. - -The CTC decoder in TorchAudio supports customizable beam search decoding with lexicon constraint. It also has optional KenLM language model support. - -For more details, please check out the [API tutorial](https://pytorch.org/audio/main/tutorials/asr_inference_with_ctc_decoder_tutorial.html). This prototype feature is available through nightly builds. - -#### (Prototype) Streaming API - -TorchAudio started as simple audio I/O APIs that supplement PyTorch. With the recent addition of ASR models and training recipes, the project has received requests to support high-level application development. - -Streaming API makes it easy to develop and test the model in online inference. It utilizes ffmpeg under the hood, and enables reading media from online services and hardware devices, decoding media in an incremental manner, and applying filters and preprocessing. - -Please checkout the [API tutorial](https://pytorch.org/audio/main/) and [the documentation](https://pytorch.org/audio/main/). There are also the [streaming ASR](https://pytorch.org/audio/main/tutorials/online_asr_tutorial.html) tutorial and the [device streaming ASR tutorial](https://pytorch.org/audio/main/tutorials/device_asr.html). This feature is available from nightly releases. Please refer to [pytorch.org](https://pytorch.org/get-started/locally/) for how to install nightly builds. - -## TorchText 0.12 - -#### (Beta) RoBERTa and XLM-R Models - -TorchText has added support for pre-trained RoBERTa and XLM-R models. It would allow users to train end-2-end Transformer Encoder based models on standard NLP tasks using TorchText. - -More specifically: - -- The models are torchscriptable and hence can be employed for production use-cases. -- The model APIs let users to easily attach custom task-specific heads with pre-trained encoders. -- The API also comes equipped with data pre-processing transforms to match the pre-trained weights and model configuration. - -We have added a [tutorial](https://pytorch.org/text/main/tutorials/sst2_classification_non_distributed.html) to demonstrate SST-2 binary text classification task with pre-trained XLM-R base architecture. - -For additional details on model APIs and usage examples, please refer to the [documentation](https://pytorch.org/text/main/models.html). - -#### (Beta) byte-level BPE tokenizer - -TorchText has added support for a Byte-Level BPE tokenizer, as used in GPT-2. This tokenizer is also used for tokenizing inputs to the pre-trained RoBERTa models described previously. In addition to the RoBERTa vocab, users can also load their own custom BPE vocab to use the tokenizer. Furthermore, the tokenizer is fully torchscriptable and hence can be employed for production use-cases. For additional details on model APIs and usage examples, please refer to the [documentation](https://pytorch.org/text/main/transforms.html#gpt2bpetokenizer). - -#### (Beta) Text datasets backed by TorchData - -TorchText has modernized its datasets by migrating from older-style Iterable Datasets to [TorchData’s](https://github.com/pytorch/data#readme) DataPipes. TorchData is a library that provides modular/composable primitives, allowing users to load and transform data in performant data pipelines. - -These DataPipes work out-of-the-box with PyTorch DataLoader and would enable new functionalities like auto-sharding. Users can now easily do data manipulation and pre-processing using user-defined functions and transformations in a functional style programming. Datasets backed by DataPipes also enable standard flow-control like batching, collation, shuffling and bucketizing. - -Collectively, DataPipes provides a comprehensive experience for data preprocessing and tensorization needs in a pythonic and flexible way for model training. We have added a [tutorial](https://pytorch.org/text/main/tutorials/sst2_classification_non_distributed.html) to demonstrate data-processing pipelining using the modernized dataset for binary text-classification. - -You can learn more about TorchData DataPipe APIs in its [official documentation](https://pytorch.org/data). - -## TorchVision 0.12 - -### New Models - -Four new model families have been released in the latest version along with pre-trained weights for their variants. - -#### #1 Object Detection - -[FCOS](https://arxiv.org/pdf/1904.01355.pdf) is a popular, fully convolutional, anchor-free model for object detection. In this release we include a community-contributed model implementation as well as pre-trained weights. The model was trained on COCO train2017 and can be used as follows: - -```python -import torch -from torchvision import models - -x = [torch.rand(3, 224, 224)] -fcos = models.detection.fcos_resnet50_fpn(pretrained=True).eval() -predictions = fcos(x) -``` - -The box AP of the pre-trained model on COCO val2017 is 39.2 (see [#4961](https://github.com/pytorch/vision/pull/4961) for more details). - -We would like to thank [Hu Ye](https://github.com/xiaohu2015) and [Zhiqiang Wang](https://github.com/zhiqwang) for contributing to the model implementation and initial training. This was the first community-contributed model in a long while, and given its success, we decided to use the learnings from this process and create a new [model contribution guidelines](https://github.com/pytorch/vision/blob/main/CONTRIBUTING_MODELS.md). - -#### #2 Optical Flow support and RAFT model - -TorchVision now supports optical flow! Optical Flow models try to predict movement in a video: given two consecutive frames, the model predicts where each pixel of the first frame ends up in the second frame. Check out our [new tutorial on Optical Flow](https://pytorch.org/vision/0.12/auto_examples/plot_optical_flow.html#sphx-glr-auto-examples-plot-optical-flow-py)! - -We implemented a torchscript-compatible [RAFT](https://arxiv.org/abs/2003.12039) model with pre-trained weights (both normal and “small” versions), and added support for [training and evaluating](https://github.com/pytorch/vision/tree/main/references/optical_flow) optical flow models. Our training scripts support distributed training across processes and nodes, leading to much faster training time than the original implementation. We also added 5 new [optical flow datasets](https://pytorch.org/vision/0.12/datasets.html#optical-flow): Flying Chairs, Flying Things, Sintel, Kitti, and HD1K. - -

        - -

        - -#### #3. Image Classification - -[Vision Transformer](https://arxiv.org/abs/2010.11929) (ViT) and [ConvNeXt](https://arxiv.org/abs/2201.03545) are two popular architectures which can be used as image classifiers or as backbones for downstream vision tasks. In this release we include 8 pre-trained weights for their classification variants. The models were trained on ImageNet and can be used as follows: - -```python -import torch -from torchvision import models - -x = torch.rand(1, 3, 224, 224) -vit = models.vit_b_16(pretrained=True).eval() -convnext = models.convnext_tiny(pretrained=True).eval() -predictions1 = vit(x) -predictions2 = convnext(x) -``` - -The accuracies of the pre-trained models obtained on ImageNet val are seen below: - -| **Model** | **Acc@1** | **Acc@5** | -| -------------- | --------: | --------: | -| vit_b_16 | 81.072 | 95.318 | -| vit_b_32 | 75.912 | 92.466 | -| vit_l_16 | 79.662 | 94.638 | -| vit_l_32 | 76.972 | 93.07 | -| convnext_tiny | 82.52 | 96.146 | -| convnext_small | 83.616 | 96.65 | -| convnext_base | 84.062 | 96.87 | -| convnext_large | 84.414 | 96.976 | - -The above models have been trained using an adjusted version of our new [training recipe](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/) and this allows us to offer models with accuracies significantly higher than the ones on the original papers. - -#### #4. GPU Video Decoding - -In this release, we add support for GPU video decoding in the video reading API. To use hardware-accelerated decoding, we just need to pass a cuda device to the video reading API as shown below: - -```python -import torchvision - -reader = torchvision.io.VideoReader(file_name, device="cuda:0") -for frame in reader: - print(frame) -``` - -We also support seeking to anyframe or a keyframe in the video before reading, as shown below: - -```python -reader.seek(seek_time) -``` - -### New Datasets - -We have implemented 14 new [classification datasets](https://pytorch.org/vision/0.12/datasets.html#image-classification): CLEVR, GTSRB, FER2013, SUN397, Country211, Flowers102, fvgc_aircraft, OxfordIIITPet, DTD, Food 101, Rendered SST2, Stanford cars, PCAM, and EuroSAT. - -As part of our work on Optical Flow support (see above for more details), we also added 5 new [optical flow datasets](https://pytorch.org/vision/0.12/datasets.html#optical-flow): Flying Chairs, Flying Things, Sintel, Kitti, and HD1K. - -### Other Updates - -- **New documentation layout**: Each function / class is now documented in a separate page, clearing up some space in the per-module pages, and easing the discovery of the proposed APIs. Compare e.g. our [previous docs](https://pytorch.org/vision/0.11/transforms.html) vs the [new ones](https://pytorch.org/vision/0.12/transforms.html). Please let us know if you have any [feedback](https://github.com/pytorch/vision/issues/5511)! -- **New [model contribution guidelines](https://github.com/pytorch/vision/blob/main/CONTRIBUTING_MODELS.md)** have been published following the success of the [FCOS](https://github.com/pytorch/vision/pull/4961) model which was contributed by the community. These guidelines aim to be an overview of the model contribution process for anyone who would like to suggest, implement and train a new model. -- **Upcoming Prototype API** - We are currently working on a prototype API which adds Multi-weight support on all of our model builder methods. This will enable us to offer multiple pre-trained weights, associated with their meta-data and inference transforms. The API is still under review and thus was not included in the release but you can read more about it on our [blogpost](https://pytorch.org/blog/introducing-torchvision-new-multi-weight-support-api/) and provide your feedback on the dedicated [Github issue](https://github.com/pytorch/vision/issues/5088). -- **Changes in our deprecation policy** - Up until now, torchvision would almost never remove deprecated APIs. In order to be more aligned and consistent with pytorch core, we are updating our deprecation policy. We are now following a 2-release deprecation cycle: deprecated APIs will raise a warning for 2 versions, and will be removed after that. To reflect these changes and to smooth the transition, we have decided to: - - Remove all APIs that had been deprecated before or on v0.8, released 1.5 years ago. - - Update the removal timeline of all other deprecated APIs to v0.14, to reflect the new 2-cycle policy starting now in v0.12. - -### Captum 0.5 - -[Captum](https://captum.ai/) is a PyTorch library for model interpretability. For this release, we expanded Captum with influential instances and added support for both similarity based influences and novel algorithms, [TracIn](https://arxiv.org/abs/2002.08484) and its variants. TracIn variants offer faster approximation of influence scores based on random projections for fully connected layers. - -More specifically the new, influence, subsection of Captum includes: - -- **[SimilarityInfluence](https://captum.ai/api/influence.html#similarityinfluence)** computes similarity scores between test and training examples using default (cosine or euclidean) or custom user definite metrics w.r.t. given input model layers. -- **[TracInCP](https://captum.ai/api/influence.html#tracincp)** approximates the influential score of each training example on a given test example based on the dot-product similarity between loss gradients w.r.t. model parameters for test and training examples. Note that if we use training examples as test examples then we compute self influence. This method and its variants described below also return top-k proponents and opponents which are the top-k largest positive and negative influential examples respectively. -- **[TracInCPFast](https://captum.ai/api/influence.html#tracincpfast)** is an approximation of TracInCP that avoids computing the gradients w.r.t. large parameter matrices. It approximates influence score based on the dot products between last fully connected layer activations and loss gradients w.r.t. that layer for training and test examples. -- **[TracInCPFastRandProj](https://captum.ai/api/influence.html#tracincpfastrandproj)** uses a nearest neighbor approximation library such as annoy to compute the dot product between the training and test quantities. In order to reduce the dimensionality of layer activations and corresponding gradients this method, in addition, allows to project those vectors into a lower dimensional space using random projection matrices. - -More about the implementation of influential instances can be found on our [GitHub](https://github.com/pytorch/captum/tree/master/captum/influence) page and [tutorials](https://captum.ai/tutorials/TracInCP_Tutorial). - -Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the [discussion forums](https://discuss.pytorch.org/) and [open GitHub issues](https://github.com/pytorch/pytorch/issues). To get the latest news from PyTorch, follow us on [Twitter](https://twitter.com/PyTorch), [Medium](https://medium.com/pytorch), [YouTube](https://www.youtube.com/pytorch), and [LinkedIn](https://www.linkedin.com/company/pytorch). - -Cheers! - -Team PyTorch - -
        -
        - -
        -
        - diff --git a/_posts/2022-3-10-pytorch-1.11-released.md b/_posts/2022-3-10-pytorch-1.11-released.md deleted file mode 100644 index 179cdffc350a..000000000000 --- a/_posts/2022-3-10-pytorch-1.11-released.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 1.11, TorchData, and functorch are now available" -author: Team PyTorch -featured-img: "assets/images/pytorch-logo.jpg" ---- - -We are excited to announce the release of PyTorch 1.11 ([release notes](https://github.com/pytorch/pytorch/releases/tag/v1.11.0)). This release is composed of over 3,300 commits since 1.10, made by 434 contributors. Along with 1.11, we are releasing beta versions of TorchData and functorch. - -Summary: - -* **TorchData** is a new library for common modular data loading primitives for easily constructing flexible and performant data pipelines. [View it on GitHub](https://github.com/pytorch/data). -* **functorch**, a library that adds composable function transforms to PyTorch, is now available in beta. [View it on GitHub](https://github.com/pytorch/functorch). -* Distributed Data Parallel (DDP) static graph optimizations available in stable. - -## Introducing TorchData - -We are delighted to present the Beta release of [TorchData](https://github.com/pytorch/data). This is a library of common modular data loading primitives for easily constructing flexible and performant data pipelines. Based on community feedback, we have found that the existing DataLoader bundled too many features together and can be difficult to extend. Moreover, different use cases often have to rewrite the same data loading utilities over and over again. The goal here is to enable composable data loading through Iterable-style and Map-style building blocks called “[DataPipes](https://github.com/pytorch/data#what-are-datapipes)” that work well out of the box with the [PyTorch’s DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). - -A `DataPipe` takes in some access function over Python data structures, `__iter__` for `IterDataPipe` and `__getitem__` for `MapDataPipe`, and returns a new access function with a slight transformation applied. You can chain multiple DataPipes together to form a data pipeline that performs all the necessary data transformation. - -We have implemented over 50 DataPipes that provide different core functionalities, such as opening files, parsing texts, transforming samples, caching, shuffling, and batching. For users who are interested in connecting to cloud providers (such as Google Drive or AWS S3), the fsspec and iopath DataPipes will allow you to do so. The documentation provides detailed explanations and usage examples of each IterDataPipe and MapDataPipe. - -In this release, some of the PyTorch domain libraries have migrated their datasets to use DataPipes. In TorchText, the [popular datasets provided by the library](https://github.com/pytorch/text/tree/release/0.12/torchtext/datasets) are implemented using DataPipes and a [section of its SST-2 binary text classification tutorial](https://pytorch.org/text/0.12.0/tutorials/sst2_classification_non_distributed.html#dataset) demonstrates how you can use DataPipes to preprocess data for your model. There also are other prototype implementations of datasets with DataPipes in [TorchVision (available in nightly releases)](https://github.com/pytorch/vision/tree/main/torchvision/prototype/datasets/_builtin) and in [TorchRec](https://pytorch.org/torchrec/torchrec.datasets.html). - -The [documentation for TorchData](https://pytorch.org/data) is now live. It contains a tutorial that covers how to use DataPipes, use them with DataLoader, and implement custom ones. FAQs and future plans related to DataLoader are described in [our project’s README file](https://github.com/pytorch/data#readme). - -## Introducing functorch - -We’re excited to announce the first beta release of [functorch](https://github.com/pytorch/functorch). Heavily inspired by [Google JAX](https://github.com/google/jax), functorch is a library that adds composable function transforms to PyTorch. It aims to provide composable vmap (vectorization) and autodiff transforms that work with PyTorch modules and PyTorch autograd with good eager-mode performance. - -Composable function transforms can help with a number of use cases that are tricky to do in PyTorch today: - -* computing per-sample-gradients (or other per-sample quantities) -* running ensembles of models on a single machine -* efficiently batching together tasks in the inner-loop of MAML -* efficiently computing Jacobians and Hessians as well as batched ones - -Composing vmap (vectorization), vjp (reverse-mode AD), and jvp (forward-mode AD) transforms allows us to effortlessly express the above without designing a separate library for each. - -For more details, please see our [documentation](https://pytorch.org/functorch/), [tutorials](https://pytorch.org/functorch), and [installation instructions](https://pytorch.org/functorch/stable/install.html). - -## Distributed Training - -### (Stable) DDP static graph - -DDP static graph assumes that your model employs the same set of used/unused parameters in every iteration, so that it can deterministically know states like which hooks will fire, how many times the hooks will fire and gradients computation ready order after the first iteration. Static graph caches these states in the first iteration, and thus it could support features that DDP can not support in previous releases, e.g., support multiple activation checkpoints on the same parameters regardless of whether there are unused parameters or not. The static graph feature also applies performance optimizations when there are unused parameters, e.g., it avoids traversing graphs to search unused parameters every iteration, and enables dynamic bucketing order. These optimizations in the DDP static graph brought 10% QPS gain for some recommendation models. - -To enable static graph, just simply set static_graph=True in the DDP API like this: - -``` -ddp_model = DistributedDataParallel(model, static_graph=True) -``` - -For more details, please see our [documentation](https://pytorch.org/docs/master/generated/torch.nn.parallel.DistributedDataParallel.html) and [tutorials](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). - -Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the [discussion forums](https://discuss.pytorch.org/) and [open GitHub issues](https://github.com/pytorch/pytorch/issues). To get the latest news from PyTorch, follow us on [Twitter](https://twitter.com/PyTorch), [Medium](https://medium.com/pytorch), [YouTube](https://www.youtube.com/pytorch), and [LinkedIn](https://www.linkedin.com/company/pytorch). - -Cheers! - -Team PyTorch diff --git a/_posts/2022-3-14-introducing-pytorch-fully-sharded-data-parallel-api.md b/_posts/2022-3-14-introducing-pytorch-fully-sharded-data-parallel-api.md deleted file mode 100644 index 35431b17716d..000000000000 --- a/_posts/2022-3-14-introducing-pytorch-fully-sharded-data-parallel-api.md +++ /dev/null @@ -1,154 +0,0 @@ ---- -layout: blog_detail -title: "Introducing PyTorch Fully Sharded Data Parallel (FSDP) API" -author: Yanli Zhao, Rohan Varma, Chien-Chin Huang, Shen Li, Min Xu, Alban Desmaison -featured-img: "assets/images/pytorch-logo.jpg" ---- - -Recent studies have shown that large model training will be beneficial for improving model quality. During the last 3 years, model size grew 10,000 times from [BERT](https://arxiv.org/abs/1810.04805) with 110M parameters to [Megatron-2](https://arxiv.org/abs/2104.04473) with one trillion. However, training large AI models is not easy—aside from the need for large amounts of computing resources, software engineering complexity is also challenging. PyTorch has been working on building tools and infrastructure to make it easier. - -PyTorch Distributed data parallelism is a staple of scalable deep learning because of its robustness and simplicity. It however requires the model to fit on one GPU. Recent approaches like DeepSpeed ZeRO and FairScale’s Fully Sharded Data Parallel allow us to break this barrier by sharding a model’s parameters, gradients and optimizer states across data parallel workers while still maintaining the simplicity of data parallelism. - -With PyTorch 1.11 we’re adding native support for Fully Sharded Data Parallel (FSDP), currently available as a prototype feature. Its implementation heavily borrows from FairScale’s version while bringing more streamlined APIs and additional performance improvements. - -Scaling tests of PyTorch FSDP on AWS show it can scale up to train dense models with 1T parameters. Realized performance in our experiments reached 84 TFLOPS per A100 GPU for GPT 1T model and 159 TFLOPS per A100 GPU for GPT 175B model on AWS cluster. Native FSDP implementation also dramatically improved model initialization time compared to FairScale’s original when CPU offloading was enabled. - -In future PyTorch versions, we’re going to enable users to seamlessly switch between DDP, ZeRO-1, ZeRO-2 and FSDP flavors of data parallelism, so that users can train different scales of models with simple configurations in the unified API. - -### How FSDP Works - -FSDP is a type of data-parallel training, but unlike traditional data-parallel, which maintains a per-GPU copy of a model’s parameters, gradients and optimizer states, it shards all of these states across data-parallel workers and can optionally offload the sharded model parameters to CPUs. - -The figure below shows how FSDP works for 2 data-parallel processes: - - -

        - -

        - -

        -Figure 1. FSDP workflow -

        - -Usually, model layers are wrapped with FSDP in a nested way, so that only layers in a single FSDP instance need to gather the full parameters to a single device during forward or backward computations. The gathered full parameters will be freed immediately after computation, and the freed memory can be used for the next layer’s computation. In this way, peak GPU memory could be saved and thus training can be scaled to use a larger model size or larger batch size. To further maximize memory efficiency, FSDP can offload the parameters, gradients and optimizer states to CPUs when the instance is not active in the computation. - -### Using FSDP in PyTorch - -There are two ways to wrap a model with PyTorch FSDP. Auto wrapping is a drop-in replacement for DDP; manual wrapping needs minimal changes of model definition code with the ability to explore complex sharding strategies. - - -#### Auto Wrapping - -Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. - -fsdp_auto_wrap_policy argument allows specifying a callable function to recursively wrap layers with FSDP. default_auto_wrap_policy function provided by the PyTorch FSDP recursively wraps layers with the number of parameters larger than 100M. You can supply your own wrapping policy as needed. The example of writing a customized wrapping policy is shown in the [FSDP API doc](https://pytorch.org/docs/stable/fsdp.html). - -In addition, cpu_offload could be configured optionally to offload wrapped parameters to CPUs when these parameters are not used in computation. This can further improve memory efficiency at the cost of data transfer overhead between host and device. - -The example below shows how FSDP is wrapped using auto wrapping. - -```python -from torch.distributed.fsdp import ( - FullyShardedDataParallel, - CPUOffload, -) -from torch.distributed.fsdp.wrap import ( - default_auto_wrap_policy, -) -import torch.nn as nn - -class model(nn.Module): - def __init__(self): - super().__init__() - self.layer1 = nn.Linear(8, 4) - self.layer2 = nn.Linear(4, 16) - self.layer3 = nn.Linear(16, 4) - -model = DistributedDataParallel(model()) -fsdp_model = FullyShardedDataParallel( - model(), - fsdp_auto_wrap_policy=default_auto_wrap_policy, - cpu_offload=CPUOffload(offload_params=True), -) -``` - -#### Manual Wrapping - -Manual wrapping can be useful to explore complex sharding strategies by applying `wrap` selectively to some parts of the model. Overall settings can be passed to the enable_wrap() context manager. - -```python -from torch.distributed.fsdp import ( - FullyShardedDataParallel, - CPUOffload, -) -from torch.distributed.fsdp.wrap import ( - enable_wrap, - wrap, -) -import torch.nn as nn -from typing import Dict - - -class model(nn.Module): - def __init__(self): - super().__init__() - self.layer1 = wrap(nn.Linear(8, 4)) - self.layer2 = nn.Linear(4, 16) - self.layer3 = wrap(nn.Linear(16, 4)) - -wrapper_kwargs = Dict(cpu_offload=CPUOffload(offload_params=True)) -with enable_wrap(wrapper_cls=FullyShardedDataParallel, **wrapper_kwargs): - fsdp_model = wrap(model()) -``` - -After wrapping the model with FSDP using one of the two above approaches, the model can be trained in a similar way as local training, like this: - -```python -optim = torch.optim.Adam(fsdp_model.parameters(), lr=0.0001) -for sample, label in next_batch(): - out = fsdp_model(input) - loss = criterion(out, label) - loss.backward() - optim.step() -``` - -### Benchmark Results - -We ran extensive scaling tests for 175B and 1T GPT models on AWS clusters using PyTorch FSDP. Each cluster node is an instance with 8 [NVIDIA A100-SXM4-40GB](https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf) GPUs, and inter-nodes are connected via AWS Elastic Fabric Adapter (EFA) with 400 Gbps network bandwidth. - -GPT models are implemented using [minGPT](https://github.com/karpathy/minGPT). A randomly generated input dataset is used for benchmarking purposes. All experiments ran with 50K vocabulary size, fp16 precision and [SGD](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html) optimizer. - -| Model | Number of layers | Hidden size | Attention heads | Model size, billions of parameters | -|----------|------------------|-------------|-----------------|------------------------------------| -| GPT 175B | 96 | 12288 | 96 | 175 | -| GPT 1T | 128 | 25600 | 160 | 1008 | - -In addition to using FSDP with parameters CPU offloading in the experiments, the [activation checkpointing feature](https://pytorch.org/docs/stable/checkpoint.html) in PyTorch is also applied in the tests. - -The maximum per-GPU throughput of 159 teraFLOP/s (51% of NVIDIA A100 peak theoretical performance 312 teraFLOP/s/GPU) is achieved with batch size 20 and sequence length 512 on 128 GPUs for the GPT 175B model; further increase of the number of GPUs leads to per-GPU throughput degradation because of growing communication between the nodes. - -For the GPT 1T model, the maximum per-GPU throughput of 84 teraFLOP/s (27% of the peak teraFLOP/s) is achieved with batch size 4 and sequence length 2048 on 128 GPUs. However, further increase of the number of GPUs doesn’t affect the per-GPU throughput too much because we observed that the largest bottleneck in the 1T model training is not from communication but from the slow CUDA cache allocator when peak GPU memory is reaching the limit. The use of A100 80G GPUs with larger memory capacity will mostly resolve this issue and also help scale the batch size to achieve much larger throughput. - -

        - -

        - -

        - -

        - -### Future Work - -In the next beta release, we are planning to add efficient distributed model/states checkpointing APIs, meta device support for large model materialization, and mixed-precision support inside FSDP computation and communication. We’re also going to make it easier to switch between [DDP](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html), [ZeRO1, ZeRO2](https://arxiv.org/abs/1910.02054) and FSDP flavors of data parallelism in the new API. To further improve FSDP performance, memory fragmentation reduction and communication efficiency improvements are also planned. - -### A Bit of History of 2 Versions of FSDP - -[FairScale FSDP](https://engineering.fb.com/2021/07/15/open-source/fsdp/) was released in early 2021 as part of the FairScale library. And then we started the effort to upstream FairScale FSDP to PyTorch in PT 1.11, making it production-ready. We have selectively upstreamed and refactored key features from FairScale FSDP, redesigned user interfaces and made performance improvements. - -In the near future, FairScale FSDP will stay in the FairScale repository for research projects, while generic and widely adopted features will be upstreamed to PyTorch incrementally and hardened accordingly. - -Meanwhile, PyTorch FSDP will focus more on production readiness and long-term support. This includes better integration with ecosystems and improvements on performance, usability, reliability, debuggability and composability. - -### Acknowledgments - -We would like to thank the authors of FairScale FSDP: Myle Ott, Sam Shleifer, Min Xu, Priya Goyal, Quentin Duval, Vittorio Caggiano, Tingting Markstrum, Anjali Sridhar. Thanks to the Microsoft DeepSpeed ZeRO team for developing and popularizing sharded data parallel techniques. Thanks to Pavel Belevich, Jessica Choi, Sisil Mehta for running experiments using PyTorch FSDP on different clusters. Thanks to Geeta Chauhan, Mahesh Yadav, Pritam Damania, Dmytro Dzhulgakov for supporting this effort and insightful discussions. diff --git a/_posts/2022-3-16-running-pytorch-models-on-jetson-nano.md b/_posts/2022-3-16-running-pytorch-models-on-jetson-nano.md deleted file mode 100644 index 25e3c905943b..000000000000 --- a/_posts/2022-3-16-running-pytorch-models-on-jetson-nano.md +++ /dev/null @@ -1,271 +0,0 @@ ---- -layout: blog_detail -title: 'Running PyTorch Models on Jetson Nano' -author: Jeff Tang, Hamid Shojanazeri, Geeta Chauhan -featured-img: 'assets/images/pytorch-logo.jpg' ---- - -### Overview -NVIDIA [Jetson Nano](https://developer.nvidia.com/embedded/jetson-nano-developer-kit), part of the [Jetson family of products](https://developer.nvidia.com/embedded/jetson-modules) or Jetson modules, is a small yet powerful Linux (Ubuntu) based embedded computer with 2/4GB GPU. With it, you can run many PyTorch models efficiently. This document summarizes our experience of running different deep learning models using 3 different mechanisms on Jetson Nano: - - 1. Jetson Inference the higher-level NVIDIA API that has built-in support for running most common computer vision models which can be transfer-learned with PyTorch on the Jetson platform. - - 2. TensorRT, an SDK for high-performance inference from NVIDIA that requires the conversion of a PyTorch model to ONNX, and then to the TensorRT engine file that the TensorRT runtime can run. - - 3. PyTorch with the direct PyTorch API `torch.nn` for inference. - -### Setting up Jetson Nano -After purchasing a Jetson Nano [here](https://developer.nvidia.com/buy-jetson?product=jetson_nano&location=US), simply follow the clear step-by-step [instructions](https://developer.nvidia.com/embedded/learn/get-started-jetson-nano-devkit) to download and write the Jetson Nano Developer Kit SD Card Image to a microSD card, and complete the setup. After the setup is done and the Nano is booted, you’ll see the standard Linux prompt along with the username and the Nano name used in the setup. - -To check the GPU status on Nano, run the following commands: - -``` -sudo pip3 install jetson-stats -sudo jtop -``` - -You’ll see information, including: - -
        - -
        - -You can also see the installed CUDA version: - -``` -$ ls -lt /usr/local -lrwxrwxrwx 1 root root 22 Aug 2 01:47 cuda -> /etc/alternatives/cuda -lrwxrwxrwx 1 root root 25 Aug 2 01:47 cuda-10 -> /etc/alternatives/cuda-10 -drwxr-xr-x 12 root root 4096 Aug 2 01:47 cuda-10.2 -``` - -To use a camera on Jetson Nano, for example, Arducam 8MP IMX219, follow the instructions [here](https://www.arducam.com/docs/camera-for-jetson-nano/mipi-camera-modules-for-jetson-nano/driver-installation/) or run the commands below after [installing a camera module](https://developer.nvidia.com/embedded/learn/jetson-nano-2gb-devkit-user-guide#id-.JetsonNano2GBDeveloperKitUserGuidevbatuu_v1.0-Camera): - -``` -cd ~ -wget https://github.com/ArduCAM/MIPI_Camera/releases/download/v0.0.3/install_full.sh -chmod +x install_full.sh -./install_full.sh -m arducam -``` - -Another way to do this is to use the original Jetson Nano camera driver: - -``` -sudo dpkg -r arducam-nvidia-l4t-kernel -sudo shutdown -r now -``` - -Then, use ls /dev/video0 to confirm the camera is found: - -``` -$ ls /dev/video0 -/dev/video0 -``` - -And finally, the following command to see the camera in action: - -``` -nvgstcapture-1.0 --orientation=2 -``` - -### Using Jetson Inference -NVIDIA [Jetson Inference](https://github.com/dusty-nv/jetson-inference) API offers the easiest way to run image recognition, object detection, semantic segmentation, and pose estimation models on Jetson Nano. Jetson Inference has TensorRT built-in, so it’s very fast. - -To test run Jetson Inference, first clone the repo and download the models: - -``` -git clone --recursive https://github.com/dusty-nv/jetson-inference -cd jetson-inference -``` - -Then use the pre-built [Docker Container](https://github.com/dusty-nv/jetson-inference/blob/master/docs/jetpack-setup-2.md) that already has PyTorch installed to test run the models: - -``` -docker/run.sh --volume ~/jetson_inference:/jetson_inference -``` - -To run image recognition, object detection, semantic segmentation, and pose estimation models on test images, use the following: - -``` -cd build/aarch64/bin -./imagenet.py images/jellyfish.jpg /jetson_inference/jellyfish.jpg -./segnet.py images/dog.jpg /jetson_inference/dog.jpeg -./detectnet.py images/peds_0.jpg /jetson_inference/peds_0.jpg -./posenet.py images/humans_0.jpg /jetson_inference/pose_humans_0.jpg -``` - -Four result images from running the four different models will be generated. Exit the docker image to see them: - -``` -$ ls -lt ~/jetson_inference/ --rw-r--r-- 1 root root 68834 Oct 15 21:30 pose_humans_0.jpg --rw-r--r-- 1 root root 914058 Oct 15 21:30 peds_0.jpg --rw-r--r-- 1 root root 666239 Oct 15 21:30 dog.jpeg --rw-r--r-- 1 root root 179760 Oct 15 21:29 jellyfish.jpg -``` - - -
        - Using jest interface example 1 - Using jest interface example 2 -
        - - -
        - Using jest interface example 3 - Using jest interface example 4 -
        - -You can also use the docker image to run PyTorch models because the image has PyTorch, torchvision and torchaudio installed: - -``` -# pip list|grep torch -torch (1.9.0) -torchaudio (0.9.0a0+33b2469) -torchvision (0.10.0a0+300a8a4) -``` - -Although Jetson Inference includes models already converted to the TensorRT engine file format, you can fine-tune the models by following the steps in Transfer Learning with PyTorch (for Jetson Inference) [here](https://github.com/dusty-nv/jetson-inference/blob/master/docs/pytorch-transfer-learning.md). - -### Using TensorRT -[TensorRT](https://docs.nvidia.com/deeplearning/tensorrt/) is an SDK for high-performance inference from NVIDIA. Jetson Nano supports TensorRT via the Jetpack SDK, included in the SD Card image used to set up Jetson Nano. To confirm that TensorRT is already installed in Nano, `run dpkg -l|grep -i tensorrt`: - - -
        - -
        - -Theoretically, TensorRT can be used to “take a trained PyTorch model and optimize it to run more efficiently during inference on an NVIDIA GPU.” Follow the instructions and code in the [notebook](https://github.com/NVIDIA/TensorRT/blob/master/quickstart/IntroNotebooks/4.%20Using%20PyTorch%20through%20ONNX.ipynb) to see how to use PyTorch with TensorRT through ONNX on a torchvision Resnet50 model: - -1. How to convert the model from PyTorch to ONNX; - -2. How to convert the ONNX model to a TensorRT engine file; - -3. How to run the engine file with the TensorRT runtime for performance improvement: inference time improved from the original 31.5ms/19.4ms (FP32/FP16 precision) to 6.28ms (TensorRT). - -You can replace the Resnet50 model in the notebook code with another PyTorch model, go through the conversion process above, and run the finally converted model TensorRT engine file with the TensorRT runtime to see the optimized performance. But be aware that due to the Nano GPU memory size, models larger than 100MB are likely to fail to run, with the following error information: - -`Error Code 1: Cuda Runtime (all CUDA-capable devices are busy or unavailable)` - -You may also see an error when converting a PyTorch model to ONNX model, which may be fixed by replacing: - -`torch.onnx.export(resnet50, dummy_input, "resnet50_pytorch.onnx", verbose=False)` - -with: - -`torch.onnx.export(model, dummy_input, "deeplabv3_pytorch.onnx", opset_version=11, verbose=False)` - -### Using PyTorch -First, to download and install PyTorch 1.9 on Nano, run the following commands (see [here](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048) for more information): - -``` -wget https://nvidia.box.com/shared/static/p57jwntv436lfrd78inwl7iml6p13fzh.whl -O torch-1.8.0-cp36-cp36m-linux_aarch64.whl -O torch-1.9.0-cp36-cp36m-linux_aarch64.whl -sudo apt-get install python3-pip libopenblas-base libopenmpi-dev -pip3 install Cython -pip3 install numpy torch-1.9.0-cp36-cp36m-linux_aarch64.whl -``` - -To download and install torchvision 0.10 on Nano, run the commands below: - -``` -https://drive.google.com/uc?id=1tU6YlPjrP605j4z8PMnqwCSoP6sSC91Z -pip3 install torchvision-0.10.0a0+300a8a4-cp36-cp36m-linux_aarch64.whl -``` - -After the steps above, run this to confirm: -``` -$ pip3 list|grep torch -torch (1.9.0) -torchvision (0.10.0) -``` - -You can also use the docker image described in the section *Using Jetson Inference* (which also has PyTorch and torchvision installed), to skip the manual steps above. - -The official [YOLOv5](https://github.com/ultralytics/yolov5) repo is used to run the PyTorch YOLOv5 model on Jetson Nano. After logging in to Jetson Nano, follow the steps below: - -* Get the repo and install what’s required: - -``` -git clone https://github.com/ultralytics/yolov5 -cd yolov5 -pip install -r requirements.txt -``` - -* Run `python3 detect.py`, which by default uses the PyTorch yolov5s.pt model. You should see something like: - -``` -detect: weights=yolov5s.pt, source=data/images, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False -YOLOv5 🚀 v5.0-499-g48b00db torch 1.9.0 CUDA:0 (NVIDIA Tegra X1, 3956.1015625MB) - -Fusing layers... -Model Summary: 224 layers, 7266973 parameters, 0 gradients -image 1/5 /home/jeff/repos/yolov5-new/yolov5/data/images/bus.jpg: 640x480 4 persons, 1 bus, 1 fire hydrant, Done. (0.142s) -... -``` - -**The inference time on Jetson Nano GPU is about 140ms, more than twice as fast as the inference time on iOS or Android (about 330ms).** - -If you get an error `“ImportError: The _imagingft C module is not installed.”` then you need to reinstall pillow: -``` -sudo apt-get install libpng-dev -sudo apt-get install libfreetype6-dev -pip3 uninstall pillow -pip3 install --no-cache-dir pillow -``` - -After successfully completing the `python3 detect.py` run, the object detection results of the test images located in `data/images` will be in the `runs/detect/exp` directory. To test the detection with a live webcam instead of local images, use the `--source 0` parameter when running `python3 detect.py`): - -``` -~/repos/yolov5$ ls -lt runs/detect/exp10 -total 1456 --rw-rw-r-- 1 jeff jeff 254895 Oct 15 16:12 zidane.jpg --rw-rw-r-- 1 jeff jeff 202674 Oct 15 16:12 test3.png --rw-rw-r-- 1 jeff jeff 217117 Oct 15 16:12 test2.jpg --rw-rw-r-- 1 jeff jeff 305826 Oct 15 16:12 test1.png --rw-rw-r-- 1 jeff jeff 495760 Oct 15 16:12 bus.jpg -``` - -Using the same test files used in the PyTorch iOS YOLOv5 demo app or Android YOLOv5 demo app, you can compare the results generated with running the YOLOv5 PyTorch model on mobile devices and Jetson Nano: - -
        - PyTorch YOLOv5 on Jetson Nano, example with a dog - PyTorch YOLOv5 on Jetson Nano, example with a horse and a rider -
        -Figure 1. PyTorch YOLOv5 on Jetson Nano. - -
        - PyTorch YOLOv5 on iOS, example with a dog - PyTorch YOLOv5 on iOS, example with a horse and a rider -
        -Figure 2. PyTorch YOLOv5 on iOS. - -
        - PyTorch YOLOv5 on Android, example with a dog - PyTorch YOLOv5 on Android, example with a horse and a rider -
        -Figure 3. PyTorch YOLOv5 on Android. - -### Summary -Based on our experience of running different PyTorch models for potential demo apps on Jetson Nano, we see that even Jetson Nano, a lower-end of the Jetson family of products, provides a powerful GPU and embedded system that can directly run some of the latest PyTorch models, pre-trained or transfer learned, efficiently. - -Building PyTorch demo apps on Jetson Nano can be similar to building PyTorch apps on Linux, but you can also choose to use TensorRT after converting the PyTorch models to the TensorRT engine file format. - -But if you just need to run some common computer vision models on Jetson Nano using NVIDIA’s Jetson Inference which supports image recognition, object detection, semantic segmentation, and pose estimation models, then this is the easiest way. - - -### References -Torch-TensorRT, a compiler for PyTorch via TensorRT: -[https://github.com/NVIDIA/Torch-TensorRT/](https://github.com/NVIDIA/Torch-TensorRT/) - -Jetson Inference docker image details: -[https://github.com/dusty-nv/jetson-inference/blob/master/docs/aux-docker.md](https://github.com/dusty-nv/jetson-inference/blob/master/docs/aux-docker.md) - -A guide to using TensorRT on the NVIDIA Jetson Nano: -[https://docs.donkeycar.com/guide/robot_sbc/tensorrt_jetson_nano/](https://docs.donkeycar.com/guide/robot_sbc/tensorrt_jetson_nano/) -including: - -1. Use Jetson as a portable GPU device to run an NN chess engine model: -[https://medium.com/@ezchess/jetson-lc0-running-leela-chess-zero-on-nvidia-jetson-a-portable-gpu-device-a213afc9c018](https://medium.com/@ezchess/jetson-lc0-running-leela-chess-zero-on-nvidia-jetson-a-portable-gpu-device-a213afc9c018) - -2. A MaskEraser app using PyTorch and torchvision, installed directly with pip: -[https://github.com/INTEC-ATI/MaskEraser#install-pytorch](https://github.com/INTEC-ATI/MaskEraser#install-pytorch) diff --git a/_posts/2022-3-2-understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu.md b/_posts/2022-3-2-understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu.md deleted file mode 100644 index c6c5eecb9443..000000000000 --- a/_posts/2022-3-2-understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu.md +++ /dev/null @@ -1,195 +0,0 @@ ---- -layout: blog_detail -title: "Understanding LazyTensor System Performance with PyTorch/XLA on Cloud TPU" -author: Vaibhav Singh -featured-img: "" ---- - -## Introduction - -Ease of use, expressivity, and debuggability are among the core principles of PyTorch. One of the key drivers for the ease of use is that PyTorch execution is by default “eager, i.e. op by op execution preserves the imperative nature of the program. However, eager execution does not offer the compiler based optimization, for example, the optimizations when the computation can be expressed as a graph. - -LazyTensor [[1]], first introduced with PyTorch/XLA, helps combine these seemingly disparate approaches. While PyTorch eager execution is widely used, intuitive, and well understood, lazy execution is not as prevalent yet. - -In this post we will explore some of the basic concepts of the LazyTensor System with the goal of applying these concepts to understand and debug performance of LazyTensor based implementations in PyTorch. Although we will use PyTorch/XLA on Cloud TPU as the vehicle for exploring these concepts, we hope that these ideas will be useful to understand other system(s) built on LazyTensors. - -## LazyTensor - -Any operation performed on a PyTorch tensor is by default dispatched as a kernel or a composition of kernels to the underlying hardware. These kernels are executed asynchronously on the underlying hardware. The program execution is not blocked until the value of a tensor is fetched. This approach scales extremely well with massively parallel programmed hardware such as GPUs. - -The starting point of a LazyTensor system is a custom tensor type. In PyTorch/XLA, this type is called XLA tensor. In contrast to PyTorch’s native tensor type, operations performed on XLA tensors are recorded into an IR graph. Let’s examine an example that sums the product of two tensors: - -```python -import torch -import torch_xla -import torch_xla.core.xla_model as xm - -dev = xm.xla_device() - -x1 = torch.rand((3, 3)).to(dev) -x2 = torch.rand((3, 8)).to(dev) - -y1 = torch.einsum('bs,st->bt', x1, x2) -print(torch_xla._XLAC._get_xla_tensors_text([y1])) -``` - -You can execute [this](https://github.com/ultrons/xla/blob/lazy-tensor-post/contrib/colab/LazyTensor_Basics.ipynb) colab notebook to examine the resulting graph for y1. Notice that no computation has been performed yet. - -```python -y1 = y1 + x2 -print(torch_xla._XLAC._get_xla_tensors_text([y1])) -``` - -The operations will continue until PyTorch/XLA encounters a barrier. This barrier can either be a [mark step()](https://github.com/pytorch/xla/blob/ff079bb48744e5aa6696201ccf34057f15fc7cac/torch_xla/core/xla_model.py#L751) api call or any other event which forces the execution of the graph recorded so far. - -```python -xm.mark_step() -print(torch_xla._XLAC._get_xla_tensors_text([y1])) -``` - -Once the mark_step() is called, the graph is compiled and then executed on TPU, i.e. the tensors have been materialized. Therefore, the graph is now reduced to a single line y1 tensor which holds the result of the computation. - -### Compile Once, Execute Often - -XLA compilation passes offer optimizations (e.g. op-fusion, which reduces HBM pressure by using scratch-pad memory for multiple ops, [ref](https://arxiv.org/pdf/2004.13336.pdf) ) and leverages lower level XLA infrastructure to optimally use the underlying hardware. However, there is one caveat, compilation passes are expensive, i.e. can add to the training step time. Therefore, this approach scales well if and only if we can **compile once and execute often** (compilation cache helps, such that the same graph is not compiled more than once). - -In the following example, we create a small computation graph and time the execution: - -```python -y1 = torch.rand((3, 8)).to(dev) -def dummy_step() : - y1 = torch.einsum('bs,st->bt', y1, x) - xm.mark_step() - return y1 -``` - -```python -%timeit dummy_step -``` - -```python -The slowest run took 29.74 times longer than the fastest. This could mean that an intermediate result is being cached. -10000000 loops, best of 5: 34.2 ns per loop -``` - -You notice that the slowest step is quite longer than the fastest. This is because of the graph compilation overhead which is incurred only once for a given shape of graph, input shape, and output shape. Subsequent steps are faster because no graph compilation is necessary. - -This also implies that we expect to see performance cliffs when the “compile once and execute often” assumption breaks. Understanding when this assumption breaks is the key to understanding and optimizing the performance of a LazyTensor system. Let’s examine what triggers the compilation. - -### Graph Compilation and Execution and LazyTensor Barrier - -We saw that the computation graph is compiled and executed when a LazyTensor barrier is encountered. There are three scenarios when the LazyTensor barrier is automatically or manually introduced. The first is the explicit call of mark_step() api as shown in the preceding example. mark_step() is also called implicitly at every step when you wrap your dataloader with MpDeviceLoader (highly recommended to overlap compute and data upload to TPU device). The [Optimizer step](https://github.com/pytorch/xla/blob/master/torch_xla/core/xla_model.py#L804) method of xla_model also allows to implicitly call mark_step (when you set barrier=True). - -The second scenario where a barrier is introduced is when PyTorch/XLA finds an op with no mapping (lowering) to equivalent XLA HLO ops. PyTorch has [2000+](https://dev-discuss.pytorch.org/t/where-do-the-2000-pytorch-operators-come-from-more-than-you-wanted-to-know/373) operations. Although most of these operations are composite (i.e. can be expressed in terms of other fundamental operations), some of these operations do not have corresponding lowering in XLA. - -

        - -

        - -What happens when an op with no XLA lowering is used? PyTorch XLA stops the operation recording and cuts the graph(s) leading to the input(s) of the unlowered op. This cut graph is then compiled and dispatched for execution. The results (materialized tensor) of execution are sent back from device to host, the unlowered op is then executed on the host (cpu), and then downstream LazyTensor operations creating a new graph(s) until a barrier is encountered again. - -The third and final scenario which results in a LazyTensor barrier is when there is a control structure/statement or another method which requires the value of a tensor. This statement would at the minimum cause the execution of the computation graph leading to the tensor (if the graph has already been seen) or cause compilation and execution of both. - -Other examples of such methods include .item(), isEqual(). In general, any operation that maps Tensor -> Scalar will cause this behavior. - -### Dynamic Graph - -As illustrated in the preceding section, graph compilation cost is amortized if the same shape of the graph is executed many times. It’s because the compiled graph is cached with a hash derived from the graph shape, input shape, and the output shape. If these shapes change it will trigger compilation, and too frequent compilation will result in training time degradation. - -Let’s consider the following example: - -```python -def dummy_step(x, y, loss, acc=False): - z = torch.einsum('bs,st->bt', y, x) - step_loss = z.sum().view(1,) - if acc: - loss = torch.cat((loss, step_loss)) - else: - loss = step_loss - xm.mark_step() - return loss - - -import time -def measure_time(acc=False): - exec_times = [] - iter_count = 100 - x = torch.rand((512, 8)).to(dev) - y = torch.rand((512, 512)).to(dev) - loss = torch.zeros(1).to(dev) - for i in range(iter_count): - tic = time.time() - loss = dummy_step(x, y, loss, acc=acc) - toc = time.time() - exec_times.append(toc - tic) - return exec_times - -dyn = measure_time(acc=True) # acc= True Results in dynamic graph -st = measure_time(acc=False) # Static graph, computation shape, inputs and output shapes don't change - -import matplotlib.pyplot as plt -plt.plot(st, label = 'static graph') -plt.plot(dyn, label = 'dynamic graph') -plt.legend() -plt.title('Execution time in seconds') -``` - -

        - -

        - -Note that static and dynamic cases have the same computation but dynamic graph compiles every time, leading to the higher overall run-time. In practice, the training step with recompilation can sometimes be an order of magnitude or slower. In the next section we discuss some of the PyTorch/XLA tools to debug training degradation. - -### Profiling Training Performance with PyTorch/XLA - -PyTorch/XLA profiling consists of two major components. First is the client side profiling. This feature is turned on by simply setting the environment variable PT_XLA_DEBUG to 1. Client side profiling points to unlowered ops or device-to-host transfer in your source code. Client side profiling also reports if there are too frequent compilations happening during the training. You can explore some metrics and counters provided by PyTorch/XLA in conjunction with the profiler in [this](https://github.com/ultrons/xla/blob/lazy-tensor-post/contrib/colab/Exploring_LazyTensor_with_Debug_Metrics.ipynb) notebook. - -The second component offered by PyTorch/XLA profiler is the inline trace annotation. For example: - -```python -import torch_xla.debug.profiler as xp - -def train_imagenet(): - print('==> Preparing data..') - img_dim = get_model_property('img_dim') - .... - server = xp.start_server(3294) - def train_loop_fn(loader, epoch): - .... - model.train() - for step, (data, target) in enumerate(loader): - with xp.StepTrace('Train_Step', step_num=step): - .... - if FLAGS.amp: - .... - else: - with xp.Trace('build_graph'): - output = model(data) - loss = loss_fn(output, target) - loss.backward() - xm.optimizer_step(optimizer) -``` - -Notice the start_server api call. The port number that you have used here is the same port number you will use with the tensorboard profiler in order to view the op trace similar to: - -

        - -

        - -Op trace along with the client-side debugging function is a powerful set of tools to debug and optimize your training performance with PyTorch/XLA. For more detailed instructions on the profiler usage, the reader is encouraged to explore blogs [part-1](https://cloud.google.com/blog/topics/developers-practitioners/pytorchxla-performance-debugging-tpu-vm-part-1), [part-2](https://cloud.google.com/blog/topics/developers-practitioners/pytorchxla-performance-debugging-cloud-tpu-vm-part-ii), and [part-3](https://cloud.google.com/blog/topics/developers-practitioners/pytorchxla-performance-debugging-cloud-tpu-vm-part-iii) of the blog series on PyTorch/XLA performance debugging. - -### Summary - -In this article we have reviewed the fundamentals of the LazyTensor system. We built on those fundamentals with PyTorch/XLA to understand the potential causes of training performance degradation. We discussed why “compile once and execute often” helps to get the best performance on LazyTensor systems, and why training slows down when this assumption breaks. - -We hope that PyTorch users will find these insights helpful for their novel works with LazyTensor systems. - -### Acknowledgements - -A big thank you to my outstanding colleagues Jack Cao, Milad Mohammedi, Karl Weinmeister, Rajesh Thallam, Jordan Tottan (Google) and Geeta Chauhan (Meta) for their meticulous reviews and feedback. And thanks to the extended PyTorch/XLA development team from Google, Meta, and the open source community to make PyTorch possible on TPUs. And finally, thanks to the authors of the [LazyTensor paper](https://arxiv.org/pdf/2102.13267.pdf) not only for developing LazyTensor but also for writing such an accessible paper. - -## Refrences - -[[1]] LazyTensor: combining eager execution with domain-specific compilers - -[1]: https://arxiv.org/pdf/2102.13267.pdf diff --git a/_posts/2022-5-12-ambient-clinical-intelligence-generating-medical-reports-with-pytorch.md b/_posts/2022-5-12-ambient-clinical-intelligence-generating-medical-reports-with-pytorch.md deleted file mode 100644 index 0952301db8bd..000000000000 --- a/_posts/2022-5-12-ambient-clinical-intelligence-generating-medical-reports-with-pytorch.md +++ /dev/null @@ -1,274 +0,0 @@ ---- -layout: blog_detail -title: "Ambient Clinical Intelligence: Generating Medical Reports with PyTorch" -author: Miguel Del-Agua, Principal Research Scientist, Nuance and Jeremy Jancsary, Senior Principal Research Scientist, Nuance -featured-img: "" ---- - -## Introduction - -Complete and accurate clinical documentation is an essential tool for tracking patient care. It allows for treatment plans to be shared among care teams to aid in continuity of care and ensures a transparent and effective process for reimbursement. - -Physicians are responsible for documenting patient care. Traditional clinical documentation methods have resulted in a sub-par patient-provider experience, less time interacting with patients, and decreased work-life balance. A significant amount of physicians’ time is spent in front of the computer doing administrative tasks. As a result, patients are less satisfied with the overall experience, and physicians, who prepare for years studying medicine, cannot practice at the top of their license and are burned out. Every hour physicians provide direct clinical face time to patients results in nearly two additional hours spent on EHR and desk work within the clinic day. Outside office hours, physicians [spend another 1 to 2 hours of personal](https://www.acpjournals.org/doi/10.7326/m16-0961) time each night doing additional computer and other clerical work. - -* [42% of all physicians reported having burnout. – Medscape](https://www.medscape.com/slideshow/2020-lifestyle-burnout-6012460) -* [The problem has grown worse due to the pandemic with 64% of U.S. physicians now reporting burnout. - AAFP](https://www.aafp.org/journals/fpm/blogs/inpractice/entry/covid_burnout_survey.html#:~:text=Physician%20burnout%20was%20already%20a,5%2C000%20%E2%80%94%20practice%20in%20the%20U.S.) -* ["Too many bureaucratic tasks e.g., charting and paperwork" is the leading contribution to burnout, increased computerization ranks 4th.](https://login.medscape.com/login/sso/getlogin?urlCache=aHR0cHM6Ly93d3cubWVkc2NhcGUuY29tL3NsaWRlc2hvdy8yMDIwLWxpZmVzdHlsZS1idXJub3V0LTYwMTI0NjA%3D&ac=401) - Medscape -* [75% of U.S. Consumers Wish Their Healthcare Experiences Were More Personalized,](https://www.businesswire.com/news/home/20200218005006/en/75-of-U.S.-Consumers-Wish-Their-Healthcare-Experiences-Were-More-Personalized-Redpoint-Global-Survey-Reveals)- Business Wire -* [61% of patients would visit their healthcare provider more often if the communication experience felt more personalized.](https://www.businesswire.com/news/home/20200218005006/en/75-of-U.S.-Consumers-Wish-Their-Healthcare-Experiences-Were-More-Personalized-Redpoint-Global-Survey-Reveals) – Business Wire - -Physician burnout is one of the primary causes for increased [medical errors](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6175626/), malpractice suits, turnover, and decreased access to care. Burnout leads to an increase in healthcare costs and a decrease in overall patient satisfaction. [Burnout costs the United States $4.6 billion a year.](https://www.nejm.org/doi/full/10.1056/NEJMp2003149) - -What can we do to bring back trust, joy, and humanity to the delivery of healthcare? A significant portion of the administrative work consists of entering patient data into Electronic Health Records (EHRs) and creating clinical documentation. Clinical documentation is created from information already in the EHR as well as from the patient-provider encounter conversation. - -This article will showcase how the Nuance Dragon Ambient eXperience (DAX), an AI-powered, voice-enabled, ambient clinical intelligence solution, automatically documents patient encounters accurately and efficiently at the point of care and the technologies that enable it. - -Nuance DAX enhances the quality of care and patient experience, increases provider efficiency and satisfaction, and improves financial outcomes. It can be used in office and telehealth settings in all ambulatory specialties, including primary and urgent care. - -

        - -

        - -## Natural Language Processing - -Natural Language Processing (NLP) is one of the most challenging fields in Artificial Intelligence (AI). It comprehends a set of algorithms that allow computers to understand or generate the language used by humans. These algorithms can process and analyze vast amounts of natural language data from different sources (either sound or text) to build models that can understand, classify, or even generate natural language as humans would. Like other fields in AI, NLP has significantly progressed thanks to the advent of Deep Learning (DL), which has resulted in models that can obtain results on par with humans in some tasks. - -These advanced NLP techniques are being applied in healthcare. During a typical patient-provider encounter, a conversation ensues where the doctor constructs, through questions and answers, a chronological description of the development of the patient's presenting illness or symptoms. A physician examines the patient and makes clinical decisions to establish a diagnosis and determine a treatment plan. This conversation, and data in the EHR, provide the required information for physicians to generate the clinical documentation, referred to as medical reports. - -Two main NLP components play a role in automating the creation of clinical documentation. The first component, Automatic Speech Recognition (ASR), is used to translate speech into text. It takes the audio recording of the encounter and generates a conversation transcription (cf. Figure 2). The second component, Automatic Text Summarization, helps generate summaries from large text documents. This component is responsible for understanding and capturing the nuances and most essential aspects from the transcribed conversation into a final report in narrative form (cf. Figure 3), structured form, or a combination of both. - -We will focus on this second component, Automatic Text Summarization, which is a difficult task with many challenges: - -* Its performance is tied to the ASR quality from multiple speakers (noisy input). -* The input is conversational in nature and contains layman's terms. -* Protected Health Information (PHI) regulations limit medical data access. -* The information for one output sentence is potentially spread across multiple conversation turns. -* There is no explicit sentence alignment between input and output. -* Various medical specialties, encounter types, and EHR systems constitute a broad and complex output space. -* Physicians have different styles of conducting encounters and have their preferences for medical reports; there is no standard. -* Standard summarization metrics might differ from human judgment of quality. - -

        - -

        - -

        -Figure 2: Transcript of a patient-doctor conversation -

        - -

        - -

        - -

        -Figure 3: Excerpt of an AI-generated medical report. HPI stands for History of present illness. -

        - -## Text Summarization with PyTorch and Fairseq - -[PyTorch](https://pytorch.org/) is an open-source machine learning framework developed by Facebook that helps researchers prototype Deep Learning models. The [Fairseq](https://github.com/pytorch/fairseq) toolkit is built on top of PyTorch and focuses on sequence generation tasks, such as Neural Machine Translation (NMT) or Text Summarization. Fairseq features an active community that is continuously providing reference implementations of state-of-the-art models. It contains many built-in components (model architectures, modules, loss functions, and optimizers) and is easily extendable with plugins. - -Text summarization constitutes a significant challenge in NLP. We need models capable of generating a short version of a document while retaining the key points and avoiding uninformative content. These challenges can be addressed with different approaches. 1). Abstractive text summarization aimed at training models that can generate a summary in narrative form. 2). Extractive methods where the models are trained to select the most important parts from the input text. 3). A combination of the two, where the essential parts from the input are selected and then summarized in an abstractive fashion. Hence, summarization can be accomplished via a single end-to-end network or as a pipeline of extractive and abstractive components. To that end, Fairseq provides all the necessary tools to be successful in our endeavor. It features either end-to-end models such as the classical Transformer, different types of Language Models and pre-trained versions that enable researchers to focus on what matters most—to build state-of-the-art models that generate valuable reports. - -However, we are not just summarizing the transcribed conversation; we generate high-quality medical reports, which have many considerations. - -* Every section of a medical report is different in terms of content, structure, fluency, etc. -* All medical facts mentioned in the conversation should be present in the report, for example, a particular treatment or dosage. -* In the healthcare domain, the vocabulary is extensive, and models need to deal with medical terminology. -* Patient-doctor conversations are usually much longer than the final report. - -All these challenges require our researchers to run a battery of extensive experiments. Thanks to the flexibility of PyTorch and Fairseq, their productivity has greatly increased. Further, the ecosystem offers an easy path from ideation, implementation, experimentation, and final roll-out to production. Using multiple GPUs or CPUs is as simple as providing an additional argument to the tools, and because of the tight Python integration, PyTorch code can be easily debugged. - -In our continuous effort to contribute to the open-source community, features have been developed at Nuance and pushed to the Fairseq GitHub repository. These try to overcome some of the challenges mentioned such as, facilitating copying of, especially rare or unseen, words from the input to summary, training speedups by improving Tensor Core utilization, and ensuring TorchScript compatibility of different Transformer configurations. Following, we will show an example of how to train a Transformer model with a Pointer Generator mechanism (Transformer-PG), which can copy words from the input. - -## How to build a Transformer model with a Pointer Generator mechanism - -In this step-by-step guide, it is assumed the user has already installed PyTorch and Fairseq. - -### 1. Create a vocabulary and extend it with source position markers: - -These markers will allow the model to point to any word in the input sequence. - -```python -vocab_size= -position_markers=512 -export LC_ALL=C -cat train.src train.tgt | - tr -s '[:space:]' '\n' | - sort | - uniq -c | - sort -k1,1bnr -k2 | - head -n "$((vocab_size - 4))" | - awk '{ print $2 " " $1 }' > dict.pg.txt -python3 -c "[print(' 0'.format(n)) for n in range($position_markers)]" >> dict.pg.txt -``` - -This will create a file "dict.pg.txt" that contains the \ most frequent words followed by 512 position markers named from "\" to "\". - -In case we have an input like - -```python -src = "Hello, I'm The Dogtor" -``` - -it could happen that our model has been trained without the word "Dogtor" in its vocabulary. Therefore, when we feed this sequence into the model, it should be converted to: - -```python -src = "Hello, I'm The " -``` - -Now, "\" is part of our vocabulary and could be predicted by the model (this is where the pointer-generator comes in). In such a case, we will only need to post-process the output to replace "\" by the word at input position 3. - -### 2. Preprocess the text data to replace unknown words by its positional markers: - -We can use the scripts from [https://github.com/pytorch/fairseq/tree/master/examples/pointer_generator](https://github.com/pytorch/fairseq/tree/master/examples/pointer_generator). - -```python -# Considering we have our data in: -# train_src = /path/to/train.src -# train_tgt = /path/to/train.tgt -# valid_src = /path/to/valid.src -# valid_tgt = /path/to/valid.tgt -./preprocess.py --source /path/to/train.src \ - --target /path/to/train.tgt \ - --vocab <(cut -d' ' -f1 dict.pg.txt) \ - --source-out /path/to/train.pg.src \ - --target-out /path/to/train.pg.tgt - -./preprocess.py --source /path/to/valid.src \ - --target /path/to/valid.tgt \ - --vocab <(cut -d' ' -f1 dict.pg.txt) \ - --source-out /path/to/valid.pg.src \ - --target-out /path/to/valid.pg.tgt - -./preprocess.py --source /path/to/test.src \ - --vocab <(cut -d' ' -f1 dict.pg.txt) \ - --source-out /path/to/test.pg.src -``` - -### 3. Now let's binarize the data, so that it can be processed faster: - -```python -fairseq-preprocess --task "translation" \ - --source-lang "pg.src" \ - --target-lang "pg.tgt" \ - --trainpref /path/to/train \ - --validpref /path/to/valid \ - --srcdict dict.pg.txt \ - --cpu \ - --joined-dictionary \ - --destdir -``` - -You might notice the type of task is "translation". This is because there is no "summarization" task available; we could understand it as a kind of NMT task where the input and output languages are shared and the output (summary) is shorter than the input. - -### 4. Now we can train the model: - -```python -fairseq-train \ - --save-dir \ - --task "translation" \ - --source-lang "src" \ - --target-lang "tgt" \ - --arch "transformer_pointer_generator" \ - --max-source-positions 512 \ - --max-target-positions 128 \ - --truncate-source \ - --max-tokens 2048 \ - --required-batch-size-multiple 1 \ - --required-seq-len-multiple 8 \ - --share-all-embeddings \ - --dropout 0.1 \ - --criterion "cross_entropy" \ - --optimizer adam \ - --adam-betas '(0.9, 0.98)' \ - --adam-eps 1e-9 \ - --update-freq 4 \ - --lr 0.004 \ - # Pointer Generator - --alignment-layer -1 \ - --alignment-heads 1 \ - --source-position-markers 512 -``` - -This configuration makes use of features Nuance has contributed back to Fairseq: - -* Transformer with a Pointer Generator mechanism to facilitate copying of words from the input. -* Sequence length padded to a multiple of 8 to better use tensor cores and reduce training time. - -### 5. Now let's take a look at how to generate a summary with our new medical report generation system: - -```python -import torch -from examples.pointer_generator.pointer_generator_src.transformer_pg import TransformerPointerGeneratorModel - -# Patient-Doctor conversation -input = "[doctor] Lisa Simpson, thirty six year old female, presents to the clinic today because " \ - "she has severe right wrist pain" - -# Load the model -model = TransformerPointerGeneratorModel.from_pretrained(data_name_or_path=, - model_name_or_path=, - checkpoint_file="checkpoint_best.pt") - -result = model.translate([input], beam=2) - -print(result[0]) -Ms. is a 36-year-old female who presents to the clinic today for evaluation of her right wrist. -``` - -### 6. Alternatively, we can use fairseq-interactive and a postprocessing tool to substitute positional unknown tokens by its words from the input: - -```python -fairseq-interactive \ - --batch-size \ - --task translation \ - --source-lang src \ - --target-lang tgt \ - --path /checkpoint_last.pt \ - --input /path/to/test.pg.src \ - --buffer-size 20 \ - --max-len-a 0 \ - --max-len-b 128 \ - --beam 2 \ - --skip-invalid-size-inputs-valid-test | tee generate.out - -grep "^H-" generate.out | cut -f 3- > generate.hyp - -./postprocess.py \ - --source <(awk 'NF<512' /path/to/test.pg.src) \ - --target generate.hyp \ - --target-out generate.hyp.processed -``` - -Now we have the final set of reports in "generate.hyp.processed", with "\" replaced by the original word from the input sequence. - -## Model Deployment - -PyTorch offers great flexibility in modeling and a rich surrounding ecosystem. However, while several recent articles have suggested that the use of PyTorch in research and academia may be close to surpassing TensorFlow, there seems to be an overall sense of TensorFlow being the preferred platform for deployment to production. Is this still the case in 2021? Teams looking to serve their PyTorch models in production have a few options. - -Before describing our journey, let's take a brief detour and define the term model. - -### Models as computation graphs - -A few years back, it was still common for machine learning toolkits to support only particular classes of models of a rather fixed and rigid structure, with only a few degrees of freedom (like the kernel of a support vector machine or the number of hidden layers of a neural network). Inspired by foundational work in Theano, toolkits like Microsoft's CNTK or Google's TensorFlow were among the first to popularize a more flexible view on models, as computation graphs with associated parameters that can be estimated from data. This view blurred the boundaries between popular types of models (such as DNNs or SVMs), as it became easy to blend the characteristics of each into your type of graph. Still, such a graph had to be defined upfront before estimating its parameters, and it was pretty static. This made it easy to save models to a self-contained bundle, like a TensorFlow SavedModel (such a bundle simply contains the structure of the graph, as well as the concrete values of the estimated parameters). However, debugging such models can be difficult because the statements in the Python code that build the graph are logically separate from the lines that execute it. Researchers also long for easier ways of expressing dynamic behavior, such as the computation steps of the forward pass of a model being conditionally dependent on its input data (or its previous output). - -Most recently, the above limitations have led to a second revolution spearheaded by PyTorch and TensorFlow 2. The computation graph is no longer defined explicitly. Instead, it will be populated implicitly as the Python code executes operations on tensor arguments. An essential technique that powers this development is automatic differentiation. As the computation graph is being built implicitly while executing the steps of the forward pass, all the necessary data will be tracked for later computation of the gradient concerning the model parameters. This allows for great flexibility in training a model, but it raises an important question. If the computation happening inside a model is only implicitly defined through our Python code's steps as it executes concrete data, what is it that we want to save as a model? The answer – at least initially – was the Python code with all its dependencies, along with the estimated parameters. This is undesirable for practical reasons. For instance, there is a danger that the team working on model deployment does not exactly reproduce the Python code dependencies used during training, leading to subtly divergent behavior. The solution typically consists of combining two techniques, scripting and tracing, that is, extra annotations in your Python code and execution of your code on exemplary input data, allowing PyTorch to define and save the graph that should be executed during later inference on new, unseen data. This requires some discipline by whoever creates the model code (arguably voiding some of the original flexibility of eager execution), but it results in a self-contained model bundle in TorchScript format. The solution in TensorFlow 2 is remarkably similar. - -### Serving our report generation models - -Our journey in deploying the report generation models reflects the above discussion. We started out serving our models by deploying the model code and its dependencies along with the parameter checkpoints in a custom Docker image exposing a gRPC service interface. However, we soon noticed that it became error-prone to replicate the exact code and environment used by the modeling team while estimating the parameters. Moreover, this approach prevented us from leveraging high-performance model serving frameworks like NVIDIA's Triton, which is written in C++ and requires self-contained models that can be used without a Python interpreter. At this stage, we were facing a choice between attempting to export our PyTorch models to ONNX or TorchScript format. ONNX is an open specification for representing machine learning models that increasingly finds adoption. It is powered by a high-performance runtime developed by Microsoft (ONNX Runtime). While we were able to achieve performance acceleration for our TensorFlow BERT-based model using ONNX Runtime, at the time one of our PyTorch model required some operators that weren’t yet supported in ONNX. Rather than implement these using custom operators, we decided to look into TorchScript for the time being. - -### A maturing ecosystem - -Is it all roses? No, it has been a rockier journey than we expected. We encountered what seems to be a memory leak in the MKL libraries used by PyTorch while serving the PyTorch code directly. We encountered deadlocks in trying to load multiple models from multiple threads. We had difficulties exporting our models to ONNX and TorchScript formats. Models would not work out-of-the-box on hardware with multiple GPUs, they always accessed the particular GPU device on which they were exported. We encountered excessive memory usage in the Triton inference server while serving TorchScript models, which we found out was due to automatic differentiation accidentally being enabled during the forward pass. However, the ecosystem keeps improving, and there is a helpful and vibrant open-source community eager to work with us to mitigate such issues. - -Where to go from here? For those that require the flexibility of serving PyTorch code directly, without going through the extra step of exporting self-contained models, it is worth pointing out that the TorchServe project now provides a way of bundling the code together with parameter checkpoints into a single servable archive, greatly reducing the risk of code and parameters running apart. To us, however, exporting models to TorchScript has proven beneficial. It provides a clear interface between modeling and deployment teams, and TorchScript further reduces the latency when serving models on GPU via its just-in-time compilation engine. - -### Scaling at large and the future - -Finally, efficient deployment to the cloud is about more than just computing the response of a single model instance efficiently. Flexibility is needed in managing, versioning and updating models. High-level scalability must be achieved via techniques such as load-balancing, horizontal scaling and vertical scaling. If many models are involved, scale-to-zero quickly becomes a topic as it is unacceptable to pay for serving models that do not answer any requests. Providing such extra functionality on top of a low-level inference server like Triton is the job of an orchestration framework. After gaining some first experience with KubeFlow, to that end, we decided to turn our attention to Azure ML, which provides similar functionality but integrates more deeply with the Azure platform, on which we crucially rely for large parts of our technology stack already. This part of our journey has just begun. - -## Conclusion - -Academia has long recognized that we are "standing on the shoulders of giants." As Artificial Intelligence is maturing from a scientific discipline into technology, the same spirit of collaboration that originally fueled its scientific foundation has carried over into the world of software engineering. Open-source enthusiasts join technology companies worldwide to build open software ecosystems that allow for new angles at solving some of the most pressing challenges of modern society. In this article, we've taken a look at Nuance's [Dragon Ambient eXperience](http://www.nuance.com/ambient), an AI-powered, voice-enabled solution that automatically documents patient care, reducing healthcare providers' administrative burdens. Nuance DAX improves the patient-provider experience, reduces physician burnout, and improves financial outcomes. It brings back trust, joy, and humanity to the delivery of healthcare. Fairseq and PyTorch have proven to be an incredible platform for powering this AI technology, and in turn, Nuance has contributed back some of its innovations in this space. For further reading, we invite you to take a look at our recent [ACL publication](https://www.aclweb.org/anthology/2020.nlpmc-1.4/) and the Nuance "What's Next" blog. diff --git a/_posts/2022-5-18-introducing-accelerated-pytorch-training-on-mac.md b/_posts/2022-5-18-introducing-accelerated-pytorch-training-on-mac.md deleted file mode 100644 index baf1a087d89c..000000000000 --- a/_posts/2022-5-18-introducing-accelerated-pytorch-training-on-mac.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -layout: blog_detail -title: "Introducing Accelerated PyTorch Training on Mac" -author: PyTorch -featured-img: "/assets/images/METAPT-002-BarGraph-02-static.png" ---- - -In collaboration with the Metal engineering team at Apple, we are excited to announce support for GPU-accelerated PyTorch training on Mac. Until now, PyTorch training on Mac only leveraged the CPU, but with the upcoming PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac. - -

        - -

        - -## Metal Acceleration - -Accelerated GPU training is enabled using Apple’s Metal Performance Shaders (MPS) as a backend for PyTorch. The MPS backend extends the PyTorch framework, providing scripts and capabilities to set up and run operations on Mac. MPS optimizes compute performance with kernels that are fine-tuned for the unique characteristics of each Metal GPU family. The new device maps machine learning computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS. - -## Training Benefits on Apple Silicon - -Every Apple silicon Mac has a unified memory architecture, providing the GPU with direct access to the full memory store. This makes Mac a great platform for machine learning, enabling users to train larger networks or batch sizes locally. This reduces costs associated with cloud-based development or the need for additional local GPUs. The Unified Memory architecture also reduces data retrieval latency, improving end-to-end performance. - -In the graphs below, you can see the performance speedup from accelerated GPU training and evaluation compared to the CPU baseline: - -

        - -

        - -

        -Accelerated GPU training and evaluation speedups over CPU-only (times faster) -

        - - -## Getting Started - -To get started, just install the latest [Preview (Nightly) build](https://pytorch.org/get-started/locally/) on your Apple silicon Mac running macOS 12.3 or later with a native version (arm64) of Python. - -You can also learn more about Metal and MPS on [Apple’s Metal page](https://developer.apple.com/metal/). - -\* _Testing conducted by Apple in April 2022 using production Mac Studio systems with Apple M1 Ultra, 20-core CPU, 64-core GPU 128GB of RAM, and 2TB SSD. Tested with macOS Monterey 12.3, prerelease PyTorch 1.12, ResNet50 (batch size=128), HuggingFace BERT (batch size=64), and VGG16 (batch size=64). Performance tests are conducted using specific computer systems and reflect the approximate performance of Mac Studio._ diff --git a/_posts/2022-6-16-how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch.md b/_posts/2022-6-16-how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch.md deleted file mode 100644 index 5f0b3b6ba72c..000000000000 --- a/_posts/2022-6-16-how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch.md +++ /dev/null @@ -1,174 +0,0 @@ ---- -layout: blog_detail -title: "How Disney Improved Activity Recognition Through Multimodal Approaches with PyTorch" -author: Monica Alfaro, Albert Aparicio, Francesc Guitart, Marc Junyent, Pablo Pernias, Marcel Porta, and Miquel Àngel Farré (former Senior Technology Manager) -featured-img: 'assets/images/disney_media_logo.jpg' ---- - -# Introduction - -Among the many things Disney Media & Entertainment Distribution (DMED) is responsible for, is the management and distribution of a huge array of media assets including news, sports, entertainment and features, episodic programs, marketing and advertising and more. - - - -

        - -

        - - - -Our team focuses on media annotation as part of DMED Technology’s content platforms group. In our day-to-day work, we automatically analyze a variety of content that constantly challenges the efficiency of our machine learning workflow and the accuracy of our models. - -Several of our colleagues recently discussed the workflow efficiencies that we achieved by switching to an end-to-end video analysis pipeline using PyTorch, as well as how we approach animated character recognition. We invite you to read more about both in this previous post. - -While the conversion to an end-to-end PyTorch pipeline is a solution that any company might benefit from, animated character recognition was a uniquely-Disney concept and solution. - -In this article we will focus on activity recognition, which is a general challenge across industries — but with some specific opportunities when leveraged in the media production field, because we can combine audio, video, and subtitles to provide a solution. - -# Experimenting with Multimodality - -Working on a multimodal problem adds more complexity to the usual training pipelines. Having multiple information modes for each example means that the multimodal pipeline has to have specific implementations to process each mode in the dataset. Usually after this processing step, the pipeline has to merge or fuse the outputs. - -Our initial experiments in multimodality were completed using the [MMF framework](https://github.com/facebookresearch/mmf). MMF is a modular framework for vision and language multimodal research. MMF contains reference implementations of state-of-the-art vision and language models and has also powered multiple research projects at Meta AI Research (as seen in this [poster](https://s3.amazonaws.com/assets.pytorch.org/pted2021/posters/A3.png) presented in PyTorch Ecosystem Day 2020). Along with the recent release of TorchMultimodal, a PyTorch library for training state-of-the-art multimodal models at scale, MMF highlights the growing interest in Multimodal understanding. - -MMF tackles this complexity with modular management of all the elements of the pipeline through a wide set of different implementations for specific modules, ranging from the processing of the modalities to the fusion of the processed information. - -In our scenario, MMF was a great entry point to experiment with multimodality. It allowed us to iterate quickly by combining audio, video and closed captioning and experiment at different levels of scale with certain multimodal models, shifting from a single GPU to TPU Pods. - -# Multimodal Transformers - -With a workbench based on MMF, our initial model was based on a concatenation of features from each modality evolving to a pipeline that included a Transformer-based fusion module to combine the different input modes. - -Specifically, we made use of the fusion module called MMFTransformer, developed in collaboration with the Meta AI Research team. This is an implementation based on [VisualBERT](https://arxiv.org/abs/1908.03557) for which the necessary modifications were added to be able to work with text, audio and video. - -Despite having decent results with the out-of-box implementation MMFTransformer, we were still far from our goal, and the Transformers-based models required more data than we had available. - -# Searching for less data-hungry solutions - -Searching for less data-hungry solutions, our team started studying [MLP-Mixer](https://arxiv.org/abs/2105.01601). This new architecture has been proposed by the Google Brain team and it provides an alternative to well established de facto architectures like convolutions or self-attention for computer vision tasks. - -## MLP-Mixer - -The core idea behind mixed variations consists of replacing the convolutions or self-attention mechanisms used in transformers with Multilayer Perceptrons. This change in architecture favors the performance of the model in high data regimes (especially with respect to the Transformers), while also opening some questions regarding the inductive biases hidden in the convolutions and the self-attention layers. - -Those proposals perform great in solving image classification tasks by splitting the image in chunks, flattening those chunks into 1D vectors and passing them through a sequence of Mixer Layers. - - - -

        - -

        - - - -Inspired by the advantages of Mixer based architectures, our team searched for parallelisms with the type of problems we try to solve in video classification: specifically, instead of a single image, we have a set of frames that need to be classified, along with audio and closed captioning in the shape of new modalities. - -# Activity Recognition reinterpreting the MLP-Mixer - -Our proposal takes the core idea of the [MLP-Mixer](https://arxiv.org/abs/2105.01601) — using multiple multi-layer perceptrons on a sequence and transposed sequence and extends it into a Multi Modal framework that allows us to process video, audio & text with the same architecture. - -For each of the modalities, we use different extractors that will provide embeddings describing the content. Given the embeddings of each modality, the MLP-Mixer architecture solves the problem of deciding which of the modalities might be the most important, while also weighing how much each modality contributes to the final labeling. - -For example, when it comes to detecting laughs, sometimes the key information is in audio or in the frames, and in some of the cases we have a strong signal in the closed caption. - -We tried processing each frame separately with a ResNet34 and getting a sequence of embeddings and by using a video-specific model called R3D, both pre-trained on ImageNet and Kinetics400 respectively. - - - -

        - -

        - - - -To process the audio, we use the pretrained ResNet34, and we remove the final layers to be able to extract 2D embeddings from the audio spectrograms (for 224x224 images we end up with 7x7 embeddings). - - - -

        - -

        - - - -For closed captioning, we are using a pre-trained BERT-large, with all layers frozen, except for the Embeddings & LayerNorms. - - - -

        - -

        - - - -Once we have extracted the embedding from each modality, we concatenate them into a single sequence and pass it through a set of MLP-Mixer blocks; next we use average pooling & a classification head to get predictions. - - - -

        - -

        - - - -Our experiments have been performed on a custom, manually labeled dataset for activity recognition with 15 classes, which we know from experiments are hard and cannot all be predicted accurately using a single modality. - -These experiments have shown a significant increase in performance using our approach, especially in a low/mid-data regime (75K training samples). - -When it comes to using only Text and Audio, our experiments showed a 15 percent improvement in accuracy over using a classifier on top of the features extracted by state-of-the-art backbones. - -Using Text, Audio and Video we have seen a 17 percent improvement in accuracy over using Meta AIFacebook’s MMF Framework, which uses a VisualBERT-like model to combine modalities using more powerful state of the art backbones. - -Currently, we extended the initial model to cover up to 55 activity classes and 45 event classes. One of the challenges we expect to improve upon in the future is to include all activities and events, even those that are less frequent. - -## Interpreting the MLP-Mixer mode combinations - -An MLP-Mixer is a concatenation of MultiLayer Perceptrons. This can be, very roughly, approximated to a linear operation, in the sense that, once trained, the weights are fixed and the input will directly affect the output. - -Once we assume that approximation, we also assume that for an input consisting of NxM numbers, we could find a NxM matrix that (when multiplied elementwise) could approximate the predictions of the MLP-Mixer for a class. - - - -

        - -

        - - - -We will call this matrix a stencil, and if we have access to it, we can find what parts of the input embeddings are responsible for a specific prediction. - -You can think of it as a punch card with holes in specific positions. Only information in those positions will pass and contribute to a specific prediction. So we can measure the intensity of the input at those positions. - - - -

        - -

        - - - -Of course, this is an oversimplification, and there won't exist a unique stencil that perfectly represents all of the contributions of the input to a class (otherwise that would mean that the problem could be solved linearly). So this should be used for visualization purposes only, not as an accurate predictor. - -Once we have a set of stencils for each class, we can effortlessly measure input contribution without relying on any external visualization techniques. - -To find a stencil, we can start from a "random noise" stencil and optimize it to maximize the activations for a specific class by just back-propagating through the MLP-Mixer. - - - -

        - -

        - - - -By doing this we can end up with many valid stencils, and we can reduce them to a few by using K-means to cluster them into similar stencils and averaging each cluster. - -# Using the Mixer to get the best of each world - -MLP-Mixer, used as an image classification model without convolutional layers, requires a lot of data, since the lack of inductive bias – one of the model's good points overall – is a weakness when it comes to working in low data domains. - -When used as a way to combine information previously extracted by large pretrained backbones (as opposed to being used as a full end-to-end solution), they shine. The Mixer’s strength lies in finding temporal or structural coherence between different inputs. For example, in video-related tasks we could extract embeddings from the frames using a powerful, pretrained model that understands what is going on at frame level and use the mixer to make sense of it in a sequential manner. - -This way of using the Mixer allows us to work with limited amounts of data and still get better results than what was achieved with Transformers. This is because Mixers seem to be more stable during training and seem to pay attention to all the inputs, while Transformers tend to collapse and pay attention only to some modalities/parts of the sequence. - -Acknowledgements: We would like to thank the Meta AI Research and Partner Engineering teams for this collaboration. diff --git a/_posts/2022-6-23-geospatial-deep-learning-with-torchgeo.md b/_posts/2022-6-23-geospatial-deep-learning-with-torchgeo.md deleted file mode 100644 index cf94047af42f..000000000000 --- a/_posts/2022-6-23-geospatial-deep-learning-with-torchgeo.md +++ /dev/null @@ -1,247 +0,0 @@ ---- -layout: blog_detail -title: "Geospatial deep learning with TorchGeo" -author: Adam Stewart (University of Illinois at Urbana-Champaign), Caleb Robinson (Microsoft AI for Good Research Lab), Isaac Corley (University of Texas at San Antonio) -featured-img: 'assets/images/torchgeo-hurricane.jpg' ---- - -TorchGeo is a PyTorch domain library providing datasets, samplers, transforms, and pre-trained models specific to geospatial data. - -

        - -

        - -

        - https://github.com/microsoft/torchgeo -

        - -For decades, Earth observation satellites, aircraft, and more recently UAV platforms have been collecting increasing amounts of imagery of the Earth’s surface. With information about seasonal and long-term trends, remotely sensed imagery can be invaluable for solving some of the greatest challenges to humanity, including climate change adaptation, natural disaster monitoring, water resource management, and food security for a growing global population. From a computer vision perspective, this includes applications like land cover mapping (semantic segmentation), deforestation and flood monitoring (change detection), glacial flow (pixel tracking), hurricane tracking and intensity estimation (regression), and building and road detection (object detection, instance segmentation). By leveraging recent advancements in deep learning architectures, cheaper and more powerful GPUs, and petabytes of freely available satellite imagery datasets, we can come closer to solving these important problems. - -

        - -

        - -

        -National Oceanic and Atmospheric Administration satellite image of Hurricane Katrina, taken on August 28, 2005 (source). Geospatial machine learning libraries like TorchGeo can be used to detect, track, and predict future trajectories of hurricanes and other natural disasters. -

        - -# The challenges - -In traditional computer vision datasets, such as ImageNet, the image files themselves tend to be rather simple and easy to work with. Most images have 3 spectral bands (RGB), are stored in common file formats like PNG or JPEG, and can be easily loaded with popular software libraries like [PIL](https://pillow.readthedocs.io/en/stable/) or [OpenCV](https://opencv.org/). Each image in these datasets is usually small enough to pass directly into a neural network. Furthermore, most of these datasets contain a finite number of well-curated images that are assumed to be independent and identically distributed, making train-val-test splits straightforward. As a result of this relative homogeneity, the same pre-trained models (e.g., CNNs pretrained on ImageNet) have shown to be effective across a wide range of vision tasks using transfer learning methods. Existing libraries, such as [torchvision](https://github.com/pytorch/vision), handle these simple cases well, and have been used to make large advances in vision tasks over the past decade. - -Remote sensing imagery is not so uniform. Instead of simple RGB images, satellites tend to capture images that are multispectral ([Landsat 8](https://www.usgs.gov/landsat-missions) has 11 spectral bands) or even hyperspectral ([Hyperion](https://www.usgs.gov/centers/eros/science/usgs-eros-archive-earth-observing-one-eo-1-hyperion) has 242 spectral bands). These images capture information at a wider range of wavelengths (400 nm–15 µm), far outside of the visible spectrum. Different satellites also have very different spatial resolutions—[GOES](https://www.goes.noaa.gov/) has a resolution of 4 km/px, [Maxar](https://www.maxar.com/products/satellite-imagery) imagery is 30 cm/px, and drone imagery resolution can be as high as 7 mm/px. These datasets almost always have a temporal component, with satellite revisists that are daily, weekly, or biweekly. Images often have overlap with other images in the dataset, and need to be stitched together based on geographic metadata. These images tend to be very large (e.g., 10K x 10K pixels), so it isn't possible to pass an entire image through a neural network. This data is distributed in hundreds of different raster and vector file formats like GeoTIFF and ESRI Shapefile, requiring specialty libraries like [GDAL](https://gdal.org/) to load. - - -

        - -

        - - -

        -From left to right: Mercator, Albers Equal Area, and Interrupted Goode Homolosine projections (source). Geospatial data is associated with one of many different types of reference systems that project the 3D Earth onto a 2D representation. Combining data from different sources often involves re-projecting to a common reference system in order to ensure that all layers are aligned. -

        - -Although each image is 2D, the Earth itself is 3D. In order to stitch together images, they first need to be projected onto a 2D representation of the Earth, called a coordinate reference system (CRS). Most people are familiar with equal angle representations like Mercator that distort the size of regions (Greenland looks larger than Africa even though Africa is 15x larger), but there are many other CRSs that are commonly used. Each dataset may use a different CRS, and each image within a single dataset may also be in a unique CRS. In order to use data from multiple layers, they must all share a common CRS, otherwise the data won't be properly aligned. For those who aren't familiar with remote sensing data, this can be a daunting task. - -

        - -

        - -

        -Even if you correctly georeference images during indexing, if you don't project them to a common CRS, you'll end up with rotated images with nodata values around them, and the images won't be pixel-aligned. -

        - -# The solution - -At the moment, it can be quite challenging to work with both deep learning models and geospatial data without having expertise in both of these very different fields. To address these challenges, we've built TorchGeo, a PyTorch domain library for working with geospatial data. TorchGeo is designed to make it simple: - -1. for machine learning experts to work with geospatial data, and -2. for remote sensing experts to explore machine learning solutions. - -TorchGeo is not just a research project, but a production-quality library that uses continuous integration to test every commit with a range of Python versions on a range of platforms (Linux, macOS, Windows). It can be easily installed with any of your favorite package managers, including pip, conda, and [spack](https://spack.io): - -``` -$ pip install torchgeo -``` - -TorchGeo is designed to have the same API as other PyTorch domain libraries like torchvision, torchtext, and torchaudio. If you already use torchvision in your workflow for computer vision datasets, you can switch to TorchGeo by changing only a few lines of code. All TorchGeo datasets and samplers are compatible with the PyTorch ``DataLoader`` class, meaning that you can take advantage of wrapper libraries like [PyTorch Lightning](https://www.pytorchlightning.ai/) for distributed training. In the following sections, we'll explore possible use cases for TorchGeo to show how simple it is to use. - -# Geospatial datasets and samplers - -

        - -

        - -

        -Example application in which we combine A) a scene from Landsat 8 and B) Cropland Data Layer labels, even though these files are in different EPSG projections. We want to sample patches C) and D) from these datasets using a geospatial bounding box as an index. -

        - -Many remote sensing applications involve working with [*geospatial datasets*](https://torchgeo.readthedocs.io/en/latest/api/datasets.html#geospatial-datasets) —datasets with geographic metadata. In TorchGeo, we define a ``GeoDataset`` class to represent these kinds of datasets. Instead of being indexed by an integer, each ``GeoDataset`` is indexed by a spatiotemporal bounding box, meaning that two or more datasets covering a different geographic extent can be intelligently combined. - -In this example, we show how easy it is to work with geospatial data and to sample small image patches from a combination of Landsat and Cropland Data Layer (CDL) data using TorchGeo. First, we assume that the user has Landsat 7 and 8 imagery downloaded. Since Landsat 8 has more spectral bands than Landsat 7, we'll only use the bands that both satellites have in common. We'll create a single dataset including all images from both Landsat 7 and 8 data by taking the union between these two datasets. - -```c++ -from torch.utils.data import DataLoader -from torchgeo.datasets import CDL, Landsat7, Landsat8, stack_samples -from torchgeo.samplers import RandomGeoSampler - -landsat7 = Landsat7(root="...") -landsat8 = Landsat8(root="...", bands=Landsat8.all_bands[1:-2]) -landsat = landsat7 | landsat8 -``` - -Next, we take the intersection between this dataset and the CDL dataset. We want to take the intersection instead of the union to ensure that we only sample from regions where we have both Landsat and CDL data. Note that we can automatically download and checksum CDL data. Also note that each of these datasets may contain files in different CRSs or resolutions, but TorchGeo automatically ensures that a matching CRS and resolution is used. - -```c++ -cdl = CDL(root="...", download=True, checksum=True) -dataset = landsat & cdl -``` - -This dataset can now be used with a PyTorch data loader. Unlike benchmark datasets, geospatial datasets often include very large images. For example, the CDL dataset consists of a single image covering the entire contiguous United States. In order to sample from these datasets using geospatial coordinates, TorchGeo defines a number of [*samplers*](https://torchgeo.readthedocs.io/en/latest/api/samplers.html). In this example, we'll use a random sampler that returns 256 x 256 pixel images and 10,000 samples per epoch. We'll also use a custom collation function to combine each sample dictionary into a mini-batch of samples. - -```c++ -sampler = RandomGeoSampler(dataset, size=256, length=10000) -dataloader = DataLoader(dataset, batch_size=128, sampler=sampler, collate_fn=stack_samples) -``` - -This data loader can now be used in your normal training/evaluation pipeline. - -```c++ -for batch in dataloader: - image = batch["image"] - mask = batch["mask"] - - # train a model, or make predictions using a pre-trained model -``` - -Many applications involve intelligently composing datasets based on geospatial metadata like this. For example, users may want to: - -- Combine datasets for multiple image sources and treat them as equivalent (e.g., Landsat 7 and 8) -- Combine datasets for disparate geospatial locations (e.g., Chesapeake NY and PA) - -These combinations require that all queries are present in *at least one* dataset, and can be created using a ``UnionDataset``. Similarly, users may want to: - -- Combine image and target labels and sample from both simultaneously (e.g., Landsat and CDL) -- Combine datasets for multiple image sources for multimodal learning or data fusion (e.g., Landsat and Sentinel) - -These combinations require that all queries are present in *both* datasets, and can be created using an ``IntersectionDataset``. TorchGeo automatically composes these datasets for you when you use the intersection (``&``) and union \(``|``\) operators. - -# Multispectral and geospatial transforms - -In deep learning, it's common to augment and transform the data so that models are robust to variations in the input space. Geospatial data can have variations such as seasonal changes and warping effects, as well as image processing and capture issues like cloud cover and atmospheric distortion. TorchGeo utilizes augmentations and transforms from the [Kornia](https://kornia.github.io/) library, which supports GPU acceleration and supports multispectral imagery with more than 3 channels. - -Traditional geospatial analyses compute and visualize spectral indices which are combinations of multispectral bands. Spectral indices are designed to highlight areas of interest in a multispectral image relevant to some application, such as vegetation health, areas of man-made change or increasing urbanization, or snow cover. TorchGeo supports numerous [*transforms*](https://torchgeo.readthedocs.io/en/latest/api/transforms.html), which can compute common spectral indices and append them as additional bands to a multispectral image tensor. - -Below, we show a simple example where we compute the Normalized Difference Vegetation Index (NDVI) on a Sentinel-2 image. NDVI measures the presence of vegetation and vegetation health and is computed as the normalized difference between the red and near-infrared (NIR) spectral bands. Spectral index transforms operate on sample dictionaries returned from TorchGeo datasets and append the resulting spectral index to the image channel dimension. - -First, we instantiate a Sentinel-2 dataset and load a sample image. Then, we plot the true color (RGB) representation of this data to see the region we are looking at. - -```c++ -import matplotlib.pyplot as plt -from torchgeo.datasets import Sentinel2 -from torchgeo.transforms import AppendNDVI - -dataset = Sentinel2(root="...") -sample = dataset[...] -fig = dataset.plot(sample) -plt.show() -``` - -Next, we instantiate and compute an NDVI transform, appending this new channel to the end of the image. Sentinel-2 imagery uses index 0 for its red band and index 3 for its NIR band. In order to visualize the data, we also normalize the image. NDVI values can range from -1 to 1, but we want to use the range 0 to 1 for plotting. - -```c++ -transform = AppendNDVI(index_red=0, index_nir=3) -sample = transform(sample) -sample["image"][-1] = (sample["image"][-1] + 1) / 2 -plt.imshow(sample["image"][-1], cmap="RdYlGn_r") -plt.show() -``` - -

        - -

        - -

        -True color (left) and NDVI (right) of the Texas Hill Region, taken on November 16, 2018 by the Sentinel-2 satellite. In the NDVI image, red indicates water bodies, yellow indicates barren soil, light green indicates unhealthy vegetation, and dark green indicates healthy vegetation. -

        - -# Benchmark datasets - -One of the driving factors behind progress in computer vision is the existence of standardized benchmark datasets like ImageNet and MNIST. Using these datasets, researchers can directly compare the performance of different models and training procedures to determine which perform the best. In the remote sensing domain, there are many such datasets, but due to the aforementioned difficulties of working with this data and the lack of existing libraries for loading these datasets, many researchers opt to use their own custom datasets. - -One of the goals of TorchGeo is to provide easy-to-use data loaders for these existing datasets. TorchGeo includes a number of [*benchmark datasets*](https://torchgeo.readthedocs.io/en/latest/api/datasets.html#non-geospatial-datasets) —datasets that include both input images and target labels. This includes datasets for tasks like image classification, regression, semantic segmentation, object detection, instance segmentation, change detection, and more. - -If you've used torchvision before, these types of datasets should be familiar. In this example, we'll create a dataset for the Northwestern Polytechnical University (NWPU) very-high-resolution ten-class (VHR-10) geospatial object detection dataset. This dataset can be automatically downloaded, checksummed, and extracted, just like with torchvision. - -```c++ -from torch.utils.data import DataLoader -from torchgeo.datasets import VHR10 - -dataset = VHR10(root="...", download=True, checksum=True) -dataloader = DataLoader(dataset, batch_size=128, shuffle=True, num_workers=4) - -for batch in dataloader: - image = batch["image"] - label = batch["label"] - - # train a model, or make predictions using a pre-trained model -``` - -All TorchGeo datasets are compatible with PyTorch data loaders, making them easy to integrate into existing training workflows. The only difference between a benchmark dataset in TorchGeo and a similar dataset in torchvision is that each dataset returns a dictionary with keys for each PyTorch ``Tensor``. - -

        - -

        - -

        -Example predictions from a Mask R-CNN model trained on the NWPU VHR-10 dataset. The model predicts sharp bounding boxes and masks for all objects with high confidence scores. -

        - -# Reproducibility with PyTorch Lightning - -Another key goal of TorchGeo is reproducibility. For many of these benchmark datasets, there is no predefined train-val-test split, or the predefined split has issues with class imbalance or geographic distribution. As a result, the performance metrics reported in the literature either can't be reproduced, or aren't indicative of how well a pre-trained model would work in a different geographic location. - -In order to facilitate direct comparisons between results published in the literature and further reduce the boilerplate code needed to run experiments with datasets in TorchGeo, we have created PyTorch Lightning [*datamodules*](https://torchgeo.readthedocs.io/en/latest/api/datamodules.html) with well-defined train-val-test splits and [*trainers*](https://torchgeo.readthedocs.io/en/latest/api/trainers.html) for various tasks like classification, regression, and semantic segmentation. These datamodules show how to incorporate augmentations from the kornia library, include preprocessing transforms (with pre-calculated channel statistics), and let users easily experiment with hyperparameters related to the data itself (as opposed to the modeling process). Training a semantic segmentation model on the Inria Aerial Image Labeling dataset is as easy as a few imports and four lines of code. - -```c++ -from pytorch_lightning import Trainer -from torchgeo.datamodules import InriaAerialImageLabelingDataModule -from torchgeo.trainers import SemanticSegmentationTask - -datamodule = InriaAerialImageLabelingDataModule(root_dir="...", batch_size=64, num_workers=6) -task = SemanticSegmentationTask(segmentation_model="unet", encoder_weights="imagenet", learning_rate=0.1) -trainer = Trainer(gpus=1, default_root_dir="...") - -trainer.fit(model=task, datamodule=datamodule) -``` - -

        - -

        - -

        -Building segmentations produced by a U-Net model trained on the Inria Aerial Image Labeling dataset. Reproducing these results is as simple as a few imports and four lines of code, making comparison of different models and training techniques simple and easy. -

        - -In our [preprint](https://arxiv.org/abs/2111.08872) we show a set of results that use the aforementioned datamodules and trainers to benchmark simple modeling approaches for several of the datasets in TorchGeo. For example, we find that a simple ResNet-50 can achieve state-of-the-art performance on the [So2Sat](https://ieeexplore.ieee.org/document/9014553) dataset. These types of baseline results are important for evaluating the contribution of different modeling choices when tackling problems with remotely sensed data. - -# Future work and contributing - -There is still a lot of remaining work to be done in order to make TorchGeo as easy to use as possible, especially for users without prior deep learning experience. One of the ways in which we plan to achieve this is by expanding our tutorials to include subjects like "writing a custom dataset" and "transfer learning", or tasks like "land cover mapping" and "object detection". - -Another important project we are working on is pre-training models. Most remote sensing researchers work with very small labeled datasets, and could benefit from pre-trained models and transfer learning approaches. TorchGeo is the first deep learning library to provide models pre-trained on multispectral imagery. Our goal is to provide models for different image modalities (optical, SAR, multispectral) and specific platforms (Landsat, Sentinel, MODIS) as well as benchmark results showing their performance with different amounts of training data. Self-supervised learning is a promising method for training such models. Satellite imagery datasets often contain petabytes of imagery, but accurately labeled datasets are much harder to come by. Self-supervised learning methods will allow us to train directly on the raw imagery without needing large labeled datasets. - -Aside from these larger projects, we're always looking to add new datasets, data augmentation transforms, and sampling strategies. If you're Python savvy and interested in contributing to TorchGeo, we would love to see contributions! TorchGeo is open source under an MIT license, so you can use it in almost any project. - -External links: - -- **Homepage**: [https://github.com/microsoft/torchgeo](https://github.com/microsoft/torchgeo) -- **Documentation**: [https://torchgeo.readthedocs.io/](https://torchgeo.readthedocs.io/) -- **PyPI**: [https://pypi.org/project/torchgeo/](https://pypi.org/project/torchgeo/) -- **Paper**: [https://arxiv.org/abs/2111.08872](https://arxiv.org/abs/2111.08872) - -If you like TorchGeo, give us a star on GitHub! And if you use TorchGeo in your work, please cite our paper. - -# Acknowledgments - -*We would like to thank all TorchGeo contributors for their efforts in creating the library, the Microsoft AI for Good program for support, and the PyTorch Team for their guidance. This research is part of the Blue Waters sustained-petascale computing project, which is supported by the National Science Foundation (awards OCI-0725070 and ACI-1238993), the State of Illinois, and as of December, 2019, the National Geospatial-Intelligence Agency. Blue Waters is a joint effort of the University of Illinois at Urbana-Champaign and its National Center for Supercomputing Applications. The research was supported in part by NSF grants IIS-1908104, OAC-1934634, and DBI-2021898.* diff --git a/_posts/2022-6-27-how-computational-graphs-are-executed-in-pytorch.md b/_posts/2022-6-27-how-computational-graphs-are-executed-in-pytorch.md deleted file mode 100644 index 05c68da2abdd..000000000000 --- a/_posts/2022-6-27-how-computational-graphs-are-executed-in-pytorch.md +++ /dev/null @@ -1,1094 +0,0 @@ ---- -layout: blog_detail -title: "How Computational Graphs are Executed in PyTorch" -author: Preferred Networks -featured-img: "" ---- - -Welcome to the last entry into understanding the autograd engine of PyTorch series! -If you haven’t read parts [1](https://pytorch.org/blog/overview-of-pytorch-autograd-engine/) & [2](https://pytorch.org/blog/computational-graphs-constructed-in-pytorch/) check them now to understand how PyTorch creates the computational graph for the backward pass! - -This post is based on PyTorch v1.11, so some highlighted parts may differ across versions. - -# PyTorch autograd graph execution - -The last post showed how PyTorch constructs the graph to calculate the outputs' derivatives w.r.t. the inputs when executing the forward pass. Now we will see how the execution of the backward pass is coordinated and done by looking at the whole process, starting from Python down to the lower C++ level internals. - -# What Happens when Calling `backward()`/`grad()` from Python -## Using `variable.backward()` - -After doing all our calculations with an input set to require the gradient, we call `.backward()` on the result to initiate the backward pass execution. - -```python ->>> x = torch.tensor([0.5, 0.75], requires_grad=True) ->>> y = torch.exp(x).sum() ->>> y.backward() -``` - -Calling [`.backward()`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/_tensor.py#L307-L363) on a tensor results in a call to [`torch.autograd.backward()`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/autograd/__init__.py#L85-L175). -```python -# torch/_tensor.py - -def backward(self, gradient=None, retain_graph=None, create_graph=False, inputs=None): - … - torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) - -``` -`torch.autograd.backward()` checks the arguments and calls the autograd engine in the C++ layer. - -``` python -def backward( - tensors: _TensorOrTensors, - grad_tensors: Optional[_TensorOrTensors] = None, - retain_graph: Optional[bool] = None, - create_graph: bool = False, - grad_variables: Optional[_TensorOrTensors] = None, - inputs: Optional[_TensorOrTensors] = None, -) -> None: - … - - if inputs is not None and len(inputs) == 0: - raise RuntimeError("'inputs' argument to backward() cannot be empty.") - - tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors) - inputs = (inputs,) if isinstance(inputs, torch.Tensor) else \ - tuple(inputs) if inputs is not None else tuple() - - grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors)) - grad_tensors_ = _make_grads(tensors, grad_tensors_) - if retain_graph is None: - retain_graph = create_graph - - Variable._execution_engine.run_backward( - tensors, grad_tensors_, retain_graph, create_graph, inputs, - allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag - -``` -First, whether the `grad_tensors` argument was specified or not, there is a call to the [`_make_grads`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/autograd/__init__.py#L30-L74) function. This is used to check the provided `grad_tensors` or to specify the default value for them by looking at the `tensors` argument values’ shapes. Check the first blog post for details on the default value for the `grad_tensors` of the backward pass. This function just provides the vector of the vector jacobian product if it was not initially specified. - -In the above code, `Variable` has an `_execution_engine` attribute that is defined in [`torch.autograd.variable`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/autograd/variable.py#L14) to be of type `ImperativeEngine`; the C++ engine exported to python and declared in [`torch/csrc/autograd/python_engine.cpp`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/python_engine.cpp#L384). In the following sections, we explain in detail how this object executes the backward pass. - -Note that the `torch.autograd.backward` function has an `inputs` optional argument. This argument is used when we want to calculate the `.grad` field of only a subset of input tensors in the forward pass. - -```python ->>> x = torch.tensor([0.5, 0.75], requires_grad=True) ->>> y = torch.tensor([0.1, 0.90], requires_grad=True) ->>> z = torch.exp(x * y).sum() ->>> torch.autograd.backward([z], inputs=[x]) ->>> x.grad -tensor([0.1051, 1.7676]) ->>> y.grad # None ->>> - -``` -## Using `torch.autograd.grad` - -An alternative to `backward()` is to use [`torch.autograd.grad()`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/autograd/__init__.py#L177-L277). The main difference to `backward()` is that `grad()` returns a tuple of tensors with the gradients of the `outputs` w.r.t. the `inputs` kwargs instead of storing them in the `.grad` field of the tensors. As you can see, the `grad()` code shown below is very similar to backward. - -```python -def grad( - outputs: _TensorOrTensors, - inputs: _TensorOrTensors, - grad_outputs: Optional[_TensorOrTensors] = None, - retain_graph: Optional[bool] = None, - create_graph: bool = False, - only_inputs: bool = True, - allow_unused: bool = False, - is_grads_batched: bool = False -) -> Tuple[torch.Tensor, ...]: - - outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs) - inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs) - overridable_args = outputs + inputs - if has_torch_function(overridable_args): - return handle_torch_function( - grad, - overridable_args, - outputs, - inputs, - grad_outputs=grad_outputs, - retain_graph=retain_graph, - create_graph=create_graph, - only_inputs=only_inputs, - allow_unused=allow_unused, - ) - - grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(outputs)) - grad_outputs_ = _make_grads(outputs, grad_outputs_) - - if retain_graph is None: - retain_graph = create_graph - - if is_grads_batched: - # …. It will not be covered here - else: - return Variable._execution_engine.run_backward( - outputs, grad_outputs_, retain_graph, create_graph, inputs, - allow_unused, accumulate_grad=False) # Calls into the C++ engine to run the backward pass - -``` - -Figure 1 shows the computational graph with the `backward()` and `grad()` arguments highlighted in red and blue, respectively: - -

        - -

        - -

        -Fgiure 1: Correspondence of `backward`/`grad` arguments in the graphs. -

        - -# Going Inside the Autograd Engine - -## Refreshing Concepts: Nodes and Edges - -As we saw in [2](https://pytorch.org/blog/computational-graphs-constructed-in-pytorch/) -The computational graph comprises `Node` and `Edge` objects. Please read that post if you haven’t done it yet. - -### Nodes - -`Node` objects are defined in [`torch/csrc/autograd/function.h`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/function.h#L105-L176), and they provide an overload of `operator()` for the associated function and a list of edges to do the graph traversal. Note that `Node` is a base class that autograd functions inherit from and override the `apply` method to execute the backward function. -```c++ -struct TORCH_API Node : std::enable_shared_from_this { - ... - /// Evaluates the function on the given inputs and returns the result of the - /// function call. - variable_list operator()(variable_list&& inputs) { - ... - } - -protected: - /// Performs the `Node`'s actual operation. - virtual variable_list apply(variable_list&& inputs) = 0; - … - edge_list next_edges_; - uint64_t topological_nr_ = 0; - … - -``` - -There is an attribute called [`topological_nr_`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/function.h#L481) in every node object. This number is used to optimize the graph execution as it allows to discard of graph branches under certain conditions. The topological number is the longest distance between this node and any leaf node and it is shown in Figure 2. Its main property is that for any pair of nodes `x`, `y` in a directed graph `topo_nr(x) < topo_nr(y)` means that there is no path from `x` to `y`. So this allows for reducing the number of paths in the graph in need of traversal. Check the [topological_nr](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/function.h#L314-L343) -) method comment for further details. - -

        - -

        - -

        -Figure 2: Example of the Topological Number calculation -

        - -### Edges - -The [`Edge`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/edge.h#L14-L39) object links `Node`s together, and its implementation is straightforward. - -```c++ -struct Edge { - ... - /// The function this `Edge` points to. - std::shared_ptr function; - /// The identifier of a particular input to the function. - uint32_t input_nr; -}; - -``` - -It only requires a function pointer to the `Node` and an input number that is the index of the output from the forward function this edge points to. When preparing the set of gradients before calling "function", we know that what is flowing from this edge should be accumulated in the "input_nr"th argument. Note that the input/output name is flipped here and this is the input to the backward function. - `Edge` objects are constructed using the [`gradient_edge`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/variable.cpp#L221-L233) function method. - -```c++ - Edge gradient_edge(const Variable& self) { - if (const auto& gradient = self.grad_fn()) { - return Edge(gradient, self.output_nr()); - } else { - return Edge(grad_accumulator(self), 0); - } - } - -``` -## Entering the C++ Realm - -Once that `torch.autograd.backward()` has been invoked, the -[`THPEngine_run_backward`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/python_engine.cpp#L152-L286) routine starts the graph traversal. Following is a schema of the function body: -```c++ -PyObject *THPEngine_run_backward(PyObject *self, PyObject *args, PyObject *kwargs) -{ - HANDLE_TH_ERRORS - PyObject *tensors = nullptr; - PyObject *grad_tensors = nullptr; - unsigned char keep_graph = 0; - unsigned char create_graph = 0; - PyObject *inputs = nullptr; - - // Convert the python arguments to C++ objects - const char *accepted_kwargs[] = { // NOLINT - "tensors", "grad_tensors", "keep_graph", "create_graph", "inputs", - "allow_unreachable", "accumulate_grad", nullptr - }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OObb|Obb", (char**)accepted_kwargs, - &tensors, &grad_tensors, &keep_graph, &create_graph, &inputs, &allow_unreachable, &accumulate_grad)) - - // Prepare arguments - for(const auto i : c10::irange(num_tensors)) { - // Check that the tensors require gradients - } - - std::vector output_edges; - if (inputs != nullptr) { - // Prepare outputs - } - - { - // Calls the actual autograd engine - pybind11::gil_scoped_release no_gil; - outputs = engine.execute(roots, grads, keep_graph, create_graph, accumulate_grad, output_edges); - } - // Clean up and finish -} - -``` - -First, we prepare the input arguments after converting the `PyObject` arguments to actual C++ objects. The `tensors` list contains the tensors from which we start the backward pass. These tensors are converted to edges using `torch::autograd::impl::gradient_edge` and added to a list called `roots` where the graph traversal starts. - - -```c++ - edge_list roots; - roots.reserve(num_tensors); - variable_list grads; - grads.reserve(num_tensors); - for(const auto i : c10::irange(num_tensors)) { - PyObject *_tensor = PyTuple_GET_ITEM(tensors, i); - const auto& variable = THPVariable_Unpack(_tensor); - auto gradient_edge = torch::autograd::impl::gradient_edge(variable); - roots.push_back(std::move(gradient_edge)); - - PyObject *grad = PyTuple_GET_ITEM(grad_tensors, i); - if (THPVariable_Check(grad)) { - const Variable& grad_var = THPVariable_Unpack(grad); - grads.push_back(grad_var); - } - } - -``` - -Now, if the `inputs` argument was specified in `backward` or we used the `torch.autograd.grad` api, the following code creates a list of edges to accumulate the gradients in the specified tensors at the end of the computation. The engine uses this later to optimize the execution as it doesn’t add the gradients in all the leaf nodes, just the specified ones. - -```c++ - std::vector output_edges; - if (inputs != nullptr) { - int num_inputs = PyTuple_GET_SIZE(inputs); - output_edges.reserve(num_inputs); - for (const auto i : c10::irange(num_inputs)) { - PyObject *input = PyTuple_GET_ITEM(inputs, i); - const auto& tensor = THPVariable_Unpack(input); - const auto output_nr = tensor.output_nr(); - auto grad_fn = tensor.grad_fn(); - if (!grad_fn) { - grad_fn = torch::autograd::impl::try_get_grad_accumulator(tensor); - } - if (accumulate_grad) { - tensor.retain_grad(); - } - if (!grad_fn) { - output_edges.emplace_back(std::make_shared(), 0); - } else { - output_edges.emplace_back(grad_fn, output_nr); - } - } - } - -``` - -The next step is the actual graph traversal and node function execution, and finally, the cleanup and return. - -```c++ - { - // Calls the actual autograd engine - pybind11::gil_scoped_release no_gil; - auto& engine = python::PythonEngine::get_python_engine(); - outputs = engine.execute(roots, grads, keep_graph, create_graph, accumulate_grad, output_edges); - } - // Clean up and finish -} - -``` - -# Starting the Real Execution - -`engine.execute`is present in [torch/csrc/autograd/engine.cpp](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L969-L1044) - -There are two differentiated steps here: - -Analyze the graph to find dependencies between functions -Create worker threads that traverse the graph - -## Data Structures Used for the Execution - -### GraphTask - -All the execution metadata is managed by the [`GraphTask`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.h#L51-L196) class in [torch/csrc/autograd/engine.h](https://github.com/pytorch/pytorch/blob/release/1.11/torch/csrc/autograd/engine.h) - -```c++ -struct GraphTask: std::enable_shared_from_this { - std::atomic outstanding_tasks_{0}; - // … - std::unordered_map not_ready_; - std::unordered_map dependencies_; - - struct ExecInfo { - // … - }; - std::unordered_map exec_info_; - std::vector captured_vars_; - // … - std::shared_ptr cpu_ready_queue_; -}; - -``` - -Here we see a series of variables dedicated to maintaining the execution state. -`outstanding_tasks_` tracks the number of tasks left to be executed for the backward pass to complete. `not_ready_` holds the input arguments for the `Node`s that are not ready to be executed. `dependencies_` track the number of predecessors that a `Node` has. As the count reaches `0`, the `Node` is ready for execution; it is placed in a ready queue to be retrieved and executed later. - -`exec_info_` and the associated `ExecInfo` struct are used only when the `inputs` argument is specified or it is a call to `autograd.grad()`. They allow filter paths on the graph that are not needeed since only the gradients are calculated only for the variables in the `inputs` list. - - `captured_vars_` is where the results of the graph execution are temporarily stored if we used the `torch.autograd.grad()` api instead of `torch.autograd.backward()` since `grad()` returns the gradients as tensors instead of just filling the `.grad` field of the inputs. - - -### NodeTask - -The [`NodeTask`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.h#L210-L242) struct is a basic class that holds an `fn_` pointer to the node to execute, and an `inputs_` buffer to store the input arguments to this function. Note that the functions executed by the backward pass are the derivatives specified in the `derivatives.yaml` file. or the user provided backward function when using custom functions as described in the second blog post. - -The `inputs_` buffer is also where the output gradients of the previously executed functions are aggregated, and it is defined as a [`std::vector` container](https://github.com/pytorch/pytorch/blob/release/1.10/torch/csrc/autograd/input_buffer.h) with facilities to accumulate values at a given position. - -```c++ -struct NodeTask { - std::weak_ptr base_; - std::shared_ptr fn_; - // This buffer serves as an implicit "addition" node for all of the - // gradients flowing here. Once all the dependencies are finished, we - // use the contents of this buffer to run the function. - InputBuffer inputs_; -}; - -``` -### GraphRoot - -The [`GraphRoot`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/functions/basic_ops.h#L72-L89) is a special function used to hold multiple input variables in a single place. The code is pretty simple as it only acts as a container of variables. - -```c++ -struct TORCH_API GraphRoot : public Node { - GraphRoot(edge_list functions, variable_list inputs) - : Node(std::move(functions)), - outputs(std::move(inputs)) { - for (const auto& t : outputs) { - add_input_metadata(t); - } - } - - variable_list apply(variable_list&& inputs) override { - return outputs; - } - -``` - -### AccumulateGrad - -This function is set during the graph creation in `gradient_edge` when the `Variable` object doesn’t have a `grad_fn`. This is, it is a leaf node. - -```c++ - if (const auto& gradient = self.grad_fn()) { - // … - } else { - return Edge(grad_accumulator(self), 0); - } - -``` - -The function body is defined in [`torch/csrc/autograd/functions/accumulate_grad.cpp`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/functions/accumulate_grad.cpp#L25-L63) and it essentially accumulates the input grads in the object’s `.grad` attribute. - -```c++ -auto AccumulateGrad::apply(variable_list&& grads) -> variable_list { - check_input_variables("AccumulateGrad", grads, 1, 0); - … - - at::Tensor new_grad = callHooks(variable, std::move(grads[0])); - std::lock_guard lock(mutex_); - - at::Tensor& grad = variable.mutable_grad(); - accumulateGrad( - variable, - grad, - new_grad, - 1 + !post_hooks().empty() /* num_expected_refs */, - [&grad](at::Tensor&& grad_update) { grad = std::move(grad_update); }); - return variable_list(); -} -}} // namespace torch::autograd - - - -``` - -[`accumulateGrad`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/functions/accumulate_grad.h#L100) -does several checks on the tensors format and eventually performs the `variable_grad += new_grad;` accumulation. - -## Preparing the graph for execution - -Now, let’s walk through [`Engine::execute`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L969-L1126). The first thing to do besides arguments consistency checks is to create the actual `GraphTask` object we described above. This object keeps all the metadata of the graph execution. - -```c++ -auto Engine::execute(const edge_list& roots, - const variable_list& inputs, - bool keep_graph, - bool create_graph, - bool accumulate_grad, - const edge_list& outputs) -> variable_list { - - validate_outputs(roots, const_cast(inputs), [](const std::string& msg) { - return msg; - }); - - // Checks - - auto graph_task = std::make_shared( - /* keep_graph */ keep_graph, - /* create_graph */ create_graph, - /* depth */ not_reentrant_backward_call ? 0 : total_depth + 1, - /* cpu_ready_queue */ local_ready_queue); - - // If we receive a single root, skip creating extra root node - // … - // Prepare graph by computing dependencies - // … - // Queue the root - // … - // launch execution - // … -} - -``` - -After creating the `GraphTask`, we use its associated function if we only have one root node. If we have multiple root nodes, we create a special `GraphRoot` object as described before. - -```c++ - bool skip_dummy_node = roots.size() == 1; - auto graph_root = skip_dummy_node ? - roots.at(0).function : - std::make_shared(roots, inputs); - -``` - -The next step is to fill the `dependencies_` map in the `GraphTask` object since the engine must know when it can execute a task. The `outputs` here is the `inputs` argument passed to the `torch.autograd.backward()` call in Python. But here, we have reversed the names since the gradients w.r.t. the inputs of the forward pass are now the outputs of the backward pass. And from now on, there is no concept of forward/backward, but only graph traversal and execution. - -```c++ - auto min_topo_nr = compute_min_topological_nr(outputs); - // Now compute the dependencies for all executable functions - compute_dependencies(graph_root.get(), *graph_task, min_topo_nr); - - if (!outputs.empty()) { - graph_task->init_to_execute(*graph_root, outputs, accumulate_grad, min_topo_nr); - } - -``` - -Here we preprocess the graph for the execution of the nodes. First, [`compute_min_topological_nr`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L922-L933) is called to to obtain the minimum topological number of the tensors specified in `outputs` (0 if no `inputs` kwarg was supplied to `.backward` or `input` for `.grad`). This computation prunes paths in the graph that lead to input variables of which we don’t want/need to calculate the grads. - -Second, is the [`compute_dependencies`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L935-L967) call. This function is a very simple graph traversal that starts with the root `Node`, and for each of the edges in `node.next_edges()` it increments the counter in `dependencies_`. Figure 3 shows the result of the dependencies calculation for the example graph. Note that the number of dependencies of any node is just the number of edges arriving at it. - -

        - -

        - -

        -Figure 3: Number of dependencies for each node -

        - -Finally, the [`init_to_execute`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L1281-L1383) call, this is the one that populates the `GraphTask::exec_info_` map in case that `inputs` were specified in the python `backward` call. It iterates the graph again, starting from the root, and records in the `exec_info_` map the intermediate nodes needed to calculate only the given `inputs` gradients. - -```c++ - // Queue the root - if (skip_dummy_node) { - InputBuffer input_buffer(roots.at(0).function->num_inputs()); - auto input = inputs.at(0); - - - input_buffer.add(roots.at(0).input_nr, - std::move(input), - input_stream, - opt_next_stream); - - execute_with_graph_task(graph_task, graph_root, std::move(input_buffer)); - } else { - execute_with_graph_task(graph_task, graph_root, InputBuffer(variable_list())); - } - // Avoid a refcount bump for the Future, since we check for refcount in - // DistEngine (see TORCH_INTERNAL_ASSERT(futureGrads.use_count() == 1) - // in dist_engine.cpp). - auto& fut = graph_task->future_result_; - fut->wait(); - return fut->value().toTensorVector(); -} - -``` - -And now, we are ready to start the actual execution by creating the `InputBuffer`. In case we only have one root variable, we begin by copying the value of the `inputs` tensor (this is the `gradients` passed to python `backward`) in position 0 of the input_buffer. This is a small optimization that avoids running the `RootNode` for no reason. Also, if the rest of the graph is not on the cpu, we directly start on that worker while the `RootNode` is always placed on the cpu ready queue. Details of the workers and ready queues are explained in the section below. - -On the other hand, if we have multiple roots, the `GraphRoot` object also holds the inputs, so it is enough to pass it an empty `InputBuffer`. - -## Graph Traversal and Node Execution -### Devices, Threads and Queues - -Before diving into the actual execution, we need to see how the engine is structured. - -First of all, the engine is multithreaded with one thread per device. For example, the caller thread is associated with the CPU while additional threads are created and associated with each GPU or other devices available in the system. Each thread tracks its device using thread-local storage in the [`worker_device`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L69) variable. In addition, the threads have a queue of tasks to be executed also located in thread-local storage, the [`local_ready_queue`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L103-L104). This is where work is queued for this thread to execute in the `thread_main` function that is explained later. -You will wonder how the device where a task should be executed is decided. The `InputBuffer` class has a [`device()`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/input_buffer.cpp#L173-L189) function that returns the first non-cpu device of all its tensors. -This function is used together with [`Engine::ready_queue`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L1181-L1190) to select the queue to queue a task. - -```c++ -auto Engine::ready_queue(std::shared_ptr cpu_ready_queue, at::Device device) -> std::shared_ptr{ - if (device.type() == at::kCPU || device.type() == at::DeviceType::Meta) { - return cpu_ready_queue; - } else { - // See Note [Allocating GPUs to autograd threads] - return device_ready_queues_.at(device.index()); - } -} - -``` - -The [`ReadyQueue`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.h#L245-L283) object is defined in `torch/csrc/autograd/engine.h` and it is a simple wrapper over `std::priority_queue` that allows a thread to [wait for a task](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L219) if it’s empty. One interesting property of the `ReadyQueue` is that it increases the [`GraphTask::outstanding_tasks_`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L195) value used to determine if the execution has completed or not. - -```c++ -auto ReadyQueue::push(NodeTask item, bool incrementOutstandingTasks) -> void { - { - std::lock_guard lock(mutex_); - if (incrementOutstandingTasks) { - std::shared_ptr graph_task = item.base_.lock(); - ++graph_task->outstanding_tasks_; - } - heap_.push(std::move(item)); - } - not_empty_.notify_one(); -} - -auto ReadyQueue::pop() -> NodeTask { - std::unique_lock lock(mutex_); - not_empty_.wait(lock, [this]{ return !heap_.empty(); }); - auto task = std::move(const_cast(heap_.top())); heap_.pop(); - return task; -} - -``` - -### Reentrant Backward - -A reentrant backward happens when one of the tasks in a backward pass calls again `backward`. It is not a very common case, but it can be used to reduce memory utilization as it could potentially avoid saving intermediate results. For more information, check this [PyTorch forum post](https://discuss.pytorch.org/t/what-is-the-scenario-of-reentrant-backwards-in-pytorch-source-code/19330/2). - -```python -class ReentrantBackward(torch.autograd.Function): - @staticmethod - def forward(ctx, input): - return input.sum() - - @staticmethod - def backward(ctx, input): - # Let's compute the backward by using autograd - input = input.detach().requires_grad_() - with torch.enable_grad(): - out = input.sum() - out.backward() # REENTRANT CALL!! - return out.detach() - -``` - -Here, we call `backward()` inside `backward()` for a user custom-defined autograd function. -This situation can lead to deadlocks because the first backward needs to wait for the second one to complete. But some internal implementation details can prevent the second backward from completing as it is explained in the dedicated subsection. -## Thread Initialization - -[`execute_with_graph_task`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L1054-L1126) is in charge of initializing the threads taking care of the computation and placing the `root` node in the queue of the device that produced it. - -```c++ -c10::intrusive_ptr Engine::execute_with_graph_task( - const std::shared_ptr& graph_task, - std::shared_ptr graph_root, - InputBuffer&& input_buffer) { - - initialize_device_threads_pool(); - // Lock mutex for GraphTask. - std::unique_lock lock(graph_task->mutex_); - - auto queue = ready_queue(graph_task->cpu_ready_queue_, input_buffer.device()); - - if (worker_device == NO_DEVICE) { - set_device(CPU_DEVICE); - graph_task->owner_ = worker_device; - queue->push(NodeTask(graph_task, std::move(graph_root), std::move(input_buffer))); - lock.unlock(); - thread_main(graph_task); - worker_device = NO_DEVICE; - } else { - // This deals with reentrant backwards, we will see it later. - } - return graph_task->future_result_; -} - -``` - -First, this function initializes several threads (one per device) calling [` initialize_device_threads_pool()`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L1046-L1052) where several things happen: -One `ReadyQueue` per device is created. -One thread per non-cpu device is created. -A thread local `worker_device` variable is set to track the current device associated with the thread. -`thread_main` function is called, and threads wait for tasks to be put in their queues. - -Then it retrieves the queue to place the root node based on the device that holds the tensors present in the `input_buffer` using the `ready_queue` function. Now, the main thread (the one also executing the Python interpreter) has its `worker_device` set to `NO_DEVICE`, and it is in charge of executing functions with all its tensors living in the cpu. If `worker_device` is set to any other value, the graph execution is already started, and `.backward()` was called inside a running `Node`, creating a reentrant backward call. This is explained later. For now, -the main thread places the task in the queue and call `thread_main`. -## Where the Magic Happens - -It’s been a long way, but finally, we are ready to traverse the graph and execute the nodes. Each of the spawned threads, and the main thread call [`thread_main`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L377-L464). - -```c++ -auto Engine::thread_main(const std::shared_ptr& graph_task) -> void { - - while (graph_task == nullptr || !graph_task->future_result_->completed()) { - std::shared_ptr local_graph_task; - { - NodeTask task = local_ready_queue->pop(); - - if (task.isShutdownTask_) { - break; - } - - if (!(local_graph_task = task.base_.lock())) { - // GraphTask for function is no longer valid, skipping further - // execution. - continue; - } - - if (task.fn_ && !local_graph_task->has_error_.load()) { - at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_); - - try { - GraphTaskGuard guard(local_graph_task); - NodeGuard ndguard(task.fn_); - { - evaluate_function( - local_graph_task, - task.fn_.get(), - task.inputs_, - local_graph_task->cpu_ready_queue_); - } - } catch (std::exception& e) { - thread_on_exception(local_graph_task, task.fn_, e); - } - } - } - - // Decrement the outstanding tasks. - --local_graph_task->outstanding_tasks_; - - // Check if we've completed execution. - if (local_graph_task->completed()) { - local_graph_task->mark_as_completed_and_run_post_processing(); - auto base_owner = local_graph_task->owner_; - if (worker_device != base_owner) { - std::atomic_thread_fence(std::memory_order_release); - ready_queue_by_index(local_graph_task->cpu_ready_queue_, base_owner) - ->push(NodeTask(local_graph_task, nullptr, InputBuffer(0))); - } - } - } -} - -``` - -The code here is simple, given the `local_ready_queue` assigned to each thread in thread-local storage. The threads loop until there are no tasks left to execute in the graph. Note that for device-associated threads, the passed `graph_task` argument is [`nullptr`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L326-L327), and they block in `local_ready_queue->pop()` until a task is pushed in their queue. After some consistency checks (the task type is shutdown, or the graph is still valid). We get to the actual function invocation in `evaluate_function`. - -```c++ - try { - GraphTaskGuard guard(local_graph_task); - NodeGuard ndguard(task.fn_); - { - evaluate_function( - local_graph_task, - task.fn_.get(), - task.inputs_, - local_graph_task->cpu_ready_queue_); - } - } catch (std::exception& e) { - thread_on_exception(local_graph_task, task.fn_, e); - } - } - -``` - -After calling `evaluate_function`, we check if the `graph_task` execution is complete by looking the `outstanding_tasks_` number. This number increases when a task is pushed to a queue and is decreased in `local_graph_task->completed()` when a task is executed. When the execution is done, we return the results that are be in the `captured_vars_` in case we called `torch.autograd.grad()` instead of `torch.autograd.backward()` as this function returns tensors instead of storing them in the `.grad` attribute of the inputs. Finally we wake up the main thread if it’s waiting by sending a dummy task. - -```c++ - // Decrement the outstanding tasks. - --local_graph_task->outstanding_tasks_; - - // Check if we've completed execution. - if (local_graph_task->completed()) { - local_graph_task->mark_as_completed_and_run_post_processing(); - auto base_owner = local_graph_task->owner_; - if (worker_device != base_owner) { - std::atomic_thread_fence(std::memory_order_release); - ready_queue_by_index(local_graph_task->cpu_ready_queue_, base_owner) - ->push(NodeTask(local_graph_task, nullptr, InputBuffer(0))); - } - } - -``` - -## Calling the Function and Unlocking New Tasks - -[`evaluate_function`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L786-L920) serves three purposes: - -Run the function. -Accumulate its results in the next node `InputBuffers`. -Decrease the dependencies counter of the next nodes and enqueues the tasks reaching 0 to be executed. - -```c++ -void Engine::evaluate_function( - std::shared_ptr& graph_task, - Node* func, - InputBuffer& inputs, - const std::shared_ptr& cpu_ready_queue) { - - // If exec_info_ is not empty, we have to instrument the execution - auto& exec_info_ = graph_task->exec_info_; - if (!exec_info_.empty()) { - // Checks if the function needs to be executed - if (!fn_info.needed_) { - // Skip execution if we don't need to execute the function. - return; - } - } - - auto outputs = call_function(graph_task, func, inputs); - - auto& fn = *func; - if (!graph_task->keep_graph_) { - fn.release_variables(); - } - -``` - -Initially, we check the `exec_info_` map of the `GraphTask` structure to determine if the current node needs to be executed. Remember that if this map is empty, all the nodes are executed because we are calculating the grads for all the inputs of the forward pass. - -After this check, the function is executed by running [`call_function`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L735-L784). Its implementation is very straightforward and calls the actual derivative function and registered hooks if any. - -```c++ - int num_outputs = outputs.size(); - if (num_outputs == 0) { - // Records leaf stream (if applicable) - return; - } - - if (AnomalyMode::is_enabled()) { - // check for nan values in result - } - -``` - -Next, we check the outputs of the function after `call_function` is done. If the number of outputs is 0, there are no following nodes to be executed so we can safely return. This is the case of the `AccumulateGrad` node associated with the leaf nodes. - - Also, the check for `NaN` values in the gradients is done here if requested. -```c++ - - std::lock_guard lock(graph_task->mutex_); - for (const auto i : c10::irange(num_outputs)) { - auto& output = outputs[i]; - const auto& next = fn.next_edge(i); - - if (!next.is_valid()) continue; - - - -``` - -We have now executed a `grad_fn` that has returned one gradient per each of the associated forward pass function inputs. As we saw in the [previous blog post](https://pytorch.org/blog/computational-graphs-constructed-in-pytorch/#linking-nodes-together), we have an `Edge` object per each of these input tensors, and the `grad_fn` of the function producing them in the forward pass. Essentially, Output[0] of the node in the backward pass, corresponds to the first argument of the forward pass associated function. Figure 4 shows how the outputs of a backward function are related to the inputs of the forward function. See that the outputs of `grad_fn C` are the gradients of `z` w.r.t. the inputs of `Function C` - -

        - -

        - -

        -Figure 4: Correspondence between forward and backward functions inputs and outputs -

        - -We now iterate through these edges and check if the associated functions are ready to be executed. - -```c++ - // Check if the next function is ready to be computed - bool is_ready = false; - auto& dependencies = graph_task->dependencies_; - auto it = dependencies.find(next.function.get()); - - if (it == dependencies.end()) { - auto name = next.function->name(); - throw std::runtime_error(std::string("dependency not found for ") + name); - } else if (--it->second == 0) { - dependencies.erase(it); - is_ready = true; - } - - auto& not_ready = graph_task->not_ready_; - auto not_ready_it = not_ready.find(next.function.get()); - -``` - -For this, we check the `graph_task->dependencies_` map. We decrement the counter, and if it reaches 0, we mark the function pointed by the edge ready to be executed. Following, we prepare the input buffers of the tasks indicated by the next edges. - -```c++ - if (not_ready_it == not_ready.end()) { - if (!exec_info_.empty()) { - // Skip functions that aren't supposed to be executed - } - - // Creates an InputBuffer and moves the output to the corresponding input position - InputBuffer input_buffer(next.function->num_inputs()); - input_buffer.add(next.input_nr, - std::move(output), - opt_parent_stream, - opt_next_stream); - - if (is_ready) { - auto queue = ready_queue(cpu_ready_queue, input_buffer.device()); - queue->push( - NodeTask(graph_task, next.function, std::move(input_buffer))); - } else { - not_ready.emplace(next.function.get(), std::move(input_buffer)); - } - -``` - -Here, we look for the task in the `graph_task->not_ready_` map. If it is not present, we create a new `InputBuffer` object and set the current output in the `input_nr` position of the buffer associated with the edge. If the task is ready to be executed, we enqueue it in the appropriate device `ready_queue` and complete the execution. However, if the task is not ready and we have seen it before, it is present in the `not_ready_map_`. - -```c++ - } else { - // The function already has a buffer - auto &input_buffer = not_ready_it->second; - // Accumulates into buffer - input_buffer.add(next.input_nr, - std::move(output), - opt_parent_stream, - opt_next_stream); - if (is_ready) { - auto queue = ready_queue(cpu_ready_queue, input_buffer.device()); - queue->push(NodeTask(graph_task, next.function, std::move(input_buffer))); - not_ready.erase(not_ready_it); - } - } - } -} - -``` - -In this case, we accumulate the output in the existing `input_buffer` instead of creating a new one. Once all the tasks are processed, the worker thread exits the loop and complete. -All this process is summarized in the animation in Figure 5. We see how a thread peeks at the tasks in the ready queue and decrements the next nodes' dependencies, unlocking them for execution. - -

        - -

        - -

        -Figure 5: Animation of the execution of the computational graph -

        - -## Flow with Reentrant Backward - -As we saw above, the reentrant backward problem is when the currently executed function does a nested call to `backward`. When this happens, the thread running this function goes all the way down to `execute_with_graph_task` as in the non-reentrant case, but here is when things are different. - -```c++ -c10::intrusive_ptr Engine::execute_with_graph_task( - const std::shared_ptr& graph_task, - std::shared_ptr graph_root, - InputBuffer&& input_buffer) { - - initialize_device_threads_pool(); - // Lock mutex for GraphTask. - std::unique_lock lock(graph_task->mutex_); - - auto queue = ready_queue(graph_task->cpu_ready_queue_, input_buffer.device()); - - if (worker_device == NO_DEVICE) { - //Regular case - } else { - // If worker_device is any devices (i.e. CPU, CUDA): this is a re-entrant - // backward call from that device. - graph_task->owner_ = worker_device; - - // Now that all the non-thread safe fields of the graph_task have been populated, - // we can enqueue it. - queue->push(NodeTask(graph_task, std::move(graph_root), std::move(input_buffer))); - - if (current_depth >= max_recursion_depth_) { - // If reached the max depth, switch to a different thread - add_thread_pool_task(graph_task); - } else { - ++total_depth; - ++current_depth; - lock.unlock(); - thread_main(graph_task); - --current_depth; - --total_depth; - } - } - return graph_task->future_result_; -} - -``` - -Here, `execute_with_graph_task` detects this as a reentrant call and then looks for the current number of nested calls. If it exceeds the limit, we create a new thread to take care of the execution of this graph, and if not, we execute this reentrant call regularly. -The limit of nested calls was originally set to avoid stack overflow due to reentrant calls creating very large call stacks. However, the number was further reduced when sanitizer tests were added because of the maximum amount of locks a thread can hold at a given moment. This can be seen in [`torch/csrc/autograd/engine.h`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.h#L36-L42). - - -When this maximum depth is exceeded, a new thread is created with the [`add_thread_pool_task`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L1239-L1255) function. - -```c++ -void Engine::add_thread_pool_task(const std::weak_ptr& graph_task) { - std::unique_lock lck(thread_pool_shared_->mutex_); - // if we have pending graph_task objects to be processed, create a worker. - bool create_thread = (thread_pool_shared_->num_workers_ <= thread_pool_shared_->graphtasks_queue_.size()); - thread_pool_shared_->graphtasks_queue_.push(graph_task); - - - lck.unlock(); - if (create_thread) { - std::thread t(&Engine::reentrant_thread_init, this); - t.detach(); - } - - thread_pool_shared_->work_.notify_one(); -} - - - -``` - -Before going in-depth, let's look at the `thread_pool_shared_` object in the [`Engine`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.h#L421) which manages all the information related to the threads associated to the reentrant backward calls. - -```c++ - struct ThreadPoolShared { - unsigned int num_workers_; - std::condition_variable work_; - std::mutex mutex_; - std::queue> graphtasks_queue_; - - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - ThreadPoolShared() : num_workers_(0) {} - }; - - - - ``` - -[`ThreadPoolShared`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.h#L398-L414) is a simple container holding a queue of `GraphTask` objects with synchronization mechanisms and the number of current workers. - -Now it is easy to understand how `add_thread_pool_task` creates a thread when there are `graph_task` objects enqueued and insufficient workers to process them. - -`add_thread_pool_task` initializes a thread by executing [`reentrant_thread_init`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L471-L493) - -```c++ -void Engine::reentrant_thread_init() { - at::init_num_threads(); - auto tp_shared = thread_pool_shared_; - while(true) { - std::unique_lock lk(tp_shared->mutex_); - ++thread_pool_shared_->num_workers_; - tp_shared->work_.wait(lk, [&tp_shared]{ return !tp_shared->graphtasks_queue_.empty();}); - --thread_pool_shared_->num_workers_; - auto task = tp_shared->graphtasks_queue_.front(); - tp_shared->graphtasks_queue_.pop(); - lk.unlock(); - std::shared_ptr graph_task; - if (!(graph_task = task.lock())) { - continue; - } - set_device(graph_task->owner_); - // set the local_ready_queue to the ready queue on the graph_task->owner_ device - local_ready_queue = ready_queue_by_index(graph_task->cpu_ready_queue_, graph_task->owner_); - total_depth = graph_task->reentrant_depth_; - thread_main(graph_task); - } -} - - - -``` - -The code is straightforward. The newly created thread waits on the `thread_pool_shared->graphtasks_queue_` for reentrant backward graphs to be available and executes them. Notice that this thread uses the task-ready queue associated with the device of the thread that started this call by accessing the `graph_task->owner_` field set in the [`execute_with_graph_task`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L1092) function. - -## Error Handling - -Whenever an error happens in one of the worker threads. It will be propagated to the `backward` calling thread. - -To achieve this, there is a try/catch block in the [`thread_main`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L415-L438) that catches any exception in the `Node` function call and sets it to the associated `GraphTask` object. - - -```c++ - try { - … - GraphTaskGuard guard(local_graph_task); - NodeGuard ndguard(task.fn_); - { - evaluate_function( - … - } - } catch (std::exception& e) { - thread_on_exception(local_graph_task, task.fn_, e); - } - } - } - -``` - -[`thread_on_exception`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L495-L500) and the [functions it calls](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/torch/csrc/autograd/engine.cpp#L605-L621) end up setting the exception in the `local_graph_task` object. - -```c++ -void Engine::thread_on_exception( - std::shared_ptr graph_task, - const std::shared_ptr& fn, - std::exception& e) { - graph_task->set_exception(std::current_exception(), fn); -} - -void GraphTask::set_exception_without_signal(const std::shared_ptr& fn) { - if (!has_error_.exchange(true)) { - if (AnomalyMode::is_enabled() && fn) { - fn->metadata()->print_stack(fn->name()); - } - } -} - -void GraphTask::set_exception( - std::exception_ptr eptr, - const std::shared_ptr& fn) { - set_exception_without_signal(fn); - if (!future_completed_.exchange(true)) { - // NOLINTNEXTLINE(performance-move-const-arg) - future_result_->setError(std::move(eptr)); - } -} - -``` - -In `set_exception` it sets the `has_error_` flag to `true` and it calls the [`setError`]() -function of the [`future_result_`](https://github.com/pytorch/pytorch/blob/bc2c6edaf163b1a1330e37a6e34caf8c553e4755/aten/src/ATen/core/ivalue_inl.h#L770-L1322) object. This will make the error to be re-thrown at the caller thread when `future_result_->value()` is accessed. - -```c++ - IValue value() { - std::unique_lock lock(mutex_); - AT_ASSERT(completed()); - if (eptr_) { - std::rethrow_exception(eptr_); - } - return value_; - } - -``` - -# Closing Remarks - -This has been the last post of this series covering how PyTorch does the auto differentiation. We hope you enjoyed reading it and that now you are familiar enough with PyTorch internals to start contributing in PyTorch development! diff --git a/_posts/2022-6-28-pytorch-1.12-new-library-releases.md b/_posts/2022-6-28-pytorch-1.12-new-library-releases.md deleted file mode 100644 index 0070c1372b15..000000000000 --- a/_posts/2022-6-28-pytorch-1.12-new-library-releases.md +++ /dev/null @@ -1,391 +0,0 @@ ---- -layout: blog_detail -title: "New library updates in PyTorch 1.12" -author: Team PyTorch -featured-img: '' ---- - -We are bringing a number of improvements to the current PyTorch libraries, alongside the [PyTorch 1.12 release](https://github.com/pytorch/pytorch/releases/tag/v1.12.0). These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. - -Summary: -- **TorchVision** - Added multi-weight support API, new architectures, model variants, and pretrained weight. See the release notes [here](https://github.com/pytorch/vision/releases). -- **TorchAudio** - Introduced beta features including a streaming API, a CTC beam search decoder, and new beamforming modules and methods. See the release notes [here](https://github.com/pytorch/audio/releases). -- **TorchText** - Extended support for scriptable BERT tokenizer and added datasets for GLUE benchmark. See the release notes [here](https://github.com/pytorch/text/releases). -- **TorchRec** - Added EmbeddingModule benchmarks, examples for TwoTower Retrieval, inference and sequential embeddings, metrics, improved planner and demonstrated integration with production components. See the release notes [here](https://github.com/pytorch/torchrec/releases). -- **TorchX** - Launch PyTorch trainers developed on local workspaces onto five different types of schedulers. See the release notes [here](https://github.com/pytorch/torchx/blob/main/CHANGELOG.md?plain=1#L3). -- **FBGemm** - Added and improved kernels for Recommendation Systems inference workloads, including table batched embedding bag, jagged tensor operations, and other special-case optimizations. - -## TorchVision v0.13 - -### Multi-weight support API - -TorchVision v0.13 offers a new [Multi-weight support API](https://pytorch.org/blog/introducing-torchvision-new-multi-weight-support-api/) for loading different weights to the existing model builder methods: - -```python -from torchvision.models import * - -# Old weights with accuracy 76.130% -resnet50(weights=ResNet50_Weights.IMAGENET1K_V1) - -# New weights with accuracy 80.858% -resnet50(weights=ResNet50_Weights.IMAGENET1K_V2) - -# Best available weights (currently alias for IMAGENET1K_V2) -# Note that these weights may change across versions -resnet50(weights=ResNet50_Weights.DEFAULT) - -# Strings are also supported -resnet50(weights="IMAGENET1K_V2") - -# No weights - random initialization -resnet50(weights=None) -``` - -The new API bundles along with the weights important details such as the preprocessing transforms and meta-data such as labels. Here is how to make the most out of it: - -```python -from torchvision.io import read_image -from torchvision.models import resnet50, ResNet50_Weights - -img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg") - -# Step 1: Initialize model with the best available weights -weights = ResNet50_Weights.DEFAULT -model = resnet50(weights=weights) -model.eval() - -# Step 2: Initialize the inference transforms -preprocess = weights.transforms() - -# Step 3: Apply inference preprocessing transforms -batch = preprocess(img).unsqueeze(0) - -# Step 4: Use the model and print the predicted category -prediction = model(batch).squeeze(0).softmax(0) -class_id = prediction.argmax().item() -score = prediction[class_id].item() -category_name = weights.meta["categories"][class_id] -print(f"{category_name}: {100 * score:.1f}%") -``` - -You can read more about the new API in the [docs](https://pytorch.org/vision/0.13/models.html). To provide your feedback, please use this dedicated [Github issue](https://github.com/pytorch/vision/issues/5088). - -### New architectures and model variants - -#### Classification - -The [Swin Transformer](https://arxiv.org/abs/2103.14030) and [EfficienetNetV2](https://arxiv.org/abs/2104.00298) are two popular classification models which are often used for downstream vision tasks. This release includes 6 pre-trained weights for their classification variants. Here is how to use the new models: - -```python -import torch -from torchvision.models import * - -image = torch.rand(1, 3, 224, 224) -model = swin_t(weights="DEFAULT").eval() -prediction = model(image) - -image = torch.rand(1, 3, 384, 384) -model = efficientnet_v2_s(weights="DEFAULT").eval() -prediction = model(image) -``` - -In addition to the above, we also provide new variants for existing architectures such as ShuffleNetV2, ResNeXt and MNASNet. The accuracies of all the new pre-trained models obtained on ImageNet-1K are seen below: - -| **Model** | **Acc@1** | **Acc@5** | -|--------------------------------|-----------|-----------| -| swin_t | 81.474 | 95.776 | -| swin_s | 83.196 | 96.36 | -| swin_b | 83.582 | 96.64 | -| efficientnet_v2_s | 84.228 | 96.878 | -| efficientnet_v2_m | 85.112 | 97.156 | -| efficientnet_v2_l | 85.808 | 97.788 | -| resnext101_64x4d | 83.246 | 96.454 | -| resnext101_64x4d (quantized) | 82.898 | 96.326 | -| shufflenet_v2_x1_5 | 72.996 | 91.086 | -| shufflenet_v2_x1_5 (quantized) | 72.052 | 0.700 | -| shufflenet_v2_x2_0 | 76.230 | 93.006 | -| shufflenet_v2_x2_0 (quantized) | 75.354 | 92.488 | -| mnasnet0_75 | 71.180 | 90.496 | -| mnas1_3 | 76.506 | 93.522 | - -We would like to thank Hu Ye for contributing to TorchVision the Swin Transformer implementation. - -#### (BETA) Object Detection and Instance Segmentation - -We have introduced 3 new model variants for RetinaNet, FasterRCNN and MaskRCNN that include several [post-paper architectural optimizations](https://github.com/pytorch/vision/pull/5444) and improved training recipes. All models can be used similarly: - -```python -import torch -from torchvision.models.detection import * - -images = [torch.rand(3, 800, 600)] -model = retinanet_resnet50_fpn_v2(weights="DEFAULT") -# model = fasterrcnn_resnet50_fpn_v2(weights="DEFAULT") -# model = maskrcnn_resnet50_fpn_v2(weights="DEFAULT") -model.eval() -prediction = model(images) -``` - -Below we present the metrics of the new variants on COCO val2017. In parenthesis we denote the improvement over the old variants: - -| **Model** | **Box mAP** | **Mask mAP** | -|----------------------------|-------------|--------------| -| retinanet_resnet50_fpn_v2 | 41.5 (+5.1) | - | -| fasterrcnn_resnet50_fpn_v2 | 46.7 (+9.7) | - | -| maskrcnn_resnet50_fpn_v2 | 47.4 (+9.5) | 41.8 (+7.2) | - -We would like to thank Ross Girshick, Piotr Dollar, Vaibhav Aggarwal, Francisco Massa and Hu Ye for their past research and contributions to this work. - -### New pre-trained weights - -#### SWAG weights - -The ViT and RegNet model variants offer new pre-trained [SWAG](https://arxiv.org/abs/2201.08371) (​​Supervised Weakly from hashtAGs) weights. One of the biggest of these models achieves a whopping 88.6% accuracy on ImageNet-1K. We currently offer two versions of the weights: 1) fine-tuned end-to-end weights on ImageNet-1K (highest accuracy) and 2) frozen trunk weights with a linear classifier fit on ImageNet-1K (great for transfer learning). Below we see the detailed accuracies of each model variant: - -| **Model Weights** | **Acc@1** | **Acc@5** | -|--------------------------------------------------|-----------|-----------| -| RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_E2E_V1 | 86.012 | 98.054 | -| RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 | 83.976 | 97.244 | -| RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_E2E_V1 | 86.838 | 98.362 | -| RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 | 84.622 | 97.48 | -| RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_E2E_V1 | 88.228 | 98.682 | -| RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 | 86.068 | 97.844 | -| ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1 | 85.304 | 97.65 | -| ViT_B_16_Weights.IMAGENET1K_SWAG_LINEAR_V1 | 81.886 | 96.18 | -| ViT_L_16_Weights.IMAGENET1K_SWAG_E2E_V1 | 88.064 | 98.512 | -| ViT_L_16_Weights.IMAGENET1K_SWAG_LINEAR_V1 | 85.146 | 97.422 | -| ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V1 | 88.552 | 98.694 | -| ViT_H_14_Weights.IMAGENET1K_SWAG_LINEAR_V1 | 85.708 | 97.73 | - -The SWAG weights are released under the [Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/SWAG/blob/main/LICENSE) license. We would like to thank Laura Gustafson, Mannat Singh and Aaron Adcock for their work and support in making the weights available to TorchVision. - -#### Model Refresh - -The release of the Multi-weight support API enabled us to refresh the most popular models and offer more accurate weights. We improved on average each model by ~3 points. The new recipe used was learned on top of ResNet50 and its details were covered on a [previous blog post](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/). - -| **Model** | **Old weights** | **New weights** | -|------------------------------|-----------------|-----------------| -| efficientnet_b1 | 78.642 | 79.838 | -| mobilenet_v2 | 71.878 | 72.154 | -| mobilenet_v3_large | 74.042 | 75.274 | -| regnet_y_400mf | 74.046 | 75.804 | -| regnet_y_800mf | 76.42 | 78.828 | -| regnet_y_1_6gf | 77.95 | 80.876 | -| regnet_y_3_2gf | 78.948 | 81.982 | -| regnet_y_8gf | 80.032 | 82.828 | -| regnet_y_16gf | 80.424 | 82.886 | -| regnet_y_32gf | 80.878 | 83.368 | -| regnet_x_400mf | 72.834 | 74.864 | -| regnet_x_800mf | 75.212 | 77.522 | -| regnet_x_1_6gf | 77.04 | 79.668 | -| regnet_x_3_2gf | 78.364 | 81.196 | -| regnet_x_8gf | 79.344 | 81.682 | -| regnet_x_16gf | 80.058 | 82.716 | -| regnet_x_32gf | 80.622 | 83.014 | -| resnet50 | 76.13 | 80.858 | -| resnet50 (quantized) | 75.92 | 80.282 | -| resnet101 | 77.374 | 81.886 | -| resnet152 | 78.312 | 82.284 | -| resnext50_32x4d | 77.618 | 81.198 | -| resnext101_32x8d | 79.312 | 82.834 | -| resnext101_32x8d (quantized) | 78.986 | 82.574 | -| wide_resnet50_2 | 78.468 | 81.602 | -| wide_resnet101_2 | 78.848 | 82.51 | - -We would like to thank Piotr Dollar, Mannat Singh and Hugo Touvron for their past research and contributions to this work. - -### New Augmentations, Layers and Losses - -This release brings a bunch of new primitives which can be used to produce SOTA models. Some highlights include the addition of [AugMix](https://arxiv.org/abs/1912.02781) data-augmentation method, the [DropBlock](https://arxiv.org/abs/1810.12890) layer, the [cIoU/dIoU](https://arxiv.org/abs/1911.08287) loss and [many more](https://github.com/pytorch/vision/issues/5410). We would like to thank Aditya Oke, Abhijit Deo, Yassine Alouini and Hu Ye for contributing to the project and for helping us maintain TorchVision relevant and fresh. - -### Documentation - -We completely revamped our models documentation to make them easier to browse, and added various key information such as supported image sizes, or image pre-processing steps of pre-trained weights. We now have a [main model page](https://pytorch.org/vision/main/models.html) with various [summary tables](https://pytorch.org/vision/main/models.html#table-of-all-available-classification-weights) of available weights, and each model has a [dedicated page](https://pytorch.org/vision/main/models/resnet.html). Each model builder is also documented in their [own page](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet50.html#torchvision.models.resnet50), with more details about the available weights, including accuracy, minimal image size, link to training recipes, and other valuable info. For comparison, our previous models docs are [here](https://pytorch.org/vision/0.12/models.html). To provide feedback on the new documentation, please use the dedicated [Github issue](https://github.com/pytorch/vision/issues/5511). - -## TorchAudio v0.12 - -### (BETA) Streaming API - -

        - -

        - - -StreamReader is TorchAudio’s new I/O API. It is backed by FFmpeg†, and allows users to: -- Decode audio and video formats, including MP4 and AAC -- Handle input forms, such as local files, network protocols, microphones, webcams, screen captures and file-like objects -- Iterate over and decode chunk-by-chunk, while changing the sample rate or frame rate -- Apply audio and video filters, such as low-pass filter and image scaling -- Decode video with Nvidia's hardware-based decoder (NVDEC) - -For usage details, please check out the [documentation](https://pytorch.org/audio/0.12.0/io.html#streamreader) and tutorials: -- [Media Stream API - Pt.1](https://pytorch.org/audio/0.12.0/tutorials/streaming_api_tutorial.html) -- [Media Stream API - Pt.2](https://pytorch.org/audio/0.12.0/tutorials/streaming_api2_tutorial.html) -- [Online ASR with Emformer RNN-T](https://pytorch.org/audio/0.12.0/tutorials/online_asr_tutorial.html) -- [Device ASR with Emformer RNN-T](https://pytorch.org/audio/0.12.0/tutorials/device_asr.html) -- [Accelerated Video Decoding with NVDEC](https://pytorch.org/audio/0.12.0/hw_acceleration_tutorial.html) - -† To use StreamReader, FFmpeg libraries are required. Please install FFmpeg. The coverage of codecs depends on how these libraries are configured. TorchAudio official binaries are compiled to work with FFmpeg 4 libraries; FFmpeg 5 can be used if TorchAudio is built from source. - -### (BETA) CTC Beam Search Decoder - -TorchAudio integrates the wav2letter CTC beam search decoder from [Flashlight](https://arxiv.org/pdf/2201.12465.pdf) ([GitHub](https://github.com/flashlight/flashlight)). The addition of this inference time decoder enables running end-to-end CTC ASR evaluation using TorchAudio utils. - -Customizable lexicon and lexicon-free decoders are supported, and both are compatible with KenLM n-gram language models or without using a language model. TorchAudio additionally supports downloading token, lexicon, and pretrained KenLM files for the LibriSpeech dataset. - -For usage details, please check out the [documentation](https://pytorch.org/audio/0.12.0/models.decoder.html#ctcdecoder) and [ASR inference tutorial](https://pytorch.org/audio/0.12.0/tutorials/asr_inference_with_ctc_decoder_tutorial.html). - -### (BETA) New Beamforming Modules and Methods - -To improve flexibility in usage, the release adds two new beamforming modules under torchaudio.transforms: [SoudenMVDR](https://pytorch.org/audio/0.12.0/transforms.html#soudenmvdr) and [RTFMVDR](https://pytorch.org/audio/0.12.0/transforms.html#rtfmvdr). The main differences from [MVDR](https://pytorch.org/audio/0.11.0/transforms.html#mvdr) are: -- Use power spectral density (PSD) and relative transfer function (RTF) matrices as inputs instead of time-frequency masks. The module can be integrated with neural networks that directly predict complex-valued STFT coefficients of speech and noise -- Add \'reference_channel\' as an input argument in the forward method, to allow users to select the reference channel in model training or dynamically change the reference channel in inference - -Besides the two modules, new function-level beamforming methods are added under torchaudio.functional. These include: -- [psd](https://pytorch.org/audio/0.12.0/functional.html#psd) -- [mvdr_weights_souden](https://pytorch.org/audio/0.12.0/functional.html#mvdr-weights-souden) -- [mvdr_weights_rtf](https://pytorch.org/audio/0.12.0/functional.html#mvdr-weights-rtf) -- [rtf_evd](https://pytorch.org/audio/0.12.0/functional.html#rtf-evd) -- [rtf_power](https://pytorch.org/audio/0.12.0/functional.html#rtf-power) -- [apply_beamforming](https://pytorch.org/audio/0.12.0/functional.html#apply-beamforming) - -For usage details, please check out the documentation at [torchaudio.transforms](https://pytorch.org/audio/0.12.0/transforms.html#multi-channel) and [torchaudio.functional](https://pytorch.org/audio/0.12.0/functional.html#multi-channel) and the [Speech Enhancement with MVDR Beamforming tutorial](https://pytorch.org/audio/0.12.0/tutorials/mvdr_tutorial.html). - -## TorchText v0.13 - -### Glue Datasets - -We increased the number of datasets in TorchText from 22 to 30 by adding the remaining 8 datasets from the GLUE benchmark (SST-2 was already supported). The complete list of GLUE datasets is as follows: -- [CoLA](https://nyu-mll.github.io/CoLA/) ([paper](https://arxiv.org/pdf/1805.12471.pdf)): Single sentence binary classification acceptability task -- [SST-2](https://nlp.stanford.edu/sentiment/) ([paper](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)): Single sentence binary classification sentiment task -- [MRPC](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) ([paper](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf)): Dual sentence binary classification paraphrase task -- [QQP](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs): Dual sentence binary classification paraphrase task -- [STS-B](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark) ([paper](https://aclanthology.org/S17-2001.pdf)): Single sentence to float regression sentence similarity task -- [MNLI](https://cims.nyu.edu/~sbowman/multinli/) ([paper](https://cims.nyu.edu/~sbowman/multinli/paper.pdf)): Sentence ternary classification NLI task -- [QNLI](https://gluebenchmark.com/) ([paper](https://arxiv.org/pdf/1804.07461.pdf)): Sentence binary classification QA and NLI tasks -- [RTE](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment) ([paper](https://arxiv.org/pdf/2010.03061.pdf)): Dual sentence binary classification NLI task -- [WNLI](https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html) ([paper](http://commonsensereasoning.org/2011/papers/Levesque.pdf)): Dual sentence binary classification coreference and NLI tasks - -### Scriptable BERT Tokenizer - -TorchText has extended support for scriptable tokenizer by adding the WordPiece tokenizer used in BERT. It is one of the commonly used algorithms for splitting input text into sub-words units and was introduced in [Japanese and Korean Voice Search (Schuster et al., 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf). - -TorchScriptabilty support would allow users to embed the BERT text-preprocessing natively in C++ without needing the support of python runtime. As TorchText now supports the CMAKE build system to natively link torchtext binaries with application code, users can easily integrate BERT tokenizers for deployment needs. - -For usage details, please refer to the corresponding [documentation](https://pytorch.org/text/main/transforms.html#torchtext.transforms.BERTTokenizer). - -## TorchRec v0.2.0 - -### EmbeddingModule + DLRM benchmarks - -A set of [benchmarking tests](https://github.com/pytorch/torchrec/tree/main/benchmarks), showing performance characteristics of TorchRec’s base modules and research models built out of TorchRec. - -### TwoTower Retrieval Example, with FAISS - -We provide an [example](https://github.com/pytorch/torchrec/tree/main/examples/retrieval) demonstrating training a distributed TwoTower (i.e. User-Item) Retrieval model that is sharded using TorchRec. The projected item embeddings are added to an IVFPQ FAISS index for candidate generation. The retrieval model and KNN lookup are bundled in a Pytorch model for efficient end-to-end retrieval. - -### Integrations - -We demonstrate that TorchRec works out of the box with many components commonly used alongside PyTorch models in production like systems, such as -- [Training](https://github.com/pytorch/torchrec/tree/main/examples/ray) a TorchRec model on Ray Clusters utilizing the Torchx Ray scheduler -- [Preprocessing](https://github.com/pytorch/torchrec/tree/main/torchrec/datasets/scripts/nvt) and DataLoading with NVTabular on DLRM -- [Training](https://github.com/pytorch/torchrec/tree/main/examples/torcharrow) a TorchRec model with on-the-fly preprocessing with TorchArrow showcasing RecSys domain UDFs - -### Sequential Embeddings Example: Bert4Rec - -We provide an [example](https://github.com/pytorch/torchrec/tree/main/examples/bert4rec), using TorchRec, that reimplements the [BERT4REC](https://arxiv.org/abs/1904.06690) paper, showcasing EmbeddingCollection for non-pooled embeddings. Using DistributedModelParallel we see a 35% QPS gain over conventional data parallelism. - -### (Beta) Planner - -The TorchRec library includes a built-in [planner](https://pytorch.org/torchrec/torchrec.distributed.planner.html) that selects near optimal sharding plan for a given model. The planner attempts to identify the best sharding plan by evaluating a series of proposals which are statically analyzed and fed into an integer partitioner. The planner is able to automatically adjust plans for a wide range of hardware setups, allowing users to scale performance seamlessly from local development environment to large scale production hardware. See this [notebook](https://github.com/pytorch/torchrec/blob/main/torchrec/distributed/planner/Planner_Introduction.ipynb) for a more detailed tutorial. - -### (Beta) Inference - -[TorchRec Inference](https://github.com/pytorch/torchrec/tree/main/torchrec/inference) is a C++ library that supports multi-gpu inference. The TorchRec library is used to shard models written and packaged in Python via torch.package (an alternative to TorchScript). The torch.deploy library is used to serve inference from C++ by launching multiple Python interpreters carrying the packaged model, thus subverting the GIL. Two models are provided as examples: [DLRM multi-GPU](https://github.com/pytorch/torchrec/blob/main/examples/inference/dlrm_predict.py) (sharded via TorchRec) and [DLRM single-GPU](https://github.com/pytorch/torchrec/blob/main/examples/inference/dlrm_predict_single_gpu.py). - -### (Beta) RecMetrics - -RecMetrics is a [metrics](https://github.com/pytorch/torchrec/tree/main/torchrec/metrics) library that collects common utilities and optimizations for Recommendation models. It extends [torchmetrics](https://torchmetrics.readthedocs.io/en/stable/). -- A centralized metrics module that allows users to add new metrics -- Commonly used metrics, including AUC, Calibration, CTR, MSE/RMSE, NE & Throughput -- Optimization for metrics related operations to reduce the overhead of metric computation -- Checkpointing - -### (Prototype) Single process Batched + Fused Embeddings - -Previously TorchRec’s abstractions (EmbeddingBagCollection/EmbeddingCollection) over FBGEMM kernels, which provide benefits such as table batching, optimizer fusion, and UVM placement, could only be used in conjunction with DistributedModelParallel. We’ve decoupled these notions from sharding, and introduced the [FusedEmbeddingBagCollection](https://github.com/pytorch/torchrec/blob/eb1247d8a2d16edc4952e5c2617e69acfe5477a5/torchrec/modules/fused_embedding_modules.py#L271), which can be used as a standalone module, with all of the above features, and can also be sharded. - -## TorchX v0.2.0 - -TorchX is a job launcher that makes it easier to run PyTorch in distributed training clusters with many scheduler integrations including Kubernetes and Slurm. We're excited to release TorchX 0.2.0 with a number of improvements. TorchX is currently being used in production in both on-premise and cloud environments. - -Check out the [quickstart](https://pytorch.org/torchx/main/quickstart.html) to start launching local and remote jobs. - -### Workspaces - -TorchX [now supports workspaces](https://pytorch.org/torchx/main/workspace.html) which allows users to easily launch training jobs using their local workspace. TorchX can automatically build a patch with your local training code on top of a base image to minimize iteration time and time to training. - -### .torchxconfig - -Specifying options in [.torchxconfig](https://pytorch.org/torchx/latest/runner.config.html) saves you from having to type long CLI commands each time you launch a job. You can also define project level generic configs and drop a config file in your home directory for user-level overrides. - -### Expanded Scheduler Support - -TorchX now supports [AWS Batch](https://pytorch.org/torchx/main/schedulers/aws_batch.html) and [Ray (experimental)](https://pytorch.org/torchx/main/schedulers/ray.html) schedulers in addition to our existing integrations. - -### Distributed Training On All Schedulers - -The TorchX dist.ddp component now works on all schedulers without any configuration. Distributed training workers will automatically discover each other when using [torchelastic](https://pytorch.org/docs/stable/distributed.elastic.html) via [the builtin dist.ddp component](https://pytorch.org/torchx/main/components/distributed.html). - -### Hyper Parameter Optimization - -TorchX [integrates with Ax](https://ax.dev/versions/latest/api/runners.html#module-ax.runners.torchx) to let you scale hyper-parameter optimizations (HPO) by launching the search trials onto remote clusters. - -### File and Device Mounts - -TorchX now supports [remote filesystem mounts and custom devices](https://pytorch.org/torchx/main/specs.html#mounts). This enables your PyTorch jobs to efficiently access cloud storage such as NFS or Lustre. The device mounts enables usage of network accelerators like Infiniband and custom inference/training accelerators. - -## FBGemm v0.2.0 - -The FBGEMM library contains optimized kernels meant to improve the performance of PyTorch workloads. We’ve added a number of new features and optimizations over the last few months that we are excited to report. - -### Inference Table Batched Embedding (TBE) - -The [table batched embedding bag](https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py#L1541) (TBE) operator is an important base operation for embedding lookup for recommendation system inference on GPU. We added the following enhancements for performance and flexibility: - -Alignment restriction removed -- Embedding dimension \* data type size had to be multiple of 4B before and now, it is 1B. - -Unified Virtual Memory (UVM) caching kernel optimizations -- UVM caching kernels now scale linearly with # of tables using UVM caching. Previously, it was having similar overhead as all tables using UVM caching -- UVM caching kernel overhead is much smaller than before - -### Inference FP8 Table Batched Embedding (TBE) - -The [table batched embedding bag](https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py#L1541) (TBE) previously supported FP32, FP16, INT8, INT4, and INT2 embedding weight types. While these weight types work well in many models, we integrate FP8 weight types (in both GPU and CPU operations) to allow for numerical and performance evaluations of FP8 in our models. Compared to INT8, FP8 does not require the additional bias and scale storage and calculations. Additionally, the next generation of H100 GPUs has the FP8 support on Tensor Core (mainly matmul ops). - -### Jagged Tensor Kernels - -We added optimized kernels to speed up [TorchRec JaggedTensor](https://pytorch.org/torchrec/torchrec.sparse.html). The purpose of JaggedTensor is to handle the case where one dimension of the input data is “jagged”, meaning that each consecutive row in a given dimension may be a different length, which is often the case with sparse feature inputs in recommendation systems. The internal representation is shown below: - -

        - -

        - -We added ops for converting jagged tensors from sparse to dense formats and back, performing matrix multiplications with jagged tensors, and elementwise ops. - -### Optimized permute102-baddbmm-permute102 - -It is difficult to fuse various matrix multiplications where the batch size is not the batch size of the model, switching the batch dimension is a quick solution. We created the permute102_baddbmm_permute102 operation that switches the first and the second dimension, performs the batched matrix multiplication and then switches back. Currently we only support forward pass with FP16 data type and will support FP32 type and backward pass in the future. - -### Optimized index_select for dim 0 index selection - -index_select is normally used as part of a sparse operation. While PyTorch supports a generic index_select for an arbitrary-dimension index selection, its performance for a special case like the dim 0 index selection is suboptimal. For this reason, we implement a specialized index_select for dim 0. In some cases, we have observed 1.4x performance gain from FBGEMM’s index_select compared to the one from PyTorch (using uniform index distribution). - -More about the implementation of influential instances can be found on our [GitHub](https://github.com/pytorch/captum/tree/master/captum/influence) page and [tutorials](https://captum.ai/tutorials/TracInCP_Tutorial). - -Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the [discussion forums](https://discuss.pytorch.org/) and [open GitHub issues](https://github.com/pytorch/pytorch/issues). To get the latest news from PyTorch, follow us on [Twitter](https://twitter.com/PyTorch), [Medium](https://medium.com/pytorch), [YouTube](https://www.youtube.com/pytorch), and [LinkedIn](https://www.linkedin.com/company/pytorch). - -Cheers! - -Team PyTorch diff --git a/_posts/2022-6-28-pytorch-1.12-released.md b/_posts/2022-6-28-pytorch-1.12-released.md deleted file mode 100644 index 434984eac130..000000000000 --- a/_posts/2022-6-28-pytorch-1.12-released.md +++ /dev/null @@ -1,208 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 1.12: TorchArrow, Functional API for Modules and nvFuser, are now available" -author: Team PyTorch -featured-img: '' ---- - -We are excited to announce the release of PyTorch 1.12 ([release note](https://github.com/pytorch/pytorch/releases/tag/v1.12.0))! This release is composed of over 3124 commits, 433 contributors. Along with 1.12, we are releasing beta versions of AWS S3 Integration, PyTorch Vision Models on Channels Last on CPU, Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16 and FSDP API. We want to sincerely thank our dedicated community for your contributions. - -Summary: -- Functional APIs to functionally apply module computation with a given set of parameters -- Complex32 and Complex Convolutions in PyTorch -- DataPipes from TorchData fully backward compatible with DataLoader -- functorch with improved coverage for APIs -- nvFuser a deep learning compiler for PyTorch -- Changes to float32 matrix multiplication precision on Ampere and later CUDA hardware -- TorchArrow, a new beta library for machine learning preprocessing over batch data - -## Frontend APIs - -### Introducing TorchArrow - -We’ve got a new Beta release ready for you to try and use: TorchArrow. This is a library for machine learning preprocessing over batch data. It features a performant and Pandas-style, easy-to-use API in order to speed up your preprocessing workflows and development. - -Currently, it provides a Python DataFrame interface with the following features: -- High-performance CPU backend, vectorized and extensible User-Defined Functions (UDFs) with [Velox](https://github.com/facebookincubator/velox) -- Seamless handoff with PyTorch or other model authoring, such as Tensor collation and easily plugging into PyTorch DataLoader and DataPipes -- Zero copy for external readers via Arrow in-memory columnar format - -For more details, please find our [10-min tutorial](https://github.com/pytorch/torcharrow/blob/main/tutorial/tutorial.ipynb), installation [instructions](https://github.com/pytorch/torcharrow#installation), API [documentation](https://pytorch.org/torcharrow/beta/), and a [prototype](https://github.com/pytorch/torchrec/tree/main/examples/torcharrow) for data preprocessing in TorchRec. - -### (Beta) Functional API for Modules - -PyTorch 1.12 introduces a new beta feature to functionally apply Module computation with a given set of parameters. Sometimes, the traditional PyTorch Module usage pattern that maintains a static set of parameters internally is too restrictive. This is often the case when implementing algorithms for meta-learning, where multiple sets of parameters may need to be maintained across optimizer steps. - -The new ``torch.nn.utils.stateless.functional_call()`` API allows for: -- Module computation with full flexibility over the set of parameters used -- No need to reimplement your module in a functional way -- Any parameter or buffer present in the module can be swapped with an externally-defined value for use in the call. Naming for referencing parameters / buffers follows the fully-qualified form in the module’s ``state_dict()`` - -Example: -```python -import torch -from torch import nn -from torch.nn.utils.stateless import functional_call - -class MyModule(nn.Module): - def __init__(self): - super().__init__() - self.fc1 = nn.Linear(3, 3) - self.bn = nn.BatchNorm1d(3) - self.fc2 = nn.Linear(3, 3) - - def forward(self, x): - return self.fc2(self.bn(self.fc1(x))) - -m = MyModule() - -# Define parameter / buffer values to use during module computation. -my_weight = torch.randn(3, 3, requires_grad=True) -my_bias = torch.tensor([1., 2., 3.], requires_grad=True) -params_and_buffers = { - 'fc1.weight': my_weight, - 'fc1.bias': my_bias, - # Custom buffer values can be used too. - 'bn.running_mean': torch.randn(3), -} - -# Apply module computation to the input with the specified parameters / buffers. -inp = torch.randn(5, 3) -output = functional_call(m, params_and_buffers, inp) -``` - -### (Beta) Complex32 and Complex Convolutions in PyTorch - -PyTorch today natively supports complex numbers, complex autograd, complex modules, and numerous complex operations, including linear algebra and Fast Fourier Transform (FFT) operators. Many libraries, including torchaudio and ESPNet, already make use of complex numbers in PyTorch, and PyTorch 1.12 further extends complex functionality with complex convolutions and the experimental complex32 (“complex half”) data type that enables half precision FFT operations. Due to the bugs in CUDA 11.3 package, we recommend using CUDA 11.6 package from wheels if you are using complex numbers. - -### (Beta) Forward-mode Automatic Differentiation - -Forward-mode AD allows the computation of directional derivatives (or equivalently, Jacobian-vector products) eagerly in the forward pass. PyTorch 1.12 significantly improves the operator coverage for forward-mode AD. See our [tutorial](https://pytorch.org/tutorials/search.html?q=forward-mode+automatic+differentiation+%28beta%29&check_keywords=yes&area=default#) for more information. - -### TorchData - -#### BC DataLoader + DataPipe - -\`DataPipe\` from TorchData becomes fully backward compatible with the existing \`DataLoader\` regarding shuffle determinism and dynamic sharding in both multiprocessing and distributed environments. - -#### (Beta) AWS S3 Integration - -DataPipes based on [AWSSDK](https://github.com/aws/aws-sdk-cpp) have been integrated into TorchData. It provides the following features backed by native AWSSDK: -- Retrieve list of urls from each S3 bucket based on prefix - - Support timeout to prevent hanging indefinitely - - Support to specify S3 bucket region - -- Load data from S3 urls - - Support buffered and multi-part download - - Support to specify S3 bucket region - -AWS native DataPipes are still in the beta phase. And, we will keep tuning them to improve their performance. - -#### (Prototype) DataLoader2 - -DataLoader2 became available in prototype mode. We are introducing new ways to interact between DataPipes, DataLoading API, and backends (aka ReadingServices). Feature is stable in terms of API, but functionally not complete yet. We welcome early adopters and feedback, as well as potential contributors. - -For more details, please checkout the [link](https://github.com/pytorch/data/tree/main/torchdata/dataloader2). - -### functorch - -Inspired by [Google JAX](https://github.com/google/jax), functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples of these include: -- [running ensembles of models on a single machine](https://pytorch.org/functorch/stable/notebooks/ensembling.html) -- [efficiently computing Jacobians and Hessians](https://pytorch.org/functorch/stable/notebooks/jacobians_hessians.html) -- [computing per-sample-gradients (or other per-sample quantities)](https://pytorch.org/functorch/stable/notebooks/per_sample_grads.html) - -We’re excited to announce functorch 0.2.0 with a number of improvements and new experimental features. - -#### Significantly improved coverage - -We significantly improved coverage for ``functorch.jvp`` (our forward-mode autodiff API) and other APIs that rely on it (``functorch.{jacfwd, hessian}``). - -#### (Prototype) functorch.experimental.functionalize - -Given a function f, ``functionalize(f)`` returns a new function without mutations (with caveats). This is useful for constructing traces of PyTorch functions without in-place operations. For example, you can use ``make_fx(functionalize(f))`` to construct a mutation-free trace of a pytorch function. To learn more, please see the [documentation](https://pytorch.org/functorch/stable/). - -For more details, please see our [installation instructions](https://pytorch.org/functorch/stable/install.html), [documentation](https://pytorch.org/functorch/), [tutorials](https://pytorch.org/functorch), and [release notes](https://github.com/pytorch/functorch/releases). - -## Performance Improvements - -### Introducing nvFuser, a deep learning compiler for PyTorch - -In PyTorch 1.12, Torchscript is updating its default fuser (for Volta and later CUDA accelerators) to nvFuser, which supports a wider range of operations and is faster than NNC, the previous fuser for CUDA devices. A soon to be published blog post will elaborate on nvFuser and show how it speeds up training on a variety of networks. - -See [the nvFuser documentation](https://github.com/pytorch/pytorch/blob/release/1.12/torch/csrc/jit/codegen/cuda/README.md) for more details on usage and debugging. - -### Changes to float32 matrix multiplication precision on Ampere and later CUDA hardware - -PyTorch supports a variety of “mixed precision” techniques, like the torch.amp (Automated Mixed Precision) module and performing float32 matrix multiplications using the TensorFloat32 datatype on Ampere and later CUDA hardware for faster internal computations. In PyTorch 1.12 we’re changing the default behavior of float32 matrix multiplications to always use full IEEE fp32 precision, which is more precise but slower than using the TensorFloat32 datatype for internal computation. For devices with a particularly high ratio of TensorFloat32 to float32 throughput such as A100, this change in defaults can result in a large slowdown. - -If you’ve been using TensorFloat32 matrix multiplications then you can continue to do so by setting ``torch.backends.cuda.matmul.allow_tf32 = True`` - -which is supported since PyTorch 1.7. Starting in PyTorch 1.12 the new matmul precision API can be used, too: ``torch.set_float32_matmul_precision(“highest”|”high”|”medium”)`` - -To reiterate, PyTorch’s new default is “highest” precision for all device types. We think this provides better consistency across device types for matrix multiplications. Documentation for the new precision API can be found [here](https://pytorch.org/docs/master/generated/torch.set_float32_matmul_precision.html?highlight=precision#torch.set_float32_matmul_precision). Setting the “high” or “medium” precision types will enable TensorFloat32 on Ampere and later CUDA devices. If you’re updating to PyTorch 1.12 then to preserve the current behavior and faster performance of matrix multiplications on Ampere devices, set precision to “high”. - -Using mixed precision techniques is essential for training many modern deep learning networks efficiently, and if you’re already using torch.amp this change is unlikely to affect you. If you’re not familiar with mixed precision training then see our soon to be published “What Every User Should Know About Mixed Precision Training in PyTorch” blogpost. - -### (Beta) Accelerating PyTorch Vision Models with Channels Last on CPU - -Memory formats have a significant impact on performance when running vision models, generally Channels Last is more favorable from a performance perspective due to better data locality. 1.12 includes fundamental concepts of memory formats and demonstrates performance benefits using Channels Last on popular PyTorch vision models on Intel® Xeon® Scalable processors. -- Enables Channels Last memory format support for the commonly used operators in CV domain on CPU, applicable for both inference and training -- Provides native level optimization on Channels Last kernels from ATen, applicable for both AVX2 and AVX512 -- Delivers 1.3x to 1.8x inference performance gain over Channels First for TorchVision models on Intel® Xeon® Ice Lake (or newer) CPUs - -### (Beta) Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16 - -Reduced precision numeric formats like bfloat16 improves PyTorch performance across multiple deep learning training workloads. PyTorch 1.12 includes the latest software enhancements on bfloat16 which applies to a broader scope of user scenarios and showcases even higher performance gains. The main improvements include: -- 2x hardware compute throughput vs. float32 with the new bfloat16 native instruction VDPBF16PS, introduced on Intel® Xeon® Cooper Lake CPUs -- 1/2 memory footprint of float32, faster speed for memory bandwidth intensive operators -- 1.4x to 2.2x inference performance gain over float32 for TorchVision models on Intel® Xeon® Cooper Lake (or newer) CPUs - -### (Prototype) Introducing Accelerated PyTorch Training on Mac - -With the PyTorch 1.12 release, developers and researchers can now take advantage of Apple silicon GPUs for significantly faster model training. This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac. Accelerated GPU training is enabled using Apple’s Metal Performance Shaders (MPS) as a backend. The benefits include performance speedup from accelerated GPU training and the ability to train larger networks or batch sizes locally. Learn more [here](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/). - -

        - -

        - -

        - Accelerated GPU training and evaluation speedups over CPU-only (times faster) -

        - -Alongside the new MPS device support, the M1 binaries for Core and Domain libraries that have been available for the last few releases are now an official prototype feature. These binaries can be used to run PyTorch natively on Apple Silicon. - -### (Prototype) BetterTransformer: Fastpath execution for Transformer Encoder Inference - -PyTorch now supports CPU and GPU fastpath implementations (“BetterTransformer”) for several Transformer Encoder modules including TransformerEncoder, TransformerEncoderLayer, and MultiHeadAttention (MHA). The BetterTransformer fastpath architecture Better Transformer is consistently faster – 2x for many common execution scenarios, depending on model and input characteristics. The new BetterTransformer-enabled modules are API compatible with previous releases of the PyTorch Transformer API and will accelerate existing models if they meet fastpath execution requirements, as well as read models trained with previous versions of PyTorch. PyTorch 1.12 includes: -- BetterTransformer integration for Torchtext’s pretrained RoBERTa and XLM-R models -- Torchtext which builds on the PyTorch Transformer API -- Fastpath execution for improved performance by reducing execution overheads with fused kernels which combines multiple operators into a single kernel -- Option to achieve additional speedups by taking advantage of data sparsity during the processing of padding tokens in natural-language processing (by setting enable_nested_tensor=True when creating a TransformerEncoder) -- Diagnostics to help users understand why fastpath execution did not occur - - -

        - -

        - -## Distributed - -### (Beta) Fully Sharded Data Parallel (FSDP) API - -[FSDP API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) helps easily scale large model training by sharding a model’s parameters, gradients and optimizer states across data parallel workers while maintaining the simplicity of data parallelism. The prototype version was released in PyTorch 1.11 with a minimum set of features that helped [scaling tests of models with up to 1T parameters](https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff). - -In this beta release, FSDP API added the following features to support various production workloads. Highlights of the the newly added features in this beta release include: -1. Universal sharding strategy API - Users can easily change between sharding strategies with a single line change, and thus compare and use DDP (only data sharding), FSDP (full model and data sharding), or Zero2 (only sharding of optimizer and gradients) to optimize memory and performance for their specific training needs -2. Fine grained mixed precision policies - Users can specify a mix of half and full data types (bfloat16, fp16 or fp32) for model parameters, gradient communication, and buffers via mixed precision policies. Models are automatically saved in fp32 to allow for maximum portability -3. Transformer auto wrapping policy - allows for optimal wrapping of Transformer based models by registering the models layer class, and thus accelerated training performance -4. Faster model initialization using device_id init - initialization is performed in a streaming fashion to avoid OOM issues and optimize init performance vs CPU init -5. Rank0 streaming for full model saving of larger models - Fully sharded models can be saved by all GPU’s streaming their shards to the rank 0 GPU, and the model is built in full state on the rank 0 CPU for saving - -For more details and example code, please checkout the [documentation](https://pytorch.org/docs/1.11/fsdp.html?highlight=fsdp#module-torch.distributed.fsdp) and the [tutorial](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html). - - -Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the [discussion forums](https://discuss.pytorch.org/) and [open GitHub issues](https://github.com/pytorch/pytorch/issues). To get the latest news from PyTorch, follow us on [Twitter](https://twitter.com/PyTorch), [Medium](https://medium.com/pytorch), [YouTube](https://www.youtube.com/pytorch), and [LinkedIn](https://www.linkedin.com/company/pytorch). - -Cheers! - -Team PyTorch diff --git a/_posts/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference.md b/_posts/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference.md deleted file mode 100644 index bbc23f9b0737..000000000000 --- a/_posts/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -layout: blog_detail -title: "A BetterTransformer for Fast Transformer Inference" -author: Michael Gschwind, Eric Han, Scott Wolchok, Rui Zhu, Christian Puhrsch -featured-img: "/assets/images/2022-7-12-a-better-transformer-for-fast-transformer-encoder-inference-3.png" ---- - -**tl;dr** Transformers achieve state-of-the-art performance for NLP, and are becoming popular for a myriad of other tasks. They are computationally expensive which has been a blocker to their widespread productionisation. Launching with PyTorch 1.12, BetterTransformer implements a backwards-compatible fast path of `torch.nn.TransformerEncoder` for Transformer Encoder Inference and does not require model authors to modify their models. BetterTransformer improvements can exceed 2x in speedup and throughput for many common execution scenarios. To use BetterTransformer, [install](https://pytorch.org/get-started/locally/) PyTorch 1.12 and start using high-quality, high-performance Transformer models with the PyTorch API today. - -

        - -

        - -

        -Diagram of the Transformer Encoder Architecture (from "Attention Is All You Need"). During Inference, the entire module will execute as a single PyTorch-native function. -

        - -In this blog post, we share the following topics — Performance Improvements, Backwards compatibility, and Taking advantage of the FastPath. Learn more about these topics below. - -## Performance Improvements - -BetterTransformer launches with accelerated native implementations of MultiHeadAttention and TransformerEncoderLayer for CPUs and GPUs. These fast paths are integrated in the standard PyTorch Transformer APIs, and will accelerate [TransformerEncoder](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html), [TransformerEncoderLayer](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html) and [MultiHeadAttention](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html) nn.modules. These new modules implement two types of optimizations: (1) fused kernels combine multiple individual operators normally used to implement Transformers to provide a more efficient implementation, and (2) take advantage of sparsity in the inputs to avoid performing unnecessary operations on padding tokens. Padding tokens frequently account for a large fraction of input batches in many Transformer models used for Natural Language Processing. - -## Backwards compatibility - -Advantageously, **no model changes are necessary to benefit from the performance boost offered by BetterTransformer.** To benefit from fast path execution, inputs and operating conditions must satisfy some access conditions (see below). While the internal implementation of Transformer APIs has changed, PyTorch 1.12 maintains strict compatibility with Transformer modules shipped in previous versions, enabling PyTorch users to use models created and trained with previous PyTorch releases while benefiting from BetterTransformer improvements. - -In addition to enabling the PyTorch nn.Modules, BetterTransformer provides improvements for PyTorch libraries. Performance benefits will become available through two different enablement paths: - -1. Transparent acceleration: Current users of PyTorch nn.Modules such as [MultiHeadAttention](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html) as well as higher-level Transformer components will benefit from the improved performance of the new nn.Modules automatically. An example of this is the [visual transformer (ViT)](https://arxiv.org/abs/2010.11929) implementation used in the torchvision library ([code link](https://github.com/pytorch/vision/blob/87cde716b7f108f3db7b86047596ebfad1b88380/torchvision/models/vision_transformer.py#L103)). - -2. Torchtext library acceleration: As part of this project, we have optimized Torchtext to build on the PyTorch core API to benefit from BetterTransformer enhancements while maintaining strict and transparent compatibility with previous library versions and models trained with previous Torchtext versions. Using PyTorch Transformers in Torchtext also ensures that Torchtext will benefit from expected future enhancements to the PyTorch Transformer implementation. - -## Taking advantage of the Fastpath - -BetterTransformer is a fastpath for the PyTorch Transformer API. The fastpath is a native, specialized implementation of key Transformer functions for CPU and GPU that applies to common Transformer use cases. - -To take advantage of input sparsity (i.e. padding) in accelerating your model (see Figure 2), set the keyword argument `enable_nested_tensor=True` when instantiating a [TransformerEncoder](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html) and pass in the `src_key_padding_mask` argument (which denotes padding tokens) during inference. This requires the padding mask to be contiguous, which is the typical case. - -Currently, the BetterTransformer speedup only applies to transformer encoder models used in inference. To benefit from fastpath execution, models must be composed of any of the following components: [TransformerEncoder](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html), [TransformerEncoderLayer](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html) or [MultiheadAttention](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html) (MHA). Fastpath execution is also subject to some criteria. Most importantly, the model must be executed in inference mode and operate on input tensors that do not collect gradient tape information (e.g., running with torch.no_grad). The full list of conditions can be found at these links for [nn.MultiHeadAttention](https://github.com/pytorch/pytorch/blob/29189d2ba8e583b2355cd0e9517a1ee742ba12cf/torch/nn/modules/activation.py#L1060) and [nn.TransformerEncoder](https://github.com/pytorch/pytorch/blob/29189d2ba8e583b2355cd0e9517a1ee742ba12cf/torch/nn/modules/transformer.py#L206), respectively. If the criteria are not met, control flows to the legacy PyTorch 1.11 Transformer implementation which has the same API, but lacks the fastpath performance boost. - -Other transformer models (such as decoder models) which use the PyTorch MultiheadAttention module will benefit from the BetterTransformer fastpath. Planned future work is to expand the end-to-end BetterTransformer fastpath to models based on [TransformerDecoder](https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html) to support popular seq2seq and decoder-only (e.g., [OPT](https://ai.facebook.com/blog/democratizing-access-to-large-scale-language-models-with-opt-175b/)) model architectures, and to training. - -## Speedups - -The following graphs show the performance achieved for the [BERT](https://arxiv.org/abs/1810.04805)-base model with small and large-scale inputs: - -

        - -

        - -

        -Figure 1: PyTorch 1.12 Improvements with BetterTransformer fastpath execution -

        - -

        - -

        - -

        -Figure 2: PyTorch 1.12 Improvements with BetterTransformer fastpath execution
        -with sparsity optimization enabled by enable_nested_tensor=True
        -

        - - -BetterTransformer includes two types of optimization: (1) fused kernels implementing multiple operations more efficiently in a single kernel, and (2) exploiting sparsity by avoiding unnecessary processing on padding tokens. Enhanced performance for small input sizes benefits primarily from the fused kernel implementations, and shows a constant performance improvement regardless of padding amount. While large inputs still benefit from fused kernels, the computation heavy processing limits the benefits that may be obtained by the fused kernels as baseline performance is already closer to the theoretical peak. However, as we increase the amount of padding, performance increases dramatically as increasingly large amounts of computation can be avoided by exploiting the sparsity introduced by padding in NLP workloads. - -## Future Work - -As part of our ongoing work on PyTorch BetterTransformer, we are working on extending BetterTransformer improvements to Transformer Decoders. We aim to expand beyond inference to training as well. - -We are partnering to enable BetterTransformer on additional libraries such as FairSeq, MetaSeq, and HuggingFace to benefit all Transformer-based PyTorch models. We’ll provide future updates on the progress of BetterTransformer accelerations for the larger PyTorch ecosystem as part of this blog series. - -Acknowledgements: The authors would like to thank Lin Qiao, Ajit Mathews, Andrew Tulloch, Dmytro Dzhulgakov, Natalia Gimelshein, Emad El-Haraty, Mark Saroufim, Adnan Aziz, Geeta Chauhan, and Hamid Shojanazeri for their support, contributions and many helpful suggestions throughout the course of this project, and in the preparation of this blog. \ No newline at end of file diff --git a/_posts/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology.md b/_posts/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology.md deleted file mode 100644 index 14f5e733a9b0..000000000000 --- a/_posts/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology.md +++ /dev/null @@ -1,119 +0,0 @@ ---- -layout: blog_detail -title: "Case Study: PathAI Uses PyTorch to Improve Patient Outcomes with AI-powered Pathology" -author: Logan Kilpatrick - Sr. Technology Advocate, Harshith Padigela - ML Engineer, Syed Ashar Javed - ML Technical Lead, Robert Egger - Biomedical Data Scientist -featured-img: "/assets/images/2022-7-15-PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology-1.png" ---- - -​[​PathAI](https://pathai.com) is the leading provider of AI-powered technology tools and services for pathology (the study of disease). Our platform was built to enable substantial improvements to the accuracy of diagnosis and the measurement of therapeutic efficacy for complex diseases, leveraging modern approaches in machine learning like image segmentation, graph neural networks, and multiple instance learning. - -

        - -

        - -Traditional manual pathology is prone to [subjectivity and observer variability](https://www.journal-of-hepatology.eu/article/S0168-8278(20)30399-8/fulltext) that can negatively affect diagnoses and drug development trials. Before we dive into how we use PyTorch to improve our diagnosis workflow, let us first lay out the traditional analog Pathology workflow without machine learning. - -## How Traditional Biopharma Works - -There are many avenues that biopharma companies take to discover novel therapeutics or diagnostics. One of those avenues relies heavily on the analysis of pathology slides to answer a variety of questions: how does a particular cellular communication pathway work? Can a specific disease state be linked to the presence or lack of a particular protein? Why did a particular drug in a clinical trial work for some patients but not others? Might there be an association between patient outcomes and a novel biomarker? - -To help answer these questions, biopharma companies rely on expert pathologists to analyze slides and help evaluate the questions they might have.  - -As you might imagine, it takes an expert board certified pathologist to make accurate interpretations and diagnosis. In [one study](https://www.bmj.com/content/357/bmj.j2813.full), a single biopsy result was given to 36 different pathologists and the outcome was 18 different diagnoses varying in severity from no treatment to aggressive treatment necessary. Pathologists also often solicit feedback from colleagues in difficult edge cases. Given the complexity of the problem, even with expert training and collaboration, pathologists can still have a hard time making a correct diagnosis. This potential variance can be the difference between a drug being approved and it failing the clinical trial. - -## How PathAI utilizes machine learning to power drug development - -PathAI develops machine learning models which provide insights for drug development R&D, for powering clinical trials, and for making diagnoses. To this end, PathAI leverages PyTorch for slide level inference using a variety of methods including graph neural networks (GNN) as well as multiple instance learning. In this context, “slides” refers to full size scanned images of glass slides, which are pieces of glass with a thin slice of tissue between them, stained to show various cell formations. PyTorch enables our teams using these different methodologies to share a common framework which is robust enough to work in all the conditions we need. PyTorch’s high level, imperative, and pythonic syntax allows us to prototype models quickly and then take those models to scale once we have the results we want.  - -## Multi-instance learning on gigabyte images - -One of the uniquely challenging aspects of applying ML to pathology is the immense size of the images. These digital slides can often be 100,000 x 100,000 pixels or more in resolution and gigabytes in size. Loading the full image in GPU memory and applying traditional computer vision algorithms on them is an almost impossible task. It also takes both a considerable amount of time and resources to have a full slide image (100k x 100k) annotated, especially when annotators need to be domain experts (board-certified pathologists). We often build models to predict image-level labels, like the presence of cancer, on a patient slide which covers a few thousand pixels in the whole image. The cancerous area is sometimes a tiny fraction of the entire slide, which makes the ML problem similar to finding a needle in a haystack. On the other hand, some problems like the prediction of certain histological biomarkers require an aggregation of information from the whole slide which is again hard due to the size of the images. All these factors add significant algorithmic, computational, and logistical complexity when applying ML techniques to pathology problems. - -Breaking down the image into smaller patches, learning patch representations, and then pooling those representations to predict an image-level label is one way to solve this problem as is depicted in the image below. One popular method for doing this is called [Multiple Instance Learning (MIL)](https://paperswithcode.com/task/multiple-instance-learning). Each patch is considered an ‘instance’ and a set of patches forms a ‘bag’. The individual patch representations are pooled together to predict a final bag-level label. Algorithmically, the individual patch instances in the bag do not require labels and hence allow us to learn bag-level labels in a weakly-supervised way. They also use permutation invariant pooling functions which make the prediction independent of the order of patches and allows for an efficient aggregation of information. Typically, attention based pooling functions are used which not only allow for efficient aggregation but also provide attention values for each patch in the bag. These values indicate the importance of the corresponding patch in the prediction and can be visualized to better understand the model predictions. This element of interpretability can be very important to drive adoption of these models in the real world and we use variations like [Additive MIL models](https://arxiv.org/pdf/2206.01794.pdf) to enable such spatial explainability. Computationally, MIL models circumvent the problem of applying neural networks to large image sizes since patch representations are obtained independently of the size of the image. - -

        - -

        - -At PathAI, we use custom MIL models based on deep nets to predict image-level labels. The overview of this process is as follows: - -1. Select patches from a slide using different sampling approaches. -2. Construct a bag of patches based on random sampling or heuristic rules. -3. Generate patch representations for each instance based on pre-trained models or large-scale representation learning models. -4. Apply permutation invariant pooling functions to get the final slide-level score. - -Now that we have walked through some of the high-level details around MIL in PyTorch, let’s look at some code to see how simple it is to go from ideation to code in production with PyTorch. We begin by defining a sampler, transformations, and our MIL dataset: - -```python -# Create a bag sampler which randomly samples patches from a slide -bag_sampler = RandomBagSampler(bag_size=12) - -# Setup the transformations -crop_transform = FlipRotateCenterCrop(use_flips=True) - -# Create the dataset which loads patches for each bag -train_dataset = MILDataset( - bag_sampler=bag_sampler, - samples_loader=sample_loader, - transform=crop_transform, -) -``` - -After we have defined our sampler and dataset, we need to define the model we will actually train with said dataset. PyTorch’s familiar model definition syntax makes this easy to do while also allowing us to create bespoke models at the same time. - -```python -classifier = DefaultPooledClassifier(hidden_dims=[256, 256], input_dims=1024, output_dims=1) - -pooling = DefaultAttentionModule( - input_dims=1024, - hidden_dims=[256, 256], - output_activation=StableSoftmax() -) - -# Define the model which is a composition of the featurizer, pooling module and a classifier -model = DefaultMILGraph(featurizer=ShuffleNetV2(), classifier=classifier, pooling = pooling) -``` - -Since these models are trained end-to-end, they offer a powerful way to go directly from a gigapixel whole slide image to a single label. Due to their wide applicability to different biological problems, two aspects of their implementation and deployment are important: - -1. Configurable control over each part of the pipeline including the data loaders, the modular parts of the model, and their interaction with each other. -2. Ability to rapidly iterate through the ideate-implement-experiment-productionize loop. - -PyTorch has various advantages when it comes to MIL modeling. It offers an intuitive way to create dynamic computational graphs with flexible control flow which is great for rapid research experimentation. The map-style datasets, configurable sampler and batch-samplers allow us to customize how we construct bags of patches, enabling faster experimentation. Since MIL models are IO heavy, data parallelism and pythonic data loaders make the task very efficient and user friendly. Lastly, the object-oriented nature of PyTorch enables building of reusable modules which aid in the rapid experimentation, maintainable implementation and ease of building compositional components of the pipeline. - -## Exploring spatial tissue organization with GNNs in PyTorch - -

        - -

        - -In both healthy and diseased tissue, the spatial arrangement and structure of cells can oftentimes be as important as the cells themselves. For example, when assessing lung cancers, pathologists try to look at the overall grouping and structure of tumor cells (do they form solid sheets? Or do they occur in smaller, localized clusters?) to determine if the cancer belongs to specific subtypes which can have vastly [different prognosis](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3369269/). Such spatial relationships between cells and other tissue structures can be modeled using graphs to capture tissue topology and cellular composition at the same time. [Graph Neural Networks](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w16/Lu_Capturing_Cellular_Topology_in_Multi-Gigapixel_Pathology_Images_CVPRW_2020_paper.pdf) (GNNs) allow learning spatial patterns within these graphs that relate to other clinical variables, for example overexpression of genes in certain cancers. - -In late 2020, when PathAI started using GNNs on tissue samples, PyTorch had the best and most mature support for GNN functionality via the [PyG package](https://pytorch-geometric.readthedocs.io/en/latest/). This made PyTorch the natural choice for our team given that GNN models were something that we knew would be an important ML concept we wanted to explore.  - -One of the main value-adds of GNN’s in the context of tissue samples is that the graph itself can uncover spatial relationships that would otherwise be very difficult to find by visual inspection alone. In our recent [AACR publication](https://aacrjournals.org/cancerres/article/82/12_Supplement/1922/701539), we showed that by using GNNs, we can better understand the way the presence of immune cell aggregates (specifically tertiary lymphoid structures, or TLS) in the tumor microenvironment can influence patient prognosis. In this case, the GNN approach was used to predict expression of genes associated with the presence of TLS, and identify histological features beyond the TLS region itself that are relevant to TLS. Such insights into gene expression are difficult to identify from tissue sample images when unassisted by ML models.  - -One of the most promising GNN variations we have had success with is [self attention graph pooling](https://arxiv.org/pdf/1904.08082.pdf). Let’s take a look at how we define our Self Attention Graph Pooling (SAGPool) model using PyTorch and PyG: - -```python -class SAGPool(torch.nn.Module): - def __init__(self, ...): - super().__init__() - self.conv1 = GraphConv(in_features, hidden_features, aggr='mean') - self.convs = torch.nn.ModuleList() - self.pools = torch.nn.ModuleList() - self.convs.extend([GraphConv(hidden_features, hidden_features, aggr='mean') for i in range(num_layers - 1)]) - self.pools.extend([SAGPooling(hidden_features, ratio, GNN=GraphConv, min_score=min_score) for i in range((num_layers) // 2)]) - self.jump = JumpingKnowledge(mode='cat') - self.lin1 = Linear(num_layers * hidden_features, hidden_features) - self.lin2 = Linear(hidden_features, out_features) - self.out_activation = out_activation - self.dropout = dropout -``` - -In the above code, we begin by defining a single convolutional graph layer and then add two [module list layers](https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html) which allow us to pass in a variable number of layers. We then take our [empty module list and append](https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html?highlight=extend#torch.nn.ModuleList.extend) a variable number of `GraphConv` layers followed by a variable number of `SAGPooling` layers. We finish up our `SAGPool` definition by adding a [JumpingKnowledge Layer](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.models.JumpingKnowledge), two linear layers, our activation function, and our dropout value. PyTorch’s intuitive syntax allows us to abstract away the complexity of working with state of the art methods like SAG Poolings while also maintaining the common approach to model development we are familiar with. - -Models like our SAG Pool one described above are just one example of how GNNs with PyTorch are allowing us to explore new and novel ideas. We also recently explored [multimodal CNN - GNN hybrid models](https://openaccess.thecvf.com/content/CVPR2022W/CVMI/papers/Dwivedi_Multi_Stain_Graph_Fusion_for_Multimodal_Integration_in_Pathology_CVPRW_2022_paper.pdf) which ended up being 20% more accurate than traditional Pathologist consensus scores. These innovations and interplay between traditional CNNs and GNNs are again enabled by the short research to production model development loop. - -## Improving Patient Outcomes -In order to achieve our mission of improving patient outcomes with AI-powered pathology, PathAI needs to rely on an ML development framework that (1) facilitates quick iteration and easy extension (i.e. Model configuration as code) during initial phases of development and exploration (2) scales model training and inference to massive images (3) easily and robustly serves models for production uses of our products (in clinical trials and beyond). As we’ve demonstrated, PyTorch offers us all of these capabilities and more. We are incredibly excited about the future of PyTorch and cannot wait to see what other impactful challenges we can solve using the framework. \ No newline at end of file diff --git a/_posts/2022-7-19-what-every-user-should-know-about-mixed-precision-training-in-pytorch.md b/_posts/2022-7-19-what-every-user-should-know-about-mixed-precision-training-in-pytorch.md deleted file mode 100644 index b6decedc5e31..000000000000 --- a/_posts/2022-7-19-what-every-user-should-know-about-mixed-precision-training-in-pytorch.md +++ /dev/null @@ -1,117 +0,0 @@ ---- -layout: blog_detail -title: "What Every User Should Know About Mixed Precision Training in PyTorch" -author: Syed Ahmed, Christian Sarofeen, Mike Ruberry, Eddie Yan, Natalia Gimelshein, Michael Carilli, Szymon Migacz, Piotr Bialecki, Paulius Micikevicius, Dusan Stosic, Dong Yang, and Naoya Maruyama -featured-img: '' ---- - -Efficient training of modern neural networks often relies on using lower precision data types. Peak float16 matrix multiplication and convolution performance is 16x faster than peak float32 performance on A100 GPUs. And since the float16 and bfloat16 data types are only half the size of float32 they can double the performance of bandwidth-bound kernels and reduce the memory required to train a network, allowing for larger models, larger batches, or larger inputs. Using a module like [torch.amp](https://pytorch.org/docs/master/amp.html) (short for “Automated Mixed Precision”) makes it easy to get the speed and memory usage benefits of lower precision data types while preserving convergence behavior. - -Going faster and using less memory is always advantageous – deep learning practitioners can test more model architectures and hyperparameters, and larger, more powerful models can be trained. Training very large models like those described in [Narayanan et al.](https://arxiv.org/pdf/2104.04473.pdf) and [Brown et al.](https://arxiv.org/pdf/2005.14165.pdf) (which take thousands of GPUs months to train even with expert handwritten optimizations) is infeasible without using mixed precision. - -We’ve talked about mixed precision techniques before ([here](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/), [here](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html), and [here](https://developer.nvidia.com/automatic-mixed-precision)), and this blog post is a summary of those techniques and an introduction if you’re new to mixed precision. - -## Mixed Precision Training in Practice - -Mixed precision training techniques – the use of the lower precision float16 or bfloat16 data types alongside the float32 data type – are broadly applicable and effective. See Figure 1 for a sampling of models successfully trained with mixed precision, and Figures 2 and 3 for example speedups using torch.amp. - -

        - -

        - -

        - Figure 1: Sampling of DL Workloads Successfully Trained with float16 (Source). -

        - -

        - -

        - -

        - Figure 2: Performance of mixed precision training using torch.amp on NVIDIA 8xV100 vs. float32 training on 8xV100 GPU. Bars represent the speedup factor of torch.amp over float32. -(Higher is better.) (Source). -

        - -

        - -

        - -

        - Figure 3. Performance of mixed precision training using torch.amp on NVIDIA 8xA100 vs. 8xV100 GPU. Bars represent the speedup factor of A100 over V100. -(Higher is Better.) (Source). -

        - -See the [NVIDIA Deep Learning Examples repository](https://github.com/NVIDIA/DeepLearningExamples) for more sample mixed precision workloads. - -Similar performance charts can be seen in [3D medical image analysis](https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/dong_yang-mixed-precision-training-for-3d-medical-image-analysis.pdf), [gaze estimation](https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/shalini_de_mello-mixed-precision-training-for-faze.pdf), [video synthesis](https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/tingchun_wang-mixed-precision-vid2vid.pdf), [conditional GANs](https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/mingyu_liu-amp-imaginaire.pdf), and [convolutional LSTMs](https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/wonmin_byeon-mixed-precision-training-for-convolutional-tensor-train-lstm.pdf). [Huang et al](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/). showed that mixed precision training is 1.5x to 5.5x faster over float32 on V100 GPUs, and an additional 1.3x to 2.5x faster on A100 GPUs on a variety of networks. On very large networks the need for mixed precision is even more evident. [Narayanan et al](https://arxiv.org/pdf/2104.04473.pdf). reports that it would take 34 days to train GPT-3 175B on 1024 A100 GPUs (with a batch size of 1536), but it’s estimated it would take over a year using float32! - -## Getting Started With Mixed Precision Using torch.amp - -torch.amp, introduced in PyTorch 1.6, makes it easy to leverage mixed precision training using the float16 or bfloat16 dtypes. See this [blog post](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/), [tutorial](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), and [documentation](https://pytorch.org/docs/master/amp.html) for more details. Figure 4 shows an example of applying AMP with grad scaling to a network. - -```console -import torch -# Creates once at the beginning of training -scaler = torch.cuda.amp.GradScaler() - -for data, label in data_iter: - optimizer.zero_grad() - # Casts operations to mixed precision - with torch.amp.autocast(device_type=“cuda”, dtype=torch.float16): - loss = model(data) - - # Scales the loss, and calls backward() - # to create scaled gradients - scaler.scale(loss).backward() - - # Unscales gradients and calls - # or skips optimizer.step() - scaler.step(optimizer) - - # Updates the scale for next iteration - scaler.update() -``` - -

        - Figure 4: AMP recipe -

        - -### Picking The Right Approach - -Out-of-the-box mixed precision training with either float16 or bfloat16 is effective at speeding up the convergence of many deep learning models, but some models may require more careful numerical accuracy management. Here are some options: - -- Full float32 precision. Floating point tensors and modules are created in float32 precision by default in PyTorch, but this is a historic artifact not representative of training most modern deep learning networks. It’s rare that networks need this much numerical accuracy. -- Enabling TensorFloat32 (TF32) mode. On Ampere and later CUDA devices matrix multiplications and convolutions can use the TensorFloat32 (TF32) mode for faster but slightly less accurate computations. See the [Accelerating AI Training with NVIDIA TF32 Tensor Cores](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) blog post for more details. By default PyTorch enables TF32 mode for convolutions but not matrix multiplications, and unless a network requires full float32 precision we recommend enabling this setting for matrix multiplications, too (see the documentation [here](https://pytorch.org/docs/master/generated/torch.set_float32_matmul_precision.html?highlight=precision#torch.set_float32_matmul_precision) for how to do so). It can significantly speed up computations with typically negligible loss of numerical accuracy. -- Using torch.amp with bfloat16 or float16. Both these low precision floating point data types are usually comparably fast, but some networks may only converge with one vs the other. If a network requires more precision it may need to use float16, and if a network requires more dynamic range it may need to use bfloat16, whose dynamic range is equal to that of float32. If overflows are observed, for example, then we suggest trying bfloat16. - -There are even more advanced options than those presented here, like using torch.amp’s autocasting for only parts of a model, or managing mixed precision directly. These topics are largely beyond the scope of this blog post, but see the “Best Practices” section below. - -### Best Practices - -We strongly recommend using mixed precision with torch.amp or the TF32 mode (on Ampere and later CUDA devices) whenever possible when training a network. If one of those approaches doesn’t work, however, we recommend the following: - -- High Performance Computing (HPC) applications, regression tasks, and generative networks may simply require full float32 IEEE precision to converge as expected. -- Try selectively applying torch.amp. In particular we recommend first disabling it on regions performing operations from the torch.linalg module or when doing pre- or post-processing. These operations are often especially sensitive. Note that TF32 mode is a global switch and can’t be used selectively on regions of a network. Enable TF32 first to check if a network’s operators are sensitive to the mode, otherwise disable it. -- If you encounter type mismatches while using torch.amp we don’t suggest inserting manual casts to start. This error is indicative of something being off with the network, and it’s usually worth investigating first. -- Figure out by experimentation if your network is sensitive to range and/or precision of a format. For example [fine-tuning bfloat16-pretrained models in float16](https://github.com/huggingface/transformers/pull/10956) can easily run into range issues in float16 because of the potentially large range from training in bfloat16, so users should stick with bfloat16 fine-tuning if the model was trained in bfloat16. -- The performance gain of mixed precision training can depend on multiple factors (e.g. compute-bound vs memory-bound problems) and users should use the [tuning guide](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html) to remove other bottlenecks in their training scripts. Although having similar theoretical performance benefits, BF16 and FP16 can have different speeds in practice. It’s recommended to try the mentioned formats and use the one with best speed while maintaining the desired numeric behavior. - -For more details, refer to the [AMP Tutorial](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), [Training Neural Networks with Tensor Cores](https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/.), and see the post “[More In-Depth Details of Floating Point Precision](https://dev-discuss.pytorch.org/t/more-in-depth-details-of-floating-point-precision/654)" on PyTorch Dev Discussion. - -## Conclusion - -Mixed precision training is an essential tool for training deep learning models on modern hardware, and it will become even more important in the future as the performance gap between lower precision operations and float32 continues to grow on newer hardware, as reflected in Figure 5. - -

        - -

        - -

        -Figure 5: Relative peak throughput of float16 (FP16) vs float32 matrix multiplications on Volta and Ampere GPUs. On Ampere relative peak throughput for the TensorFloat32 (TF32) mode and bfloat16 matrix multiplications are shown, too. The relative peak throughput of low precision data types like float16 and bfloat16 vs. float32 matrix multiplications is expected to grow as new hardware is released. -

        - -PyTorch’s torch.amp module makes it easy to get started with mixed precision, and we highly recommend using it to train faster and reduce memory usage. torch.amp supports both float16 and bfloat16 mixed precision. - -There are still some networks that are tricky to train with mixed precision, and for these networks we recommend trying TF32 accelerated matrix multiplications on Ampere and later CUDA hardware. Networks are rarely so precision sensitive that they require full float32 precision for every operation. - -If you have questions or suggestions for torch.amp or mixed precision support in PyTorch then let us know by posting to the [mixed precision category on the PyTorch Forums](https://discuss.pytorch.org/c/mixed-precision/27) or [filing an issue on the PyTorch GitHub page](https://github.com/pytorch/pytorch/issues/new/choose). diff --git a/_posts/2022-7-22-introducing-the-playtorch-app.md b/_posts/2022-7-22-introducing-the-playtorch-app.md deleted file mode 100644 index 6533d8837c96..000000000000 --- a/_posts/2022-7-22-introducing-the-playtorch-app.md +++ /dev/null @@ -1,96 +0,0 @@ ---- -layout: blog_detail -title: "Introducing the PlayTorch app: Rapidly Create Mobile AI Experiences" -author: PlayTorch Team -featured-img: "" ---- - -

        - -

        - -In December, we announced PyTorch Live, a toolkit for building AI-powered mobile prototypes in minutes. The initial release included a command-line interface to set up a development environment and an SDK for building AI-powered experiences in React Native. Today, we're excited to share that PyTorch Live will now be known as PlayTorch. This new release provides an improved and simplified developer experience. PlayTorch development is independent from the PyTorch project and the PlayTorch code repository is moving into the Meta Research GitHub organization. - -## A New Workflow: The PlayTorch App - -The PlayTorch team is excited to announce that we have partnered with [Expo](https://expo.dev) to change the way AI powered mobile experiences are built. Our new release simplifies the process of building mobile AI experiences by eliminating the need for a complicated development environment. You will now be able to build cross platform AI powered prototypes from the very browser you are using to read this blog. - -In order to make this happen, we are releasing the [PlayTorch app](https://playtorch.dev/) which is able to run AI-powered experiences built in the [Expo Snack](https://snack.expo.dev/@playtorch/playtorch-starter?supportedPlatforms=my-device) web based code editor. - -

        - -

        - -The PlayTorch app can be downloaded from the Apple App Store and Google Play Store. With the app installed, you can head over to [playtorch.dev/snack](https://playtorch.dev/snack) and write the code for your AI-powered PlayTorch Snack. When you want to try what you’ve built, you can use the PlayTorch app’s QR code scanner to scan the QR code on the Snack page and load the code to your device. - -NOTE: PlayTorch Snacks will not work in the Expo Go app. - -## More to Explore in the PlayTorch App - -### AI Demos - -The PlayTorch app comes with several examples of how you can build AI powered experiences with a variety of different machine learning models from object detection to natural language processing. See what can be built with the PlayTorch SDK and be inspired to make something of your own as you play with the examples. - -

        - -

        - -### Sharing Your Creations - -Any PlayTorch Snack that you run in the PlayTorch app can be shared with others in an instant. When they open the link on their device, the PlayTorch app will instantly load what you’ve built from the cloud so they can experience it first hand. - -

        - -

        - -When you have something you want to share, let us know on [Discord](https://discord.gg/sQkXTqEt33) or [Twitter](https://twitter.com/PlayTorch) or embed the PlayTorch Snack on your own webpage. - -## SDK Overhaul - -We learned a lot from the community after our initial launch in December and have been hard at work over the past several months to make the PlayTorch SDK (formerly known as PyTorch Live) simple, performant, and robust. In our initial version, the SDK relied on config files to define how a model ingested and output data. - -Today, we are happy to announce the next version of our SDK can handle data processing in JavaScript for your prototypes with the new PlayTorch API that leverages the JavaScript Interface (JSI) to directly call C++ code. Not only have we completely redone the way you can interact with models, but we have also greatly expanded the variety of supported model architectures. - -## A New Data Processing API for Prototyping - -With this JSI API, we now allow users direct access to tensors (data format for machine learning). Instead of only having access to predefined transformations, you can now manipulate tensors however you would like for your prototypes. - -

        - -

        - -No more switching back and forth between code and config. You will now be able to write everything in JavaScript and have access to all of the type annotations and autocomplete features available to you in those languages. - -Check out our [tutorials](https://playtorch.dev/tutorials) to see the new Data Processing API in action, take a deeper dive in the [API docs](https://playtorch.dev/docs/api/core/), or inspect the code yourself on [GitHub](https://github.com/facebookresearch/playtorch). - -### Expanded Use Cases - -With the new version of the SDK, we have added support for several cutting edge models. - -

        - -

        - -Image-to-image transformations are now supported thanks to our robust JSI API, so you can see what your world would look like if it were an anime. - -

        - -

        - -Translate French to English with an AI powered translator using the Seq2Seq model. - -

        - -

        - -Use DeepLab V3 to segment images! - -## Start Playing - -If you want to start creating AI experiences yourself, head over to [playtorch.dev](https://playtorch.dev) and try out our [tutorials](https://playtorch.dev/tutorials/). Each tutorial will guide you through building a simple AI powered experience that you can instantly run on your phone and share with others. - -## How to Get Support - -Join us on [Discord](https://discord.gg/sQkXTqEt33), collaborate with us on [GitHub](https://github.com/facebookresearch/playtorch), or follow us on [Twitter](https://twitter.com/playtorch). Got questions or feedback? We’d love to hear from you! diff --git a/_posts/2022-8-16-empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16.md b/_posts/2022-8-16-empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16.md deleted file mode 100644 index 117f586a7d68..000000000000 --- a/_posts/2022-8-16-empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16.md +++ /dev/null @@ -1,76 +0,0 @@ ---- -layout: blog_detail -title: "Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16" -author: Mingfei Ma (Intel), Vitaly Fedyunin (Meta), Wei Wei (Meta) -featured-img: '\assets\images\empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16.png' ---- - -## Overview - -Recent years, the growing complexity of AI models have been posing requirements on hardware for more and more compute capability. Reduced precision numeric format has been proposed to address this problem. Bfloat16 is a custom 16-bit floating point format for AI which consists of one sign bit, eight exponent bits, and seven mantissa bits. With the same dynamic range as float32, bfloat16 doesn’t require a special handling such as loss scaling. Therefore, bfloat16 is a drop-in replacement for float32 when running deep neural networks for both inference and training. - -The 3rd Gen Intel® Xeon® Scalable processor (codenamed Cooper Lake), is the first general purpose x86 CPU with native bfloat16 support. Three new bfloat16 instructions were introduced in Intel® Advanced Vector Extensions-512 (Intel® AVX-512): VCVTNE2PS2BF16, VCVTNEPS2BF16, and VDPBF16PS. The first two instructions perform conversion from float32 to bfloat16, and the last one performs a dot product of bfloat16 pairs. Bfloat16 theoretical compute throughput is doubled over float32 on Cooper Lake. On the next generation of Intel® Xeon® Scalable Processors, bfloat16 compute throughput will be further enhanced through Advanced Matrix Extensions (Intel® AMX) instruction set extension. - -Intel and Meta previously collaborated to enable bfloat16 on PyTorch, and the related work was published in an earlier [blog](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/Intel-and-Facebook-Accelerate-PyTorch-Performance-with-3rd-Gen/post/1335659) during launch of Cooper Lake. In that blog, we introduced the hardware advancement for native bfloat16 support and showcased a performance boost of 1.4x to 1.6x of bfloat16 over float32 from DLRM, ResNet-50 and ResNext-101-32x4d. - -In this blog, we will introduce the latest software enhancement on bfloat16 in PyTorch 1.12, which would apply to much broader scope of user scenarios and showcase even higher performance boost. - -## Native Level Optimization on Bfloat16 - -On PyTorch CPU bfloat16 path, the compute intensive operators, e.g., convolution, linear and bmm, use oneDNN (oneAPI Deep Neural Network Library) to achieve optimal performance on Intel CPUs with AVX512_BF16 or AMX support. The other operators, such as tensor operators and neural network operators, are optimized at PyTorch native level. We have enlarged bfloat16 kernel level optimizations to majority of operators on dense tensors, both inference and training applicable (sparse tensor bfloat16 support will be covered in future work), specifically: - -- **Bfloat16 vectorization**: Bfloat16 is stored as unsigned 16-bit integer, which requires it to be casted to float32 for arithmetic operations such as add, mul, etc. Specifically, each bfloat16 vector will be converted to two float32 vectors, processed accordingly and then converted back. While for non-arithmetic operations such as cat, copy, etc., it is a straight memory copy and no data type conversion will be involved. -- **Bfloat16 reduction**: Reduction on bfloat16 data uses float32 as accumulation type to guarantee numerical stability, e.g., sum, BatchNorm2d, MaxPool2d, etc. -- **Channels Last optimization**: For vision models, Channels Last is the preferable memory format over Channels First from performance perspective. We have implemented fully optimized CPU kernels for all the commonly used CV modules on channels last memory format, taking care of both float32 and bfloat16. - -## Run Bfloat16 with Auto Mixed Precision - -To run model on bfloat16, typically user can either explicitly convert the data and model to bfloat16, for example: - -```console -# with explicit conversion -input = input.to(dtype=torch.bfloat16) -model = model.to(dtype=torch.bfloat16) -``` - -or utilize torch.amp (Automatic Mixed Precision) package. The autocast instance serves as context managers or decorators that allow regions of your script to run in mixed precision, for example: - -```console -# with AMP -with torch.autocast(device_type="cpu", dtype=torch.bfloat16): - output = model(input) -``` - -Generally, the explicit conversion approach and AMP approach have similar performance. Even though, we recommend run bfloat16 models with AMP, because: - -- **Better user experience with automatic fallback**: If your script includes operators that don’t have bfloat16 support, autocast will implicitly convert them back to float32 while the explicit converted model will give a runtime error. - -- **Mixed data type for activation and parameters**: Unlike the explicit conversion which converts all the model parameters to bfloat16, AMP mode will run in mixed data type. To be specific, input/output will be kept in bfloat16 while parameters, e.g., weight/bias, will be kept in float32. The mixed data type of activation and parameters will help improve performance while maintaining the accuracy. - -## Performance Gains - -We benchmarked inference performance of TorchVision models on Intel® Xeon® Platinum 8380H CPU @ 2.90GHz (codenamed Cooper Lake), single instance per socket (batch size = 2 x number of physical cores). Results show that bfloat16 has 1.4x to 2.2x performance gain over float32. - -

        - -

        - -## The performance boost of bfloat16 over float32 primarily comes from 3 aspects: - -- The compute intensive operators take advantage of the new bfloat16 native instruction VDPBF16PS which doubles the hardware compute throughput. -- Bfloat16 have only half the memory footprint of float32, so theoretically the memory bandwidth intensive operators will be twice faster. -- On Channels Last, we intentionally keep the same parallelization scheme for all the memory format aware operators (can’t do this on Channels First though), which increases the data locality when passing each layer’s output to the next. Basically, it keeps the data closer to CPU cores while data would reside in cache anyway. And bfloat16 will have a higher cache hit rate compared with float32 in such scenarios due to smaller memory footprint. - -## Conclusion & Future Work - -In this blog, we introduced recent software optimizations on bfloat16 introduced in PyTorch 1.12. Results on the 3rd Gen Intel® Xeon® Scalable processor show that bfloat16 has 1.4x to 2.2x performance gain over float32 on the TorchVision models. Further improvement is expected on the next generation of Intel® Xeon® Scalable Processors with AMX instruction support. Though the performance number for this blog is collected with TorchVision models, the benefit is broad across all topologies. And we will continue to extend the bfloat16 optimization effort to a broader scope in the future! - -## Acknowledgement - -The results presented in this blog is a joint effort of Meta and Intel PyTorch team. Special thanks to Vitaly Fedyunin and Wei Wei from Meta who spent precious time and gave substantial assistance! Together we made one more step on the path of improving the PyTorch CPU eco system. - -## Reference - -- [The bfloat16 numerical format](https://cloud.google.com/tpu/docs/bfloat16?hl=en) -- [https://pytorch.org/docs/master/amp.html#torch.autocast](https://pytorch.org/docs/master/amp.html#torch.autocast) -- [Intel and Facebook Accelerate PyTorch Performance with 3rd Gen Intel® Xeon® Processors and Intel® Deep Learning Boost’s new BFloat16 capability](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/Intel-and-Facebook-Accelerate-PyTorch-Performance-with-3rd-Gen/post/1335659) \ No newline at end of file diff --git a/_posts/2022-8-18-easily-list-and-initialize-models-with-new-apis-in-torchvision.md b/_posts/2022-8-18-easily-list-and-initialize-models-with-new-apis-in-torchvision.md deleted file mode 100644 index c7a32d61ccbe..000000000000 --- a/_posts/2022-8-18-easily-list-and-initialize-models-with-new-apis-in-torchvision.md +++ /dev/null @@ -1,135 +0,0 @@ ---- -layout: blog_detail -title: "Easily list and initialize models with new APIs in TorchVision" -author: Vasilis Vryniotis and Laurence Rouesnel -featured-img: "/assets/images/easily-list-and-initialize-models-with-new-apis-in-torchvision-1.png" ---- - -TorchVision now supports listing and initializing all available built-in models and weights by name. This new API builds upon the recently introduced [Multi-weight support API](https://pytorch.org/blog/introducing-torchvision-new-multi-weight-support-api/), is currently in Beta, and it addresses a long-standing [request](https://github.com/pytorch/vision/issues/1143) from the community. - -

        - -

        - -You can try out the new API in the [latest nightly](https://pytorch.org/get-started/locally/) release of TorchVision. We’re looking to collect feedback ahead of finalizing the feature in TorchVision v0.14. We have created a dedicated [Github Issue](https://github.com/pytorch/vision/issues/6365) where you can post your comments, questions and suggestions! - -## Querying and initializing available models - -Before the new model registration API, developers had to query the ``__dict__`` attribute of the modules in order to list all available models or to fetch a specific model builder method by its name: - -```python -# Initialize a model by its name: -model = torchvision.models.__dict__[model_name]() - -# List available models: -available_models = [ - k for k, v in torchvision.models.__dict__.items() - if callable(v) and k[0].islower() and k[0] != "_" -] -``` - -The above approach does not always produce the expected results and is hard to discover. For example, since the [``get_weight()``](https://pytorch.org/vision/main/models.html#using-models-from-hub) method is exposed publicly under the same module, it will be included in the list despite not being a model. In general, reducing the verbosity (less imports, shorter names etc) and being able to initialize models and weights directly from their names (better support of configs, TorchHub etc) was [feedback](https://github.com/pytorch/vision/issues/5088) provided previously by the community. To solve this problem, we have developed a model registration API. - -## A new approach - -We’ve added 4 new methods under the torchvision.models module: - -```python -from torchvision.models import get_model, get_model_weights, get_weight, list_models -``` - -The styles and naming conventions align closely with a prototype mechanism proposed by Philip Meier for the [Datasets V2](https://github.com/pytorch/vision/blob/main/torchvision/prototype/datasets/_api.py) API, aiming to offer a similar user experience. The model registration methods are kept private on purpose as we currently focus only on supporting the built-in models of TorchVision. - -### List models - -Listing all available models in TorchVision can be done with a single function call: - -```python ->>> list_models() -['alexnet', 'mobilenet_v3_large', 'mobilenet_v3_small', 'quantized_mobilenet_v3_large', ...] -``` - -To list the available models of specific submodules: - -```python ->>> list_models(module=torchvision.models) -['alexnet', 'mobilenet_v3_large', 'mobilenet_v3_small', ...] ->>> list_models(module=torchvision.models.quantization) -['quantized_mobilenet_v3_large', ...] -``` - -### Initialize models - -Now that you know which models are available, you can easily initialize a model with pre-trained weights: - -```python ->>> get_model("quantized_mobilenet_v3_large", weights="DEFAULT") -QuantizableMobileNetV3( - (features): Sequential( - .... - ) -) -``` - -### Get weights -Sometimes, while working with config files or using TorchHub, you might have the name of a specific weight entry and wish to get its instance. This can be easily done with the following method: - -```python ->>> get_weight("ResNet50_Weights.IMAGENET1K_V2") -ResNet50_Weights.IMAGENET1K_V2 -``` - -To get the enum class with all available weights of a specific model you can use either its name: - -```python ->>> get_model_weights("quantized_mobilenet_v3_large") - -``` - -Or its model builder method: - -```python ->>> get_model_weights(torchvision.models.quantization.mobilenet_v3_large) - -``` - -### TorchHub support -The new methods are also available via TorchHub: - -```python -import torch - -# Fetching a specific weight entry by its name: -weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2") - -# Fetching the weights enum class to list all available entries: -weight_enum = torch.hub.load("pytorch/vision", "get_model_weights", name="resnet50") -print([weight for weight in weight_enum]) -``` - -## Putting it all together - -For example, if you wanted to retrieve all the small-sized models with pre-trained weights and initialize one of them, it’s a matter of using the above APIs: - -```python -import torchvision -from torchvision.models import get_model, get_model_weights, list_models - - -max_params = 5000000 - -tiny_models = [] -for model_name in list_models(module=torchvision.models): - weights_enum = get_model_weights(model_name) - if len([w for w in weights_enum if w.meta["num_params"] <= max_params]) > 0: - tiny_models.append(model_name) - -print(tiny_models) -# ['mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mobilenet_v2', ...] - -model = get_model(tiny_models[0], weights="DEFAULT") -print(sum(x.numel() for x in model.state_dict().values())) -# 2239188 -``` - -For more technical details please see the original [RFC](https://github.com/pytorch/vision/pull/6330). Please spare a few minutes to provide your feedback on the new API, as this is crucial for graduating it from beta and including it in the next release. You can do this on the dedicated [Github Issue](https://github.com/pytorch/vision/issues/6365). We are looking forward to reading your comments! \ No newline at end of file diff --git a/_posts/2022-8-24-accelerating-pytorch-vision-models-with-channels-last-on-cpu.md b/_posts/2022-8-24-accelerating-pytorch-vision-models-with-channels-last-on-cpu.md deleted file mode 100644 index 4137d8f5c33d..000000000000 --- a/_posts/2022-8-24-accelerating-pytorch-vision-models-with-channels-last-on-cpu.md +++ /dev/null @@ -1,125 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating PyTorch Vision Models with Channels Last on CPU" -author: Mingfei Ma (Intel), Vitaly Fedyunin (Meta), Wei Wei (Meta) -featured-img: '/assets/images/accelerating-pytorch-vision-models-with-channels-last-on-cpu-2.png' ---- - -## Overview - -Memory formats has significant impact on performance when running vision models, generally Channels Last is a more favorable from performance perspective due to better data locality. - -This blog will introduce fundamental concepts of memory formats and demonstrate performance benefits using Channels Last on popular PyTorch vision models on Intel® Xeon® Scalable processors. - -## Memory Formats Introduction - -Memory format refers to data representation that describes how a multidimensional (nD) array is stored in linear (1D) memory address space. The concept of memory format has two aspects: - -- **Physical Order** is the layout of data storage in physical memory. For vision models, usually we talk about NCHW, NHWC. These are the descriptions of physical memory layout, also referred as Channels First and Channels Last respectively. -- **Logical Order** is a convention on how to describe tensor shape and stride. In PyTorch, this convention is NCHW. No matter what the physical order is, tensor shape and stride will always be depicted in the order of NCHW. - -Fig-1 is the physical memory layout of a tensor with shape of [1, 3, 4, 4] on both Channels First and Channels Last memory format (channels denoted as R, G, B respectively): - -

        - -

        - -

        -Fig-1 Physical memory layout of Channels First and Channels Last -

        - -## Memory Formats Propagation - -The general rule for PyTorch memory format propagation is to preserve the input tensor’s memory format. Which means a Channels First input will generate a Channels First output and a Channels Last input will generate a Channels Last output. - -For Convolution layers, PyTorch uses oneDNN (oneAPI Deep Neural Network Library) by default to achieve optimal performance on Intel CPUs. Since it is physically impossible to achieve highly optimized performance directly with Channels Frist memory format, input and weight are firstly converted to blocked format and then computed. oneDNN may choose different blocked formats according to input shapes, data type and hardware architecture, for vectorization and cache reuse purposes. The blocked format is opaque to PyTorch, so the output needs to be converted back to Channels First. Though blocked format would bring about optimal computing performance, the format conversions may add overhead and therefore offset the performance gain. - -On the other hand, oneDNN is optimized for Channels Last memory format to use it for optimal performance directly and PyTorch will simply pass a memory view to oneDNN. Which means the conversion of input and output tensor is saved. Fig-2 indicates memory format propagation behavior of convolution on PyTorch CPU (the solid arrow indicates a memory format conversion, and the dashed arrow indicates a memory view): - -

        - -

        - -

        -Fig-2 CPU Conv memory format propagation -

        - -On PyTorch, the default memory format is Channels First. In case a particular operator doesn't have support on Channels Last, the NHWC input would be treated as a non-contiguous NCHW and therefore fallback to Channels First, which will consume the previous memory bandwidth on CPU and result in suboptimal performance. - -Therefore, it is very important to extend the scope of Channels Last support for optimal performance. And we have implemented Channels Last kernels for the commonly use operators in CV domain, applicable for both inference and training, such as: - -- Activations (e.g., ReLU, PReLU, etc.) -- Convolution (e.g., Conv2d) -- Normalization (e.g., BatchNorm2d, GroupNorm, etc.) -- Pooling (e.g., AdaptiveAvgPool2d, MaxPool2d, etc.) -- Shuffle (e.g., ChannelShuffle, PixelShuffle) - -Refer to [Operators-with-Channels-Last-support](https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support) for details. - -## Native Level Optimization on Channels Last - -As mentioned above, PyTorch uses oneDNN to achieve optimal performance on Intel CPUs for convolutions. The rest of memory format aware operators are optimized at PyTorch native level, which doesn’t require any third-party library support. - -- **Cache friendly parallelization scheme:** keep the same parallelization scheme for all the memory format aware operators, this will help increase data locality when passing each layer’s output to the next. -- **Vectorization on multiple archs:** generally, we can vectorize on the most inner dimension on Channels Last memory format. And each of the vectorized CPU kernels will be generated for both AVX2 and AVX512. - -While contributing to Channels Last kernels, we tried our best to optimize Channels First counterparts as well. The fact is some operators are physically impossible to achieve optimal performance on Channels First, such as Convolution, Pooling, etc. - -## Run Vision Models on Channels Last - -The Channels Last related APIs are documented at [PyTorch memory format tutorial](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html). Typically, we can convert a 4D tensor from Channels First to Channels Last by: - -```python -# convert x to channels last -# suppose x’s shape is (N, C, H, W) -# then x’s stride will be (HWC, 1, WC, C) -x = x.to(memory_format=torch.channels_last) -``` - -To run models on Channels Last memory format, simply need to convert input and model to Channels Last and then you are ready to go. The following is a minimal example showing how to run ResNet50 with TorchVision on Channels Last memory format: - -```python -import torch -from torchvision.models import resnet50 - -N, C, H, W = 1, 3, 224, 224 -x = torch.rand(N, C, H, W) -model = resnet50() -model.eval() - -# convert input and model to channels last -x = x.to(memory_format=torch.channels_last) -model = model.to(memory_format=torch.channels_last) -model(x) -``` - -The Channels Last optimization is implemented at native kernel level, which means you may apply other functionalities such as torch.fx and torch script together with Channels Last as well. - -## Performance Gains - -We benchmarked inference performance of TorchVision models on Intel® Xeon® Platinum 8380 CPU @ 2.3 GHz, single instance per socket (batch size = 2 x number of physical cores). Results show that Channels Last has 1.3x to 1.8x performance gain over Channels First. - -

        - -

        - -The performance gain primarily comes from two aspects: - -- For Convolution layers, Channels Last saved the memory format conversion to blocked format for activations, which improves the overall computation efficiency. -- For Pooling and Upsampling layers, Channels Last can use vectorized logic along the most inner dimension, e.g., “C”, while Channels First can’t. - -For memory format non aware layers, Channels Last and Channels First has the same performance. - -## Conclusion & Future Work - -In this blog we introduced fundamental concepts of Channels Last and demonstrated the performance benefits of CPU using Channels Last on vision models. The current work is limited to 2D models at the current stage, and we will extend the optimization effort to 3D models in near future! - -## Acknowledgement - -The results presented in this blog is a joint effort of Meta and Intel PyTorch team. Special thanks to Vitaly Fedyunin and Wei Wei from Meta who spent precious time and gave substantial assistance! Together we made one more step on the path of improving the PyTorch CPU eco system. - -## References - -- [PyTorch memory format tutorial](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) -- [oneDNN guide on memory formats](https://oneapi-src.github.io/oneDNN/dev_guide_understanding_memory_formats.html) -- [PyTorch operators with Channels Last support](https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support) \ No newline at end of file diff --git a/_posts/2022-8-26-introducing-nvfuser-a-deep-learning-compiler-for-pytorch.md b/_posts/2022-8-26-introducing-nvfuser-a-deep-learning-compiler-for-pytorch.md deleted file mode 100644 index 4407962f430a..000000000000 --- a/_posts/2022-8-26-introducing-nvfuser-a-deep-learning-compiler-for-pytorch.md +++ /dev/null @@ -1,91 +0,0 @@ ---- -layout: blog_detail -title: 'Introducing nvFuser, a deep learning compiler for PyTorch' -author: Christian Sarofeen, Piotr Bialecki, Jie Jiang, Kevin Stephano, Masaki Kozuki, Neal Vaidya, Stas Bekman -featured-img: "/assets/images/introducing-nvfuser-a-deep-learning-compiler-for-pytorch-1.png" ---- - -nvFuser is a Deep Learning Compiler for NVIDIA GPUs that automatically just-in-time compiles fast and flexible kernels to reliably accelerate users' networks. It provides significant speedups for deep learning networks running on Volta and later CUDA accelerators by generating fast custom “fusion” kernels at runtime. nvFuser is specifically designed to meet the unique requirements of the PyTorch community, and it supports diverse network architectures and programs with dynamic inputs of varying shapes and strides. -In this blog post we’ll describe nvFuser and how it’s used today, show the significant performance improvements it can obtain on models from HuggingFace and TIMM, and look ahead to nvFuser in PyTorch 1.13 and beyond. If you would like to know more about how and why fusion improves the speed of training for Deep Learning networks, please see our previous talks on nvFuser from [GTC 2022](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41958/) and [GTC 2021](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31952/). -nvFuser relies on a graph representation of PyTorch operations to optimize and accelerate. Since PyTorch has an eager execution model, the PyTorch operations users are running are not directly accessible as a whole program that can be optimized by a system like nvFuser. Therefore users must utilize systems built on top of nvFuser which are capable of capturing users programs and translating them into a form that is optimizable by nvFuser. These higher level systems then pass these captured operations to nvFuser, so that nvFuser can optimize the execution of the user’s script for NVIDIA GPUs. There are three systems that capture, translate, and pass user programs to nvFuser for optimization: - -- [TorchScript jit.script](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script) - - This system directly parses sections of an annotated python script to translate into its own representation what the user is doing. This system then applies its own version of auto differentiation to the graph, and passes sections of the subsequent forward and backwards graphs to nvFuser for optimization. -- [FuncTorch](https://pytorch.org/functorch/stable/generated/functorch.compile.memory_efficient_fusion.html#functorch.compile.memory_efficient_fusion) - - This system doesn’t directly look at the user python script, instead inserting a mechanism that captures PyTorch operations as they’re being run. We refer to this type of capture system as “trace program acquisition”, since we’re tracing what has been performed. FuncTorch doesn’t perform its own auto differentiation – it simply traces PyTorch’s autograd directly to get backward graphs. -- [TorchDynamo](https://github.com/pytorch/torchdynamo) - - TorchDynamo is another program acquisition mechanism built on top of FuncTorch. TorchDynamo parses the Python bytecode produced from the user script in order to select portions to trace with FuncTorch. The benefit of TorchDynamo is that it’s able to apply decorators to a user’s script, effectively isolating what should be sent to FuncTorch, making it easier for FuncTorch to successfully trace complex Python scripts. - -These systems are available for users to interact with directly while nvFuser automatically and seamlessly optimizes performance critical regions of the user’s code. These systems automatically send parsed user programs to nvFuser so nvFuser can: - -1. Analyze the operations being run on GPUs -2. Plan parallelization and optimization strategies for those operations -3. Apply those strategies in generated GPU code -4. Runtime-compile the generated optimized GPU functions -5. Execute those CUDA kernels on subsequent iterations - -It is important to note nvFuser does not yet support all PyTorch operations, and there are still some scenarios that are actively being improved in nvFuser that are discussed herein. However, nvFuser does support many DL performance critical operations today, and the number of supported operations will grow in subsequent PyTorch releases. nvFuser is capable of generating highly specialized and optimized GPU functions for the operations it does have support for. This means nvFuser is able to power new PyTorch systems like TorchDynamo and FuncTorch to combine the flexibility PyTorch is known for with unbeatable performance. - -## nvFuser Performance - -Before getting into how to use nvFuser, in this section we’ll show the improvements in training speed nvFuser provides for a variety of models from the [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Image Models (TIMM)](https://github.com/rwightman/pytorch-image-models) repositories and we will discuss current gaps in nvFuser performance that are under development today. All performance numbers in this section were taken using an NVIDIA A100 40GB GPU, and used either FuncTorch alone or Functorch with TorchDynamo. - -## HuggingFace Transformer Benchmarks - -nvFuser can dramatically accelerate training of HuggingFace Transformers when combined with another important optimization (more on that in a moment). Performance improvements can be seen in Figure 1 to range between 1.12x and 1.50x across a subset of popular HuggingFace Transformer networks. - -

        - -

        - -

        -Figure 1: Performance gains of 8 training scenarios from HuggingFace’s Transformer repository. First performance boost in the dark green is due to replacing the optimizer with an NVIDIA Apex fused AdamW optimizer. The light green is due to adding nvFuser. Models were run with batch size and sequence lengths of [64, 128], [8, 512], [2, 1024], [64, 128], [8, 512], [8, src_seql=512, tgt_seql=128], [8, src_seql=1024, tgt_seql=128], and [8, 512] respectively. All networks were run with Automatic Mixed Precision (AMP) enabled with dtype=float16. -

        - -While these speedups are significant, it’s important to understand that nvFuser doesn’t (yet) automate everything about running networks quickly. For HuggingFace Transformers, for example, it was important to use the AdamW fused optimizer from [NVIDIA’s Apex repository](https://github.com/NVIDIA/apex) as the optimizer otherwise consumed a large portion of runtime. Using the fused AdamW optimizer to make the network faster exposes the next major performance bottleneck — memory bound operations. These operations are optimized by nvFuser, providing another large performance boost. With the fused optimizer and nvFuser enabled, the training speed of these networks improved between 1.12x to 1.5x. -HuggingFace Transformer models were run with [the torch.amp module](https://pytorch.org/docs/stable/amp.html). (“amp” stands for Automated Mixed Precision, see the [“What Every User Should Know about Mixed Precision in PyTorch”](https://pytorch.org/blog/what-every-user-should-know-about-mixed-precision-training-in-pytorch/) blog post for details.) An option to use nvFuser was added to HuggingFace’sTrainer. If you have [TorchDynamo installed](https://github.com/pytorch/torchdynamo#requirements-and-setup) you can activate it to enable nvFuser in HuggingFace by passing *torchdynamo = ‘nvfuser’* to the Trainer class. -nvFuser has great support for normalization kernels and related fusions frequently found in Natural Language Processing (NLP) models, and it is recommended users try nvFuser in their NLP workloads. - -## PyTorch Image Models (TIMM) Benchmarks -nvFuser, can also significantly reduce the training time of TIMM networks, up to over 1.3x vs. eager PyTorch, and up to 1.44x vs. eager PyTorch when combined with the torch.amp module. Figure 1 shows nvFuser’s speedup without torch.amp, and when torch.amp is used with the NHWC (“channels last”) and NCHW (“channels first”) formats. nvFuser is integrated in TIMM through FuncTorch tracing directly (without TorchDynamo) and can be used by adding the [--aot-autograd command line argument](https://github.com/rwightman/pytorch-image-models/commit/ca991c1fa57373286b9876aa63370fd19f5d6032) when running the TIMM benchmark or training script. - -

        - -

        - -

        -Figure 1: The Y-axis is the performance gain nvFuser provides over not using nvFuser. A value of 1.0 means no change in perf, 2.0 would mean nvFuser is twice as fast, 0.5 would mean nvFuser takes twice the time to run. Square markers are with float16 Automatic Mixed Precision (AMP) and channels first contiguous inputs, circle markers are float32 inputs, and triangles are with float16 AMP and channels last contiguous inputs. Missing data points are due to an error being encountered when tracing. -

        - -When running with float32 precision nvFuser provides a 1.12x geometric mean (“geomean”) speedup on TIMM networks, and when running with torch.amp and “channels first” it provides a 1.14x geomean speedup. However, nvFuser currently doesn’t speedup torch.amp and “channels last” training (a .9x geomean regression), so we recommend not using it in those cases. We are actively working on improving “channels last” performance now, and soon we will have two additional optimization strategies (grid persistent optimizations for channels-last normalizations and fast transposes) which we expect will provide speedups comparable to “channels first” in PyTorch version 1.13 and later. Many of nvFuser’s optimizations can also help in inference cases. However, in PyTorch when running inference on small batch sizes, the performance is typically limited by CPU overhead, which nvFuser can’t completely remove or fix. Therefore, typically the most important optimization for inference is to enable [CUDA Graphs](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/) when possible. Once CUDA Graphs is enabled, then it can also be beneficial to also enable fusion through nvFuser. Performance of inference is shown in Figure 2 and Figure 3. Inference is only run with float16 AMP as it is uncommon to run inference workloads in full float32 precision. - -

        - -

        - -

        - -

        - -

        -Figure 2: Performance gains of enabling CUDA Graphs, and CUDA Graphs with nvFuser compared to the performance of native PyTorch without CUDA Graphs and nvFuser across TIMM models with float16 AMP, channels first inputs, and a batch size of 1 and 8 respectively. There is a geomean speedup of 2.74x with CUDA Graphs and 2.71x with CUDA Graphs + nvFuser respectively. nvFuser provides a maximum regression of 0.68x and a maximum performance gain of 2.74x (relative to CUDA Graphs without nvFuser). Performance gain is measured relative to the average time per iteration PyTorch takes without CUDA Graphs and without nvFuser. Models are sorted by how much additional performance nvFuser is providing. -

        - -

        - -

        - -

        - -

        - -

        -Figure 3: Performance gains of enabling CUDA Graphs, and CUDA Graphs with nvFuser compared to the performance of native PyTorch without CUDA Graphs and nvFuser across TIMM models with AMP, channels last inputs, and a batch size of 1 and 8 respectively. There is a geomean speedup of 2.29x with CUDA Graphs and 2.95x with CUDA Graphs + nvFuser respectively. nvFuser provides a maximum regression of 0.86x and a maximum performance gain of 3.82x (relative to CUDA Graphs without nvFuser). Performance gain is measured relative to the average time per iteration PyTorch takes without CUDA Graphs and without nvFuser. Models are sorted by how much additional performance nvFuser is providing. -

        - -So far nvFuser performance has not been tuned for inference workloads so its performance benefit is not consistent across all cases. However, there are still many models that benefit significantly from nvFuser during inference and we encourage users to try nvFuser in inference workloads to see if you would benefit today. Performance of nvFuser in inference workloads will improve in the future and if you’re interested in nvFuser in inference workloads please reach out to us on the PyTorch forums. - -## Getting Started - Accelerate Your Scripts with nvFuser - -We’ve created [a tutorial](https://pytorch.org/tutorials/intermediate/nvfuser_intro_tutorial.html) demonstrating how to take advantage of nvFuser to accelerate part of a standard transformer block, and how nvFuser can be used to define fast and novel operations. There are still some rough edges in nvFuser that we’re working hard on improving as we’ve outlined in this blog post. However we’ve also demonstrated some great improvements for training speed on multiple networks in HuggingFace and TIMM and we expect there are opportunities in your networks where nvFuser can help today, and many more opportunities it will help in the future. -If you would like to learn more about nvFuser we recommend watching our presentations from NVIDIA’s GTC conference [GTC 2022](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41958/) and [GTC 2021](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31952/). diff --git a/_posts/2022-8-29-fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text.md b/_posts/2022-8-29-fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text.md deleted file mode 100644 index 25c47c41d2bb..000000000000 --- a/_posts/2022-8-29-fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -layout: blog_detail -title: "Fast Beam Search Decoding in PyTorch with TorchAudio and Flashlight Text" -author: Caroline Chen, Jacob Kahn (@jacob_d_kahn) -featured-img: "/assets/images/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text-6.png" ---- - -Beam search decoding with industry-leading speed from [Flashlight Text](https://github.com/flashlight/text) (part of the [Flashlight](https://arxiv.org/abs/2201.12465) ML framework) is now available with official support in [TorchAudio](https://pytorch.org/audio/0.12.0/models.decoder.html#ctcdecoder), bringing high-performance beam search and text utilities for speech and text applications built on top of PyTorch. The current integration supports CTC-style decoding, but it can be used for *any modeling setting that outputs token-level probability distributions over time steps*. - -## A brief beam search refresher - -In speech and language settings, *beam search* is an efficient, greedy algorithm that can convert sequences of *continuous values* (i.e. probabilities or scores) into *graphs* or *sequences* (i.e. tokens, word-pieces, words) using *optional constraints* on valid sequences (i.e. a lexicon), *optional external scoring* (i.e. an LM which scores valid sequences), and other *score adjustments* for particular sequences. - -In the example that follows, we'll consider — a token set of {ϵ, a, b}, where ϵ is a special token that we can imagine denotes a space between words or a pause in speech. Graphics here and below are taken from Awni Hannun's excellent [distill.pub writeup](https://distill.pub/2017/ctc/) on CTC and beam search. - -

        - -

        - -With a greedy-like approach, beam search considers the next viable token given an existing sequence of tokens — in the example above, a, b, b is a valid sequence, but a, b, a is not. We *rank* each possible next token at each step of the beam search according to a scoring function. Scoring functions (s) typically looks something like: - -

        - -

        - -Where **ŷ** is a potential path/sequence of tokens, **x** is the input *(P(ŷ|x)* represents the model's predictions over time), and 𝛼 is a weight on the language model probability *(P(y)* the probability of the sequence under the language model). Some scoring functions add *𝜷* which adjusts a score based on the length of the predicted sequence **|ŷ|**. This particular scoring function is used in [FAIR's prior work](https://arxiv.org/pdf/1911.08460.pdf) on end-to-end ASR, and there are many variations on scoring functions which can vary across application areas. - -Given a particular sequence, to assess the next viable token in that sequence (perhaps constrained by a set of allowed words or sequences, such as a lexicon of words), the beam search algorithm scores the sequence with each candidate token added, and sorts token candidates based on those scores. For efficiency and since the number of paths is exponential in the token set size, the *top-k* highest-scoring candidates are kept — *k* represents the *beam size*. - -

        - -

        - -

        There are many other nuances with how beam search can progress: similar hypothesis sequences can be “merged”, for instance. -

        - -The scoring function can be further augmented to up/down-weight token insertion or long or short words. Scoring with *stronger external language* models, while incurring computational cost, can also significantly improve performance; this is frequently referred to as *LM fusion*. There are many other knobs to tune for decoding — these are documented in [TorchAudio’s documentation](https://pytorch.org/audio/0.12.0/models.decoder.html#ctcdecoder) and explored further in [TorchAudio’s ASR Inference tutorial](https://pytorch.org/audio/0.12.0/tutorials/asr_inference_with_ctc_decoder_tutorial.html#beam-search-decoder-parameters). Since decoding is quite efficient, parameters can be easily swept and tuned. - -Beam search has been used in ASR extensively over the years in far too many works to cite, and in strong, recent results and systems including [wav2vec 2.0](https://proceedings.neurips.cc/paper/2020/file/92d1e1eb1cd6f9fba3227870bb6d7f07-Paper.pdf) and [NVIDIA's NeMo](https://developer.nvidia.com/nvidia-nemo). - -## Why beam search? - -Beam search remains a fast competitor to heavier-weight decoding approaches such as [RNN-Transducer](https://arxiv.org/pdf/1211.3711.pdf) that Google has invested in putting [on-device](https://ai.googleblog.com/2019/03/an-all-neural-on-device-speech.html) and has shown strong results with on [common benchmarks](https://arxiv.org/pdf/2010.10504.pdf). Autoregressive text models at scale can benefit from beam search as well. Among other things, beam search gives: - -- A flexible performance/latency tradeoff — by adjusting beam size and the external LM, users can sacrifice latency for accuracy or pay for more accurate results with a small latency cost. Decoding with no external LM can improve results at very little performance cost. -- Portability without retraining — existing neural models can benefit from multiple decoding setups and plug-and-play with external LMs without training or fine-tuning. -- A compelling complexity/accuracy tradeoff — adding beam search to an existing modeling pipeline incurs little additional complexity and can improve performance. - -## Performance Benchmarks - -Today's most commonly-used beam search decoding libraries today that support external language model integration include Kensho's [pyctcdecode](https://github.com/kensho-technologies/pyctcdecode), NVIDIA's [NeMo toolkit](https://github.com/NVIDIA/NeMo/tree/stable/scripts/asr_language_modeling). We benchmark the TorchAudio + Flashlight decoder against them with a *wav2vec 2.0* base model trained on 100 hours of audio evaluated on [LibriSpeech](https://www.openslr.org/12) dev-other with the official [KenLM](https://github.com/kpu/kenlm/) 3-gram LM. Benchmarks were run on Intel E5-2698 CPUs on a single thread. All computation was in-memory — KenLM memory mapping was disabled as it wasn't widely supported. - -When benchmarking, we measure the *time-to-WER (word error rate)* — because of subtle differences in the implementation of decoding algorithms and the complex relationships between parameters and decoding speed, some hyperparameters differed across runs. To fairly assess performance, we first sweep for parameters that achieve a baseline WER, minimizing beam size if possible. - -

        - -

        - -

        -Decoding performance on Librispeech dev-other of a pretrained wav2vec 2.0 model. TorchAudio + Flashlight decoding outperforms by an order of magnitude at low WERs. -

        - -

        - -

        - -

        -Time-to-WER results, deferring to smaller beam size, across decoders. The TorchAudio + Flashlight decoder scales far better with larger beam sizes and at lower WERs. -

        - -## TorchAudio API and Usage - -TorchAudio provides a Python API for CTC beam search decoding, with support for the following: - -- lexicon and lexicon-free decoding -- KenLM n-gram language model integration -- character and word-piece decoding -- sample pretrained LibriSpeech KenLM models and corresponding lexicon and token files -- various customizable beam search parameters (beam size, pruning threshold, LM weight...) - -To set up the decoder, use the factory function torchaudio.models.decoder.ctc_decoder - -```python -from torchaudio.models.decoder import ctc_decoder, download_pretrained_files -files = download_pretrained_files("librispeech-4-gram") -decoder = ctc_decoder( - lexicon=files.lexicon, - tokens=files.tokens, - lm=files.lm, - nbest=1, - ... additional optional customizable args ... -) -``` - -Given emissions of shape *(batch, time, num_tokens)*, the decoder will compute and return a List of batch Lists, each consisting of the nbest hypotheses corresponding to the emissions. Each hypothesis can be further broken down into tokens, words (if a lexicon is provided), score, and timesteps components. - -```python -emissions = acoustic_model(waveforms) # (B, T, N) -batch_hypotheses = decoder(emissions) # List[List[CTCHypothesis]] - -# transcript for a lexicon decoder -transcripts = [" ".join(hypo[0].words) for hypo in batch_hypotheses] - -# transcript for a lexicon free decoder, splitting by sil token -batch_tokens = [decoder.idxs_to_tokens(hypo[0].tokens) for hypo in batch_hypotheses] -transcripts = ["".join(tokens) for tokens in batch_tokens] -``` - -Please refer to the [documentation](https://pytorch.org/audio/stable/models.decoder.html#ctcdecoder) for more API details, and the tutorial ([ASR Inference Decoding](https://pytorch.org/audio/main/tutorials/asr_inference_with_ctc_decoder_tutorial.html)) or sample [inference script](https://github.com/pytorch/audio/tree/main/examples/asr/librispeech_ctc_decoder) for more usage examples. - -## Upcoming Improvements - -**Full NNLM support** — decoding with large neural language models (e.g. transformers) remains somewhat unexplored at scale. Already supported in Flashlight, we plan to add support in TorchAudio, allowing users to use custom decoder-compatible LMs. Custom word level language models are already available in the nightly TorchAudio build, and is slated to be released in TorchAudio 0.13. - -**Autoregressive/seq2seq decoding** — Flashlight Text also supports [sequence-to-sequence (seq2seq) decoding](https://github.com/flashlight/text/blob/main/flashlight/lib/text/decoder/LexiconSeq2SeqDecoder.h) for autoregressive models, which we hope to add bindings for and add to TorchAudio and TorchText with efficient GPU implementations as well. - -**Better build support** — to benefit from improvements in Flashlight Text, TorchAudio will directly submodule Flashlight Text to make upstreaming modifications and improvements easier. This is already in effect in the nightly TorchAudio build, and is slated to be released in TorchAudio 0.13. - -## Citation - -To cite the decoder, please use the following: - -```python -@inproceedings{kahn2022flashlight, - title={Flashlight: Enabling innovation in tools for machine learning}, - author={Kahn, Jacob D and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and others}, - booktitle={International Conference on Machine Learning}, - pages={10557--10574}, - year={2022}, - organization={PMLR} -} -``` -```python -@inproceedings{yang2022torchaudio, - title={Torchaudio: Building blocks for audio and speech processing}, - author={Yang, Yao-Yuan and Hira, Moto and Ni, Zhaoheng and Astafurov, Artyom and Chen, Caroline and Puhrsch, Christian and Pollack, David and Genzel, Dmitriy and Greenberg, Donny and Yang, Edward Z and others}, - booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, - pages={6982--6986}, - year={2022}, - organization={IEEE} -} -``` \ No newline at end of file diff --git a/_posts/2022-9-12-PyTorchfoundation.md b/_posts/2022-9-12-PyTorchfoundation.md deleted file mode 100644 index 368957576abc..000000000000 --- a/_posts/2022-9-12-PyTorchfoundation.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch strengthens its governance by joining the Linux Foundation" -author: Soumith Chintala -featured-img: "/assets/images/pytorch-foundation-blog-image.jpg" ---- - -Today, I am proud to announce that PyTorch is moving to the [Linux Foundation (LF)](https://www.linuxfoundation.org/) as a top-level project under the name PyTorch Foundation. The [core mission](https://www.linuxfoundation.org/about/) of the Linux Foundation is the collaborative development of open source software. With a governing board of leaders from AMD, Amazon Web Services (AWS), Google Cloud, Meta, Microsoft Azure and NVIDIA, this model aligns with where PyTorch stands today and what it needs to travel forward. The creation of the PyTorch Foundation will ensure business decisions are being made in a transparent and open manner by a diverse group of members for years to come. The technical decisions remain in control of individual maintainers. I’m excited that the Linux Foundation will be our new home as they have notable experience supporting large open-source projects like ours such as Kubernetes and NodeJS. At this pivotal moment, I want to take a look back at how we started, share why we are moving, and what’s ahead. - -This January, PyTorch celebrated its 5 year anniversary! I reflected on what it meant to me in this [tweet thread](https://soumith.ch/posts/2022/01/pytorch-retro/), and [this](https://www.youtube.com/watch?v=r7qB7mKJOFk) conversation with my colleagues Mike Schroepfer, Lin Qiao, and Yann LeCun. When we started PyTorch development in 2016, it was a collective effort by a band of people from the [Lua]Torch community with a big chunk of people and funding from Meta and individuals contributing from NVIDIA, Twitter and other entities. - -Since 2017, PyTorch has grown far beyond our initial vision. With over [2,400 contributors](https://github.com/pytorch/pytorch/graphs/contributors) who have built nearly 154,000 projects using PyTorch as a foundation, PyTorch has become one of the primary platforms for AI research, as well as commercial production use. We’ve seen its impact across industry and academia, from large companies to numerous university courses at Stanford, NYU, EPFL, Oxford, and other academic institutions. As a maintainer of PyTorch, the journey has been extremely fulfilling, with the impact of the project seen in various fields from self-driving cars to healthcare to aerospace. - -As PyTorch grew, many companies have made foundational investments around it. While Meta remains the largest contributor to PyTorch, companies such as AMD, Amazon Web Services (AWS), Google Cloud, HuggingFace, Lightning AI, Microsoft Azure, Nvidia, and many others have made significant investments, including both technical contributions and community building efforts. They’ve established teams around PyTorch or filled significant voids within the PyTorch community and sent countless contributions to the PyTorch core and to the ecosystem around it — PyTorch is an important part of their future. With PyTorch continuing to grow as a multi-stakeholder project, it’s time to move to a broader open-source foundation. - -The business governance of PyTorch was fairly unstructured for quite some time since launch – we operated like a scrappy startup. Team members at Meta spent the time and energy to structure this properly and organize PyTorch into an organizationally more healthy entity. Meta helped PyTorch with introducing many structures, such as [Contributor License Agreements](https://pytorch.org/blog/a-contributor-license-agreement-for-pytorch/), [Branding Guidelines](https://pytorch.org/assets/brand-guidelines/PyTorch-Brand-Guidelines.pdf), and Trademark registration. Keeping PyTorch’s organizational health up to check is essential and beneficial for the community. The next stage of our organizational progress is to support the interests of multiple stakeholders, hence moving to a foundation is good. We chose the Linux Foundation as it has vast organization experience hosting large multi-stakeholder open-source projects with the right balance of organizational structure and finding specific solutions for these projects. - -Simultaneously, the technical governance of PyTorch has been a loosely structured community model of open-source development — A set of people maintaining PyTorch by area with their responsibility often tied to their individual identity rather than their employment. While we kept a codified list at the [PyTorch - Maintainers](https://pytorch.org/docs/stable/community/persons_of_interest.html) page, the technical governance was not formalized nor codified. As PyTorch scales as a community, the next step is to structure and codify. The [PyTorch Technical Governance](https://pytorch.org/docs/master/community/governance.html) now supports a hierarchical maintainer structure and clear outlining of processes around day to day work and escalations. This doesn’t change how we run things, but it does add discipline and openness that at our scale feels essential and timely. - -It’s been an exciting journey since 2016. I am grateful for the experiences and people I’ve met along the way. PyTorch started with a small group of contributors which have grown and diversified over the years, all bringing in new ideas and innovations that would not have been possible without our community. We want to continue the open-source spirit – for the community and by the community. Thank you to our contributors, maintainers, users, supporters and new foundation members. We look forward to the next chapter of PyTorch with the PyTorch Foundation. - diff --git a/_posts/2022-9-26-announcing-pytorch-conference-2022.md b/_posts/2022-9-26-announcing-pytorch-conference-2022.md deleted file mode 100644 index d912ef419d78..000000000000 --- a/_posts/2022-9-26-announcing-pytorch-conference-2022.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -layout: blog_detail -title: "Announcing PyTorch Conference 2022" -author: -featured-img: "/assets/images/pytorch-conference-2022.png" ---- - -We are excited to announce that the PyTorch Conference returns in-person as a satellite event to [NeurlPS](https://l.workplace.com/l.php?u=https%3A%2F%2Fnips.cc%2F&h=AT3cdRwSEhyuNXpH2ptWjk-KxMxcceaYeTfflT6PEezDQ_zeUxRv1gjX7GhTQBgvZxFAR0wlSBwuhpipdMjUknMnhY5oJ5C4HjLNO40-12UnoeYALriwrvdxGfgigo8KYlWu_gRIQwlO-2r0wTnNft0whoSaOdVAxw&__tn__=-UK-R&c[0]=AT3z6QRLu8Uw48lKQ_P6FFq7ncHfjsfI16OGZvWO9kALatCY4sZcMjNzR7a4OiOG25RKVHpDX0TGutZHyM_R8Kl2s71Y3DEbq5QccmUVaSzCbcMUSc5Ms2zXHoeGxUlw1XirihAydPsX4Y1OmF6GRjqH8YFTNTFQRN3I8j2SFhR8LEUDxDmfnZ8Q7c2hXi0HeGc) (Neural Information Processing Systems) in New Orleans on Dec. 2nd. - -

        - -

        - -We changed the name from PyTorch Developer Day to PyTorch Conference to signify the turning of a new chapter as we look to the future of PyTorch, encompassing the entire PyTorch Community. This conference will bring together leading researchers, academics and developers from the Machine Learning (ML) and Deep Learning (DL) communities to join a multiple set of talks and a poster session; covering new software releases on [PyTorch](https://pytorch.org/), use cases in academia and industry, as well as ML/DL development and production trends. - -### EVENT OVERVIEW - -When: Dec 2nd, 2022 (In-Person and Virtual) - -Where: New Orleans, Louisiana (USA) | *Virtual option as well* - -### SCHEDULE - -All times in Central Standard. - -8:00-9:00 am   Registration/Check in - -9:00-11:20 am   Keynote & Technical Talks - -11:30-1:00 pm   Lunch - -1:00-3:00 pm   Poster Session & Breakouts - -3:00-4:00 pm   Community/Partner Talks - -4:00-5:00 pm   Panel Discussion - -Agenda subject to change. - -All talks will be livestreamed and available to the public. The in-person event will be by invitation only as space is limited. If you’d like to apply to attend in person, please submit all requests [here](https://pytorchconference22.splashthat.com/). - -### LINKS - -- [Submit Content for Consideration by Sept. 30th](https://docs.google.com/forms/d/121ptOuhqhmcPev9g5Zt2Ffl-NtB_oeyFk5CWjumUVLQ/edit) -- [Livestream event page](https://www.facebook.com/events/1562940847455759) -- [Apply for an invitation to the in-person event](https://pytorchconference22.splashthat.com/) \ No newline at end of file diff --git a/_posts/2022-9-29-performance-debugging-of-production-pytorch-models-at-meta.md b/_posts/2022-9-29-performance-debugging-of-production-pytorch-models-at-meta.md deleted file mode 100644 index 83ae8f260c56..000000000000 --- a/_posts/2022-9-29-performance-debugging-of-production-pytorch-models-at-meta.md +++ /dev/null @@ -1,121 +0,0 @@ ---- -layout: blog_detail -title: "Performance Debugging of Production PyTorch Models at Meta" -author: CK Luk, Lei Tian -featured-img: "/assets/images/performance-debugging-of-production-pytorch-models-at-meta-1.png" ---- - -## 1. Meta’s AI Performance Profiling (MAIProf) - -

        - -

        - -

        -Figure 1: A simplified illustration of the Meta’s AI performance profiling (MAIProf) infrastructure. -

        - -Figure 1 gives a simplified illustration of the AI performance profiling infrastructure at Meta. ML research and performance engineers submit through the User Portal a profiling request for a training job to the Profiling Service, which subsequently broadcasts the request to all the GPU hosts running the training job. When the Monitoring Daemon on a GPU host receives the profiling request, it will notify the Kineto GPU tracer (built on top of NVIDIA’s libcupti) inside the PyTorch program corresponding to the training job. As a result, Kineto traces will be collected and uploaded to the Object Store asynchronously (in more details: there is one Kineto trace collected for each individual GPU, each is treated and stored as a blob; an example will be given in Section 2). Meanwhile, MAIProf also collects a variety of aggregated performance metrics: the Monitoring Daemon on every GPU host continuously reads performance counters from NVIDIA’s DCGM/NVML and logs them to a Time Series DB. - -Once both trace and metrics collections are completed, the Profiling Service will automatically download traces from the Object Store for trace analysis and performance metrics from the Time Series DB for metric analysis. Finally, an overall profiling report with detailed and insightful analysis is delivered to the user. - -To serve production uses, we deliberately made the following design choices for MAIProf: - -- **No source-code change required in the PyTorch models**: profiling is triggered by sampling the execution of an unmodified model for a user-specified amount of time. -- **Provide a holistic view of performance**: MAIProf performs system-wide analysis that cover both CPU and GPU. Under the hood, it invokes various CPU tools (e.g., Python tracer, Autograd Observer) and GPU tools (e.g., Kineto, DCGM) and correlates their results. -- **Provide multiple tools that target a wide range of AI partitioners**: At Meta, there are engineers with different backgrounds who may need to tune their AI workload performance. Some of them are AI experts while others are general software engineers. Therefore, MAIProf provides a variety of tools for different levels of performance debugging, from high-level automatic trace comprehension to low-level trace analysis. -- **Support distributed GPU profiling**: MAIProf can collect profiling data from multiple hosts, each with multiple GPUs. It then shows a combined view/analysis of the entire system. -- **Highly scalable**: MAIProf is built as a service on top of existing infrastructures in Meta data centers such as a scalable storage system called Manifold. Its profiling capability can be easily scaled by adding more machines in the service pool with the increase of workloads. - -## 2. Case Study: Optimizing a Protection PyTorch Model - -To be concrete, we use a case study on a protection PyTorch model used in production. First, we discuss our steps for identifying the performance bottlenecks in the model with MAIProf. Then we describe the corresponding optimizations applied and their impacts. - -### 2.1 Performance Bottlenecks - -#### Step 1: - -Inspect the CPU and GPU utilization on the same timeline, as shown in Figure 2. - -

        - -

        - -

        -Figure 2: CPU usage over time (the top) vs. GPU usage over time (the bottom). -

        - -The first performance anomaly we noticed in Figure 2 is the pattern: *“GPU-idle, GPU-active, GPU-idle, GPU-active …”* throughout the training. Overall, the GPU is idle for more than half of the training time (this is bad for performance because the GPU is a higher-performance device and so we want it to be utilized as much as possible). - -#### Step 2: - -Collect a Python function call trace on the CPU with MAIProf while the GPU is idle, which is shown in Figure 3. - -

        - -

        - -

        -Figure 3: A Python call trace. -

        - -The Python trace shows that most of the CPU time is spent inside a Python function `sharded_iterrows()`. From the source code of the model, we learned that this function processes a big feature table in parallel. The number of worker threads used is controlled by a configurable parameter (`num_worker_threads`). Also, after investigating how the feature table is generated, we understood the performance anomaly: the training dataset is too large to fit in the CPU memory all at once; it needs to be broken into multiple sub-datasets, each has sufficient data for running 10 epochs. Consequently, a new sub-dataset needs to be read from the disk to memory every 10 epochs, during which the GPU is totally idle. - -#### Step 3: - -Collect GPU performance metrics, which is shown in Figure 4. - -

        - -

        - -

        -Figure 4: GPU performance metrics in MAIProf. -

        - -We made the following observations from Figure 4: - -- The streaming multiprocessor (SM) runs the model’s CUDA kernels. Its utilization [1] is 9.1%, indicating that the parallel compute units on the GPU are not well utilized. -- Tensor Core utilization is 0, meaning that Tensor Core (the mixed-precision compute unit on GPU) [2] is not used at all. -- Max GPU memory utilization is 47.13%, indicating that half of the GPU memory is left unused. - -#### Step 4: - -Collect a GPU trace (aka Kineto trace) of the training loop as shown in Figure 5. - -

        - -

        - -

        -Figure 5: A GPU trace (aka Kineto trace) of the training loop. -

        - -Since commonly used PyTorch functions are already annotated, their names are automatically shown on the trace. With them, we can roughly divide the trace into the four phases in a training iteration: (1) data loading, (2) forward pass, (3) backward pass, (4) gradient optimization (note: In Figure 5, the “optimizer” phase is from the previous batch while the other three phases are from the current batch). - -### 2.2 Optimizations - -We performed four simple optimizations that target the bottlenecks identified above, each requiring only a change in a config parameter or at most a few source lines. They are listed in Figure 6. - -| Optimization | Amount of changes | Bottlenecks addressed | -| ------------ | ----------------- | --------------------- | -|Tune `num_worker_threads` by trying a few possible values within the number of CPU cores on each host. | 1 source line | GPU totally idle time | -| Double the batch sizes | 2 config parameters | GPU memory under-utilization | -| Use [automatic mixed precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html) in PyTorch | 13 source lines | Zero Tensor Core utilization | -| Use [mulitensor optimizer](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html#torch.optim.AdamW) in PyTorch | 1 source line | Many small GPU kernels in the optimizer | - -

        -Figure 6: Four simple optimizations applied. -

        - -## 3. Concluding Remarks - -Performance tuning for PyTorch in production environments is increasingly important. A capable performance-debugging tool is a key to this process. We demonstrate with a case study on a production model that MAIProf is a powerful infrastructure for identifying optimization opportunities. - -At Meta, MAIProf has been used by 100s of engineers, from performance novices to experts, to identify many more types of bottlenecks. These include slow data loading, small and/or slow GPU kernels, distributed training issues such as load imbalance and excessive communication. MAIProf covers major classes of models, including recommendation, vision, and natural language processing. In summary, it is now an indispensable tool for tuning the performance of production PyTorch workloads. - -## References - -[1] [https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/ cudaexperiments/kernellevel/achievedoccupancy.htm](https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/achievedoccupancy.htm) - -[2] [https://www.nvidia.com/en-us/data-center/tensor-cores/](https://www.nvidia.com/en-us/data-center/tensor-cores/) \ No newline at end of file diff --git a/_posts/2023-01-09-trace-analysis-for-masses.md b/_posts/2023-01-09-trace-analysis-for-masses.md deleted file mode 100644 index 85f26f1b7ba9..000000000000 --- a/_posts/2023-01-09-trace-analysis-for-masses.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Trace Analysis for the Masses" -author: Anupam Bhatnagar, Xizhou Feng, Brian Coutinho, Yifan Liu, Sung-Han Lin, Louis Feng, and Yuzhen Huang ---- - -We are excited to announce the public release of Holistic Trace Analysis (HTA), an open source performance analysis and visualization Python library for PyTorch users. HTA takes as input [Kineto traces](https://github.com/pytorch/kineto) collected by the [PyTorch profiler](https://pytorch.org/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/), which are complex and challenging to interpret, and up-levels the performance information contained in these traces. It was initially developed internally at Meta to understand and debug performance problems for large-scale distributed training jobs on GPUs. The multidisciplinary team has made a number of enhancements to HTA’s features and scaled them to support state-of-the-art ML workloads. - -ML researchers and systems engineers often struggle to computationally scale up their models because they are not aware of the performance bottlenecks in their workloads. The resources requested for a job (e.g. GPUs, memory) are often misaligned with the resources actually required due to lack of visibility “under the hood”. To achieve the best performance from the hardware stack, it is imperative to understand the resource utilization and bottlenecks for distributed training workloads. - -The initial HTA implementation was specifically targeted at Deep Learning Based Recommendation Models (DLRM). To make the features in HTA generic and applicable to use cases such as analyzing Vision and NLP models, we decided to refactor the HTA codebase and make the library available to the larger community. This new codebase has implemented several important ideas which lead to significant efficiency and performance improvements. - -In this blog, we present several features implemented in the open source version of HTA, which can be used as a Python script as well as interactively in a Jupyter notebook. HTA provides the following features: - -1. **Breakdown by Dimensions** - 1. **Temporal**: Breakdown of GPU time in terms of time spent in computation, communication, memory events, and idle time on a single node and across all ranks. - 1. **Idle Time**: Breakdown of GPU idle time into waiting for the host, waiting for another kernel or attributed to an unknown cause. - 1. **Kernel**: Find kernels with the longest duration on each rank. - 1. **Communication Computation Overlap**: Calculate the percentage of time when communication overlaps computation. -1. **Statistical Analysis** - 1. **Kernel Duration Distribution**: Distribution of average time taken by longest kernels across different ranks. - 1. **CUDA Kernel Launch**: Distributions of GPU kernels with very small duration, large duration, and excessive launch time. - 1. **Augmented Counters (Memory bandwidth, Queue length)**: Augmented trace files which provide insights into memory copy bandwidth and number of outstanding operations on each CUDA stream. -1. **Patterns** - 1. **Frequent CUDA Kernels**: Find the CUDA kernels most frequently launched by any given PyTorch or user defined operator. -1. **Trace Comparison** - 1. **Trace Diff**: A trace comparison tool to identify and visualize the differences between traces. - -HTA source code is available to users via [Github](https://github.com/facebookresearch/HolisticTraceAnalysis). Users can request new features or build their own analysis using the core libraries and data structures provided in the codebase in addition to the features mentioned above. - -## GPU Training Performance Debugging 101 - -To understand the GPU performance in distributed training jobs, we consider how the model operators interact with the GPU devices and how such interactions are reflected in certain measurable metrics. - -At a high level, we can break down the GPU operations in a model execution into three broad categories, henceforth referred to as kernel types: -1. **Computation (COMP)** - Compute kernels execute compiled routines for matrix multiplication and similar numeric calculations. They are responsible for all of the number-crunching necessary for model execution. -1. **Communication (COMM)** - Communication kernels are routines which are responsible for exchanging and synchronizing data between different GPU devices in a distributed training job. The NVIDIA Collective Communication Library (NCCL) is a widely used communication library and all its kernels have the prefix “nccl”. Example NCCL kernels include NCCL_AllGather, NCCL_ReduceScatter, NCCL_AllReduce, etc. -1. **Memory (MEM)** - Memory kernels manage the memory allocations/deallocations on the GPU devices and data movement between the memory space on the host and the GPUs. The memory kernels include Memcpy_H2D, Memcpy_D2H, Memcpy_D2D, Memset, etc. Here, H represents the Host and D represents the GPU Device. Thus, H2D, D2H, D2D stands for Host to Device, Device to Host and Device to Device respectively. - -Because a modern GPU device like the NVIDIA A100 GPU is a massively parallel device which is capable of running multiple kernels simultaneously, it is possible to overlap the computation, communication, and memory kernels to reduce the model execution time. One common technique to achieve the overlap is to utilize multiple CUDA streams. A CUDA stream is a sequence of operations that execute on a GPU device in the order in which they are issued by the host code. Different CUDA streams can be interleaved and even run concurrently, thus achieving the effect of kernel overlap. - -To help understand the above concepts, Figure 1 provides a timeline of the GPU kernels in a sample distributed training job on 8 GPUs for one iteration. In the figure below, each rank represents one GPU and the kernels on each GPU run on 6 CUDA streams. In the right column of the figure, you can see names of the GPU kernels used. In the middle of the figure, you see the overlap between compute and communicate kernels. This figure is created using the [plot_timeline example notebook](https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/examples/plot_timeline.ipynb) available in HTA. - -![Figure 1. An example of the execution timeline of GPU Kernels across multiple ranks](/assets/images/trace-image6.png){:width="100%"} - -*Figure 1. An example of the execution timeline of GPU Kernels across multiple ranks* -{: style="text-align: center;"} - -The performance of multiple GPU training jobs is affected by multiple factors. Among these factors, how does a model execution create and orchestrate the GPU kernels plays a critical role. HTA provides insights on how the model execution interacts with the GPU devices and highlights the opportunities for performance improvement. - -With the features we built in HTA, we aim to provide users insights into “what is happening under the hood in a distributed GPU training?” We briefly describe these features in the next few paragraphs. - -## Features in Holistic Trace Analysis - -For most users, understanding the performance of GPU training jobs is nontrivial. Thus, we built this library to simplify the task of trace analysis and provide the user useful insights by examining the model execution traces. As the first step, we developed features which are important and generic enough so that most users can benefit from this library. - -**Temporal Breakdown**: We begin by asking whether the GPU is spending time on computation, communication, memory events, or is it idle? To answer this question, the temporal breakdown feature presents a breakdown in terms of these categories. To achieve high training efficiency the code should maximize time used by computation kernels and minimize idle time and non-compute time (time used by communication or memory kernels). This is accomplished by implementing concurrent execution of computation kernels with communication or memory kernels. *Note that, during concurrent execution of computation kernels with communication/memory kernels the time spent by communication/memory kernels is accounted for under compute time.* - -![Figure 2: Temporal Breakdown across 8 GPUs](/assets/images/trace-image3.png){:width="100%"} - -*Figure 2: Temporal Breakdown across 8 GPUs* -{: style="text-align: center;"} - -**Kernel Breakdown**: It is natural to ask which kernels are taking the most amount of time. The next feature breaks down the time spent within each kernel type (COMM, COMP, MEM) and sorts them by duration. We present this information for each kernel type and for each rank as a pie chart. See figure 3 below. - -![Figure 3: Pie chart of top computation and communication kernels](/assets/images/trace-image1.png){:width="100%"} - -*Figure 3: Pie chart of top computation and communication kernels* -{: style="text-align: center;"} - -**Kernel Duration Distribution**: Subsequently, one can also ask - for any given kernel, what is the distribution of the time spent across the ranks? To answer this, HTA generates bar graphs for the average duration of a given kernel across all ranks. Additionally, the error bars in the bar graphs show the minimum and maximum amount of time taken by a given kernel on a given rank. Figure 4 below shows a discrepancy between average duration on rank 0 as compared to other ranks. This anomalous behavior on rank 0 guides the user on where to look for possible bugs. - -![Figure 4: Average duration of NCCL AllReduce Kernel across 8 ranks](/assets/images/trace-image4.png){:width="100%"} - -*Figure 4: Average duration of NCCL AllReduce Kernel across 8 ranks* -{: style="text-align: center;"} - -**Communication Computation Overlap**: In distributed training, a significant amount of time is spent in communication and synchronization events among multiple GPU devices. To achieve high GPU efficiency (i.e. TFLOPS/GPU) it is vital to keep the GPU doing actual computation work. In other words, a GPU should not be blocked because of waiting for data from other GPUs. One way to measure the extent to which computation is blocked by data dependencies is to calculate the computation-communication overlap. Higher GPU efficiency is observed if communication events overlap computation events. Lack of communication and computation overlap will lead to the GPU being idle, thus the efficiency would be low. Thus, the communication computation overlap feature calculates the percentage of time communication and computation overlap in a job for each rank and generates a bar graph representation. See figure below. More precisely, we measure the following ratio - -(time spent in computation while communicating) / (time spent in communication) -{: style="text-align: center;"} - - -![Figure 5: Communication computation overlap](/assets/images/trace-image5.png){:width="100%"} - -*Figure 5: Communication computation overlap* -{: style="text-align: center;"} - -**Augmented Counters (Queue length, Memory bandwidth)**: To aid in debugging, HTA calculates the memory bandwidth statistics for D2H, H2D and D2D memory copy (memcpy) and memory set (memset) events. Additionally, HTA also computes the number of outstanding CUDA operations on each CUDA stream. We refer to this as queue length. When the queue length on a stream is 1024 or larger new events cannot be scheduled on that stream and the CPU will stall until the GPU events have processed. Additionally, HTA generates a new trace file containing tracks with the memory bandwidth and queue length time series. See Figure 6 below. - -![Figure 6: Memory Bandwidth and Queue Length](/assets/images/trace-image2.png){:width="100%"} - -*Figure 6: Memory Bandwidth and Queue Length* -{: style="text-align: center;"} - -These primary features give us a peek into the system performance and help answer “what is happening in the system?”. As HTA evolves, we hope to address “why is X happening?” and also suggest possible solutions to overcome the bottlenecks. - -## Installation and Usage - -### Installation - -For installing the HTA please refer to the [README](https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/README.md). In brief, the user is required to clone the [repo](https://github.com/facebookresearch/HolisticTraceAnalysis) and install the necessary Python packages via pip. - -### Usage - -This version of Holistic Trace Analysis is currently in beta and we recommend using HTA in a Jupyter notebook. A [demo notebook](https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/examples/trace_analysis_demo.ipynb) is provided for your convenience. To get started, import the hta package in a Jupyter notebook, create a TraceAnalysis object and off we go in exactly two lines of code. - -```python -from hta.trace_analysis import TraceAnalysis -analyzer = TraceAnalysis(trace_dir = “/trace/folder/path”) -``` - -### Requirements - -- All trace files for a training or inference job must be stored in a unique folder. -- Trace files are in json or gzipped json format. - -## FAQ - -#### Q. How can I install HTA? - -Please see the [README](https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/README.md) in the root directory of the repository. - -#### Q. Is there any documentation on the features and API in HTA? - -The documentation and detailed API is available [here](https://hta.readthedocs.io/). - -#### Q. Can you implement feature X? - -Depending on how widely the feature is needed and the level of effort required to implement it we would consider developing the feature. Please open a [Github Issue](https://github.com/facebookresearch/HolisticTraceAnalysis/issues) and tag it with the feature-request label. - -#### Q. Can I modify the code? - -Please do and [send a PR](https://github.com/facebookresearch/HolisticTraceAnalysis/pulls) along the way, if you think it would be useful for others. - -#### Q. How can I collect traces in PyTorch? - -Please refer to this tutorial [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#use-profiler-to-record-execution-events). - -#### Q. Can HTA be used at production scale? - -Yes, please see a use case study [here](https://pytorch.org/blog/performance-debugging-of-production-pytorch-models-at-meta/). diff --git a/_posts/2023-02-02-deprecation-cuda-python-support.md b/_posts/2023-02-02-deprecation-cuda-python-support.md deleted file mode 100644 index c02c7249d8d6..000000000000 --- a/_posts/2023-02-02-deprecation-cuda-python-support.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -layout: blog_detail -title: "Deprecation of CUDA 11.6 and Python 3.7 Support" ---- - -For the upcoming PyTorch 2.0 feature release (target March 2023), we will target CUDA 11.7 as the stable version and CUDA 11.8 as the experimental version of CUDA and Python >=3.8, <=3.11. - -If you are still using or depending on CUDA 11.6 or Python 3.7 builds, we strongly recommend moving to at least CUDA 11.7 and Python 3.8, as it would be the minimum versions required for PyTorch 2.0. - -**Please note that as of Feb 1, CUDA 11.6 and Python 3.7 are no longer included in the nightlies** - -Please refer to the Release Compatibility Matrix for PyTorch releases: - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        PyTorch Version - Python - Stable CUDA - Experimental CUDA -
        2.0 - >=3.8, <=3.11 - CUDA 11.7, CUDNN 8.5.0.96 - CUDA 11.8, CUDNN 8.7.0.84 -
        1.13 - >=3.7, <=3.10 - CUDA 11.6, CUDNN 8.3.2.44 - CUDA 11.7, CUDNN 8.5.0.96 -
        1.12 - >=3.7, <=3.10 - CUDA 11.3, CUDNN 8.3.2.44 - CUDA 11.6, CUDNN 8.3.2.44 -
        - - -As of 2/1/2023 - -For more information on PyTorch releases, updated compatibility matrix and release policies, please see (and bookmark) [Readme](https://github.com/pytorch/pytorch/blob/master/RELEASE.md#release-compatibility-matrix). - diff --git a/_posts/2023-02-14-democratizing-ai-with-pytorch.md b/_posts/2023-02-14-democratizing-ai-with-pytorch.md deleted file mode 100644 index 3430ce5cb4eb..000000000000 --- a/_posts/2023-02-14-democratizing-ai-with-pytorch.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -layout: blog_detail -title: "Democratizing AI with PyTorch Foundation and ROCm™ support for PyTorch" -author: AMD ---- - -![AMD Founding Member](/assets/images/2023-02-14-democratizing-ai-with-pytorch-1.png){:width="50%" style="display:block; margin-left:auto; margin-right:auto"} - -Last year, Meta announced that [PyTorch](https://pytorch.org/) joined the Linux Foundation as a neutral home for growing the machine learning project and community with AMD representation as a part of the founding membership and governing board. - -[PyTorch Foundation’s](https://pytorch.org/foundation) mission is to drive AI adoption by democratizing its software ecosystem through open source principles aligning with the AMD core principle of an Open software ecosystem. AMD strives to foster innovation through the support for latest generations of hardware, tools, libraries, and other components to simplify and accelerate adoption of AI across a broad range of scientific discoveries. - -
        -
        -

        -AMD, along with key PyTorch codebase developers (including those at Meta AI), delivered a set of updates to the ROCm™ open software ecosystem that brings stable support for AMD Instinct™ accelerators as well as many Radeon™ GPUs. This now gives PyTorch developers the ability to build their next great AI solutions leveraging AMD GPU accelerators & ROCm. The support from PyTorch community in identifying gaps, prioritizing key updates, providing feedback for performance optimizing and supporting our journey from “Beta” to “Stable” was immensely helpful and we deeply appreciate the strong collaboration between the two teams at AMD and PyTorch. The move for ROCm support from “Beta” to “Stable” came in the PyTorch 1.12 release (June 2022) brings the added support to easily run PyTorch on native environment without having to configure custom dockers. This is a sign of confidence about the quality of support and performance of PyTorch using AMD Instinct and ROCm. The results of these collaborative efforts are evident in the performance measured on key industry benchmarks like Microsoft’s SuperBench shown below in Graph 1. -

        -
        -
        -

        -“We are excited to see the significant impact of developers at AMD to contribute to and extend features within PyTorch to make AI models run in a more performant, efficient, and scalable way. A great example of this is the thought-leadership around unified memory approaches between the framework and future hardware systems, and we look forward to seeing that feature progress.”
        -- Soumith Chintala, PyTorch lead-maintainer and Director of Engineering, Meta AI -

        -
        -
        - - -The progressive improvements on both the AMD CDNA™ architecture as well as ROCm and PyTorch shows single GPU model throughput increase from AMD Instinct MI100 to the latest generation AMD Instinct MI200 family GPUs going from ROCm 4.2 to ROCm 5.3 and from PyTorch 1.7 to PyTorch 1.12. - -![Graph 1: ML model performance over generation using Microsoft Superbench Suite](/assets/images/2023-02-14-democratizing-ai-with-pytorch-2.png){:width="100%"} - -Graph 1: ML model performance over generation using Microsoft Superbench Suite 1, 2, 3 - - -Below are a few of the key updates for ROCm support since the PyTorch 1.12 release - - - -## Full Continuous Integration (CI) for ROCm on PyTorch - -With the ROCm support for PyTorch move from “Beta” to “Stable,” all the functions and features commits are now verified through a full Continuous Integration (CI) process. The CI process helps ensure the proper build and test process ahead of an expected Docker and PIP wheel release with stable commits forthcoming. - - -## Support for [Kineto Profiler](https://github.com/pytorch/kineto) - -The addition of Kineto profiler support to ROCm now helps developers and users understand performance bottlenecks through effective diagnosis and profiling tools. The tool also provides recommendations to improve known issues and visualization through TensorBoard UI. - -## Key PyTorch Libraries support added - -PyTorch ecosystem libraries like [TorchText](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html) (Text classification), [TorchRec](https://pytorch.org/torchrec/) (libraries for recommender systems - RecSys), [TorchVision](https://pytorch.org/vision/stable/index.html) (Computer Vision), [TorchAudio](https://pytorch.org/audio/stable/index.html) (audio and signal processing) are fully supported since ROCm 5.1 and upstreamed with PyTorch 1.12. - -Key libraries provided with the ROCm software stack including [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) (Convolution models), [RCCL](https://github.com/ROCmSoftwarePlatform/rccl) (ROCm Collective Communications) and [rocBLAS](https://github.com/ROCmSoftwarePlatform/rocBLAS) (BLAS for transformers) were further optimized to offer new potential efficiencies and higher performance. - -MIOpen innovates on several fronts, such as implementing fusion to optimize for memory bandwidth and GPU launch overheads, providing an auto-tuning infrastructure to overcome the large design space of problem configurations, and implementing different algorithms to optimize convolutions for different filter and input sizes. MIOpen is one of the first libraries to publicly support the bfloat16 data-type for convolutions, allowing efficient training at lower precision maintaining expected accuracy. - -RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe®, Infinity Fabric™ (GPU to GPU) as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in single or multiple nodes and can be used in either single- or multi-process (e.g., MPI) applications. - -Along with the above key highlights, over 50 features and functionality improvements were completed jointly between AMD and PyTorch to add stable support for ROCm. These include improvements to tools, compilers, runtime, graph optimizations through TorchScript, INT8 quant path usage, and [ONNX runtime integration](https://onnxruntime.ai/) including support for Navi 21 based Radeon™ PRO datacenter graphics card to name a few. - -## [AITemplate](https://github.com/facebookincubator/AITemplate) Inference Engine - -MetaAI recently published a blog announcing the release of its open source AITemplate ([link](https://ai.facebook.com/blog/gpu-inference-engine-nvidia-amd-open-source/)) for a unified inference system supporting AMD Instinct GPU accelerators using the AMD ROCm stack. This Python based framework can help significantly improve performance through increased utilization of AMD matrix cores for transformer blocks. This is achieved through the AMD [Composable Kernel (CK) library](https://github.com/ROCmSoftwarePlatform/composable_kernel) which provides performance critical Kernels for ML AI workloads across multiple architectures including GPUs and CPUs through HIP & C++. - -Moreover, the AITemplate also provides out-of-the-box support for widely used AI models like BERT, ResNET, Vision Transformer, Stable Diffusion etc. simplifying deployment process through these pretrained models. - - -## What’s coming with future ROCm releases? - -### Unified memory models for CPU + GPU - - - -As system architecture evolves to address the complexity of large problem sizes and data sets, memory management becomes a key performance bottle neck that needs a cohesive strategy to be addressed through innovations at both hardware and software levels. AMD is uniquely positioned to address this problem with its effective data center solutions integrating AMD EPYC™ CPU cores with its AMD Instinct GPU compute units in a truly unified datacenter APU (Accelerated Processing Unit) form factor set to be launched in 2H 2023. - -The software work to leverage the unified CPU + GPU memory has already started in collaboration with the PyTorch team, to enable the usage of a fast, low latency, synchronized memory model that enables not only AMD but also other AI accelerators to address the complex memory management problem of today. We are looking forward to this joint effort and announcement soon. - -## Acknowledgement - -The content in this blog highlights the joint work between AMD and key PyTorch contributors including Meta, working on many of the core features, as well as Microsoft enabling ONNX Runtime support. We are looking forward to working with the other founding members at the PyTorch Foundation on the next steps and improvements to democratize and grow adoption of PyTorch across the industry. - -## CAUTIONARY STATEMENT - - -This blog contains forward-looking statements concerning Advanced Micro Devices, Inc. (AMD) such as the availability, timing and expected benefits of an AMD datacenter APU form factor, which are made pursuant to the Safe Harbor provisions of the Private Securities Litigation Reform Act of 1995. Forward-looking statements are commonly identified by words such as "would," "may," "expects," "believes," "plans," "intends," "projects" and other terms with similar meaning. Investors are cautioned that the forward-looking statements in this blog are based on current beliefs, assumptions and expectations, speak only as of the date of this blog and involve risks and uncertainties that could cause actual results to differ materially from current expectations. Such statements are subject to certain known and unknown risks and uncertainties, many of which are difficult to predict and generally beyond AMD's control, that could cause actual results and other future events to differ materially from those expressed in, or implied or projected by, the forward-looking information and statements. Investors are urged to review in detail the risks and uncertainties in AMD’s Securities and Exchange Commission filings, including but not limited to AMD’s most recent reports on Forms 10-K and 10-Q. AMD does not assume, and hereby disclaims, any obligation to update forward-looking statements made in this blog, except as may be required by law. - - - -## Endnotes - - -1. MI100D-01 SuperBench v0.5 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™ 7763 CPU server tested with 1x AMD Instinct™ MI100 (32GB HBM2e) 300W GPU, SBIOS 2.2, Ubuntu® 20.04.5 LTS, host ROCm™ 5.2.0, guest ROCm 4.2, PyTorch 1.7.0. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations. -2. MI200D-01 SuperBench v0.6 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™ 7763 CPU server tested with 1x AMD Instinct™ MI210 (64GB HBM2e) 300W GPU, SBIOS 2.2, Ubuntu 20.04.5 LTS, host ROCm 5.3.0, guest ROCm 5.3, PyTorch 1.12. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations. -3. MI200D-02: SuperBench v0.6 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™️ 7763 CPU server tested with 1x AMD Instinct™️ MI250 (128GB HBM2e) 560W GPU, SBIOS M12, Ubuntu 20.04 LTS, host ROCm 5.3.0, guest ROCm 5.3, PyTorch 1.12. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations. - diff --git a/_posts/2023-03-15-new-library-updates-in-pytorch-2.0.md b/_posts/2023-03-15-new-library-updates-in-pytorch-2.0.md deleted file mode 100644 index d33d9af343b2..000000000000 --- a/_posts/2023-03-15-new-library-updates-in-pytorch-2.0.md +++ /dev/null @@ -1,212 +0,0 @@ ---- -layout: blog_detail -title: "New Library Updates in PyTorch 2.0" ---- - -## Summary - -We are bringing a number of improvements to the current PyTorch libraries, alongside the [PyTorch 2.0 release](/blog/pytorch-2.0-release/). These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. - -Along with 2.0, we are also releasing a series of beta updates to the PyTorch domain libraries, including those that are in-tree, and separate libraries including TorchAudio, TorchVision, and TorchText. An update for TorchX is also being released as it moves to community supported mode. Please find the list of the latest stable versions and updates below. - -**Latest Stable Library Versions (Full List)** - - - - - - - - - - - - - - - - -
        TorchArrow 0.1.0 - TorchRec 0.4.0 - TorchVision 0.15 -
        TorchAudio 2.0 - TorchServe 0.7.1 - TorchX 0.4.0 -
        TorchData 0.6.0 - TorchText 0.15.0 - PyTorch on XLA Devices 1.14 -
        - - -*To see [prior versions](https://pytorch.org/docs/stable/index.html) or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’. - - -## TorchAudio - -### [Beta] Data augmentation operators - -The release adds several data augmentation operators under torchaudio.functional and torchaudio.transforms: -* torchaudio.functional.add_noise -* torchaudio.functional.convolve -* torchaudio.functional.deemphasis -* torchaudio.functional.fftconvolve -* torchaudio.functional.preemphasis -* torchaudio.functional.speed -* torchaudio.transforms.AddNoise -* torchaudio.transforms.Convolve -* torchaudio.transforms.Deemphasis -* torchaudio.transforms.FFTConvolve -* torchaudio.transforms.Preemphasis -* torchaudio.transforms.Speed -* torchaudio.transforms.SpeedPerturbation - -The operators can be used to synthetically diversify training data to improve the generalizability of downstream models. - -For usage details, please refer to the [functional](https://pytorch.org/audio/2.0.0/functional.html) and [transform](https://pytorch.org/audio/2.0.0/transforms.html) documentation and [Audio Data Augmentation](https://pytorch.org/audio/2.0.0/tutorials/audio_data_augmentation_tutorial.html) tutorial. - - -### [Beta] WavLM and XLS-R models - -The release adds two self-supervised learning models for speech and audio. - -* [WavLM](https://ieeexplore.ieee.org/document/9814838) that is robust to noise and reverberation. -* [XLS-R](https://arxiv.org/abs/2111.09296) that is trained on cross-lingual datasets. - -Besides the model architectures, torchaudio also supports corresponding pre-trained pipelines: - -* torchaudio.pipelines.WAVLM_BASE -* torchaudio.pipelines.WAVLM_BASE_PLUS -* torchaudio.pipelines.WAVLM_LARGE -* torchaudio.pipelines.WAV2VEC_XLSR_300M -* torchaudio.pipelines.WAV2VEC_XLSR_1B -* torchaudio.pipelines.WAV2VEC_XLSR_2B - -For usage details, please refer to the [factory function](https://pytorch.org/audio/2.0.0/generated/torchaudio.models.Wav2Vec2Model.html#factory-functions) and [pre-trained pipelines](https://pytorch.org/audio/2.0.0/pipelines.html#id3) documentation. - - -## TorchRL - -The initial release of torchrl includes several features that span across the entire RL domain. TorchRL can already be used in online, offline, multi-agent, multi-task and distributed RL settings, among others. See below: - - -### [Beta] Environment wrappers and transforms - -torchrl.envs includes several wrappers around common environment libraries. This allows users to swap one library with another without effort. These wrappers build an interface between these simulators and torchrl: - -* dm_control: -* Gym -* Brax -* EnvPool -* Jumanji -* Habitat - -It also comes with many commonly used transforms and vectorized environment utilities that allow for a fast execution across simulation libraries. Please refer to the [documentation](https://pytorch.org/rl/reference/envs.html) for more detail. - - -### [Beta] Datacollectors - -Data collection in RL is made easy via the usage of single process or multiprocessed/distributed data collectors that execute the policy in the environment over a desired duration and deliver samples according to the user’s needs. These can be found in torchrl.collectors and are documented [here](https://pytorch.org/rl/reference/collectors.html). - - -### [Beta] Objective modules - -Several objective functions are included in torchrl.objectives, among which: - -* A generic PPOLoss class and derived ClipPPOLoss and KLPPOLoss -* SACLoss and DiscreteSACLoss -* DDPGLoss -* DQNLoss -* REDQLoss -* A2CLoss -* TD3Loss -* ReinforceLoss -* Dreamer - -Vectorized value function operators also appear in the library. Check the documentation [here](https://pytorch.org/rl/reference/objectives.html). - - -### [Beta] Models and exploration strategies - -We provide multiple models, modules and exploration strategies. Get a detailed description in [the doc](https://pytorch.org/rl/reference/modules.html). - - -### [Beta] Composable replay buffer - -A composable replay buffer class is provided that can be used to store data in multiple contexts including single and multi-agent, on and off-policy and many more.. Components include: - -* Storages (list, physical or memory-based contiguous storages) -* Samplers (Prioritized, sampler without repetition) -* Writers -* Possibility to add transforms - -Replay buffers and other data utilities are documented [here](https://pytorch.org/rl/reference/data.html). - - -### [Beta] Logging tools and trainer - -We support multiple logging tools including tensorboard, wandb and mlflow. - -We provide a generic Trainer class that allows for easy code recycling and checkpointing. - -These features are documented [here](https://pytorch.org/rl/reference/trainers.html). - - -## TensorDict - -TensorDict is a new data carrier for PyTorch. - - -### [Beta] TensorDict: specialized dictionary for PyTorch - -TensorDict allows you to execute many common operations across batches of tensors carried by a single container. TensorDict supports many shape and device or storage operations, and can readily be used in distributed settings. Check the [documentation](https://pytorch.org/tensordict/) to know more. - - -### [Beta] @tensorclass: a dataclass for PyTorch - -Like TensorDict, [tensorclass](https://pytorch.org/tensordict/reference/prototype.html) provides the opportunity to write dataclasses with built-in torch features such as shape or device operations. - - -### [Beta] tensordict.nn: specialized modules for TensorDict - -The [tensordict.nn module](https://pytorch.org/tensordict/reference/nn.html) provides specialized nn.Module subclasses that make it easy to build arbitrarily complex graphs that can be executed with TensorDict inputs. It is compatible with the latest PyTorch features such as functorch, torch.fx and torch.compile. - - -## TorchRec - - -### [Beta] KeyedJaggedTensor All-to-All Redesign and Input Dist Fusion - -We observed performance regression due to a bottleneck in sparse data distribution for models that have multiple, large KJTs to redistribute. - -To combat this we altered the comms pattern to transport the minimum data required in the initial collective to support the collective calls for the actual KJT tensor data. This data sent in the initial collective, ‘splits’ means more data is transmitted over the comms stream overall, but the CPU is blocked for significantly shorter amounts of time leading to better overall QPS. - -Furthermore, we altered the TorchRec train pipeline to group the initial collective calls for the splits together before launching the more expensive KJT tensor collective calls. This fusion minimizes the CPU blocked time as launching each subsequent input distribution is no longer dependent on the previous input distribution. - -With this feature, variable batch sizes are now natively supported across ranks. These features are documented [here](https://github.com/pytorch/torchrec/commit/d0d23bef8aef5a79a1061fbc842c97bb68b91463). - - -## TorchVision - - -### [Beta] Extending TorchVision’s Transforms to Object Detection, Segmentation & Video tasks - -TorchVision is extending its Transforms API! Here is what’s new: - -* You can use them not only for Image Classification but also for Object Detection, Instance & Semantic Segmentation and Video Classification. -* You can use new functional transforms for transforming Videos, Bounding Boxes and Segmentation Masks. - -Learn more about these new transforms [from our docs](https://pytorch.org/vision/stable/auto_examples/), and submit any feedback in our [dedicated issue](https://github.com/pytorch/vision/issues/6753). - - -## TorchText - -### [Beta] Adding scriptable T5 and Flan-T5 to the TorchText library with incremental decoding support! - -TorchText has added the T5 model architecture with pre-trained weights for both the [original T5 paper](https://arxiv.org/abs/1910.10683) and [Flan-T5](https://arxiv.org/abs/2210.11416). The model is fully torchscriptable and features an optimized [multiheaded attention implementation](https://pytorch.org/docs/master/generated/torch.ao.nn.quantizable.MultiheadAttention.html?highlight=multihead#torch.ao.nn.quantizable.MultiheadAttention). We include several examples of how to utilize the model including summarization, classification, and translation. - -For more details, please refer to [our docs](https://pytorch.org/text/stable/models.html). - - -## TorchX - -TorchX is moving to community supported mode. More details will be coming in at a later time. \ No newline at end of file diff --git a/_posts/2023-03-15-pytorch-2.0-release.md b/_posts/2023-03-15-pytorch-2.0-release.md deleted file mode 100644 index bd6cdafc63f1..000000000000 --- a/_posts/2023-03-15-pytorch-2.0-release.md +++ /dev/null @@ -1,528 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.0: Our next generation release that is faster, more Pythonic and Dynamic as ever" ---- - -We are excited to announce the release of [PyTorch® 2.0](https://github.com/pytorch/pytorch/releases/tag/v2.0.0) which we highlighted during the [PyTorch Conference](https://www.youtube.com/@PyTorch/playlists?view=50&sort=dd&shelf_id=2) on 12/2/22! PyTorch 2.0 offers the same eager-mode development and user experience, while fundamentally changing and supercharging how PyTorch operates at compiler level under the hood with faster performance and support for Dynamic Shapes and Distributed. - -This next-generation release includes a Stable version of Accelerated Transformers (formerly called Better Transformers); Beta includes torch.compile as the main API for PyTorch 2.0, the scaled_dot_product_attention function as part of torch.nn.functional, the MPS backend, functorch APIs in the torch.func module; and other Beta/Prototype improvements across various inferences, performance and training optimization features on GPUs and CPUs. For a comprehensive introduction and technical overview of torch.compile, please visit the 2.0 [Get Started page](/get-started/pytorch-2.0). - -Along with 2.0, we are also releasing a series of beta updates to the PyTorch domain libraries, including those that are in-tree, and separate libraries including TorchAudio, TorchVision, and TorchText. An update for TorchX is also being released as it moves to community supported mode. More details can be found in this [library blog](/blog/new-library-updates-in-pytorch-2.0/). - -This release is composed of over 4,541 commits and 428 contributors since 1.13.1. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.0 and the overall 2-series this year. - -Summary: -* torch.compile is the main API for PyTorch 2.0, which wraps your model and returns a compiled model. It is a fully additive (and optional) feature and hence 2.0 is 100% backward compatible by definition. -* As an underpinning technology of torch.compile, TorchInductor with Nvidia and AMD GPUs will rely on OpenAI Triton deep learning compiler to generate performant code and hide low level hardware details. OpenAI Triton-generated kernels achieve performance that's on par with hand-written kernels and specialized cuda libraries such as cublas. -* Accelerated Transformers introduce high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA). The API is integrated with torch.compile() and model developers may also use the [scaled dot product attention](#beta-scaled-dot-product-attention-20) kernels directly by calling the new scaled_dot_product_attention() operator. -* Metal Performance Shaders (MPS) backend provides GPU accelerated PyTorch training on Mac platforms with added support for Top 60 most used ops, bringing coverage to over 300 operators. -* Amazon AWS optimizes the PyTorch CPU inference on AWS Graviton3 based [C7g instances](https://aws.amazon.com/blogs/aws/new-amazon-ec2-c7g-instances-powered-by-aws-graviton3-processors/). PyTorch 2.0 improves inference performance on Graviton compared to the previous releases, including improvements for Resnet50 and Bert. -* New prototype features and technologies across TensorParallel, DTensor, 2D parallel, TorchDynamo, AOTAutograd, PrimTorch and TorchInductor. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        -Stable - Beta - Prototype - Performance Improvements -
        - -Accelerated PT 2 Transformers - - -torch.compile - - -DTensor - - -CUDA support for 11.7 & 11.8 (deprecating CUDA 11.6) -
        - - -PyTorch MPS Backend - - -TensorParallel - - -Python 3.8 (deprecating Python 3.7) -
        - - -Scaled dot product attention - - -2D Parallel - - -AWS Graviton3 -
        - - -functorch - - -Torch.compile (dynamic=True) - -
        - Dispatchable Collectives - -
        - Torch.set_default & torch.device - - -
        - - -X86 quantization backend - - -
        - - -GNN inference and training performance - - -
        - - -*To see a full list of public 2.0, 1.13 and 1.12 feature submissions click [here](https://docs.google.com/spreadsheets/d/1H3jazwO8BBCwK8JwLNYspLiHfUrzshEtyqjL-X93I9g/edit#gid=790902532). - - -## Stable Features - - -### [Stable] Accelerated PyTorch 2 Transformers - -The PyTorch 2.0 release includes a new high-performance implementation of the PyTorch Transformer API. In releasing Accelerated PT2 Transformers, our goal is to make training and deployment of state-of-the-art Transformer models affordable across the industry. This release introduces high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA), extending the inference “fastpath” architecture, previously known as "Better Transformer." - -Similar to the “fastpath” architecture, custom kernels are fully integrated into the PyTorch Transformer API – thus, using the native Transformer and MultiHeadAttention API will enable users to: - -* transparently see significant speed improvements; -* support many more use cases including models using Cross-Attention, Transformer Decoders, and for training models; and -* continue to use fastpath inference for fixed and variable sequence length Transformer Encoder and Self Attention use cases. - -To take full advantage of different hardware models and Transformer use cases, multiple SDPA custom kernels are supported (see below), with custom kernel selection logic that will pick the highest-performance kernel for a given model and hardware type. In addition to the existing Transformer API, model developers may also use the [scaled dot product attention](#beta-scaled-dot-product-attention-20) kernels directly by calling the new scaled_dot_product_attention() operator. Accelerated PyTorch 2 Transformers are integrated with torch.compile() . To use your model while benefiting from the additional acceleration of PT2-compilation (for inference or training), pre-process the model with `model = torch.compile(model)`. - -We have achieved major speedups for training transformer models and in particular large language models with Accelerated PyTorch 2 Transformers using a combination of custom kernels and torch.compile(). - -![alt_text](/assets/images/pytorch20post.png "Accelerated PyTorch 2 speed"){:width="100%"} -Figure: Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for [nanoGPT](https://github.com/karpathy/nanoGPT) shown here. - - - -## Beta Features - - -### [Beta] torch.compile - -torch.compile is the main API for PyTorch 2.0, which wraps your model and returns a compiled model. It is a fully additive (and optional) feature and hence 2.0 is 100% backward compatible by definition. - -Underpinning torch.compile are new technologies – TorchDynamo, AOTAutograd, PrimTorch and TorchInductor: -* TorchDynamo captures PyTorch programs safely using Python Frame Evaluation Hooks and is a significant innovation that was a result of 5 years of our R&D into safe graph capture. -* AOTAutograd overloads PyTorch’s autograd engine as a tracing autodiff for generating ahead-of-time backward traces. -* PrimTorch canonicalizes ~2000+ PyTorch operators down to a closed set of ~250 primitive operators that developers can target to build a complete PyTorch backend. This substantially lowers the barrier of writing a PyTorch feature or backend. -* TorchInductor is a deep learning compiler that generates fast code for multiple accelerators and backends. For NVIDIA and AMD GPUs, it uses OpenAI Triton as a key building block. For intel CPUs, we generate C++ code using multithreading, vectorized instructions and offloading appropriate operations to mkldnn when possible. - -With all the new technologies, torch.compile is able to work 93% of time across 165 open-source models and runs 20% faster on average at float32 precision and 36% faster on average at AMP precision. - -For more information, please refer to [https://pytorch.org/get-started/pytorch-2.0/](https://pytorch.org/get-started/pytorch-2.0/) and for TorchInductor CPU with Intel [here](https://dev-discuss.pytorch.org/t/torchinductor-update-5-cpu-backend-backend-performance-update-and-deep-dive-on-key-optimizations/1117). - - -### [Beta] PyTorch MPS Backend - -MPS backend provides GPU-accelerated PyTorch training on Mac platforms. This release brings improved correctness, stability, and operator coverage. - -MPS backend now includes support for the Top 60 most used ops, along with the most frequently requested operations by the community, bringing coverage to over 300 operators. The major focus of the release was to enable full OpInfo-based forward and gradient mode testing to address silent correctness issues. These changes have resulted in wider adoption of MPS backend by 3rd party networks such as Stable Diffusion, YoloV5, WhisperAI, along with increased coverage for Torchbench networks and Basic tutorials. We encourage developers to update to the latest macOS release to see the best performance and stability on the MPS backend. - - - -Links - - - -1. [MPS Backend](https://pytorch.org/docs/stable/notes/mps.html) -2. [Developer information](https://github.com/pytorch/pytorch/wiki/MPS-Backend) -3. [Accelerated PyTorch training on Mac](https://developer.apple.com/metal/pytorch/) -4. [Metal](https://developer.apple.com/documentation/metal?language=objc), [Metal Performance Shaders](https://developer.apple.com/documentation/metalperformanceshaders?language=objc) & [Metal Performance Shaders Graph](https://developer.apple.com/documentation/metalperformanceshadersgraph?language=objc) - - -### [Beta] Scaled dot product attention 2.0 - -We are thrilled to announce the release of PyTorch 2.0, which introduces a powerful scaled dot product attention function as part of torch.nn.functional. This function includes multiple implementations that can be seamlessly applied depending on the input and hardware in use. - -In previous versions of PyTorch, you had to rely on third-party implementations and install separate packages to take advantage of memory-optimized algorithms like [FlashAttention](https://github.com/HazyResearch/flash-attention). With PyTorch 2.0, all these implementations are readily available by default. - -These implementations include [FlashAttention](https://arxiv.org/abs/2205.14135) from HazyResearch, Memory-Efficient Attention from the [xFormers](https://github.com/facebookresearch/xformers) project, and a native C++ implementation that is ideal for non-CUDA devices or when high-precision is required. - -PyTorch 2.0 will automatically select the optimal implementation for your use case, but you can also toggle them individually for finer-grained control. Additionally, the scaled dot product attention function can be used to build common transformer architecture components. - -Learn more with the [documentation](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html?highlight=scaled_dot_product#torch.nn.functional.scaled_dot_product_attention) and this [tutorial](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html). - - -### [Beta] functorch -> torch.func - -Inspired by [Google JAX](https://github.com/google/jax), functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples include: -* [model ensembling](https://pytorch.org/tutorials/intermediate/ensembling.html) -* [efficiently computing jacobians and hessians](https://pytorch.org/tutorials/intermediate/jacobians_hessians.html) -* [computing per-sample-gradients (or other per-sample quantities)](https://pytorch.org/tutorials/intermediate/per_sample_grads.html) - -We’re excited to announce that, as the final step of upstreaming and integrating functorch into PyTorch, the functorch APIs are now available in the torch.func module. Our function transform APIs are identical to before, but we have changed how the interaction with NN modules work. Please see the [docs](https://pytorch.org/docs/master/func.html) and the [migration guide](https://pytorch.org/docs/master/func.migrating.html) for more details. - -Furthermore, we have [added support for torch.autograd.Function](https://pytorch.org/docs/master/notes/extending.func.html): one is now able to apply function transformations (e.g. vmap, grad, jvp) over torch.autograd.Function. - - -### [Beta] Dispatchable Collectives - -Dispatchable collectives is an improvement to the existing init_process_group() API which changes backend to an optional argument. For users, the main advantage of this feature is that it will allow them to write code that can run on both GPU and CPU machines without having to change the backend specification. The dispatchability feature will also make it easier for users to support both GPU and CPU collectives, as they will no longer need to specify the backend manually (e.g. “NCCL” or “GLOO”). Existing backend specifications by users will be honored and will not require change. - -Usage example: -``` -import torch.distributed.dist -… -# old -dist.init_process_group(backend=”nccl”, ...) -dist.all_reduce(...) # with CUDA tensors works -dist.all_reduce(...) # with CPU tensors does not work - -# new -dist.init_process_group(...) # backend is optional -dist.all_reduce(...) # with CUDA tensors works -dist.all_reduce(...) # with CPU tensors works -``` - -Learn more [here](https://pytorch.org/docs/master/distributed.html#torch.distributed.init_process_group). - - -### [Beta] torch.set_default_device and torch.device as context manager - -torch.set_default_device allows users to change the default device that factory functions in PyTorch allocate on. For example, if you torch.set_default_device(‘cuda’), a call to torch.empty(2) will allocate on CUDA (rather than on CPU). You can also use torch.device as a context manager to change the default device on a local basis. This resolves a long standing feature request from PyTorch’s initial release for a way to do this. - -Learn more [here](https://pytorch.org/tutorials/recipes/recipes/changing_default_device.html). - - -### [Beta] "X86" as the new default quantization backend for x86 CPU - -The new X86 quantization backend, which utilizes FBGEMM and oneDNN kernel libraries, replaces FBGEMM as the default quantization backend for x86 CPU platforms and offers improved int8 inference performance compared to the original FBGEMM backend, leveraging the strengths of both libraries, with 1.3X – 2X inference performance speedup measured on 40+ deep learning models. The new backend is functionally compatible with the original FBGEMM backend. - - -**Table: Geomean Speedup of X86 Quantization Backend vs. FBGEMM Backend** - - - - - - - - - - - - - - - - - -
        - 1 core/instance - 2 cores/instance - 4 cores/instance - 1 socket (32 cores)/instance -
        Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz - 1.76X - 1.80X - 2.04X - 1.34X -
        - - -By default, users on x86 platforms will utilize the x86 quantization backend and their PyTorch programs will remain unchanged when using the default backend. Alternatively, users have the option to specify "X86" as the quantization backend explicitly. Example code is shown below: - -``` -import torch -from torch.ao.quantization import get_default_qconfig_mappingfrom torch.quantization.quantize_fx -import prepare_fx, convert_fx - -# get default configuration -qconfig_mapping = get_default_qconfig_mapping() - -# or explicitly specify the backend -# qengine = 'x86' -# torch.backends.quantized.engine = qengine -# qconfig_mapping = get_default_qconfig_mapping(qengine) - -# construct fp32 model -model_fp32 = ... - -# prepare -prepared_model = prepare_fx(model_fp32, qconfig_mapping, example_inputs=x) - -# calibrate -... - -# convert -quantized_model = convert_fx(prepared_model) -``` - -Find more information: [https://github.com/pytorch/pytorch/issues/83888](https://github.com/pytorch/pytorch/issues/83888) and [https://www.intel.com/content/www/us/en/developer/articles/technical/accelerate-pytorch-int8-inf-with-new-x86-backend.html](https://www.intel.com/content/www/us/en/developer/articles/technical/accelerate-pytorch-int8-inf-with-new-x86-backend.html). - - -### [Beta] GNN inference and training optimization on CPU - -PyTorch 2.0 includes several critical optimizations to improve GNN inference and training performance on CPU. Before 2.0, GNN models of PyG suffers from low efficiency on CPU due to lack of performance tuning for several critical kernels (scatter/gather, etc) and the lack of GNN-related sparse matrix multiplication ops. To be specific, optimizations include: -* scatter_reduce: performance hotspot in Message Passing when the edge index is stored in Coordinate format (COO). -* gather: backward of scatter_reduce, specially tuned for the GNN compute when the index is an expanded tensor. -* torch.sparse.mm with reduce flag: performance hotspot in Message Passing when the edge index is stored in Compressed Sparse Row (CSR). Supported reduce flag of: sum, mean, amax, amin. - -On PyG benchmarks/examples, OGB benchmarks, a 1.12x - 4.07x performance speedup is measured (1.13.1 compared with 2.0) for single node inference and training. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model-Dataset - Option - Speedup Ratio -
        GCN-Reddit (inference) - 512-2-64-dense - 1.22x -
        1024-3-128-dense - 1.25x -
        512-2-64-sparse - 1.31x -
        1024-3-128-sparse - 1.68x -
        512-2-64-dense - 1.22x -
        -GraphSage-ogbn-products (inference) - 1024-3-128-dense - 1.15x -
        512-2-64-sparse - 1.20x -
        1024-3-128-sparse - 1.33x -
        full-batch-sparse - 4.07x -
        GCN-PROTEINS (training) - 3-32 - 1.67x -
        GCN-REDDIT-BINARY (training) - 3-32 - 1.67x -
        GCN-Reddit (training) - 512-2-64-dense - 1.20x -
        1024-3-128-dense - 1.12x -
        - - -Learn more: [PyG CPU Performance Optimization](https://www.pyg.org/ns-newsarticle-accelerating-pyg-on-intel-cpus). - - -### [Beta] Accelerating inference on CPU with PyTorch by leveraging oneDNN Graph - -[oneDNN Graph API](https://spec.oneapi.io/onednn-graph/latest/introduction.html) extends [oneDNN](https://spec.oneapi.io/versions/latest/elements/oneDNN/source/index.html) with a flexible graph API to maximize the optimization opportunity for generating efficient code on AI hardware. -* It automatically identifies the graph partitions to be accelerated via fusion. -* The [fusion patterns](https://github.com/oneapi-src/oneDNN/blob/dev-graph/doc/programming_model/ops_and_patterns.md#fusion-patterns) focus on fusing compute-intensive operations such as convolution, matmul and their neighbor operations for both inference and training use cases. -* Although work is ongoing to integrate oneDNN Graph with TorchDynamo as well, its integration with the PyTorch JIT Fuser attained beta status in PyTorch 2.0 for [Float32](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/onednn#example-with-float) & [BFloat16](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/onednn#example-with-bfloat16) inference (on machines that support AVX512_BF16 ISA). - - -From a developer’s/researcher’s perspective, the usage is quite simple & intuitive, with the only change in code being an API invocation: -* Leverage oneDNN Graph, with [JIT-tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), a model is profiled with an example input. -* The context manager _with torch.jit.fuser(“fuser3”):_ can also be used instead of invoking _torch.jit.enable_onednn_fusion(True)_. -* For accelerating [BFloat16 inference](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/onednn#example-with-bfloat16), we rely on eager-mode AMP (Automatic Mixed Precision) support in PyTorch & disable JIT mode’s AMP, as both of them are currently divergent: - -``` -# Assuming we have a model of the name 'model' - -example_input = torch.rand(1, 3, 224, 224) - -# enable oneDNN Graph -torch.jit.enable_onednn_fusion(True) -# Disable AMP for JIT -torch._C._jit_set_autocast_mode(False) -with torch.no_grad(), torch.cpu.amp.autocast(): - model = torch.jit.trace(model, (example_input)) - model = torch.jit.freeze(model) - # 2 warm-ups (2 for tracing/scripting with an example, 3 without an example) - model(example_input) - model(example_input) - - # speedup would be observed in subsequent runs. - model(example_input) -``` - - -Learn more [here](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-onednn-graph-with-torchscript-for-inference). - - -## Prototype Features - -### Distributed API - -#### [Prototype] DTensor - -PyTorch [DistributedTensor](https://github.com/pytorch/pytorch/blob/master/torch/distributed/_tensor/README.md) (DTensor) is a prototyping effort with distributed tensor primitives to allow easier distributed computation authoring in the SPMD (Single Program Multiple Devices) paradigm. The primitives are simple but powerful when used to express tensor distributions with both sharded and replicated parallelism strategies. PyTorch DTensor empowered PyTorch [Tensor Parallelism](https://pytorch.org/docs/master/distributed.tensor.parallel.html) along with other advanced parallelism explorations. In addition, it also offers a uniform way to save/load state_dict for distributed checkpointing purposes, even when there’re complex tensor distribution strategies such as combining tensor parallelism with parameter sharding in FSDP. More details can be found in this [RFC](https://github.com/pytorch/pytorch/issues/88838) and the [DTensor examples notebook](https://colab.research.google.com/drive/12Pl5fvh0eLPUrcVO7s6yY4n2_RZo8pLR#scrollTo=stYPKb9Beq4e). - - -#### [Prototype] TensorParallel - -We now support DTensor based Tensor Parallel which users can distribute their model parameters across different GPU devices. We also support Pairwise Parallel which shards two concatenated linear layers in a col-wise and row-wise style separately so that only one collective(all-reduce/reduce-scatter) is needed in the end. - - -#### [Prototype] 2D Parallel - -We implemented the integration of the aforementioned TP with FullyShardedDataParallel(FSDP) as 2D parallel to further scale large model training. More details can be found in this [slide](https://docs.google.com/presentation/d/17g6WqrO00rP3MsxbRENsPpjrlSkwiA_QB4r93_eB5is/edit?usp=sharing). - - -#### [Prototype] torch.compile(dynamic=True) - -Experimental support for PT2 compilation with dynamic shapes is available in this release. Inference compilation with inductor for simple models is supported, but there are a lot of limitations: - -* Training available in a future release (This is partially fixed in nightlies!) -* Minifier available in a future release. -* It is easy to end up in a situation where the dimension you wanted to be dynamic gets specialized anyway. Some of these issues are fixed in nightlies, others are not. -* We do not appropriately propagate Inductor guards to the top-level, this is tracked at [#96296](https://github.com/pytorch/pytorch/issues/96296). -* Data-dependent operations like nonzero still require a graph break. -* Dynamic does not work with non-standard modes like reduce-overhead or max-autotune. -* There are many bugs in Inductor compilation. To track known bugs, check the [dynamic shapes](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3A%22module%3A+dynamic+shapes%22) label on the PyTorch issue tracker. - -For the latest and greatest news about dynamic shapes support on master, check out [our status reports](https://dev-discuss.pytorch.org/t/state-of-symbolic-shapes-branch/777/43). - - -## Highlights/Performance Improvements - - -### [Deprecation of Cuda 11.6 and Python 3.7 support](https://pytorch.org/blog/deprecation-cuda-python-support/) for PyTorch 2.0 - -If you are still using or depending on CUDA 11.6 or Python 3.7 builds, we strongly recommend moving to at least CUDA 11.7 and Python 3.8, as it would be the minimum versions required for PyTorch 2.0. For more detail, please refer to the [Release Compatibility Matrix for PyTorch](https://github.com/pytorch/pytorch/blob/master/RELEASE.md#release-compatibility-matrix) releases. - - -### Python 3.11 support on Anaconda Platform - -Due to lack of Python 3.11 support for packages that PyTorch depends on, including NumPy, SciPy, SymPy, Pillow and others on the Anaconda platform. We will not be releasing Conda binaries compiled with Python 3.11 for PyTorch Release 2.0. The Pip packages with Python 3.11 support will be released, hence if you intend to use PyTorch 2.0 with Python 3.11 please use our Pip packages. Please note: Conda packages with Python 3.11 support will be made available on our nightly channel. Also we are planning on releasing Conda Python 3.11 binaries as part of future release once Anaconda provides these key dependencies. More information and instructions on how to download the Pip packages can be found [here](https://dev-discuss.pytorch.org/t/pytorch-2-0-message-concerning-python-3-11-support-on-anaconda-platform/1087). - - -### Optimized PyTorch Inference with AWS Graviton processors - -The optimizations focused on three key areas: GEMM kernels, bfloat16 support, primitive caching and the memory allocator. For aarch64 platforms, PyTorch supports Arm Compute Library (ACL) GEMM kernels via Mkldnn(OneDNN) backend. The ACL library provides Neon/SVE GEMM kernels for fp32 and bfloat16 formats. The bfloat16 support on c7g allows efficient deployment of bfloat16 trained, AMP (Automatic Mixed Precision) trained, or even the standard fp32 trained models. The standard fp32 models leverage bfloat16 kernels via OneDNN fast math mode, without any model quantization. Next we implemented primitive caching for conv, matmul and inner product operators. More information on the updated PyTorch user guide with the upcoming 2.0 release improvements and TorchBench benchmark details can be found [here](https://github.com/aws/aws-graviton-getting-started). diff --git a/_posts/2023-03-16-accelerated-diffusers-pt-20.md b/_posts/2023-03-16-accelerated-diffusers-pt-20.md deleted file mode 100644 index 3bca71f4a9bf..000000000000 --- a/_posts/2023-03-16-accelerated-diffusers-pt-20.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -layout: blog_detail -title: "Accelerated Diffusers with PyTorch 2.0" -author: Pedro Cuenca, Patrick von Platen, Suraj Patil, Sayak Paul ---- - -PyTorch 2.0 has just been released. Its flagship new feature is `torch.compile()`, a one-line code change that promises to automatically improve performance across codebases. We have previously [checked on that promise in Hugging Face Transformers and TIMM models](https://pytorch.org/blog/Accelerating-Hugging-Face-and-TIMM-models/), and delved deep into its [motivation, architecture and the road ahead](https://pytorch.org/get-started/pytorch-2.0/). - -As important as `torch.compile()` is, there’s much more to PyTorch 2.0. Notably, PyTorch 2.0 incorporates several strategies to accelerate transformer blocks, and these improvements are very relevant for diffusion models too. Techniques such as [FlashAttention](https://arxiv.org/abs/2205.14135), for example, have become very popular in the diffusion community thanks to their ability to significantly speed up Stable Diffusion and achieve larger batch sizes, and they are now part of PyTorch 2.0. - -In this post we discuss how attention layers are optimized in PyTorch 2.0 and how these optimization are applied to the popular [🧨 Diffusers library](https://github.com/huggingface/diffusers). We finish with a benchmark that shows how the use of PyTorch 2.0 and Diffusers immediately translates to significant performance improvements across different hardware. - -Update (June 2023): [a new section has been added](#compile-fixing-graph-breaks) to show dramatic performance improvements of `torch.compile()` with the latest version of PyTorch (2.0.1), after going through the process of fixing graph breaks in the diffusers codebase. A more detailed analysis of how to find and fix graph breaks will be published in a separate post. - - -## Accelerating transformer blocks - -PyTorch 2.0 includes a _scaled dot-product attention_ function as part of `torch.nn.functional`. This function encompasses several implementations that can be applied depending on the inputs and the hardware in use. Before PyTorch 2.0, you had to search for third-party implementations and install separate packages in order to take advantage of memory optimized algorithms, such as FlashAttention. The available implementations are: -* FlashAttention, from the official [FlashAttention project](https://github.com/HazyResearch/flash-attention). -* Memory-Efficient Attention, from the [xFormers project](https://github.com/facebookresearch/xformers). -* A native C++ implementation suitable for non-CUDA devices or when high-precision is required. - -All these methods are available by default, and PyTorch will try to select the optimal one automatically through the use of the new scaled dot-product attention (SDPA) API. You can also individually toggle them for finer-grained control, see [the documentation](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) for details. - - -## Using scaled dot-product attention in diffusers - -The incorporation of Accelerated PyTorch 2.0 Transformer attention to the Diffusers library was achieved through the use of the [`set_attn_processor` method](https://huggingface.co/docs/diffusers/v0.13.0/en/api/models#diffusers.UNet2DConditionModel.set_attn_processor), which allows for pluggable attention modules to be configured. In this case, a [new attention processor was created](https://github.com/huggingface/diffusers/blob/856dad57/src/diffusers/models/cross_attention.py#L469), which is [enabled by default when PyTorch 2.0 is available](https://github.com/huggingface/diffusers/blob/856dad57bb7a9ee13af4a08492e524b0a145a2c5/src/diffusers/models/cross_attention.py#L105). For clarity, this is how you could enable it manually (but it’s usually not necessary since diffusers will automatically take care of it): - -``` -from diffusers import StableDiffusionPipeline -from diffusers.models.cross_attention import AttnProcessor2_0 - -pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") -pipe.to("cuda") -pipe.unet.set_attn_processor(AttnProcessor2_0()) - -prompt = "a photo of an astronaut riding a horse on mars" -image = pipe(prompt).images[0] -``` - -## Stable Diffusion Benchmark - -We ran a number of tests using accelerated dot-product attention from PyTorch 2.0 in Diffusers. We installed diffusers from pip and used nightly versions of PyTorch 2.0, since our tests were performed before the official release. We also used `torch.set_float32_matmul_precision('high')` to enable additional fast matrix multiplication algorithms. - -We compared results with the traditional attention implementation in `diffusers` (referred to as `vanilla` below) as well as with the best-performing solution in pre-2.0 PyTorch: PyTorch 1.13.1 with the xFormers package (v0.0.16) installed. - -Results were measured without compilation (i.e., no code changes at all), and also with a single call to `torch.compile()` to wrap the UNet module. We did not compile the image decoder because most of the time is spent in the 50 denoising iterations that run UNet evaluations. - - -### Results in float32 - -![Diffusers Speedup vs xFormers float32](/assets/images/3-16-accelerated-d/fig1-latest.png){:width="100%"} - -The following figures explore performance improvement vs batch size for various representative GPUs belonging to different generations. We collected data for each combination until we reached maximum memory utilization. Vanilla attention runs out of memory earlier than xFormers or PyTorch 2.0, which explains the missing bars for larger batch sizes. Similarly, A100 (we used the 40 GB version) is capable of running batch sizes of 64, but the other GPUs could only reach 32 in our tests. - -![Diffusers Inference Speedup vs Vanilla and xFormers Attention (A100, float32)](/assets/images/3-16-accelerated-d/fig2-latest.png){:width="100%"} - -![Diffusers Inference Speedup vs Vanilla and xFormers Attention (3090, float32)](/assets/images/3-16-accelerated-d/fig3-latest.png){:width="100%"} - -![Diffusers Inference Speedup vs Vanilla and xFormers Attention (4090, float32)](/assets/images/3-16-accelerated-d/fig4-latest.png){:width="100%"} - -![Diffusers Inference Speedup vs Vanilla and xFormers Attention (V100, float32)](/assets/images/3-16-accelerated-d/fig5-latest.png){:width="100%"} - - -We found very significant performance improvements over vanilla attention across the board, without even using `torch.compile()`. An out of the box installation of PyTorch 2.0 and diffusers yields about 50% speedup on A100 and between 35% and 50% on 4090 GPUs, depending on batch size. Performance improvements are more pronounced for modern CUDA architectures such as Ada (4090) or Ampere (A100), but they are still very significant for older architectures still heavily in use in cloud services. - -In addition to faster speeds, the accelerated transformers implementation in PyTorch 2.0 allows much larger batch sizes to be used. A single 40GB A100 GPU runs out of memory with a batch size of 10, and 24 GB high-end consumer cards such as 3090 and 4090 cannot generate 8 images at once. Using PyTorch 2.0 and diffusers we could achieve batch sizes of **48** for 3090 and 4090, and **64** for A100. This is of great significance for cloud services and applications, as they can efficiently process more images at a time. - -When compared with PyTorch 1.13.1 + xFormers, the new accelerated transformers implementation is still faster and requires no additional packages or dependencies. In this case we found moderate speedups of up to 2% on datacenter cards such as A100 or T4, but performance was great on the two last generations of consumer cards: up to 20% speed improvement on 3090 and between 10% and 45% on 4090, depending on batch size. - -When `torch.compile()` is used, we get an additional performance boost of (typically) 2% and 3% over the previous improvements. As compilation takes some time, this is better geared towards user-facing inference services or training. **Update**: improvements achieved by `torch.compile()` are much larger when graph breaks are minimized, [see the new section for details](#compile-fixing-graph-breaks). - - -### Results in float16 - -![Diffusers Speedup vs xFormers float16](/assets/images/3-16-accelerated-d/fig6-latest.png){:width="100%"} - -![Diffusers Inference Speedup vs Vanilla and xFormers Attention (A100, float16)](/assets/images/3-16-accelerated-d/fig7-latest.png){:width="100%"} - -![Diffusers Inference Speedup vs Vanilla and xFormers Attention (4090, float16)](/assets/images/3-16-accelerated-d/fig8-latest.png){:width="100%"} - -![Diffusers Inference Speedup vs Vanilla and xFormers Attention (3090, float16)](/assets/images/3-16-accelerated-d/fig9-latest.png){:width="100%"} - -When we consider `float16` inference, the performance improvements of the accelerated transformers implementation in PyTorch 2.0 are between 20% and 28% over standard attention, across all the GPUs we tested, except for the 4090, which belongs to the more modern Ada architecture. This GPU benefits from a dramatic performance improvement when using PyTorch 2.0 nightlies. With respect to optimized SDPA vs xFormers, results are usually on par for most GPUs, except again for the 4090. Adding `torch.compile()` to the mix boosts performance a few more percentage points across the board. - - -## Performance of `torch.compile()` after minimizing graph breaks - -In the previous sections we saw that using the accelerated transformers implementation of PyTorch 2.0 provides important performance improvements with respect to earlier versions of PyTorch (with or without xFormers). However, `torch.compile()` only contributed modest marginal improvements. With the help of the PyTorch team we discovered that the reason for those moderate improvements was that some operations in the diffusers source code were causing graph breaks, which prevented `torch.compile()` from taking full advantage of graph optimizations. - -After fixing the graph breaks (see [these](https://github.com/huggingface/diffusers/pull/3286) [PRs](https://github.com/huggingface/diffusers/pull/3313) for details), we measured the additional improvement of `torch.compile()` vs the uncompiled version of PyTorch 2, and we saw very important incremental performance gains. The following chart was obtained using a nightly version of PyTorch 2 downloaded on May 1st, 2023, and it shows improvements in the range of ~13% to 22% for most workloads. The performance gains get better for modern GPU families, achieving more than 30% for A100. There are also two outliers in the chart. First, we see a performance decrease on T4 for a batch size of 16, which imposes a huge memory pressure on that card. At the opposite end of the spectrum, we see a performance increase on A100 of more than 100% when using a batch size of only 1, which is interesting but not representative of real-world use of a gpu with such large amount of RAM – larger batch sizes capable of serving multiple customers will usually be more interesting for service deployment on A100. - -![Diffusers Speedup using torch.compile() in float16](/assets/images/3-16-accelerated-d/fig10-latest.png){:width="100%"} - -To stress it again, these performance gains are _additional_ to the ones achieved by migrating to PyTorch 2 and using the accelerated transformers scaled dot-product attention implementation. We recommend using `torch.compile()` when deploying diffusers in production. - - -## Conclusions - -PyTorch 2.0 comes with multiple features to optimize the crucial components of the foundational transformer block, and they can be further improved with the use of `torch.compile`. These optimizations lead to significant memory and time improvements for diffusion models, and remove the need for third-party library installations. - -To take advantage of these speed and memory improvements all you have to do is upgrade to PyTorch 2.0 and use diffusers >= 0.13.0. - -For more examples and in-detail benchmark numbers, please also have a look at the [Diffusers with PyTorch 2.0](https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/torch2.0) docs. - - -## Acknowledgement - -The authors are grateful to the PyTorch team for creating such excellent software. \ No newline at end of file diff --git a/_posts/2023-03-22-pytorch-2.0-xla.md b/_posts/2023-03-22-pytorch-2.0-xla.md deleted file mode 100644 index 2d8e680c7b41..000000000000 --- a/_posts/2023-03-22-pytorch-2.0-xla.md +++ /dev/null @@ -1,224 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.0 & XLA—The Latest Cutting Edge Features" -author: Jack Cao, Milad Mohammadi, Alex Wertheim, Yeounoh Chung, Joe Spisak, Will Cromar, Shauheen Zahirazami ---- - -Today, we are excited to share our latest work for [PyTorch/XLA 2.0](https://github.com/pytorch/xla/releases/tag/v2.0.0). The release of [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/) is yet another major milestone for this storied community and we are excited to continue to be part of it. When the [PyTorch/XLA](https://github.com/pytorch/xla) project started in 2018 between Google and Meta, the focus was on bringing cutting edge Cloud TPUs to help support the PyTorch community. Along the way, others in the community such as Amazon joined the project and very quickly the community expanded. We are excited about XLA's [direction](https://opensource.googleblog.com/2023/03/openxla-is-ready-to-accelerate-and-simplify-ml-development.html) and the benefits this project continues to bring to the PyTorch community. In this blog we’d like to showcase some key features that have been in development, show code snippets, and illustrate the benefit through some benchmarks. - - -## TorchDynamo / torch.compile (Experimental) - -[TorchDynamo](https://github.com/pytorch/torchdynamo) (Dynamo) is a Python-level JIT compiler designed to make unmodified PyTorch programs faster. It provides a clean API for compiler backends to hook in; its biggest feature is to dynamically modify Python bytecode just before execution. In the PyTorch/XLA 2.0 release, an experimental backend for Dynamo is provided for both inference and training. - -Dynamo provides a [Torch FX](https://pytorch.org/docs/stable/fx.html) (FX) graph when it recognizes a model pattern and PyTorch/XLA uses a Lazy Tensor approach to compile the FX graph and return the compiled function. To get more insight regarding the technical details about PyTorch/XLA’s dynamo implementation, check out [this](https://dev-discuss.pytorch.org/t/torchdynamo-update-10-integrating-with-pytorch-xla-for-inference-and-training/935) dev-discuss post and [dynamo doc](https://github.com/pytorch/xla/blob/r2.0/docs/dynamo.md). - -Here is a small code example of running ResNet18 with `torch.compile`: - -``` -import torch -import torchvision -import torch_xla.core.xla_model as xm - -def eval_model(loader): - device = xm.xla_device() - xla_resnet18 = torchvision.models.resnet18().to(device) - xla_resnet18.eval() - dynamo_resnet18 = torch.compile( - xla_resnet18, backend='torchxla_trace_once') - for data, _ in loader: - output = dynamo_resnet18(data) -``` - -With `torch.compile` PyTorch/XLA only traces the ResNet18 model once during the init time and executes the compiled binary everytime `dynamo_resnet18` is invoked, instead of tracing the model every step. To illustrate the benefits of Dynamo+XLA, below is an inference speedup analysis to compare Dynamo and LazyTensor (without Dynamo) using TorchBench on a Cloud TPU v4-8 where the y-axis is the speedup multiplier. - - -![Inference Speedup - PyTorch/XLA Dynamo on TPU](/assets/images/2023-03-22-inferencespeedup.svg){:width="100%"} - - -Dynamo for training is in the development stage with its implementation being at an earlier stage than inference. Developers are welcome to test this early feature, however, in the 2.0 release, PyTorch/XLA supports the forward and backward pass graphs and not the optimizer graph; the optimizer graph is available in the nightly builds and will land in the PyTorch/XLA 2.1 release. Below is an example of what training looks like using the ResNet18 example with `torch.compile`: - -``` -import torch -import torchvision -import torch_xla.core.xla_model as xm - -def train_model(model, data, target): - loss_fn = torch.nn.CrossEntropyLoss() - pred = model(data) - loss = loss_fn(pred, target) - loss.backward() - return pred - -def train_model_main(loader): - device = xm.xla_device() - xla_resnet18 = torchvision.models.resnet18().to(device) - xla_resnet18.train() - dynamo_train_model = torch.compile( - train_model, backend='aot_torchxla_trace_once') - for data, target in loader: - output = dynamo_train_model(xla_resnet18, data, target) -``` - -Note that the backend for training is `aot_torchxla_trace_once` (API will be updated for stable release) whereas the inference backend is `torchxla_trace_once` (name subject to change). We expect to extract and execute 3 graphs per training step instead of 1 training step if you use the Lazy tensor. Below is a training speedup analysis to compare Dynamo and Lazy using the TorchBench on Cloud TPU v4-8. - - -![Training Speedup - PyTorch/XLA Dynamo on TPU](/assets/images/2023-03-22-trainingspeedup.svg){:width="100%"} - - - -## PJRT Runtime (Beta) - -PyTorch/XLA is migrating from XRT to the new PJRT runtime. PJRT is a better-maintained stack, with demonstrated performance advantages, including, on average, a 35% performance for training on TorchBench 2.0 models. It also supports a richer set of features enabling technologies like SPMD. In the PyTorch/XLA 2.0 release, PJRT is the default runtime for TPU and CPU; GPU support is in experimental state. The PJRT features included in the PyTorch/XLA 2.0 release are: - -* TPU runtime implementation in `libtpu` using the [PJRT Plugin API](https://github.com/openxla/community/blob/main/rfcs/20230123-pjrt-plugin.md#rfc-openxla-pjrt-plugin) improves performance by up to 30% -* `torch.distributed` support for TPU v2 and v3, including `pjrt://` `init_method` (Experimental) -* Single-host GPU support. Multi-host support coming soon. (Experimental) - -Switching to PJRT requires no change (or minimal change for GPUs) to user code (see [pjrt.md](https://github.com/pytorch/xla/blob/master/docs/pjrt.md) for more details). Runtime configuration is as simple as setting the `PJRT_DEVICE` environment variable to the local device type (i.e. `TPU`, `GPU`, `CPU`). Below are examples of using PJRT runtimes on different devices. - -``` -# TPU Device -PJRT_DEVICE=TPU python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=256 --num_epochs=1 -``` - -``` -# TPU Pod Device -gcloud alpha compute tpus tpu-vm ssh $USER-pjrt --zone=us-central2-b --project=$PROJECT --worker=all --command="git clone --depth=1 --branch r2.0 https://github.com/pytorch/xla.git" - -gcloud alpha compute tpus tpu-vm ssh $USER-pjrt --zone=us-central2-b --project=$PROJECT --worker=all --command="PJRT_DEVICE=TPU python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=256 --num_epochs=1" -``` - -``` -# GPU Device (Experimental) -PJRT_DEVICE=GPU GPU_NUM_DEVICES=4 python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=128 --num_epochs=1 -``` - -Below is a performance comparison between XRT and PJRT by task on TorchBench 2.0 on v4-8 TPU. To learn more about PJRT vs. XRT please review the [documentation](https://github.com/pytorch/xla/blob/r2.0/docs/pjrt.md#tpu). - - -![TorchBench Training Time](/assets/images/2023-03-22-torchbenchtraining.svg){:width="100%"} - - - -## Parallelization - - -### GSPMD (Experimental) - -We are delighted to introduce General and Scalable Parallelization for ML Computation Graphs ([GSPMD](https://arxiv.org/abs/2105.04663)) in PyTorch as a new experimental data & model sharding solution. [GSPMD](https://arxiv.org/abs/2105.04663) provides automatic parallelization for common ML workloads, allowing developers to write PyTorch programs as if on a single large device and without custom sharded computation ops and/or collective communication ops. The XLA compiler transforms the single device program into a partitioned one with proper collectives, based on the user provided sharding hints. The API ([RFC](https://github.com/pytorch/xla/issues/3871)) will be available in the PyTorch/XLA 2.0 release as an experimental feature on a single TPU VM host. - - -#### Next Steps for GSPMD - -GSPMD is experimental in 2.0 release. To bring it to Stable status, we plan to address a number of feature gaps and known issues in the following releases, including multi-host support, DTensor integration, partial replication sharding, asynchronous data loading, and checkpointing. - - -### FSDP (Beta) - -PyTorch/XLA [introduced](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) fully sharded data parallel (FSDP) experimental support in version 1.12. This feature is a parallel representation of PyTorch FSDP and there are subtle differences in how XLA and upstream CUDA kernels are set up. `auto_wrap_policy` is a new argument that enables developers to automatically specify conditions for propagating partitioning specifications to neural network submodules. `auto_wrap_policy`s may be simply passed in as an argument when wrapping a model with FSDP. Two `auto_wrap_policy` callables worth noting are: `size_based_auto_wrap_policy`, `transformer_auto_wrap_policy`. - -`size_based_auto_wrap_policy` enables users to wrap submodules with a minimum number of parameters. The example below wraps model submodules having at least 10M parameters. - -``` -auto_wrap_policy = partial(size_based_auto_wrap_policy, min_num_params=1e7) -``` - -`transformer_auto_wrap_policy` enables users to wrap all submodules that match a specific layer type. The example below wraps model submodules named `torch.nn.Conv2d`. To learn more, review [this ResNet example](https://github.com/pytorch/xla/blob/master/test/test_train_mp_imagenet_fsdp.py#L237-L255) by Ronghang Hu. - -``` -auto_wrap_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={torch.nn.Conv2d}) -``` - -PyTorch/XLA FSDP is now integrated in HuggingFace trainer class ([PR](https://github.com/huggingface/transformers/pull/21406)) enabling users to train much larger models on PyTorch/XLA ([official Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#pytorchxla-fully-sharded-data-parallel)). A 16B parameters GPT2 model trained on Cloud TPU v4-64 with this FSDP configuration achieved 39% hardware utilization. - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        TPU Accelerator - Num Devices - v4-64 -
        GPT2 Parameter Count - 16B -
        Layers Wrapped with FSDP - GPT2Block -
        TFLOPs / Chip - 275 -
        PFLOPs / Step - 50 -
        Hardware Utilization - 39% -
        - - - -### Differences Between FSDP & GSPMD - -FSDP is a data parallelism technique that reduces device memory footprint by storing model parameters, optimizer states, and gradients all sharded. Note that the actual computation is still local to the device and requires all-gathering the sharded model parameters for both forward and backward passes, hence the name “data parallel”. FSDP is one of the newest additions to PyTorch/XLA to scale large model training. - -GSPMD on the other hand, is a general parallelization system that enables various types of parallelisms, including both data and model parallelisms. PyTorch/XLA provides a sharding annotation API and XLAShardedTensor abstraction, so a user can annotate any tensor with sharding specs in the PyTorch program. Developers don’t need to manually implement sharded computations or inject collective communications ops to get it right. The XLA compiler does the work so that each computation can run in a distributed manner on multiple devices. - - -### Examples & Preliminary Results - -To learn about PyTorch/XLA parallelism sharding API, visit our [RFC](https://github.com/pytorch/xla/issues/3871) and see the [Sample Code](https://github.com/pytorch/xla/tree/r2.0/test/spmd) references. Below is a simple example to enable data and model parallelism. - -``` -model = SimpleLinear().to(xm.xla_device()) -# Sharding annotate the linear layer weights. -xs.mark_sharding(model.fc1.weight, mesh, partition_spec) -# Training loop -model.train() -for step, (data, target) in enumerate(loader): - optimizer.zero_grad() - data = data.to(xm.xla_device()) - target = target.to(xm.xla_device()) - # Sharding annotate input data, we can shard any input - # dimensions. Sharidng the batch dimension enables - # data parallelism, sharding the feature dimension enables - # spatial partitioning. - xs.mark_sharding(data, mesh, partition_spec) - ouput = model(data) - loss = loss_fn(output, target) - optimizer.step() - xm.mark_step() -``` - -The following graph highlights the memory efficiency benefits of PyTorch/XLA FSDP and SPMD on Cloud TPU v4-8 running ResNet50. - - -![Batch Size Scaling with Spatial Partitioning](/assets/images/2023-03-22-batchsizescaling.svg){:width="100%"} - - - -## Closing Thoughts… - -We are excited to bring these features to the PyTorch community, and this is really just the beginning. Areas like dynamic shapes, deeper support for OpenXLA and many others are in development and we plan to put out more blogs to dive into the details. PyTorch/XLA is developed fully open source and we invite you to join the community of developers by filing issues, submitting pull requests, and sending RFCs on [GitHub](https://github.com/pytorch/xla). You can try PyTorch/XLA on a variety of XLA devices including TPUs and GPUs. [Here](https://colab.sandbox.google.com/github/pytorch/xla/blob/master/contrib/colab/getting-started.ipynb) is how to get started. - -Congratulations again to the PyTorch community on this milestone! - -Cheers, - -The PyTorch Team at Google \ No newline at end of file diff --git a/_posts/2023-03-28-accelerated-pytorch-2.md b/_posts/2023-03-28-accelerated-pytorch-2.md deleted file mode 100644 index d1dfe70c21c8..000000000000 --- a/_posts/2023-03-28-accelerated-pytorch-2.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -layout: blog_detail -title: "Accelerated PyTorch 2 Transformers" -author: Michael Gschwind, Driss Guessous, Christian Puhrsch ---- - -The PyTorch 2.0 release includes a new high-performance implementation of the PyTorch Transformer API with the goal of making training and deployment of state-of-the-art Transformer models affordable. Following the successful release of “fastpath” inference execution (“Better Transformer”), this release introduces high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA). - -You can take advantage of the new fused SDPA kernels either by calling the new SDPA operator directly (as described in the [SDPA tutorial](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html#beta-implementing-high-performance-transformers-with-scaled-dot-product-attention-sdpa)), or transparently via integration into the pre-existing PyTorch Transformer API. All features of the PyTorch Transformer API will continue to work compatibly, with many features mapped to high-performance SDPA kernels, while other features are impossible to support with higher performance (e.g., need_weights, as per below) while expanded high-performance support for other features may still be under active development. \ - \ -Similar to the “fastpath” architecture, custom kernels are fully integrated into the PyTorch Transformer API – thus, using the native Transformer and MultiHeadAttention API will enable users to transparently see significant speed improvements. Unlike the “fastpath” architecture, the newly introduced “custom kernels” support many more use cases including models using Cross-Attention, Transformer Decoders, and for training models, in addition to the existing fastpath inference for fixed and variable sequence length Transformer Encoder and Self Attention use cases. - -To take full advantage of different hardware models and Transformer use cases, multiple SDPA custom kernels are supported, with custom kernel selection logic that will pick the highest-performance kernel for a given model and hardware type. In particular, the first custom kernels included with the PyTorch 2.0 release are the [Flash Attention](https://arxiv.org/abs/2205.14135) kernel (sdpa_flash, for 16-bit floating point training and inference on Nvidia GPUs with SM80+ architecture level) and the [xFormers memory-efficient attention](https://github.com/facebookresearch/xformers) kernel (sdpa_mem_eff, for 16-bit and 32-bit floating point training and inference on a broad range of Nvidia GPUs). A general-purpose kernel sdpa_math provides an implementation when the custom kernels are not applicable. - -As mentioned, custom kernels provide a wider range of support for execution scenarios To ensure efficient execution (e,g., to use GPU tensor cores), model configurations need to meet a small number of requirements. This list of requirements will evolve over time, prospectively relaxing constraints limiting the usage of currently supported custom kernels, or providing additional kernels in the future. - -For the most up to date list of custom kernels and dispatch constraints, you can refer to [sdp_utils.h](https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/transformers/cuda/sdp_utils.h). As of PyTorch 2.0, the existing fused SDPA kernels have the following constraints: - - - -* Flash Attention only supports 16 bit floating point data types (float16 and bfloat16). -* The head dimension must be a multiple of 8 for 16-bit floating point numbers and a multiple of 4 for 32-bit floating point numbers. At present, the maximum head_dim support for the Flash Attention custom kernel is 128. -* The CUDA architecture level must be sm5x or better for the mem_efficient kernel, and sm80 for Flash Attention. -* Flash Attention supports arbitrary dropout, in PyTorch 2.0 the mem_efficient kernel does not support dropout (i.e., dropout must be set to zero for this kernel to be selected in PyTorch 2.0). -* To support variable-sequence length batches, all SDPA kernels support Nested Tensor inputs that combine input data and padding information using variable sequence length tensors for forward. (You can find more information about Nested Tensors in the [Nested Tensor tutorial](https://pytorch.org/tutorials/prototype/nestedtensor.html).) -* You can specify both a _key_padding_mask_ and an _attn_mask_ by combining them before passing them to the SDPA operator. In particular, you can use the per-batch-element key padding mask of the nn.Transformer API to implement training for variable-sequence length inputs in a batch. -* At present, the only attention mask supported by fused kernel implementation is the causal mask commonly used for training. To specify the causal mask in custom kernels, it must be specified with the _is_causal_ boolean and _attn_mask_ must be None. -* Support for Nested Tensors is still under development. Specifically, in PyTorch 2.0, only the sdpa_math kernel supports training with Nested Tensors. Also, PyTorch 2.0 does not support Nested Tensors as part of code being compiled with torch.compile(). -* The SDPA operator does not support returning averaged attention weights because computing them defeats the optimizations that enabled fused kernels to execute more efficiently. The argument _need_weights_ for torch.nn.MultiheadAttention's forward function defaults to True. In order to use the fused kernels, _need_weights_ needs to be set to _need_weights=False_. - -We find that an attention mask is rarely used in real-world applications, except for the causal mask during training. Consequently, we reduce kernel complexity and compute cost by building in the option to use a causal mask as attention mask, and select this new capability with the _is_causal_ parameter introduced in conjunction with the new SDPA operator. - -Providing the _is_causal_ Boolean flag for the frequently used causal mask also obviates the expensive and memory-intensive allocation of a causal mask, increasing training memory efficiency by allowing more memory to be used for large batch sizes, and reduce memory bandwidth and cache contention – which are both at a premium in GPU accelerators – by not needing to load an attention mask tensor. - -If the constraints of none of the available custom kernels are met, then training falls back to using the default sdpa_math kernel, implementing the mathematical equations for scaled dot product attention using a sequence of PyTorch operator to implement SDPA. This is the most general “catch-all” fallback kernel to ensure successful training for all models. - -In addition to the existing Transformer API, model developers may also use the scaled dot product attention kernels directly by calling the new `scaled_dot_product_attention()` operator. This operator may be used to efficiently implement multi-head attention by combining it with in-projection and outprojection, as described in the [SDPA tutorial](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html). - -In addition to adding custom kernels, Accelerated PyTorch 2 Transformers are integrated with PyTorch 2.0 compilation. To use your model while benefiting from the additional acceleration of PT2-compilation (for inference or training), pre-process the model with - - -``` -model = torch.compile(model) -``` - - -We have achieved major speedups for training transformer models and in particular large language models with Accelerated PyTorch 2 Transformers using a combination of custom kernels and torch.compile(). - - -![Better Transformer chart](/assets/images/pytorch_better_transformer_chart1.png){:width="100%"} -Figure: Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for [nanoGPT](https://github.com/karpathy/nanoGPT) shown here. - -Finally, because the custom kernels are much more memory efficient, try to increase the size of training batches to achieve faster training with increased batch size. - -In addition to automatic kernel selection, a context manager enables developers to override the kernel selection algorithm – this is not required for day to day operation, but enables developers to debug their code as well as enable performance engineers to override kernel selection. The SDPA tutorial provides additional information on using the SDPA context manager. - -In addition to availability as part of the nn.Transformer API, Accelerated PyTorch 2 Transformer custom kernels are also available in conjunction with the torchtext, torchvision, and fairseq domain libraries with the launch of PyTorch 2.0. \ No newline at end of file diff --git a/_posts/2023-04-03-pytorch-2.0-xla-path-forward.md b/_posts/2023-04-03-pytorch-2.0-xla-path-forward.md deleted file mode 100644 index ecf62f3cd4b6..000000000000 --- a/_posts/2023-04-03-pytorch-2.0-xla-path-forward.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch & OpenXLA: The Path Forward" -author: Milad Mohammadi, Jack Cao, Shauheen Zahirazami, Joe Spisak, and Jiewen Tan ---- - -As we celebrate the release of [OpenXLA](https://opensource.googleblog.com/2023/03/openxla-is-ready-to-accelerate-and-simplify-ml-development.html), [PyTorch 2.0](https://pytorch.org/blog/pytorch-2.0-release/), and [PyTorch/XLA 2.0](https://pytorch.org/blog/pytorch-2.0-xla/), it’s worth taking a step back and sharing where we see it all going in the short to medium term. With PyTorch adoption leading in the AI space and XLA supporting best-in-class compiler features, PyTorch/XLA is well positioned to provide a cutting edge development stack for both model training and inference. To achieve this, we see investments in three main areas: - -* **Training Large Models** - Large language models (LLM) and diffusion models have quickly risen in popularity and many cutting edge applications today are built on them. Further to this, training these models requires scale and more specifically the ability to train across thousands of accelerators. To achieve this we are investing in features such as AMP for mixed precision training, PjRt for increased runtime performance, SPMD / FSDP for efficient model sharding, Dynamic Shapes to enable new research approaches, faster data loading through Ray and tf.data, and a toolchain that packages all of these features together into a seamless workflow. Some of these features are already available in experimental or beta stages, and others are coming up this year with many heavily leveraging the underlying OpenXLA compiler stack. -* **Model Inference** - With large models continuing to grow in size and computational cost, deployment becomes the next challenge as these models continue to find their way into applications. With the introduction of Dynamo in the PyTorch 2.0 release, PyTorch/XLA delivers performance competitive inference. We are, however, incorporating additional inference-oriented including model serving support, Dynamo for sharded large models, quantization via Torch.Export and StableHLO. -* **Ecosystem integration** - We are expanding integration with [Hugging Face](https://huggingface.co/) and [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/) so users can take advantage of upcoming PyTorch/XLA cutting edge features (e.g. FSDP support in Hugging Face) and the downstream OpenXLA features (e.g. Quantization) through familiar APIs. - -Additionally, PyTorch/XLA is set to migrate to the open source [OpenXLA](https://github.com/openxla) as its default downstream compiler; allowing the PyTorch community to gain access to a leading, framework-agnostic compiler stack that enjoys industry-wide contribution and innovation. To achieve this, we will begin supporting StableHLO. As a result, OpenXLA will replace the existing TF:XLA dependency, overall streamlining the dependencies and creating leverage from the broader compiler ecosystem. PyTorch/XLA will also sunset the XRT runtime after migration. You can see the resulting high level stack below with the TensorFlow dependency stricken out: - -![the upcoming PyTorch/XLA features and integrations](/assets/images/PyTorch_XLA Future Stack.svg){:style="max-height:800px; width:100%"} - -**Figure:** the upcoming PyTorch/XLA features and integrations are illustrated here - -We cannot be more excited about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source so please file issues, submit pull requests, and send RFCs to [GitHub](https://github.com/pytorch/xla) such that we can openly collaborate. You can also [try out](https://colab.sandbox.google.com/github/pytorch/xla/blob/master/contrib/colab/getting-started.ipynb) PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs. - -Cheers, -The PyTorch/XLA Team at Google \ No newline at end of file diff --git a/_posts/2023-04-07-celebrate-pytorch-2.0.md b/_posts/2023-04-07-celebrate-pytorch-2.0.md deleted file mode 100644 index c03d35bd2d81..000000000000 --- a/_posts/2023-04-07-celebrate-pytorch-2.0.md +++ /dev/null @@ -1,115 +0,0 @@ ---- -layout: blog_detail -title: "Celebrate PyTorch 2.0 with New Performance Features for AI Developers" -author: Intel ---- - -Congratulations to the PyTorch Foundation for its release of **PyTorch 2.0**! In this blog, I discuss the four features for which Intel made significant contributions to PyTorch 2.0: - -1. TorchInductor -2. GNN -3. INT8 Inference Optimization -4. oneDNN Graph API - -We at Intel are delighted to be part of the PyTorch community and appreciate the collaboration with and feedback from our colleagues at [Meta](http://www.meta.com/) as we co-developed these features. - - -Let’s get started. - - -## 1. TorchInductor CPU FP32 Inference Optimized - - -As part of the PyTorch 2.0 compilation stack, TorchInductor CPU backend optimization brings notable performance improvements via graph compilation over the PyTorch eager mode. - - -The TorchInductor CPU backend is sped up by leveraging the technologies from the [Intel® Extension for PyTorch](http://github.com/intel/intel-extension-for-pytorch) for Conv/GEMM ops with post-op fusion and weight prepacking, and PyTorch ATen CPU kernels for memory-bound ops with explicit vectorization on top of OpenMP*-based thread parallelization. - - -With these optimizations on top of the powerful loop fusions in TorchInductor codegen, we achieved up to a **1.7x** FP32 inference performance boost over three representative deep learning benchmarks: TorchBench, HuggingFace, and timm1. Training and low-precision support are under development. - - -### See the Improvements - - -The performance improvements on various backends are tracked on this [TouchInductor CPU Performance Dashboard](http://github.com/pytorch/pytorch/issues/93531#issuecomment-1457373890). - - -## Improve Graph Neural Network (GNN) in PyG for Inference and Training Performance on CPU - - -GNN is a powerful tool to analyze graph structure data. This feature is designed to improve GNN inference and training performance on Intel® CPUs, including the new 4th Gen Intel® Xeon® Scalable processors. - - -PyTorch Geometric (PyG) is a very popular library built upon PyTorch to perform GNN workflows. Currently on CPU, GNN models of PyG run slowly due to the lack of GNN-related sparse matrix multiplication operations (i.e., SpMM_reduce) and the lack of several critical kernel-level optimizations (scatter/gather, etc.) tuned for GNN compute. - - -To address this, optimizations are provided for message passing between adjacent neural network nodes: - -* **scatter_reduce:** performance hotspot in message-passing when the edge index is stored in coordinate format (COO). -* **gather:** backward computation of scatter_reduce, specially tuned for the GNN compute when the index is an expanded tensor. -* **torch.sparse.mm with reduce flag:** performance hotspot in message-passing when the edge index is stored in compressed sparse row (CSR). Supported reduce flag for: sum, mean, amax, amin. - -End-to-end performance benchmark results for both inference and training on 3rd Gen Intel® Xeon® Scalable processors 8380 platform and on 4th Gen 8480+ platform are discussed in [Accelerating PyG on Intel CPUs](http://www.pyg.org/ns-newsarticle-accelerating-pyg-on-intel-cpus). - - -## Optimize int8 Inference with Unified Quantization Backend for x86 CPU Platforms - - -The new X86 quantization backend is a combination of [FBGEMM](http://github.com/pytorch/FBGEMM) (Facebook General Matrix-Matrix Multiplication) and [oneAPI Deep Neural Network Library (oneDNN](http://spec.oneapi.io/versions/latest/elements/oneDNN/source/index.html)) backends and replaces FBGEMM as the default quantization backend for x86 platforms. The result: better end-to-end int8 inference performance than FBGEMM. - - -Users access the x86 quantization backend by default for x86 platforms, and the selection between different kernels is automatically done behind the scenes. The rules of selection are based on prior performance testing data done by Intel during feature development. Thus, the x86 backend replaces FBGEMM and may offer better performance, depending on the use case. - - -The selection rules are: - -* On platforms without VNNI (e.g., Intel® Core™ i7 processors), FBGEMM is always used. -* On platforms with VNNI (e.g., 2nd-4th Gen Intel® Xeon® Scalable processors and future platforms): - * For linear, FBGEMM is always used. - * For convolution layers, FBGEMM is used for depth-wise convolution whose layers > 100; otherwise, oneDNN is used. - -Note that as the kernels continue to evolve. - - -The selection rules above are subject to change to achieve better performance. Performance metrics for through-put speed-up ratios of unified x86 backend vs. pure FBGEMM are discussed in [[RFC] Unified quantization backend for x86 CPU platforms #83888](http://github.com/pytorch/pytorch/issues/83888). - - -## Leverage oneDNN Graph API to Accelerate Inference on CPU - - -[oneDNN Graph API](http://spec.oneapi.io/onednn-graph/latest/introduction.html) extends [oneDNN](http://spec.oneapi.io/versions/latest/elements/oneDNN/source/index.html) with a flexible graph API to maximize the optimization opportunity for generating efficient code on Intel® AI hardware. It automatically identifies the graph partitions to be accelerated via fusion. The [fusion patterns](http://github.com/oneapi-src/oneDNN/blob/dev-graph/doc/programming_model/ops_and_patterns.md#fusion-patterns) focus on fusing compute-intensive operations such as convolution, matmul, and their neighbor operations for both inference and training use cases. - - -Currently, BFloat16 and Float32 datatypes are supported and only inference workloads can be optimized. BF16 is only optimized on machines with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) BF16 support. - - -Few or no modifications are needed in PyTorch to support newer oneDNN Graph fusions/optimized kernels. To use oneDNN Graph, users can: - -* Either use the API _torch.jit.enable_onednn_fusion(True)_ before JIT tracing a model, OR … -* Use its context manager, viz. _with torch.jit.fuser(“fuser3”)._ -* For accelerating [BFloat16 inference](http://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/onednn#example-with-bfloat16), we rely on eager-mode AMP (Automatic Mixed Precision) support in PyTorch and disable JIT mode’s AMP. - -See the [PyTorch performance tuning guide](http://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-onednn-graph-with-torchscript-for-inference). - - -## Next Steps - - -### Get the Software - - -[Try out PyTorch 2.0](http://pytorch.org/get-started/locally/) and realize the performance benefits for yourself from these Intel-contributed features. - - -We encourage you to check out Intel’s other [AI Tools](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/tools.html) and [Framework](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html) optimizations and learn about the open, standards-based [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html) multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio. - - -For more details about 4th Gen Intel Xeon Scalable processor, visit [AI Platform](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/platform.html) where you can learn about how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines. - - -### PyTorch Resources - -* [PyTorch Get Started](http://pytorch.org/get-started/pytorch-2.0/) -* [Dev Discussions](http://dev-discuss.pytorch.org/t/pytorch-release-2-0-execution-update/1077) -* [Documentation](http://pytorch.org/docs/2.0/) diff --git a/_posts/2023-04-07-straggler-mitigation.md b/_posts/2023-04-07-straggler-mitigation.md deleted file mode 100644 index a04633a9bdc7..000000000000 --- a/_posts/2023-04-07-straggler-mitigation.md +++ /dev/null @@ -1,265 +0,0 @@ ---- -layout: blog_detail -title: "Straggler Mitigation On PyTorch DDP By Hierarchical SGD" -author: Yi Wang (Cruise AI), Rohan Varma (Meta AI) ---- - -[PyTorch DDP](https://pytorch.org/docs/stable/notes/ddp.html) has been widely adopted across the industry for distributed training, which by default runs synchronous SGD to synchronize gradients across model replicas at every step. The performance of this technique is critical for fast iteration during model exploration as well as resource and cost saving. The performance is critical for fast iteration and cost saving of model development and exploration. To resolve a ubiquitous performance bottleneck introduced by slow nodes in large-scale training, Cruise and Meta co-developed a solution based on the [Hierarchical SGD](https://arxiv.org/abs/2007.13819) algorithm to significantly accelerate training in the presence of these stragglers. - - -## The Need For Straggler Mitigation - -In DDP setup, a straggler problem can occur when one or more processes run much slower ("stragglers") than other processes. When this happens, all the processes have to wait for the stragglers before synchronizing gradients and completing the communication, which essentially bottlenecks distributed performance to the slowest worker.As a result, even for the cases of training relatively small models, the communication cost can still be a major performance bottleneck. - - -### Potential Causes of Stragglers - -Severe straggler issues are usually caused by workload imbalance before synchronization, and many factors can contribute to this imbalance. For instance, some data loader workers in the distributed environment can become stragglers, because some input examples can be outliers in terms of the data size, or the data transfer of some examples can be drastically slowed down due to unstable network I/O, or the on-the-fly data transformation costs can have a high variance. - -Besides data loading, other phases before gradient synchronization can also cause stragglers, such as unbalanced workloads of embedding table lookup during the forward pass in recommendation systems. - - -### The Appearance of Stragglers - -If we profile DDP training jobs that have stragglers, we can find that some processes may have much higher gradient synchronization costs (a.k.a., allreducing gradients) than other processes at a certain step. As a result, the distributed performance can be dominated by the communication cost even if the model size is very small. In this case, some processes run faster than the straggler(s) at a step, and hence they have to wait for the stragglers and spend a much longer time on allreduce. - -The below shows screenshots of two trace files output by PyTorch profiler in a use case. Each screenshot profiles 3 steps. -* The first screenshot shows that a process has a very high allreduce cost in both the first and the third steps, because this process reaches the synchronization phase earlier than the straggler(s), and it spends more time on waiting. On the other hand, the allreduce cost is relatively small in the second step, this suggests that 1) there is no straggler at this step; or 2) this process is the straggler among all the processes, so it does not need to wait for any other process. - - -![chart showing allreduce cost](/assets/images/straggler-mitigation/straggler-mitigation-1.png){:style="max-height:800px; width:100%"} - -Both the 1st and the 3rd Steps Are Slowed Down by Stragglers - - -* The second screenshot shows a normal case without stragglers. In this case, all the gradient synchronizations are relatively short. - - -![chart showing normal case without stragglers](/assets/images/straggler-mitigation/straggler-mitigation-2.png){:style="max-height:800px; width:100%"} - -Normal Case Without Stragglers - - -## Hierarchical SGD in PyTorch - -Recently hierarchical SGD has been proposed to optimize the communication costs by mainly reducing the total amount of data transfer in large-scale distributed training, and multiple convergence analyses have been provided ([example](https://arxiv.org/pdf/2010.12998.pdf)). As a main novelty of this post, at Cruise we could leverage hierarchical SGD to mitigate stragglers, which may also occur on training relatively small models. Our implementation has been upstreamed by Cruise to PyTorch in early 2022. - - -### How Does Hierarchical SGD Work? - -As the name implies, hierarchical SGD organizes all the processes into groups at different levels as a hierarchy, and runs synchronization by following the rules below: - -* All the groups at the same level have the same number of processes, and the processes in these groups synchronize at the same frequency concurrently, where the synchronization period is pre-defined by the user. -* The higher level a group is, the larger synchronization period is used, as the synchronization becomes more expensive. -* When multiple overlapping groups are supposed to synchronize according to their periods, to reduce redundant synchronization and avoid data race across groups, only the highest-level group runs synchronization. - -The following figure illustrates an example of 4-level hierarchy SGD among 16 processes on 8 machines, each of which has 2 GPUs: - -1. **Level 1:** Each process runs mini-batch SGD locally; -2. **Level 2:** Each 4-process group across 2 machines runs synchronization every 2 steps; -3. **Level 3:** Each 8-process group across 4 machines runs synchronization every 4 steps; -4. **Level 4:** The global process group of all 16 processes over 8 machines runs synchronization every 8 steps. - -Particularly, when the step number can be divided by 8, only the synchronization at 3) is executed, and when the step number can be divided by 4 but not 8, only the synchronization at 2) is executed. - - -![An example of 4-level hierarchy SGD among 16 processes on 8 machines, each of which has 2 GPUs](/assets/images/straggler-mitigation/straggler-mitigation-3.png){:style="max-height:800px; width:100%"} - - -Intuitively, hierarchical SGD can be viewed as an extension of [local SGD](https://core.ac.uk/download/pdf/211998087.pdf), which only has a two-level hierarchy – every process runs mini-batch SGD locally and then synchronizes globally at a certain frequency. This can also help explain that, just like local SGD, hierarchical SGD synchronizes model parameters instead of gradients. Otherwise the gradient descent will be mathematically incorrect when the frequency is greater than 1. - - -### Why Can Hierarchical SGD Mitigate Stragglers? - -The key insight here is that, when there is a random straggler, it only directly slows down a relatively small group of processes instead of all the processes. Next time another random straggler is very likely to slow down a different small group, and hence a hierarchy can help smooth out the straggler effect. - -The example below assumes that there is a random straggler among totally 8 processes at every step. After 4 steps, vanilla DDP that runs synchronous SGD will be slowed down by straggler 4 times, because it runs global synchronization at every step. In contrast, hierarchical SGD runs synchronization with the groups of 4 processes after the first two steps, and then a global synchronization after another two steps. We can see that both the first two and the last two stragglers have a large overlap, and hence the performance loss can be mitigated. - - -![flow diagram](/assets/images/straggler-mitigation/straggler-mitigation-4.png){:style="max-height:800px; width:100%"} - - -Essentially, the mitigation effect of this hierarchical SGD example actually is between local SGD at a frequency of every 2 steps and every 4 steps. The main advantage of hierarchical SGD over local SGD is a better convergence efficiency of the same global synchronization frequency, because hierarchical SGD allows more low-level synchronization. Moreover, it is possible for hierarchical SGD to provide a global synchronization frequency lower than local SGD with model parity, leading to a higher training performance, especially in a large-scale distributed training. - - -### Ease of Use - -Straggler mitigation is not a novel study in distributed training. Multiple approaches have been proposed, such as [gossip SGD](https://arxiv.org/pdf/1705.09056.pdf), [data encoding](https://proceedings.neurips.cc/paper/2017/file/663772ea088360f95bac3dc7ffb841be-Paper.pdf), [gradient coding](http://proceedings.mlr.press/v70/tandon17a/tandon17a.pdf), as well as some particularly designed for parameter-server architecture, including [backup workers](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45187.pdf) and [stale synchronous parallel](http://www.cs.cmu.edu/~seunghak/SSPTable_NIPS2013.pdf). However, to the best of our knowledge, before this effort we have not found a good open-source PyTorch implementation of straggler mitigation that can work like a plugin to our training system at Cruise. In contrast, our implementation only requires the minimal changes – no need to modify the existing code or tune any existing hyperparameters. This is a very appealing advantage for industry users. - -As the code example below shows, only a few lines need to be added to the setup of DDP model, and the training loop code can keep untouched. As explained previously, hierarchical SGD is an extended form of local SGD, so the enablement can be quite similar to local SGD (see PyTorch docs of [PostLocalSGDOptimizer](https://pytorch.org/docs/stable/distributed.optim.html#torch.distributed.optim.PostLocalSGDOptimizer)): - -1. Register a post-local SGD communication hook to run a warmup stage of fully synchronous SGD and defer hierarchical SGD. -2. Create a post-local SGD optimizer that wraps an existing local optimizer and a hierarchical SGD configuration. - -``` -import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD -from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import ( - PostLocalSGDState, - post_localSGD_hook, -) -from torch.distributed.optim import PostLocalSGDOptimizer - -ddp_model = nn.parallel.DistributedDataParallel( - module=model, - device_ids=[rank], -) - -# Register a post-local SGD communication hook for the warmup. -subgroup, _ = torch.distributed.new_subgroups() -state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=1_000) -ddp_model.register_comm_hook(state, post_localSGD_hook) - -# Wraps the existing (local) optimizer to run hierarchical model averaging. -optim = PostLocalSGDOptimizer( - optim=optim, - averager=hierarchicalSGD.HierarchicalModelAverager( - # The config runs a 4-level hierarchy SGD among 128 processes: - # 1) Each process runs mini-batch SGD locally; - # 2) Each 8-process group synchronize every 2 steps; - # 3) Each 32-process group synchronize every 4 steps; - # 4) All 128 processes synchronize every 8 steps. - period_group_size_dict=OrderedDict([(2, 8), (4, 32), (8, 128)]), - # Do not run hierarchical SGD until 1K steps for model parity. - warmup_steps=1_000) -) -``` - -### Algorithm Hyperparameters - -Hierarchical SGD has two major hyperparameters: _period_group_size_dict_ and _warmup_steps_. - -* **period_group_size_dict** is an ordered dictionary mapping from synchronization period to process group size, used for initializing process groups of different sizes in a hierarchy to synchronize parameters concurrently. A larger group is expected to use a larger synchronization period. -* **warmup_steps** specifies a number of steps as the warmup stage to run synchronous SGD before hierarchical SGD. Similar to [post-local SGD](https://arxiv.org/pdf/1808.07217.pdf) algorithm, a warmup stage is usually recommended to achieve a higher accuracy. The value should be the same as _start_localSGD_iter_ arg used in _PostLocalSGDState_ when post_localSGD_hook is registered. Typically the warmup stage should at least cover the beginning of training when the loss is decreased drastically. - -A subtle difference between the PyTorch implementation and the initial design proposed by relevant papers is that, after the warmup stage, by default the processes within each host still run intra-host gradient synchronization at every step. This is because that: - -1. The intra-host communication is relatively cheap, and it can usually significantly accelerate the convergence; -2. The intra-host group (of size 4 or 8 for most industry users) can usually be a good choice of the smallest group of processes that synchronize most frequently in hierarchical SGD. If the synchronization period is 1, then gradient synchronization is faster than model parameter synchronization (a.k.a., model averaging), because DDP automatically overlaps gradient synchronization and the backward pass. - -Such intra-host gradient synchronization can be disabled by unsetting _post_local_gradient_allreduce_ arg in _PostLocalSGDState_. - - -## Demonstration - -Now we demonstrate that hierarchical SGD can accelerate distributed training by mitigating stragglers. - - -### Experimental Setup - -We compared the performance of hierarchical SGD against local SGD and synchronous SGD on [ResNet18](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet18.html) (model size: 45MB). Since the model is so small, the training is not bottlenecked by data transfer cost during synchronization. To avoid the noises incurred by data loading from remote storage, the input data was randomly simulated from memory. We varied the number of GPUs used by training from 64 to 256. The batch size per worker is 32, and the number of iterations of training is 1,000. Since we don’t evaluate convergence efficiency in this set of experiments, warmup is not enabled. - -We also emulated stragglers at a rate of 1% on 128 and 256 GPUs, and 2% on 64 GPUs, to make sure at least one stragglers at every step on average. These stragglers randomly appear on different CUDA devices. Each straggler stalls for 1 second besides the normal per-step training time (~55ms in our setup). This can be perceived as a practical scenario where 1% or 2% of input data are outliers in terms of the data pre-processing cost (I/O and/or data transformation on the fly) during training, and such cost is 20X+ larger than the average. - -The code snippet below shows how a straggler can be emulated in the training loop. We applied it to a ResNet model, and it can be easily applied to the other models as well. - -``` - loss = loss_fn(y_pred, y) - # Emulate a straggler that lags for 1 second at a rate of 1%. - if random.randint(1, 100) == 1: - time.sleep(1) - loss.backward() - optimizer.step() -``` - -The experiments are conducted on us-central1 GCP cluster. Each machine has 4 NVIDIA Tesla T4 GPUs with 16 GB memory per GPU, connected through a 32 Gbit/s ethernet network. Each instance also features 96 vCPUs, 360 GB RAM. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Architecture - ResNet18 (45MB) -
        Workers - 64, 128, 256 -
        Backend - NCCL -
        GPU - Tesla T4, 16 GB memory -
        Batch size - 32 x ## of workers -
        Straggler Duration - 1 sec -
        Straggler Rate - 1% on 128 and 256 GPUs, 2% on 64 GPUs -
        - - -We used multiple configurations for both local SGD and hierarchical SGD. Local SGD runs global synchronization every 2, 4, and 8 steps, respectively. - -We ran hierarchical SGD with the following configurations: - - - -1. On 64 GPUs: - 1. Each 8-process group, 32-process, and the global 64-process group synchronizes every 2, 4, and 8 steps, respectively. Denoted as "_**HSGD 2-8,4-32,8-64**_". - 2. Each 32-process group and the global 64-process group synchronizes every 4 and 8 steps, respectively. Denoted as "_**HSGD 4-32,8-64**_". -2. On 128 GPUs: - 3. Each 8-process group, 32-process group, and the global 128-process group synchronizes every 2, 4, and 8 steps, respectively. Denoted as "_**HSGD 2-8,4-32,8-128**_". - 4. Each 32-process group and the global 128-process group synchronizes every 4 and 8 steps, respectively. Denoted as "_**HSGD 4-32,8-128**_". -3. On 256 GPUs: - 5. Each 4-process group, 16-process group, 64-process group, and the global 256-process group synchronizes every 1, 2, 4, and 8 steps, respectively. Denoted as "_**HSGD 1-4,2-16,4-64,8-256**_". - 6. Each 8-process group, 64-process group, and the global 256-process group synchronizes every 2, 4, and 8 steps. Denoted as "_**HSGD 2-8,4-64,8-256**_". - 7. Each 16-process group and the global 256-process group synchronizes every 4 and 8 steps, respectively. Denoted as "_**HSGD 4-16,8-256**_". - - -### Experimental Results - -The figures below show the speedups of different communication schemes against the baseline of synchronous SGD, with the emulated stragglers. We can make the following observations: - -1. As expected, we can see that both hierarchical SGD and local SGD can achieve a higher speedup with a lower synchronization frequency. -2. The speedups of the hierarchical SGD schemes are **2.08X-2.45X** on 64 GPUs, **2.57X-2.68X** on 128 GPUs, and **2.63X-3.25X** on 256 GPUs, respectively. This shows that hierarchical SGD can significantly mitigate stragglers, and such mitigation can be more effective at a larger scale. -3. The performance of local SGD with the synchronization period of 2 steps and 8 steps can be perceived as the lower bound and upper bound of the experimented hierarchical SGD schemes, respectively. This is because the hierarchical SGD schemes synchronize less frequently than every 2 steps globally, but their low-level synchronization at small groups are the extra overheads in comparison with the global synchronization every 8 steps. - -Overall, hierarchical SGD can provide a finer-grained trade-off between communication cost and model quality than local SGD. Therefore, when local SGD at a relatively large synchronization period like 8 or 4 cannot give a satisfactory convergence efficiency, hierarchical SGD can have a much better chance to achieve both a good speedup and a model parity. - -Since only simulated data is used in the experiments, we did not demonstrate the model parity here, which in practice can be achieved in two ways: -1. Tuning the hyperparameters including both hierarchy and warmup steps; -2. For some cases, hierarchical SGD could lead to a slightly lower quality than the original model for the same number of training steps (i.e., lower convergence rate), but with a speedup like 2X+ per training step, it is still possible to achieve model parity with more steps but still less total training time. - - -![Speedups on 64 GPUs](/assets/images/straggler-mitigation/straggler-mitigation-5.png){:style="max-height:800px; width:100%"} - -![Speedups on 128 GPUs](/assets/images/straggler-mitigation/straggler-mitigation-6.png){:style="max-height:800px; width:100%"} - -![Speedups on 256 GPUs](/assets/images/straggler-mitigation/straggler-mitigation-7.png){:style="max-height:800px; width:100%"} - - -## Limitations - -Before applying hierarchical SGD to straggler mitigation, the user should be aware of a few limitations of this approach: - -1. This approach can only mitigate non-persistent stragglers, which occur to different workers at different times. However, for the case of persistent stragglers, which can be caused by hardware degradation or a network issue on a specific host, these stragglers will slow down the same low-level subgroup at every time, leading to nearly no straggler mitigation. -2. This approach can only mitigate low-frequency stragglers. E.g., if 30% workers can randomly become stragglers at every step, then most low-level synchronizations will still be slowed down by stragglers. As a result, hierarchical SGD may not show an obvious performance advantage over synchronous SGD. -3. Since hierarchical SGD applies model averaging that does not overlap with backward like gradient averaging used by vanilla DDP, its performance gain of straggler mitigation must outweigh the performance loss of no overlap between communication and backward pass. Therefore, if stragglers only slow down training by less than 10%, hierarchical SGD may not be able to bring much speedup. This limitation can be addressed by [overlapping optimizer step and backward pass](https://github.com/pytorch/pytorch/blob/release/1.13/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py) in the future. -4. Since hierarchical SGD is less well-studied than local SGD, there is no guarantee that hierarchical SGD with a finer-grained synchronization granularity can converge faster than certain advanced forms of local SGD, such as [SlowMo](https://openreview.net/pdf?id=SkxJ8REYPH), which can improve convergence efficiency with slow momentum. However, to the best of our knowledge, these advanced algorithms cannot be natively supported as a PyTorch DDP plugin like hierarchical SGD yet. - - -## Acknowledgements - -We would like to thank Cruise teammates **Bo Tian**, **Sergei Vorobev**, **Eugene Selivonchyk, Tsugn-Hsien Lee**, **Dan Ring**, **Ian Ackerman**, **Lei Chen**, **Maegan Chew**, **Viet Anh To**, **Xiaohui Long**, **Zeyu Chen**, **Alexander Sidorov**, **Igor Tsvetkov**, **Xin Hu**, **Manav Kataria**, **Marina Rubtsova**, and **Mohamed Fawzy**, as well as Meta teammates **Shen Li, Yanli Zhao, Suraj Subramanian, Hamid Shojanzeri, Anjali Sridhar** and **Bernard Nguyen** for the support. \ No newline at end of file diff --git a/_posts/2023-04-14-accelerated-generative-diffusion-models.md b/_posts/2023-04-14-accelerated-generative-diffusion-models.md deleted file mode 100644 index 5feee1797013..000000000000 --- a/_posts/2023-04-14-accelerated-generative-diffusion-models.md +++ /dev/null @@ -1,517 +0,0 @@ ---- -layout: blog_detail -title: "Accelerated Generative Diffusion Models with PyTorch 2" -author: Grigory Sizov, Michael Gschwind, Hamid Shojanazeri, Driss Guessous, Daniel Haziza, Christian Puhrsch ---- - -**TL;DR**: PyTorch 2.0 nightly offers out-of-the-box performance improvement for Generative Diffusion models by using the new `torch.compile()` compiler and optimized implementations of Multihead Attention integrated with PyTorch 2. - -## Introduction - -A large part of the recent progress in Generative AI came from denoising diffusion models, which allow producing high quality images and videos from text prompts. This family includes Imagen, DALLE, Latent Diffusion, and others. However, all models in this family share a common drawback: generation is rather slow, due to the iterative nature of the sampling process by which the images are produced. This makes it important to optimize the code running inside the sampling loop. - -We took an open source implementation of a popular text-to-image diffusion model as a starting point and accelerated its generation using two optimizations available in PyTorch 2: compilation and fast attention implementation. Together with a few minor memory processing improvements in the code these optimizations give up to 49% inference speedup relative to the original implementation without [xFormers](https://github.com/facebookresearch/xformers), and 39% inference speedup relative to using the original code with xFormers (excluding the compilation time), depending on the GPU architecture and batch size. Importantly, the speedup comes without a need to install xFormers or any other extra dependencies. - -The table below shows the improvement in runtime between the original implementation with xFormers installed and our optimized version with PyTorch-integrated memory efficient attention (originally developed for and released in the [xFormers](https://github.com/facebookresearch/xformers) library) and PyTorch compilation. The compilation time is excluded. - -**Runtime improvement in % compared to original+xFormers** - -See the absolute runtime numbers in section “Benchmarking setup and results summary” - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        GPU - Batch size 1 - Batch size 2 - Batch size 4 -
        P100 (no compilation) - -3.8 - 0.44 - 5.47 -
        T4 - 2.12 - 10.51 - 14.2 -
        A10 - -2.34 - 8.99 - 10.57 -
        V100 - 18.63 - 6.39 - 10.43 -
        A100 - 38.5 - 20.33 - 12.17 -
        - - -One can notice the following: - - - -* The improvements are significant for powerful GPUs like A100 and V100. For those GPUs the improvement is most pronounced for batch size 1 -* For less powerful GPUs we observe smaller speedups (or in two cases slight regressions). The batch size trend is reversed here: improvement is larger for larger batches - -In the following sections we describe the applied optimizations and provide detailed benchmarking data, comparing the generation time with various optimization features on/off. - -Specifically, we benchmark 5 configurations and the plots below compare their absolute performance for different GPUs and batch sizes. For definitions of these configurations see section “Benchmarking setup and results”. - - - -![Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 1](/assets/images/2023-04-11-accelerated-generative-diffusion-models1.png){:style="max-height:800px; width:100%"} - -![Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 2](/assets/images/2023-04-11-accelerated-generative-diffusion-models2.png){:style="max-height:800px; width:100%"} - -![Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 1](/assets/images/2023-04-11-accelerated-generative-diffusion-models3.png){:style="max-height:800px; width:100%"} - - - - - -## Optimizations - -Here we’ll go into more detail about the optimizations introduced into the model code. These optimizations rely on features of PyTorch 2.0 which has been released recently. - - -### Optimized Attention - -One part of the code which we optimized is the scaled dot-product attention. Attention is known to be a heavy operation: naive implementation materializes the attention matrix, leading to time and memory complexity quadratic in sequence length. It is common for diffusion models to use attention (`CrossAttention`) as part of Transformer blocks in multiple parts of the U-Net. Since the U-Net runs at every sampling step, this becomes a critical point to optimize. Instead of custom attention implementation one can use `torch.nn.MultiheadAttention,` which in PyTorch 2 has optimized attention implementation is integrated into it. This optimization schematically boils down to the following pseudocode: - - - -``` -class CrossAttention(nn.Module): - def __init__(self, ...): - # Create matrices: Q, K, V, out_proj - ... - def forward(self, x, context=None, mask=None): - # Compute out = SoftMax(Q*K/sqrt(d))V - # Return out_proj(out) - … -``` - -gets replaced with - -``` -class CrossAttention(nn.Module): - def __init__(self, ...): - self.mha = nn.MultiheadAttention(...) - def forward(self, x, context): - return self.mha(x, context, context) -``` - - -The optimized implementation of attention was available already in PyTorch 1.13 (see [here](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/)) and widely adopted (see e.g. [HuggingFace transformers library example](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2)). In particular, it integrates memory-efficient attention from the [xFormers](https://github.com/facebookresearch/xformers) library and flash attention from [https://arxiv.org/abs/2205.14135](https://arxiv.org/abs/2205.14135). PyTorch 2.0 expands this to additional attention functions such as cross attention and custom kernels for further acceleration, making it applicable to diffusion models. - -Flash attention is available on GPUs with compute capability SM 7.5 or SM 8.x - for example, on T4, A10, and A100, which are included in our benchmark (you can check compute capability of each NVIDIA GPU [here](https://developer.nvidia.com/cuda-gpus#compute)). However, in our tests on A100 the memory efficient attention performed better than flash attention for the particular case of diffusion models, due to the small number of attention heads and small batch size. PyTorch understands this and in this case chooses memory efficient attention over flash attention when both are available (see the logic [here](https://github.com/pytorch/pytorch/blob/d8e795ecd53670682bd3b2e5ff1f378402b147d5/aten/src/ATen/native/transformers/cuda/sdp_utils.h#L33-L71)). For full control over the attention backends (memory-efficient attention, flash attention, “vanilla math”, or any future ones), power users can enable and disable them manually with the help of the context manager [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel). - - -### Compilation - -Compilation is a [new feature of PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/#user-experience), enabling significant speedups with a very simple user experience. To invoke the default behavior, simply wrap a PyTorch module or a function into `torch.compile`: - - -``` -model = torch.compile(model) -``` - - -PyTorch compiler then turns Python code into a set of instructions which can be executed efficiently without Python overhead. The compilation happens dynamically the first time the code is executed. With the default behavior, under the hood PyTorch utilized [TorchDynamo](https://pytorch.org/docs/stable/torch.compiler) to compile the code and [TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747) to further optimize it. See [this tutorial](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for more details. - -Although the one-liner above is enough for compilation, certain modifications in the code can squeeze a larger speedup. In particular, one should avoid so-called graph breaks - places in the code which PyTorch can’t compile. As opposed to previous PyTorch compilation approaches (like TorchScript), PyTorch 2 compiler doesn’t break in this case. Instead it falls back on eager execution - so the code runs, but with reduced performance. We introduced a few minor changes to the model code to get rid of graph breaks. This included eliminating functions from libraries not supported by the compiler, such as `inspect.isfunction` and `einops.rearrange`. See this [doc](https://pytorch.org/docs/stable/torch.compiler_faq.html#identifying-the-cause-of-a-graph-break) to learn more about graph breaks and how to eliminate them. - -Theoretically, one can apply `torch.compile `on the whole diffusion sampling loop. However, in practice it is enough to just compile the U-Net. The reason is that `torch.compile` doesn’t yet have a loop analyzer and would recompile the code for each iteration of the sampling loop. Moreover, compiled sampler code is likely to generate graph breaks - so one would need to adjust it if one wants to get a good performance from the compiled version. - -Note that compilation [requires GPU compute capability >= SM 7.0](https://github.com/openai/triton/blob/b5d32896b1f89fc44a82f8df3bb010934c53f4f5/README.md?plain=1#L66-L68) to run in non-eager mode. This covers all GPUs in our benchmarks - T4, V100, A10, A100 - except for P100 (see the [full list](https://developer.nvidia.com/cuda-gpus#compute)). - - -### Other optimizations - -In addition, we have improved efficiency of GPU memory operations by eliminating some common pitfalls, e.g. creating a tensor on GPU directly rather than creating it on CPU and later moving to GPU. The places where such optimizations were necessary were determined by line-profiling and looking at CPU/GPU traces and [Flame Graphs](https://github.com/brendangregg/FlameGraph). - - -## Benchmarking setup and results summary - -We have two versions of code to compare: _original_ and _optimized_. On top of this, several optimization features (xFormers, PyTorch memory efficient attention, compilation) can be turned on/off. Overall, as mentioned in the introduction, we will be benchmarking 5 configurations: - - - -* _Original code without xFormers_ -* _Original code with xFormers_ -* _Optimized code with vanilla math attention backend and no compilation_ -* _Optimized code with memory-efficient attention backend and no compilation_ -* _Optimized code with memory-efficient attention backend and compilation_ - -As the _original version_ we took the version of the code which uses PyTorch 1.12 and a custom implementation of attention. The _optimized version_ uses `nn.MultiheadAttention` in `CrossAttention` and PyTorch 2.0.0.dev20230111+cu117. It also has a few other minor optimizations in PyTorch-related code. - -The table below shows runtime of each version of the code in seconds, and the percentage improvement compared to the _original with xFormers. _The compilation time is excluded. - -**Runtimes for batch size 1. In parenthesis - relative improvement with respect to the “Original with xFormers” row** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Configuration - P100 - T4 - A10 - V100 - A100 -
        Original without xFormers - 30.4s (-19.3%) - 29.8s (-77.3%) - 13.0s (-83.9%) - 10.9s (-33.1%) - 8.0s (-19.3%) -
        Original with xFormers - 25.5s (0.0%) - 16.8s (0.0%) - 7.1s (0.0%) - 8.2s (0.0%) - 6.7s (0.0%) -
        Optimized with vanilla math attention, no compilation - 27.3s (-7.0%) - 19.9s (-18.7%) - 13.2s (-87.2%) - 7.5s (8.7%) - 5.7s (15.1%) -
        Optimized with mem. efficient attention, no compilation - 26.5s (-3.8%) - 16.8s (0.2%) - 7.1s (-0.8%) - 6.9s (16.0%) - 5.3s (20.6%) -
        Optimized with mem. efficient attention and compilation - - - 16.4s (2.1%) - 7.2s (-2.3%) - 6.6s (18.6%) - 4.1s (38.5%) -
        - - -**Runtimes for batch size 2** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Configuration - P100 - T4 - A10 - V100 - A100 -
        Original without xFormers - 58.0s (-21.6%) - 57.6s (-84.0%) - 24.4s (-95.2%) - 18.6s (-63.0%) - 12.0s (-50.6%) -
        Original with xFormers - 47.7s (0.0%) - 31.3s (0.0%) - 12.5s (0.0%) - 11.4s (0.0%) - 8.0s (0.0%) -
        Optimized with vanilla math attention, no compilation - 49.3s (-3.5%) - 37.9s (-21.0%) - 17.8s (-42.2%) - 12.7s (-10.7%) - 7.8s (1.8%) -
        Optimized with mem. efficient attention, no compilation - 47.5s (0.4%) - 31.2s (0.5%) - 12.2s (2.6%) - 11.5s (-0.7%) - 7.0s (12.6%) -
        Optimized with mem. efficient attention and compilation - - - 28.0s (10.5%) - 11.4s (9.0%) - 10.7s (6.4%) - 6.4s (20.3%) -
        - - -**Runtimes for batch size 4** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Configuration - P100 - T4 - A10 - V100 - A100 -
        Original without xFormers - 117.9s (-20.0%) - 112.4s (-81.8%) - 47.2s (-101.7%) - 35.8s (-71.9%) - 22.8s (-78.9%) -
        Original with xFormers - 98.3s (0.0%) - 61.8s (0.0%) - 23.4s (0.0%) - 20.8s (0.0%) - 12.7s (0.0%) -
        Optimized with vanilla math attention, no compilation - 101.1s (-2.9%) - 73.0s (-18.0%) - 28.3s (-21.0%) - 23.3s (-11.9%) - 14.5s (-13.9%) -
        Optimized with mem. efficient attention, no compilation - 92.9s (5.5%) - 61.1s (1.2%) - 23.9s (-1.9%) - 20.8s (-0.1%) - 12.8s (-0.9%) -
        Optimized with mem. efficient attention and compilation - - - 53.1s (14.2%) - 20.9s (10.6%) - 18.6s (10.4%) - 11.2s (12.2%) -
        - - -To minimize fluctuations and external influence on the performance of the benchmarked code, we ran each version of the code one after another, and then repeated this sequence 10 times: A, B, C, D, E, A, B, … So the results of a typical run would look like the one in the picture below.. Note that one shouldn’t rely on comparison of absolute run times between different graphs, but comparison of run times_ inside_ one graph is pretty reliable, thanks to our benchmarking setup. - - - - -![Denoising diffusion model generation benchmarks](/assets/images/2023-04-11-accelerated-generative-diffusion-models4.png){:style="max-height:700px"} - - -Each run of text-to-image generation script produces several batches, the number of which is regulated by the CLI parameter `--n_iter`. In the benchmarks we used `n_iter = 2`, but introduced an additional “warm-up” iteration, which doesn’t contribute to the run time. This was necessary for the runs with compilation, because compilation happens the first time the code runs, and so the first iteration is much longer than all subsequent. To make comparison fair, we also introduced this additional “warm-up” iteration to all other runs. - -The numbers in the table above are for number of iterations 2 (plus a “warm-up one”), prompt ”A photo”, seed 1, PLMS sampler, and autocast turned on. - -Benchmarks were done using P100, V100, A100, A10 and T4 GPUs. The T4 benchmarks were done in Google Colab Pro. The A10 benchmarks were done on g5.4xlarge AWS instances with 1 GPU. - - -## Conclusions and next steps - -We have shown that new features of PyTorch 2 - compiler and optimized attention implementation - give performance improvements exceeding or comparable with what previously required installation of an external dependency (xFormers). PyTorch achieved this, in particular, by integrating memory efficient attention from xFormers into its codebase. This is a significant improvement for user experience, given that xFormers, being a state-of-the-art library, in many scenarios requires custom installation process and long builds. - -There are a few natural directions in which this work can be continued: - - - -* The optimizations we implemented and described here are only benchmarked for text-to-image inference so far. It would be interesting to see how they affect training performance. PyTorch compilation can be directly applied to training; enabling training with PyTorch optimized attention is on the roadmap -* We intentionally minimized changes to the original model code. Further profiling and optimization can probably bring more improvements -* At the moment compilation is applied only to the U-Net model inside the sampler. Since there is a lot happening outside of U-Net (e.g. operations directly in the sampling loop), it would be beneficial to compile the whole sampler. However, this would require analysis of the compilation process to avoid recompilation at every sampling step -* Current code only applies compilation within the PLMS sampler, but it should be trivial to extend it to other samplers -* Besides text-to-image generation, diffusion models are also applied to other tasks - image-to-image and inpainting. It would be interesting to measure how their performance improves from PyTorch 2 optimizations - -See if you can increase performance of open source diffusion models using the methods we described, and share the results! - - -## Resources - - - -* PyTorch 2.0 overview, which has a lot of information on `torch.compile:` [https://pytorch.org/get-started/pytorch-2.0/](https://pytorch.org/get-started/pytorch-2.0/) -* Tutorial on `torch.compile`: [https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) -* General compilation troubleshooting: [https://pytorch.org/docs/stable/torch.compiler_troubleshooting.html](https://pytorch.org/docs/stable/torch.compiler_troubleshooting.html) -* Details on graph breaks: [https://pytorch.org/docs/stable/torch.compiler_faq.html#identifying-the-cause-of-a-graph-break](https://pytorch.org/docs/stable/torch.compiler_faq.html#identifying-the-cause-of-a-graph-break) -* Details on guards: [https://pytorch.org/docs/stable/torch.compiler_guards_overview.html](https://pytorch.org/docs/stable/torch.compiler_guards_overview.html) -* Video deep dive on TorchDynamo [https://www.youtube.com/watch?v=egZB5Uxki0I](https://www.youtube.com/watch?v=egZB5Uxki0I) -* Tutorial on optimized attention in PyTorch 1.12: [https://pytorch.org/tutorials/beginner/bettertransformer_tutorial.html](https://pytorch.org/tutorials/beginner/bettertransformer_tutorial.html) - - -## Acknowledgements - -We would like to thank Geeta Chauhan, Natalia Gimelshein, Patrick Labatut, Bert Maher, Mark Saroufim, Michael Voznesensky and Francisco Massa for their valuable advice and early feedback on the text. - -Special thanks to Yudong Tao initiating the work on using PyTorch native attention in diffusion models. diff --git a/_posts/2023-04-15-experience-power-pytorch-2.0.md b/_posts/2023-04-15-experience-power-pytorch-2.0.md deleted file mode 100644 index d294e052756b..000000000000 --- a/_posts/2023-04-15-experience-power-pytorch-2.0.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -layout: blog_detail -title: "Experience the power of PyTorch 2.0 on AMD Solutions" -author: AMD ---- - -PyTorch 2.0 represents a significant step forward for the PyTorch machine learning framework. The stable release of PyTorch 2.0 brings new features that unlock even higher performance, while remaining backward compatible with prior releases and retaining the Pythonic focus which has helped to make PyTorch so enthusiastically adopted by the AI/ML community. AMD has long been a strong proponent of PyTorch, and we are delighted that the PyTorch 2.0 stable release includes support for AMD Instinct™ and Radeon™ GPUs that are supported by the ROCm™ software platform. - -With the stable PyTorch 2.0 release, PyTorch 2.0 introduces torch.compile as a beta feature underpinned by TorchInductor with support for AMD Instinct and Radeon GPUs through OpenAI Triton deep learning compiler. Through TorchInductor, developers can now generate low level kernels using Triton that are portable and performant to hand-written kernels on native hardware centric kernel programming models. - -OpenAI Triton is a language and compiler for blocked algorithms, which aims to provide an abstraction layer between CUDA/HIP and Torch at which developers can write efficient kernels more productively. We have written a new backend which interfaces Triton's custom MLIR dialects with our ROCm compiler stack. - -Triton can automatically optimize kernels generated by machine learning compilers such as TorchInductor for multiple AI accelerators including AMD Instinct GPU accelerator by leveraging hardware-specific features of the AMD CDNA™ GPU architecture. This makes it easy for developers and users to switch seamlessly from any HW to AMD Instinct GPU accelerators and get great out of the box performance. - -In addition, compilers like Triton can also enable developers to use high-level programming languages, such as Python, to write machine learning code that can be efficiently compiled and executed on specialized hardware. This can help greatly improve the productivity of machine learning developers, as they can focus on the algorithmic aspects of their models and rely on the compiler to generate efficient code. - -By design, PyTorch 2.0 is backward compatible to earlier PyTorch releases. This holds true for the ROCm build of PyTorch 2.0 as well. Developers using PyTorch with AMD GPUs can migrate to PyTorch 2.0 with the confidence that their existing code will continue to work without any required changes, so there is no penalty to access the improvements that come with this release. On the other hand, using PyTorch 2.0 and TorchInductor can result in significant performance improvement over the default eager-mode as shown below. - -The initial results using AMD Instinct MI250 GPUs already shows strong performance improvement with minimal optimization on TorchInductor compared to the default eager-mode. We see an average performance increase of up to 1.54X on 44 out of the 45 models on HuggingFace benchmarks suite with CamemBert, DistillGPT2 and T5Small being a few of the standout models with up to 1.5X or more performance improvement over eager-mode. We are looking forward to continued engagement with members of the PyTorch team at Meta to enable further optimization on ROCm software stack and the additional performance improvement for future PyTorch releases. - -![Image 1: AMD MI250 GPU performance improvement for TorchInductor vs eager-mode using HuggingFace](/assets/images/t-vs-eager-mode.svg){:style="max-height:800px; width:100%"} - -Image 1: AMD MI250 GPU performance improvement for TorchInductor vs eager-mode using HuggingFace MI200-89. - -PyTorch 2.0 follows the same set of install options as before to build and install for supporting AMD GPUs. These include an installable Python package hosted at [pytorch.org](https://pytorch.org/), AMD’s public PyTorch docker image, and of course the option to build from source using the upstream PyTorch repository. As with PyTorch builds for other platforms, the specific command line to be run for pip-based install is provided by the configurator at [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/). - -The GPUs supported by the ROCm software platform which forms the basis for PyTorch support on AMD GPUs are documented at [https://docs.amd.com/bundle/Hardware_and_Software_Reference_Guide/page/Hardware_and_Software_Support.html](https://docs.amd.com/bundle/Hardware_and_Software_Reference_Guide/page/Hardware_and_Software_Support.html) - -## Conclusion - -PyTorch 2.0 represents a major step in continuing to broaden support for ML developers by increasing performance while maintaining a simple, Pythonic interface. This performance uplift is made possible in large part by the new TorchInductor infrastructure, which in turn harnesses the Triton ML programming language and just-in-time compiler. AMD’s support for these technologies allows users to realize the full promise of the new PyTorch architecture. Our GPU support in PyTorch 2.0 is just one manifestation of a larger vision around AI and machine learning. AI/ML plays an important role in multiple AMD product lines, including Instinct and Radeon GPUs, Alveo™ data center accelerators, and both Ryzen™ and EPYC processors. These hardware and software initiatives are all part of AMD’s Pervasive AI vision, and we look forward to addressing the many new challenges and opportunities of this dynamic space. - -MI200-89 – PyTorch Inductor mode HuggingFace Transformers training speedup, running the standard PyTorch 2.0 test suite, over PyTorch eager-mode comparison based on AMD internal testing on a single GCD as of 3/10/2023 using a 2P AMD EPYC™ 7763 production server with 4x AMD Instinct™ MI250 (128GB HBM2e) 560W GPUs with Infinity Fabric™ technology; host ROCm™ 5.3, guest ROCm™ 5.4.4, PyTorch 2.0.0, Triton 2.0. Server manufacturers may vary configurations, yielding different results. Performance may vary based on factors including use of latest drivers and optimizations. - -© 2023 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD Arrow logo, AMD CDNA, AMD Instinct, EPYC, Radeon, ROCm, Ryzen, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective owners. \ No newline at end of file diff --git a/_posts/2023-04-19-accelerating-large-language-models.md b/_posts/2023-04-19-accelerating-large-language-models.md deleted file mode 100644 index 642d97e0d0d6..000000000000 --- a/_posts/2023-04-19-accelerating-large-language-models.md +++ /dev/null @@ -1,214 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Large Language Models with Accelerated Transformers" -author: Lucas Pasqualin, Driss Guessous, Christian Puhrsch, Bertrand Maher, Michael Gschwind ---- - -**TL;DR.** We show how to use Accelerated PyTorch 2.0 Transformers and the newly introduced `torch.compile()` method to accelerate Large Language Models on the example of [nanoGPT](https://github.com/karpathy/nanoGPT), a compact open-source implementation of the GPT model from Andrej Karpathy. Using the new [scaled dot product attention operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) introduced with Accelerated PT2 Transformers, we select the flash_attention custom kernel and achieve faster training time per batch (measured with Nvidia A100 GPUs), going from a ~143ms/batch baseline to ~113 ms/batch. In addition, the enhanced implementation using the SDPA operator offers better numerical stability. Finally, further optimizations are achieved using padded inputs, which when combined with flash attention lead to ~87ms/batch. - -Recent times have seen exponential adoption of large language models (LLMs) and Generative AI in everyday life. Tightly coupled with these ever-growing models is the ever-growing training cost - in terms of both time and hardware utilization. The PyTorch team has tackled these challenges head on with [Accelerated PyTorch 2 Transformers](https://pytorch.org/blog/accelerated-pytorch-2/) (previously known as “Better Transformer”) and JIT Compilation in [PyTorch 2.0](https://pytorch.org/blog/pytorch-2.0-release/). - -In this blog post, we explore training optimizations gained by utilizing custom kernel implementations of SDPA - also known as scaled dot product attention - a critical layer in transformer models. The custom kernel for SDPA replaces several discrete sequential operations with one globally optimized kernel which avoids allocating a large amount of intermediate CUDA memory. This approach offers a number of advantages, including but not limited to: higher performance computation of SDPA by reducing memory bandwidth bottleneck, reduced memory footprint to support larger batch sizes, and finally added numerical stability by prescaling input tensors. These optimizations are demonstrated on nanoGPT, an open-source implementation of GPT from Andrej Karpathy. - - -## Background - -Scaled dot product attention is the fundamental building block of multihead attention, as introduced in [“Attention is All You Need”](https://arxiv.org/abs/1706.03762), and has a wide range of applications in LLM and Generative AI models. - -![The Transformer model architecture](/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Figure-1.png){:style="max-height:800px; width:100%"} - -**Figure 1:** The Transformer model architecture based on [“Attention is All You Need”](https://arxiv.org/abs/1706.03762). With the new PyTorch SDPA operator, Multi-Head Attention is efficiently implemented by a linear layer for the in-projection, the SDPA operator, and a linear layer for the out-projection. - - -With the new scaled_dot_product_attention operator, multihead attention can be implemented in just 3 steps: in projection with a linear layer, SDPA, and out projection with a linear layer. - -``` -# In Projection -# variable descriptions: -# q,k,v = Query, Key, Value tensors -# bsz = batch size -# num_heads = Numner of heads for Multihead Attention -# tgt_len = Target length -# src_len = Source Length -# head_dim: Head Dimension - q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v) - q = q.view(bsz, num_heads, tgt_len, head_dim) - k = k.view(bsz, num_heads, src_len, head_dim) - v = v.view(bsz, num_heads, src_len, head_dim) - - # Scaled Dot Product Attention - attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) - - # Out Projection - attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) - attn_output = linear(attn_output, out_proj_weight, out_proj_bias) - attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) -``` - -PyTorch 2. supports multiple different kernels optimized for specific use cases, with specific requirements. A kernel picker picks the best kernel for a particular combination of input parameters. If no optimized "custom kernel" for a particular combination of input parameters can be identified, the kernel picker selects a general kernel that can handle all input combinations. - -While future releases may extend this set of operators, PyTorch 2.0 launches with 3 implementations for the SDPA operator: - -1. A generic kernel which implements the mathematical equation of SDPA in the function `sdpa_math()` -2. An optimized kernel based on the paper “[Flash Attention](https://arxiv.org/abs/2205.14135)”, which supports evaluation of SDPA with 16 bit floating point data types on compute architecture SM80 (A100). -3. An optimized kernel based on the paper “[Self-Attention Does Not Need O(n^2) Memory](https://arxiv.org/abs/2112.0568)" and implemented in [xFormer](https://github.com/facebookresearch/xformers), which supports both 32 and 16 bit floating data types on a wider range of architectures (SM40 and later). This blog post refers to this kernel as the `mem_efficient` kernel. - -Note that both optimized kernels (two and three listed above), support a key padding mask and limit the supported attention mask to causal attention. Accelerated PyTorch 2.0 Transformers today only support the causal mask when it is specified using the `is_causal` boolean. When a mask is specified, the general-purpose kernel will be selected because it is too expensive to analyze the contents of a provided mask to determine if it is the causal mask. Additional explanations on the constraints for each kernel can be found in the [Accelerated PT2 Transformer blog](https://pytorch.org/blog/accelerated-pytorch-2/). - - -## Enabling Accelerated Transformers with nanoGPT - -The SDPA operator being a critical component of the GPT model, we identified the open source nanoGPT model as an excellent candidate for both demonstrating the ease of implementation and benefits of PyTorch 2.0’s Accelerated Transformers. The following demonstrates the exact process by which Accelerated Transformers was enabled on nanoGPT. - -This process largely revolves around replacing the existing SDPA implementation with the newly added F.scaled_dot_product_attention operator from [functional.py](https://github.com/pytorch/pytorch/blob/df14650f0b14b80db132b0c1797dc595fbee1054/torch/nn/functional.py#L4834). This process can be easily adapted to enable the operator in many other LLMs. Alternatively, users can instead choose to call F.multi_head_attention_forward() or utilize the nn.MultiHeadAttention module directly where applicable. The following code snippets are adapted from Karpathy’s nanoGPT repository. - - -### Step 1: Identify the existing SDPA implementation - -In the case of nanoGPT, SDPA is implemented in the model’s [CausalSelfAttention](https://github.com/karpathy/nanoGPT/blob/master/model.py#L37) class. The original implementation at time of writing is adapted below for this post. - -![The original implementation at time of writing](/assets/images/2023-04-18-accelerating-large-language-models/causal_attention_step_1.png){:style="max-height:800px; width:100%"} - - -### Step 2: Replace with Torch’s _scaled_dot_product_attention_ - -At this point we can note the following: - -* Lines 36 - 42 define the mathematical implementation of SDPA which we are replacing -* The mask applied on line 39 is no longer relevant since we are using scaled_dot_product_attention’s `is_causal` flag. -* The dropout layer used in line 41 is also now unnecessary. - -Swapping out the SDPA implementation for torch’s scaled_dot_product_attention and removing the now redundant code yields the following implementation. - -![Swapping out the SDPA implementation for torch’s scaled_dot_product_attention and removing the now redundant code yields the following implementation.](/assets/images/2023-04-18-accelerating-large-language-models/causal_attention_step_2.png){:style="max-height:800px; width:100%"} - - -Alternatively, the original mask can be passed into the `attn_mask` field however due to the mentioned kernel constraints that would limit the implementation to only support the generic `sdpa_math` kernel. - - -### Step 3 (Bonus): Faster matmuls with padding - -On top of the performance improvements from SDPA, our analysis yielded a nice ancillary win. In Andrej's words "The most dramatic optimization to nanoGPT so far (~25% speedup) is to simply increase the vocab size from 50257 to 50304 (nearest multiple of 64)." - - -![Tweet by Andrej Karpathy](/assets/images/2023-04-18-accelerating-large-language-models/tweet.png){:style="max-height:800px; width:100%; max-width:600px"} - -The vocab size determines the dimensions of matmuls in the output layer of GPT, and these are so large that they were taking a _majority_ of the time for the entire training loop! We discovered that they were achieving performance significantly below the peak throughput achievable on the A100 GPU, and guessed from [NVIDIA's matmul documentation](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html) that 64-element alignment would yield better results. Indeed, padding these matmuls achieves nearly a 3x speedup! The underlying cause is that unaligned memory accesses significantly reduce efficiency. A deeper analysis can be found in [this Twitter thread](https://twitter.com/cHHillee/status/1630274804795445248). - -With this optimization we were able to further reduce training time from ~113 ms (using flash attention) to ~87 ms per batch. - - -## Results - -The figure below demonstrates the performance gained using Pytorch custom kernels. Here are the exact figures: - -* baseline (nanoGPT implementation): ~143ms -* sdpa_math (generic): ~134ms (6.71% faster) -* `mem_efficient` kernel: ~119ms (20.16% faster) -* `flash_attention` kernel: ~113ms (26.54% faster) -* flash_attention + padded vocab: ~87ms (64.37% faster) - -All code was run on an 8 x NVIDIA Corporation A100 server with 80 GB HBM [A100 SXM4 80GB], and for the purpose of this experiment dropout was set to 0. - - -![Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models](/assets/images/2023-04-18-accelerating-large-language-models/PyTorch_Better-Transformer_Chart-2.png){:style="max-height:800px; width:100%"} - - -**Figure 2:** Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for [nanoGPT](https://github.com/karpathy/nanoGPT) shown here. - - -## Enhancing Numerical Model Stability - -In addition to being faster, PyTorch's implementation offers increased numerical stability by avoiding loss of precision in many execution scenarios. There is a great explanation [here](https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/118), but essentially the PyTorch implementation scales the Query and Key matrices _before_ multiplication, which is said to be more stable and avoid loss of precision. Because of the merged custom kernel architecture of SDPA, this scaling does not introduce additional overhead in the computation of the attention result. In comparison, an implementation from the individual computational components would require separate pre-scaling at additional cost. For an additional explanation, see Appendix A. - - -### Improved Memory Consumption - -Yet another large advantage of using the torch SDPA kernels is the reduced memory footprint, which allows for the utilization of larger batch sizes. The following chart compares the best validation loss after one hour of training for both flash attention and the baseline implementations of causal attention. As can be seen, the maximum batch size achieved with the baseline causal attention implementation (on 8 x NVIDIA Corporation A100 server with 80 GB HBM) was 24, significantly less then the maximum achieved with flash attention, which was 39. - -![Using Flash Attention enables the usage of larger batch sizes](/assets/images/2023-04-18-accelerating-large-language-models/chart.png){:style="max-height:800px; width:100%"} - - -**Figure 3:** Using Flash Attention enables the usage of larger batch sizes, allowing users to achieve lower validation loss after one hour of training (smaller is better). - - -## Conclusion - -Accelerated PyTorch 2 Transformers were designed to make the training and production deployment of state-of-the-art transformer models affordable and integrated with PyTorch 2.0 model JIT compilation. The newly introduced PyTorch SDPA operator provides improved performance for training Transformer models and is particularly valuable for the expensive Large Language Model training. In this post we demonstrate a number of optimizations on the exemplary nanoGPT model including: - - - -* Over 26% training speedup, when compared against the baseline with constant batch size -* An additional speedup achieved with padded vocabulary, bringing the total optimization to approximately 64% compared to the baseline -* Additional numerical stability - - -## Appendix A: Analyzing Attention Numeric Stability - -In this section we provide a more in depth explanation of the previously mentioned enhanced numerical stability which is gained by prescaling SDPA’s input vectors. The following is a simplified version of nanoGPT’s mathematical implementation of SDPA. The important thing to note here is that the query undergoes matrix multiplication without being scaled. - -``` -# nanoGPT implementation of SDPA -# notice q (our query vector) is not scaled ! -att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) -att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf')) -att = F.softmax(att, dim=-1) - -# Dropout is set to 0, so we can safely ignore this line in the implementation# att = self.attn_dropout(att) - -y_nanogpt = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) -``` - -The following is the equivalent mathematical implementation in torch’s `scaled_dot_product_attention`. - -``` -# PyTorch implementation of SDPA -embed_size = q.size(-1) -scaling_factor = math.sqrt(math.sqrt(embed_size)) -q = q / scaling_factor # notice q _is_ scaled here ! - -# same as above, but with scaling factor -att = q @ (k.transpose(-2, -1) / scaling_factor) -att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf')) -att = F.softmax(att0, dim=-1) - -# Dropout is set to 0, so we can safely ignore this line in the implementation# att = self.attn_dropout(att) - -y_scale_before = att @ v -``` - -Mathematically both approaches should be equivalent, however our experimentation shows that in practice we receive different results from each approach. - -Using the approach above, we verified `y_scale_before` matches the expected output from using the `scaled_dot_product_attention `method while `y_nanogpt` does not. - -The `torch.allclose` method was used to test equivalence. Specifically, we showed that: - -``` -y_sdpa = torch.nn.functional._scaled_dot_product_attention( - q, - k, - v, - attn_mask=self.bias[:,:,:T,:T] != 0, - dropout_p=0.0, - need_attn_weights=False, - is_causal=False, -) - -torch.allclose(y_sdpa, y_nanogpt) # False, indicating fp issues -torch.allclose(y_sdpa, y_scale_before) # True, as expected -``` - -## Appendix B: Reproducing Experiment Results - -Researchers seeking to reproduce these results should start with the following commit from Andrej’s nanoGPT repository - **b3c17c6c6a363357623f223aaa4a8b1e89d0a465**. This commit was used as the baseline when measuring the per batch speed improvements. For results which include padded vocabulary optimizations (which yielded the most significant improvements to batch speed), use the following commit - **77e7e04c2657846ddf30c1ca2dd9f7cbb93ddeab**. From either checkout, selecting kernels for experimentation is made trivial with the use of the [torch.backends](https://pytorch.org/docs/stable/backends.html) API. - -The desired kernel can be selected via a context manager: - -``` -with torch.backends.cuda.sdp_kernel ( - enable_math = False, - enable_flash = False, - enable_mem_efficient = True -): - train(model) -``` \ No newline at end of file diff --git a/_posts/2023-04-27-introducing-hidet.md b/_posts/2023-04-27-introducing-hidet.md deleted file mode 100644 index b0e9eb526a3e..000000000000 --- a/_posts/2023-04-27-introducing-hidet.md +++ /dev/null @@ -1,140 +0,0 @@ ---- -layout: blog_detail -title: "Introducing Hidet: A Deep Learning Compiler for Efficient Model Serving" -author: Team Hidet ---- - -[Hidet](https://github.com/hidet-org/hidet) is a powerful deep learning compiler that simplifies the process of implementing high-performing deep learning operators on modern accelerators (e.g., NVIDIA GPUs). With the new feature of `torch.compile(...)` in PyTorch 2.0, integrating a novel compiler into PyTorch is easier than ever - Hidet now can be used as a `torch.compile(...)` backend to accelerate PyTorch models, making it an attractive option for PyTorch users who want to improve the inference performance of their models, especially for those who also need to implement extremely optimized custom operators. - - -## Using Hidet to Compile A PyTorch Model - -To use Hidet in PyTorch, you need to first install the `hidet` package via pip: - - -``` -pip install hidet -``` - - -Hidet is integrated with PyTorch as a `torch.compile(...)` backend following the [Custom Backends tutorial](https://pytorch.org/docs/stable/torch.compiler_custom_backends.html). You can specify `hidet` as the `backend` when you compile a model. (Note: requires PyTorch version 2.0+): - - -``` -torch.compile(..., backend='hidet') -``` - - -Hidet converts the given PyTorch model in the torch.fx.Graph format into its internal graph representation, and conducts a series of optimizations. Hidet provides a few options to configure the optimizations. For example, we can use `hidet.torch.dynamo_config.use_tensor_core(True)` to allow Hidet to generate CUDA kernels that leverage the [Tensor Cores on NVIDIA GPUs](https://www.nvidia.com/en-us/data-center/tensor-cores/), and use `hidet.torch.dynamo_config.search_space(2)` to allow Hidet to search for the best operator schedule specific for your hardware and input sizes. More configurations can be found in [Hidet’s documentation](https://docs.hidet.org/stable/gallery/tutorials/optimize-pytorch-model.html). - -Here's a complete example of how to use Hidet to compile and optimize a pre-trained ResNet50 model from `torchvision`: - - -``` -import hidet -import torch - -# Load a pre-trained ResNet50 model -x = torch.randn(1, 3, 224, 224, device='cuda').half() -model = torch.hub.load( - 'pytorch/vision:v0.6.0', 'resnet50', pretrained=True -).cuda().half().eval() - -# Configure hidet to use tensor core and enable tuning -hidet.torch.dynamo_config.use_tensor_core(True) -hidet.torch.dynamo_config.search_space(2) - -# Compile the model using Hidet -model_opt = torch.compile(model, backend='hidet') - -# Check correctness -torch.testing.assert_close(actual=model_opt(x), expected=model(x), rtol=1e-2, atol=1e-2) - -# Benchmark -from hidet.utils import benchmark_func -print('eager: {:2f}'.format(benchmark_func(lambda: model(x)))) -print('hidet: {:2f}'.format(benchmark_func(lambda: model_opt(x)))) -``` - - -We encourage you to try out the above script on your own NVIDIA GPU(s)! If you run this script on an `aws.g5.2xlarge` instance, you would get the result shown in the following figure. Hidet achieves the speedup because it could automatically fuse multiple operators, tune operator schedules, and use CUDA Graph to reduce framework-level overhead. More results can be found in the [ASPLOS’23 publication of Hidet](https://dl.acm.org/doi/10.1145/3575693.3575702) and our [performance tracking](https://github.com/hidet-org/hidet/issues/154) - - -![Eager vs Hidet latency](/assets/images/2023-4-27-hidet.png){:style="max-height:800px; width:100%"} - - - -## Using Hidet Script to Write Custom Operators - -Hidet Script is one approach to implement tensor operators in Python. The following example shows how to implement a naive matrix multiplication using Hidet Script and integrate it as a PyTorch operator. - - -``` -import torch -import hidet - - -def matmul(m_size, n_size, k_size): - from hidet.lang import f32, attr - from hidet.lang.cuda import threadIdx, blockIdx, blockDim - - with hidet.script_module() as script_module: - @hidet.script - def matmul( - a: f32[m_size, k_size], - b: f32[k_size, n_size], - c: f32[m_size, n_size] - ): - attr.cuda_grid_dim = ((m_size + 31) // 32, (n_size + 31) // 32) - attr.cuda_block_dim = (32, 32) - i = threadIdx.x + blockIdx.x * blockDim.x - j = threadIdx.y + blockIdx.y * blockDim.y - if i < m_size and j < n_size: - c[i, j] = 0.0 - for k in range(k_size): - c[i, j] += a[i, k] * b[k, j] - - ir_module = script_module.ir_module() - func = hidet.driver.build_ir_module(ir_module) - return func - - -class NaiveMatmul(torch.autograd.Function): - @staticmethod - def forward(ctx, a, b): - m, k = a.shape - k, n = b.shape - c = torch.empty([m, n], dtype=a.dtype, device=a.device) - func = matmul(m, n, k) - func(a, b, c) - return c - - -a = torch.randn([3, 4], device='cuda') -b = torch.randn([4, 5], device='cuda') -c = NaiveMatmul.apply(a, b) -cc = torch.matmul(a, b) -torch.testing.assert_close(c, cc) -``` - - -More optimizations can be applied, see the [example](https://docs.hidet.org/stable/gallery/developer-guides/hidet-script-dynamic-kernel.html) in our documentation to learn more. - -**Hidet Script vs. Triton**: Triton greatly simplifies the CUDA programming by introducing the tile-based programming model where the parallel execution unit is thread blocks instead of threads. However, this simplification also prevents the tensor program developers from manipulating the fine-grained computation and memory resources (e.g., warps, shared memory) in their preferred ways. It would be challenging to implement an optimization that requires fine-grained control of these resources using Triton if it has not been implemented by the Triton compiler itself. Hidet Script, on the other hand, simplifies tensor programming while still enabling users to implement their own optimizations with extensive flexibility. It's worth noting that the more granular control of Hidet Script also brings added complexity compared to Triton. - - -## More about Hidet - -Hidet originates from a research project led by the [EcoSystem lab](https://www.cs.toronto.edu/ecosystem/) at the University of Toronto (UofT) and AWS. The authors propose a new way, named the task-mapping programming paradigm, to construct tensor programs. It aims to simplify the tensor programming without sacrificing any optimization opportunity. Now, Hidet is an open-source project, jointly supported by [CentML](https://centml.ai/) and the EcoSystem lab, that aims to provide an efficient solution to end-to-end inference on modern accelerators (e.g., NVIDIA GPUs). - -### Additional Resources - -* GitHub Repository: [https://github.com/hidet-org/hidet](https://github.com/hidet-org/hidet) -* Hidet’s Documentation: [https://docs.hidet.org](https://docs.hidet.org) -* ASPLOS ’23 Publication: [https://dl.acm.org/doi/10.1145/3575693.3575702](https://dl.acm.org/doi/10.1145/3575693.3575702) -* ASPLOS ’23 Tutorial: [https://centml.github.io/asplos23-tutorial/](https://centml.github.io/asplos23-tutorial/) - - -## Acknowledgement - -We would like to thank Jerry Park, Mark Saroufim, Jason Liang and Helen Suk for their valuable help on preparing the blog post and feedback on the text. We also would like to thank Nikita Shulga, Jason Ansel, and Dmytro Dzhulgakov for reviewing and improving our PR https://github.com/pytorch/pytorch/pull/93873 on the 3rd-party dynamo backend registration. diff --git a/_posts/2023-05-02-accelerated-image-seg.md b/_posts/2023-05-02-accelerated-image-seg.md deleted file mode 100644 index bd585950f130..000000000000 --- a/_posts/2023-05-02-accelerated-image-seg.md +++ /dev/null @@ -1,281 +0,0 @@ ---- -layout: blog_detail -title: "Accelerated Image Segmentation using PyTorch" -author: Intel ---- - -_Using Intel® Extension for PyTorch to Boost Image Processing Performance_ - -PyTorch delivers great CPU performance, and it can be further accelerated with Intel® Extension for PyTorch. I trained an AI image segmentation model using PyTorch 1.13.1 (with ResNet34 + UNet architecture) to identify roads and speed limits from satellite images, all on the 4th Gen Intel® Xeon® Scalable processor. - -I will walk you through the steps to work with a satellite image dataset called SpaceNet5 and how I optimized the code to make deep learning workloads feasible on CPUs just by flipping a few key switches. - -**Before we get started, some housekeeping...** - -The code accompanying this article is available in the examples folder in the [Intel Extension for PyTorch repository](http://github.com/intel/intel-extension-for-pytorch/tree/master/examples/cpu/usecase_spacenet5). I borrowed heavily from the [City-Scale Road Extraction from Satellite Imagery (CRESI) repository](http://github.com/avanetten/cresi/). I adapted it for the 4th Gen Intel Xeon processors with PyTorch optimizations and [Intel Extension for PyTorch](http://github.com/intel/intel-extension-for-pytorch) optimizations. In particular, I was able to piece together a workflow using the [notebooks here](http://github.com/avanetten/cresi/tree/main/notebooks). - -You can find the accompanying talk I gave [on YouTube](http://www.youtube.com/watch?v=LVZWm5GFvAw). - -I also highly recommend these articles for a detailed explanation of how to get started with the SpaceNet5 data: - -* [The SpaceNet 5 Baseline — Part 1: Imagery and Label Preparation](http://medium.com/the-downlinq/the-spacenet-5-baseline-part-1-imagery-and-label-preparation-598af46d485e) -* [The SpaceNet 5 Baseline — Part 2: Training a Road Speed Segmentation Model](http://medium.com/the-downlinq/the-spacenet-5-baseline-part-2-training-a-road-speed-segmentation-model-2bc93de564d7) -* [The SpaceNet 5 Baseline — Part 3: Extracting Road Speed Vectors from Satellite Imagery](https://medium.com/the-downlinq/the-spacenet-5-baseline-part-3-extracting-road-speed-vectors-from-satellite-imagery-5d07cd5e1d21) -* [SpaceNet 5 Winning Model Release: End of the Road](http://medium.com/the-downlinq/spacenet-5-winning-model-release-end-of-the-road-fd02e00b826c) - -I referenced two Hugging Face blogs by Julien Simon; he ran his tests on the AWS instance `r7iz.metal-16xl`: - -* [Accelerating PyTorch Transformers with Intel Sapphire Rapids, part 1](http://huggingface.co/blog/intel-sapphire-rapids) -* [Accelerating PyTorch Transformers with Intel Sapphire Rapids, part 2](http://huggingface.co/blog/intel-sapphire-rapids-inference) - -The potential cost savings from using a CPU instance instead of a GPU instance on the major cloud service providers (CSP) can be significant. The latest processors are still being rolled out to the CSPs, so I’m using a 4th Gen Intel Xeon processor that is hosted on the Intel® Developer Cloud (you can sign up for the Beta here: [cloud.intel.com](http://cloud.intel.com/)). - -On AWS, you can select from the `r7iz.*` EC2 instances after you [sign up for the preview here](http://pages.awscloud.com/R7iz-Preview.html) (Figure 1). At the time of writing, the new AI-acceleration engine, Intel® Advanced Matrix Extensions (Intel® AMX), is only available on bare metal but it should soon be enabled on the virtual machines. - -![List of 4th Gen Xeon instances on AWS EC2](/assets/images/f1-4th-gen-xeon-aws-instances.png){:style="max-height:800px; max-width: 100%; display: block; margin-left: auto; margin-right: auto"} - -**Figure 1**. List of 4th Gen Xeon instances on AWS EC2 (image by author) - -On Google Cloud* Platform, you can select from the 4th Gen Xeon Scalable processors C3 VMs (Figure 2). - - -![List of 4th Gen Intel Xeon Scalable processor instances on Google Cloud Platform](/assets/images/f2-4th-gen-xeon-googlecloud-instances.png){:style="max-height:800px; max-width: 100%; display: block; margin-left: auto; margin-right: auto"} - -**Figure 2**. List of 4th Gen Intel Xeon Scalable processor instances on Google Cloud Platform (image by author) - - -## Hardware Introduction and Optimizations - -The 4th Gen Intel Xeon processors were released January 2023, and the bare-metal instance I am using has two sockets (each with 56 physical cores), 504 GB of memory, and Intel AMX acceleration. I installed a few key libraries in the backend to take control and monitor the sockets, memory, and cores that I am using on the CPU: - -`numactl` (with `sudo apt-get install numactl`) - -`libjemalloc-dev` (with `sudo apt-get install libjemalloc`) - -`intel-openmp` (with `conda install intel-openmp`) - -`gperftools` (with `conda install gperftools -c conda-forge`) - -Both PyTorch and Intel Extension for PyTorch have helper scripts so that one does not need to explicitly use `intel-openmp` and `numactl`, but they do need to be installed in the backend. In case you want to set them up for other work, here is what I used for OpenMP* ... - - -``` -export OMP_NUM_THREADS=36 -export KMP_AFFINITY=granularity=fine,compact,1,0 -export KMP_BLOCKTIME=1 -``` - - -… where `OMP_NUM_THREADS` is the number of threads allocated to the job, `KMP_AFFINITY` affects thread affinity settings (including packing threads close to each other, the state of pinning threads), and `KMP_BLOCKTIME` sets the time in milliseconds that an idle thread should wait before going to sleep. - -Here’s what I used for `numactl` … - - -``` -numactl -C 0-35 --membind=0 train.py -``` - - -...where `-C` specifies which cores to use and `--membind` instructs the program to only use one socket (socket 0 in this case). - - -## SpaceNet Data - -I am using a satellite image dataset from the [SpaceNet 5 Challenge](http://spacenet.ai/sn5-challenge/). Different cities can be downloaded for free from an AWS S3 bucket: - - -``` -aws s3 ls s3://spacenet-dataset/spacenet/SN5_roads/tarballs/ --human-readable -``` - -``` -2019-09-03 20:59:32 5.8 GiB SN5_roads_test_public_AOI_7_Moscow.tar.gz -2019-09-24 08:43:02 3.2 GiB SN5_roads_test_public_AOI_8_Mumbai.tar.gz -2019-09-24 08:43:47 4.9 GiB SN5_roads_test_public_AOI_9_San_Juan.tar.gz -2019-09-14 13:13:26 35.0 GiB SN5_roads_train_AOI_7_Moscow.tar.gz -2019-09-14 13:13:34 18.5 GiB SN5_roads_train_AOI_8_Mumbai.tar.gz -``` - - -You can use the following commands to download and unpack a file: - - -``` -aws s3 cp s3://spacenet-dataset/spacenet/SN5_roads/tarballs/SN5_roads_train_AOI_7_Moscow.tar.gz . -tar -xvzf ~/spacenet5data/moscow/SN5_roads_train_AOI_7_Moscow.tar.gz -``` - - - -### Dataset Preparation - -I used the Moscow satellite image dataset, which consists of 1,352 images of 1,300 by 1,300 pixels with corresponding street labels in separate text files. The dataset contains both 8-band multispectral images and 3-band RGB images. Figure 3 shows four sample RGB satellite images and their corresponding generated masks. I used the [speed_masks.py](http://github.com/avanetten/cresi/blob/main/cresi/data_prep/speed_masks.py) script from the CRESI repository to generate the segmentation masks. - -![Satellite image 3-channel RGB chips from Moscow (top row) and corresponding pixel segmentation masks with varying speed limits](/assets/images/f3-moscow-satellite-image-dataset.png){:style="max-height:800px; max-width: 100%; display: block; margin-left: auto; margin-right: auto"} - -**Figure 3**. Satellite image 3-channel RGB chips from Moscow (top row) and corresponding pixel segmentation masks with varying speed limits (bottom row) (image by author) - -There is a JSON configuration file that must be updated for all remaining components: training and validation split, training, and inference. [An example configuration can be found here](http://github.com/avanetten/cresi/blob/main/cresi/configs/sn5_baseline_aws.json). I perform an 80:20 training/validation split, making sure to point to the correct folder of satellite images and corresponding masks for training. The configuration parameters are explained in more in the [notebook under examples in GitHub for Intel Extension for PyTorch here](http://github.com/intel/intel-extension-for-pytorch/tree/master/examples/cpu/usecase_spacenet5). - - -### Training a ResNet34 + UNet Model - -I made some changes to the `cresi` code described below in order to run on a CPU and optimize the training. To run natively on a CPU, replace `self.model = nn.DataParallel(model).cuda()` with `self.model = nn.DataParallel(model)` in the [train.py](https://github.com/avanetten/cresi/blob/main/cresi/net/pytorch_utils/train.py) script. In the [01_train.py](https://github.com/avanetten/cresi/blob/main/cresi/01_train.py) script, remove `torch.randn(10).cuda()`. - -To optimize training, add `import intel_extension_for_pytorch as ipex` to the import statements in the [train.py](https://github.com/avanetten/cresi/blob/main/cresi/net/pytorch_utils/train.py) script. Just after defining the model and optimizer as follows: - - -``` -self.model = nn.DataParallel(model) -self.optimizer = optimizer(self.model.parameters(), lr=config.lr) -``` - - -Add the `ipex.optimize` line to use BF16 precision, instead of FP32: \ - - - -``` -self.model, self.optimizer = ipex.optimize(self.model, - optimizer=self.optimizer,dtype=torch.bfloat16) -``` - - -Add a line to do mixed-precision training just before running a forward pass and calculating the loss function: - - -``` -with torch.cpu.amp.autocast(): - if verbose: - print("input.shape, target.shape:", input.shape, target.shape) - output = self.model(input) - meter = self.calculate_loss_single_channel(output, target, meter, training, iter_size) -``` - - -Now that we have optimized our training code, we can move onto training our model. - -Like the [winner of the SpaceNet 5 competition](https://medium.com/the-downlinq/spacenet-5-winning-model-release-end-of-the-road-fd02e00b826c), I trained a ResNet34 encoder + UNet decoder model. It is pretrained from ImageNet weights, and the backbone is left completely unfrozen during training. The training can be run with the [01_train.py](https://github.com/avanetten/cresi/blob/main/cresi/01_train.py) script, but in order to control the use of hardware I used a helper script. There are actually two helper scripts: one that comes with stock PyTorch and one that comes with Intel Extension for PyTorch. They both accomplish the same thing, but the first one from stock is `torch.backends.xeon.run_cpu`, and the second one from Intel Extension for PyTorch is `ipexrun`. - -Here is what I ran in the command-line: - - -``` -python -m torch.backends.xeon.run_cpu --ninstances 1 \ - --ncores_per_instance 32 \ - --log_path /home/devcloud/spacenet5data/moscow/v10_xeon4_devcloud22.04/logs/run_cpu_logs \ - /home/devcloud/cresi/cresi/01_train.py \ - /home/devcloud/cresi/cresi/configs/ben/v10_xeon4_baseline_ben.json --fold=0 -``` - -``` -ipexrun --ninstances 1 \ ---ncore_per_instance 32 \ -/home/devcloud/cresi/cresi/01_train.py \ -/home/devcloud/cresi/cresi/configs/ben/v10_xeon4_baseline_ben.json --fold=0 -``` - - -In both cases, I am asking PyTorch to run training on one socket with 32 cores. Upon running, I get a printout of what environment variables get set in the backend to understand how PyTorch is using the hardware: - - -``` -INFO - Use TCMalloc memory allocator -INFO - OMP_NUM_THREADS=32 -INFO - Using Intel OpenMP -INFO - KMP_AFFINITY=granularity=fine,compact,1,0 -INFO - KMP_BLOCKTIME=1 -INFO - LD_PRELOAD=/home/devcloud/.conda/envs/py39/lib/libiomp5.so:/home/devcloud/.conda/envs/py39/lib/libtcmalloc.so -INFO - numactl -C 0-31 -m 0 /home/devcloud/.conda/envs/py39/bin/python -u 01_train.py configs/ben/v10_xeon4_baseline_ben.json --fold=0 -``` - - -During training, I make sure that my total loss function is decreasing (i.e., the model is converging on a solution). - - -### Inference - -After training a model, we can start to make predictions from satellite images alone. In the eval.py inference script, add import intel_extension_for_pytorch as ipex to the import statements. After loading the PyTorch model, use Intel Extension for PyTorch to optimize the model for BF16 inference: - - -``` -model = torch.load(os.path.join(path_model_weights, - 'fold{}_best.pth'.format(fold)), - map_location = lambda storage, - loc: storage) -model.eval() -model = ipex.optimize(model, dtype = torch.bfloat16) -``` - - -Just prior to running prediction, add two lines for mixed precision: - - -``` -with torch.no_grad(): - with torch.cpu.amp.autocast(): - for data in pbar: - samples = torch.autograd.Variable(data['image'], volatile=True) - predicted = predict(model, samples, flips=self.flips) -``` - - -To run inference, we can use the [02_eval.py](https://github.com/avanetten/cresi/blob/main/cresi/02_eval.py) script. Now that we have a trained model, we can make predictions on satellite images (Figure 4). We can see that it does seem to map the roads closely to the image! - - -![Moscow satellite image and accompanying prediction of roads](/assets/images/f4-moscow-satellite-image-complete.png){:style="max-height:800px; max-width: 100%; display: block; margin-left: auto; margin-right: auto"} - -**Figure 4**. Moscow satellite image and accompanying prediction of roads (image by author) - -I realize that the model I’ve trained is overfit to the Moscow image data and probably won’t generalize well to other cities. However, the [winning solution to this challenge](http://medium.com/the-downlinq/spacenet-5-winning-model-release-end-of-the-road-fd02e00b826c) used data from six cities (Las Vegas, Paris, Shanghai, Khartoum, Moscow, Mumbai) and performs well on new cities. In the future, one thing that would be worth testing is training on all six cities and running inference on another city to reproduce their results. - - -## Note on Post-Processing - -There are further post-processing steps that can be performed to add the mask as graph features to maps. You can read more about the post-processing steps here: - -[The SpaceNet 5 Baseline — Part 3: Extracting Road Speed Vectors from Satellite Imagery](http://medium.com/the-downlinq/the-spacenet-5-baseline-part-3-extracting-road-speed-vectors-from-satellite-imagery-5d07cd5e1d21) - -[Post-processing scripts](https://github.com/avanetten/cresi/tree/main/cresi) - - -## Conclusions - -In summary, we: - -* Created 1,352 image training masks (with speed limits) to correspond to our training satellite image data (from .geojson text file labels) -* Defined our configuration file for training and inference -* Split up our data into training and validation sets -* Optimized our code for CPU training, including using Intel Extension for PyTorch and BF16 -* Trained a performant ResNet34 + UNet model on a 4th Gen Intel Xeon CPU -* Ran initial inference to see the prediction of a speed limit mask - -You can find [detailed benchmarks here for the 4th Gen Intel Xeon CPU here](http://edc.intel.com/content/www/us/en/products/performance/benchmarks/4th-generation-intel-xeon-scalable-processors/). - - -## Next Steps - -Extend the optimizations on an Intel CPU by using the Intel Extension for PyTorch: - -`pip install intel-extension-for-pytorch` - -`git clone https://github.com/intel/intel-extension-for-pytorch` - - -[Get in touch with me on LinkedIn](http://linkedin.com/in/bconsolvo) if you have any more questions! - -More information about the Intel Extension for PyTorch [can be found here](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-pytorch.html). - - -### Get the Software - -I encourage you to check out Intel’s other **[AI Tools](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/tools.html)** and **[Framework](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html)** optimizations and learn about the open, standards-based **[oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html)** multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio. - -For more details about 4th Gen Intel Xeon Scalable processor, visit **[AI Platform](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/platform.html)** where you can learn about how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines. - - -### PyTorch Resources - -* [PyTorch Get Started](http://pytorch.org/get-started/pytorch-2.0/) -* [Dev Discussions](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-execution-update/1077) -* [Documentation](http://pytorch.org/docs/2.0/) \ No newline at end of file diff --git a/_posts/2023-05-03-announcing-docathon.md b/_posts/2023-05-03-announcing-docathon.md deleted file mode 100644 index e44bb3c38612..000000000000 --- a/_posts/2023-05-03-announcing-docathon.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: blog_detail -title: "Announcing PyTorch Docathon 2023" ---- - -![PyTorch Docathon](/assets/images/docathon-cover.jpg){:style="max-height:800px; width:100%"} - - -We are excited to announce the first ever PyTorch Docathon! The Docathon is a hackathon-style event focused on improving the documentation by enlisting the help of the community. Documentation is a crucial aspect of any technology and by improving the documentation, we can make it easier for users to get started with PyTorch, help them understand how to use its features effectively, and ultimately accelerate research to production in the field of machine learning. - - -## WHY PARTICIPATE - - -### Low Barrier to Entry - -Many open-source projects require extensive knowledge of the codebase and prior contributions to the project to participate in any sort of hackathon events. The Docathon, on the other hand, is designed for newcomers. We do expect familiarity with Python, basic knowledge of PyTorch, and ML. But don't fret, there are some tasks that are related to website issues that won't require even that. - - -### Tangible Results - -One of the best things about the Docathon is that you can see the results of your efforts in real time. Improving documentation can have a huge impact on a project's usability and accessibility and you'll be able to see those improvements firsthand. Plus having tangible results can be a great motivator to keep contributing. - - -### Collaborative Environment - -The Docathon is a collaborative event which means you'll have the opportunity to work with other contributors and PyTorch maintainers on improving the documentation. This can be a great way to learn from others, share ideas, and build connections. - - -### Learning Opportunities - -Finally, even if you are not an expert in PyTorch, the Docathon can be a great learning experience. You'll have the opportunity to explore the PyTorch modules and test some of the tutorials on your machine as well as in the CI. - - -## EVENT DETAILS - -* **May 31**: Kick-off -* **May 31 - June 11**: Submissions and Feedback -* **June 12 - June 13**: Final Reviews -* **June 15**: Winner Announcements - -Details for the Docathon to be announced at the kick-off stream on May 31. - -Please register to join this year’s event: [**RSVP**](https://community.linuxfoundation.org/e/mmbqqb/) \ No newline at end of file diff --git a/_posts/2023-05-12-language-identification.md b/_posts/2023-05-12-language-identification.md deleted file mode 100644 index 02dbeb662a10..000000000000 --- a/_posts/2023-05-12-language-identification.md +++ /dev/null @@ -1,226 +0,0 @@ ---- -layout: blog_detail -title: "Language Identification: Building an End-to-End AI Solution using PyTorch" -author: Intel ---- - -Language Identification is the process of identifying the primary language from multiple audio input samples. In natural language processing (NLP), language identification is an important problem and a challenging issue. There are many language-related tasks such as entering text on your phone, finding news articles you enjoy, or discovering answers to questions that you may have. All these tasks are powered by NLP models. To decide which model to invoke at a particular point in time, we must perform language identification. - -This article presents an in-depth solution and code sample for language identification using [Intel® Extension for PyTorch](http://intel.github.io/intel-extension-for-pytorch/), which is a version of the popular PyTorch AI framework optimized for use on Intel® processors, and [Intel® Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html), which is a tool to accelerate AI inference without sacrificing accuracy. - -The [code sample](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification) demonstrates how to train a model to perform language identification using the Hugging Face SpeechBrain* toolkit and optimize it using the [Intel® AI Analytics Toolkit (AI Kit)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit-download.html). The user can modify the code sample and identify up to 93 languages using the Common Voice dataset. - - -## Proposed Methodology for Language Identification - -In the proposed solution, the user will use an Intel AI Analytics Toolkit container environment to train a model and perform inference leveraging Intel-optimized libraries for PyTorch. There is also an option to quantize the trained model with Intel Neural Compressor to speed up inference. - - -### Dataset - -The [Common Voice](http://commonvoice.mozilla.org/en/datasets) dataset is used and for this code sample, specifically, Common Voice Corpus 11.0 for Japanese and Swedish. This dataset is used to train an [Emphasized Channel Attention, Propagation and Aggregation Time Delay Neural Network (ECAPA-TDNN)](http://arxiv.org/abs/2005.07143), which is implemented using the [Hugging Face SpeechBrain](http://huggingface.co/SpeechBrain) library. Time Delay Neural Networks (TDNNs), aka one-dimensional Convolutional Neural Networks (1D CNNs), are multilayer artificial neural network architectures to classify patterns with shift-invariance and model context at each layer of the network. ECAPA-TDNN is a new TDNN-based speaker-embedding extractor for speaker verification; it is built upon the original x-vector architecture and puts more emphasis on channel attention, propagation, and aggregation. - - -### Implementation - -After downloading the Common Voice dataset, the data is preprocessed by converting the MP3 files into WAV format to avoid information loss and separated into training, validation, and testing sets. - -A [pretrained VoxLingua107 ](http://huggingface.co/speechbrain/lang-id-voxlingua107-ecapa)model is retrained with the Common Voice dataset using the Hugging Face SpeechBrain library to focus on the languages of interest. [VoxLingua107](http://bark.phon.ioc.ee/voxlingua107/) is a speech dataset used for training spoken language recognition models that work well with real-world and varying speech data. This dataset contains data for 107 languages. By default, Japanese and Swedish are used, and more languages can be included. This model is then used for inference on the testing dataset or a user-specified dataset. Also, there is an option to utilize SpeechBrain's Voice Activity Detection (VAD) where only the speech segments from the audio files are extracted and combined before samples are randomly selected as input into the model. This [link](http://huggingface.co/speechbrain/vad-crdnn-libriparty) provides all the necessary tools to perform VAD. To improve performance, the user may quantize the trained model to integer-8 (INT8) using Intel Neural Compressor to decrease latency. - - -#### Training - -The copies of training scripts are added to the current working directory, including `create_wds_shards.py` - for creating the [WebDataset](http://github.com/webdataset/webdataset) shards, `train.py` - to perform the actual training procedure, and `train_ecapa.yaml` - to configure the training options. The script to create WebDataset shards and YAML file are patched to work with the two languages chosen for this code sample. - -In the data preprocessing phase, `prepareAllCommonVoice.py` script is executed to randomly select a specified number of samples to convert the input from MP3 to WAV format. Here, 80% of these samples will be used for training, 10% for validation, and 10% for testing. At least 2000 samples are recommended as the number of input samples and is the default value. - -In the next step, WebDataset shards are created from the training and validation datasets. This stores the audio files as tar files which allows writing purely sequential I/O pipelines for large-scale deep learning in order to achieve high I/O rates from local storage—about 3x-10x faster compared to random access. - -The YAML file will be modified by the user. This includes setting the value for the largest number for the WebDataset shards, output neurons to the number of languages of interest, number of epochs to train over the entire dataset, and the batch size. The batch size should be decreased if the CPU or GPU runs out of memory while running the training script. - -In this code sample, the training script will be executed with CPU. While running the script, “cpu” will be passed as an input parameter. The configurations defined in `train_ecapa.yaml` are also passed as parameters. - -The command to run the script to train the model is: - - -``` -python train.py train_ecapa.yaml --device "cpu" -``` - - -In the future, the training script train.py will be designed to work for Intel® GPUs such as the Intel® Data Center GPU Flex Series, Intel® Data Center GPU Max Series, and Intel® Arc™ A-Series with updates from Intel Extension for PyTorch. - -[Run the training script](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification#train-the-model-with-languages) to learn how to train the models and execute the training script. The 4th Generation Intel® Xeon® Scalable Processor is recommended for this [transfer learning](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/training/transfer-learning.html) application because of its performance improvements through its Intel® Advanced Matrix Extensions (Intel® AMX) instruction set. - -After training, checkpoint files are available. These files are used to load the model for inference. - - -#### Inference - - -![Inference Pipeline](/assets/images/f1-inference-pipeline-language-identification.png){:style="max-height:800px; width:100%"} - - -The crucial step before running inference is to patch the SpeechBrain library’s pretrained `interfaces.py` file so that PyTorch TorchScript* can be run to improve the runtime. TorchScript requires the output of the model to be only tensors. - -Users can choose to run inference using the testing set from Common Voice or their own custom data in WAV format. The following are the options the inference scripts (`inference_custom.py and inference_commonVoice.py`) can be run with: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Input Option - Description -
        -p - Specify the data path. -
        -d - Specify the duration of wave sample. The default value is 3. -
        -s - Specify size of sample waves, default is 100. -
        --vad - (`inference_custom.py` only) Enable VAD model to detect active speech. The VAD option will identify speech segments in the audio file and construct a new .wav file containing only the speech segments. This improves the quality of speech data used as input into the language identification model. -
        --ipex - Run inference with optimizations from Intel Extension for PyTorch. This option will apply optimizations to the pretrained model. Using this option should result in performance improvements related to latency. -
        --ground_truth_compare - (`inference_custom.py` only) Enable comparison of prediction labels to ground truth values. -
        --verbose - Print additional debug information, like latency. -
        - - -The path to the data must be specified. By default, 100 audio samples of 3-seconds will be randomly selected from the original audio file and used as input to the language identification model. - -A small Convolutional Recurrent Deep Neural Network (CRDNN) pretrained on the [LibriParty](http://drive.google.com/file/d/1--cAS5ePojMwNY5fewioXAv9YlYAWzIJ/view) dataset is used to process audio samples and output the segments where speech activity is detected. This can be used in inference with the `--vad` option. - -From the figure below, the timestamps where speech will be detected is delivered from the CRDNN model, and these are used to construct a new, shorter audio file with only speech. Sampling from this new audio file will give a better prediction of the primary language spoken. - - -![Audio wave file visualization](/assets/images/f2-timestamps-delivered-from-crdnn-model.png){:style="max-height:800px; width:100%"} - - -[Run the inference script](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification#run-inference) yourself. An example command of running inference: - - -``` -python inference_custom.py -p data_custom -d 3 -s 50 --vad -``` - - -This will run inference on data you provide located inside the _data_custom_ folder. This command performs inference on 50 randomly selected 3-second audio samples with voice activity detection. - -If you want to run the code sample for other languages, download Common Voice Corpus 11.0 datasets for other languages. - - -## Optimizations with Intel Extension for PyTorch and Intel Neural Compressor - - -### PyTorch - -The Intel extension expands PyTorch with up-to-date features and optimizations for an extra performance boost on Intel hardware. Check out [how to install Intel Extension for PyTorch](http://github.com/intel/intel-extension-for-pytorch#installation). The extension can be loaded as a Python module or linked as a C++ library. Python users can enable it dynamically by importing `intel_extension_for_pytorch`. - - - -* The [CPU tutorial](http://intel.github.io/intel-extension-for-pytorch/cpu/latest/) gives detailed information about Intel Extension for PyTorch for Intel CPUs. Source code is available at the [master branch](https://github.com/intel/intel-extension-for-pytorch/tree/master). -* The [GPU tutorial](http://intel.github.io/intel-extension-for-pytorch/xpu/latest/) gives detailed information about Intel Extension for PyTorch for Intel GPUs. Source code is available at the [xpu-master branch](http://github.com/intel/intel-extension-for-pytorch/tree/xpu-master). - -To optimize the model for inference using Intel Extension for PyTorch, the `--ipex`option can be passed in. The model is optimized using the plug-in. TorchScript speeds up inference because PyTorch is run in graph mode. The command to run with this optimization is: - - -``` -python inference_custom.py -p data_custom -d 3 -s 50 --vad --ipex --verbose -``` - - -Note: The `--verbose` option is required to view the latency measurements. - -Auto-mixed precision such as bfloat16 (BF16) support will be added in a future release of the code sample. - - -### Intel Neural Compressor - -This is an open-source Python library that runs on CPUs or GPUs, which: - - - -* Performs model quantization to reduce the model size and increase the speed of deep learning inference for deployment. -* Automates popular methods such as quantization, compression, pruning, and knowledge distillation across multiple deep-learning frameworks. -* Is part of the AI Kit - -The model can be quantized from float32 (FP32) precision to integer-8 (INT8) by running the `quantize_model.py` script while passing in the path to the model and a validation dataset. The following code can be used to load this INT8 model for inference: - - -``` -from neural_compressor.utils.pytorch import load -model_int8 = load("./lang_id_commonvoice_model_INT8", self.language_id) -signal = self.language_id.load_audio(data_path) -prediction = self.model_int8(signal) -``` - - -Note that the original model is required when loading the quantized model. The command to quantize the trained model from FP32 to INT8 by using `quantize_model.py` is: - - -``` -python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/commonVoiceData/commonVoice/dev -``` - - - -## What’s Next? - -Try out the above code sample by upgrading the hardware to a 4th Generation Intel Xeon Scalable Processor with Intel AMX and identify up to 93 different languages from Common Voice datasets. - -We encourage you to learn more about and incorporate Intel’s other [AI/ML Framework optimizations](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html) and [end-to-end portfolio of tools](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/tools.html) into your AI workflow. Also, visit [AI & ML page](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/overview.html) covering Intel’s AI software development resources for preparing, building, deploying, and scaling your AI solutions. - -For more details about the new 4th Gen Intel Xeon Scalable processors, visit [Intel's AI Solution Platform portal](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/platform.html) where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs. - - -### Useful resources - -* [Intel AI Developer Tools and resources](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/overview.html) -* [oneAPI unified programming model](https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html) -* [Official documentation - Intel® Optimization for TensorFlow*](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-tensorflow.html) -* [Official documentation - Intel® Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) -* [Accelerate AI Workloads with Intel® AMX](https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/ai-solution-brief.html) - - -### Explore more AI code samples - -* [Optimize PyTorch Models using Intel® Extension for PyTorch (IPEX) Quantization](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/Features-and-Functionality/IntelPytorch_Quantization) -* [PyTorch Training Optimizations with Advanced Matrix Extensions Bfloat16](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16) -* [Intel® Neural Compressor TensorFlow* Getting Started](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/Getting-Started-Samples/INC-Sample-for-Tensorflow) - - -See all code samples \ No newline at end of file diff --git a/_posts/2023-05-16-pytorch-conference-2023.md b/_posts/2023-05-16-pytorch-conference-2023.md deleted file mode 100644 index 251270fbaf04..000000000000 --- a/_posts/2023-05-16-pytorch-conference-2023.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Conference 2023: Join us in San Francisco October 16-17" ---- - -![PyTorch Conference 2023](/assets/images/pytorch-conf-2023.png){:style="max-height:800px; width:100%"} - - -We’re thrilled to announce the upcoming [PyTorch Conference 2023](https://events.linuxfoundation.org/pytorch-conference/)! On October 16-17, the conference will showcase PyTorch 2.1, the next-generation release of the popular machine learning framework. As part of the Linux Foundation, the PyTorch Foundation Conference continues the tradition of bringing together leading researchers, developers, and academic communities to advance the education and development of end-to-end machine learning. - -The conference agenda features an engaging lineup of events, including an opening reception, engaging community and partner discussions, informative panels, poster sessions, enlightening use cases and community stories, as well as discussions on the latest trends in machine learning and deep learning development and deployment. - -## Call for Proposals - -We are now accepting speaker proposals for the conference until **July 21**. The program committee will carefully review all submissions, and selected speakers will be notified by **August 8**. We strongly encourage both experienced and first-time speakers to submit their proposals. This conference provides an excellent opportunity to connect with the PyTorch community, share your ideas, and showcase your work. - -When preparing your proposal, please consider the following guidelines: - -* What are you hoping to get from your presentation? -* What do you expect the audience to gain from your presentation? -* How will your presentation help better the open source ecosystem? - -To help you shape your proposal, here are some suggested topics for the conference: - -* Deployments on AWS, Azure -* Use cases and real-world applications -* Foundational models -* AI practices -* Production considerations -* PyTorch 2.X features and updates -* Training techniques and best practices -* Inference methodologies -* Hardware advancements and optimizations -* Edge computing applications -* Scalability solutions -* Latest research breakthroughs -* Optimization strategies -* Extending PyTorch through customizations and plugins - -We kindly request that you refrain from submitting sales or marketing pitches and avoid discussing unlicensed or closed-source technologies. Such talks tend to detract from the integrity of our events and are not well-received by conference attendees. - -## Register Today - -Registration is now open! Get your ticket today and secure your spot: [https://events.linuxfoundation.org/pytorch-conference/register/](https://events.linuxfoundation.org/pytorch-conference/register/) - -Thank you for your interest, and we look forward to a successful PyTorch Conference 2023! \ No newline at end of file diff --git a/_posts/2023-05-22-out-of-the-box-acceleration.md b/_posts/2023-05-22-out-of-the-box-acceleration.md deleted file mode 100644 index 7d61266a2f5e..000000000000 --- a/_posts/2023-05-22-out-of-the-box-acceleration.md +++ /dev/null @@ -1,181 +0,0 @@ ---- -layout: blog_detail -title: "Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0" -author: Felix Marty, Younes Belkada, Hamid Shojanazeri, Driss Guessous ---- - -As part of PyTorch 2.0 release, an accelerated implementation of the attention mechanism as part of the “Better Transformer” project (and known in PyTorch as Accelerated Transformers) has been added natively into PyTorch as [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html?highlight=scaled_dot_product_attention#torch.nn.functional.scaled_dot_product_attention). This implementation leverages fused kernels from [FlashAttention](https://arxiv.org/abs/2205.14135) and [Memory-efficient attention](https://arxiv.org/abs/2112.05682), and supports both training and inference. - -We also release a notebook showcasing an example of this integration [here](https://colab.research.google.com/drive/1_zuAiiBFoFWpexxeWsTS694tCSlMYydo?usp=sharing) - -After seeing [20-30% speedups at inference for diffusion models](https://pytorch.org/blog/accelerated-diffusers-pt-20/), we went ahead and implemented an integration with 🤗 Transformers models through the [🤗 Optimum library](https://huggingface.co/docs/optimum/main/en/bettertransformer/overview). Similar to [the previous integration for encoder models](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2), the integration replaces modules from Transformers with efficient implementations that use `torch.nn.functional.scaled_dot_product_attention`. The usage is as follow: - -``` -from optimum.bettertransformer import BetterTransformer -from transformers import AutoModelForCausalLM - -with torch.device(“cuda”): -model = AutoModelForCausalLM.from_pretrained(“gpt2-large”, torch_dtype=torch.float16) - -model = BetterTransformer.transform(model) - -# do your inference or training here - -# if training and want to save the model -model = BetterTransformer.reverse(model) -model.save_pretrained(“fine_tuned_model”) -model.push_to_hub(“fine_tuned_model”) -``` - -Summarizing our findings below about `torch.nn.functional.scaled_dot_product_attention`: -* It is most useful to fit larger models, sequence length, or batch size to train on a given hardware. -* Memory footprint savings on GPU during training range from 20% to 110%+. -* Speedups during training range from 10% to 70%. -* Speedups during inference range from 5% to 20%. -* Standalone, for small head dimensions, `scaled_dot_product_attention` speedups go up to 3x, memory savings go as high as 40x (depending on the sequence length). - -You may be surprised by the wide range of memory savings and speedups. In this blog post, we discuss our benchmarks, where this feature shines and upcoming improvements in future PyTorch releases. - -_In the next release of transformers you will just need to install the proper version of optimum and run:_ -``` -model = model.to_bettertransformer() -``` -_To convert your model using the BetterTransformer API. You can already try this feature out by installing transformers from source._ - - -## Benchmark and usage with 🤗 Transformers - -`torch.nn.functional.scaled_dot_product_attention` is usable with any architecture that uses standard attention, and namely replaces the boiler-plate code: - -``` -# native scaled_dot_product_attention is equivalent to the following: -def eager_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale): - scale_factor = 1 / math.sqrt(Q.size(-1)) if scale is None else scale - attn_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) if is_causal else attn_mask - attn_mask = attn_mask.masked_fill(not attn_mask, -float('inf')) if attn_mask.dtype==torch.bool else attn_mask - attn_weight = torch.softmax((Q @ K.transpose(-2, -1) * scale_factor) + attn_mask, dim=-1) - attn_weight = torch.dropout(attn_weight, dropout_p) - return attn_weight @ V -``` - -In the 🤗 Optimum integration with Transformers models, the following architectures are supported for now: gpt2, gpt-neo, gpt-neox, gptj, t5, bart, codegen, pegasus, opt, LLaMA, blenderbot, m2m100. You can expect this list to be extended in the near future! - -To validate the benefits from the native scaled dot-product attention, we ran inference and training benchmarks, whose results are presented below. - -![Inference benchmark on a single A10G GPU, AWS g5.4xlarge instance](/assets/images/out-of-the-box/Fig1.jpg){:style="max-height:800px; width:100%"} -Inference benchmark on a single A10G GPU, AWS g5.4xlarge instance - -

        - -![Training benchmark on a single A10G GPU, AWS g5.4xlarge instance](/assets/images/out-of-the-box/Fig2.jpg){:style="max-height:800px; width:100%"} -Training benchmark on a single A10G GPU, AWS g5.4xlarge instance - -

        - -![Training benchmark on a single A100-SXM4-80GB, Nvidia DGX](/assets/images/out-of-the-box/Fig3.jpg){:style="max-height:800px; width:100%"} -Training benchmark on a single A100-SXM4-80GB, Nvidia DGX - -

        - - -Out of this benchmark, the most interesting finding is that native SDPA allows for the usage of longer sequence lengths and batch sizes without running into out of memory issues. Moreover, up to 20% speedups can be seen during inference, and even larger during training. - -As seen on the training benchmarks, it appears that smaller head dimension brings higher speedups and memory savings, which we will discuss in the following section. - -The implementation supports multi-GPU settings as well, thanks to 🤗 Accelerate library by passing `device_map=”auto”` to the `from_pretrained` method. Here are some results for training on two A100-SXM4-80GB. - - -![Training benchmark on two A100-SXM4-80GB, Nvidia DGX, using 🤗 Accelerate library for distributed training](/assets/images/out-of-the-box/Fig4.jpg){:style="max-height:800px; width:100%"} -Training benchmark on two A100-SXM4-80GB, Nvidia DGX, using 🤗 Accelerate library for distributed training - -

        - -Note that some kernels support only the sm_80 compute capability (which is the one from A100 GPUs), which limits usability on a wide range of hardware, notably if the head dimension is not a power of two. For example, as of PyTorch 2.0.0 during training, opt-2.7b (headim=80) and gpt-neox-20b (headdim=96) can not dispatch to a kernel using flash attention, unless run on an A100 GPU. Better kernels may be developed in the future: https://github.com/pytorch/pytorch/issues/98140#issuecomment-1518101895 - -## Flash Attention, Memory-efficient attention & math differences - -The native `scaled_dot_product_attention` relies on three possible backend implementations: flash attention, memory-efficient attention, and the so-called math implementation which provides a hardware-neutral fallback for all PyTorch platforms. - -When fused kernels are available for a given problem size, flash-attention or memory-efficient attention will be used, effectively allowing for a lower memory footprint, as in the memory-efficient attention case O(N) memory allocations are done on the GPU global memory instead of the classic O(N^2) for the traditional eager attention implementation. With flash attention, a reduced number of memory accesses (read and writes) is expected, hence both giving speedups and memory savings. - -The “math” implementation is simply an [implementation using the PyTorch’s C++ API](https://github.com/pytorch/pytorch/blob/c263bd43e8e8502d4726643bc6fd046f0130ac0e/aten/src/ATen/native/transformers/attention.cpp#L812-L868). Interesting to note in this implementation is that the query and key tensors are scaled individually for numerical stability, thus launching two aten::div operations instead of possibly only one in an eager implementation that does not contain this optimization for numerical stability. - -### Head dimension influence on speedups, memory savings - -Benchmarking `torch.nn.functional.scaled_dot_product_attention`, we notice a decrease in the speedup / memory gains as the head dimension increases. This is an issue for some architectures like EleutherAI/gpt-neo-2.7B, that has a relatively large head dimension of 128, or EleutherAI/gpt-j-6B (and derived models as PygmalionAI/pygmalion-6b) that has a head dimension of 256 (that actually currently do not dispatch on fused kernels as the head dimension is too large). - -This trend can be seen in the figures below, where `torch.nn.scaled_dot_production` is benchmarked standalone versus the above eager implementation. Moreover, we use the [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) context manager to force the usage of respectively math, flash attention, and memory-efficient attention implementation. - -![Using memory-efficient attention SDP kernel (forward-only), A100](/assets/images/out-of-the-box/Fig5.jpg){:style="max-height:800px; width:100%"} -Using memory-efficient attention SDP kernel (forward-only), A100 - -

        - -![Using math (without dropout), A100](/assets/images/out-of-the-box/Fig6.jpg){:style="max-height:800px; width:100%"} -Using math (without dropout), A100 - -

        - -![Using flash attention SDP kernel (without dropout), A100](/assets/images/out-of-the-box/Fig7.jpg){:style="max-height:800px; width:100%"} -Using flash attention SDP kernel (without dropout), A100 - -

        - -![Using memory-efficient attention SDP kernel (without dropout), A100](/assets/images/out-of-the-box/Fig8.jpg){:style="max-height:800px; width:100%"} -Using memory-efficient attention SDP kernel (without dropout), A100 - -

        - -We see that for the same problem size, be it for inference-only or training, the speedup decreases with higher head dimension, e.g. from 3.4x for headdim=8 to 1.01x for headdim=128 using flash attention kernel. - -The reduced memory saving is expected with larger head dimensions. Recall the standard attention computation: - -![Math equation](/assets/images/out-of-the-box/Fig9.jpg){:style="max-height:800px; width:100%"} - - -Due to the intermediate computations, the global memory footprint is 2 * N * N + N * d in this standard step by step computation. Memory-efficient attention proposes to iteratively update the softmax renormalization constant and moving its computation at the very end, allowing for only a constant output memory allocation N * d. - -Thus, the memory saving ratio is 2 * N / d + 1, which decreases with larger head dimension. - -In flash attention, the tradeoff is between the head dimension d and the shared memory size M of a GPU streaming multiprocessor, with a total number of memory accesses of O(N² * d²/M). Thus, the memory accesses scale quadratically in the head dimension, contrary to the standard attention that scales linearly. The reason is that in flash attention, for larger head dimension d, the key and value K, V need to be split into more blocks to fit into shared memory, and in turn each block needs to load the full query Q and output O. - -Thus, the highest speedups for flash attention are in a regime where the ratio d² / M is small enough. - -## Current limitations as of PyTorch 2.0.0 - -### Absence of a scale argument - -As of PyTorch 2.0.0, `torch.nn.functional.scaled_dot_product_attention` has no scale argument and uses the default square root of the hidden size sqrt(d_k). - -![Math equation](/assets/images/out-of-the-box/Fig10.jpg){:style="max-height:800px; width:100%; max-width: 400px"} - - -However, some architectures as OPT or T5 do not use a scaling in the attention, which as of Pytorch 2.0.0 forces it to artificially rescale before the `scaled_dot_product_attention` call. This introduces an unnecessary overhead, as an additional multiplication is necessary, on top of unneeded divisions in the attention. - -A fix for this issue has been merged [in PyTorch repository](https://github.com/pytorch/pytorch/pull/95259). - -### Support of flash attention / memory-efficient attention with custom mask - -As of PyTorch 2.0.0, when passing a custom attention mask, flash attention and memory-efficient attention can not be used. In this case, `scaled_dot_product_attention` automatically dispatches to the C++ implementation. - -However, as we have seen, some architectures require a custom attention mask, as T5 that uses positional bias. Moreover, in the case of a batch size larger than one where some inputs may be padded, a custom attention mask also needs to be passed. For this latter case, an alternative would be to use [NestedTensor](https://pytorch.org/docs/stable/nested.html), which SDPA supports. - -This limited support for custom masks thus limits the benefits from SDPA in these specific cases, although we can hope for an extended support [in the future](https://github.com/pytorch/pytorch/issues/96099#issuecomment-1458609375). - -Note that xformers, from which PyTorch’s SDPA partially takes inspiration, currently supports arbitrary attention masks: https://github.com/facebookresearch/xformers/blob/658ebab39545f180a6075385b3897921623d6c3b/xformers/ops/fmha/cutlass.py#L147-L156 . HazyResearch implementation of flash attention also supports an equivalent implementation of padding, as a cumulative sequence length array is used along with packed query/key/values - similar in essence to NestedTensor. - -## In conclusion - -Using `torch.nn.functional.scaled_dot_product_attention` is a free-lunch optimization, both making your code more readable, uses less memory, and is in most common cases faster. - -Although the implementation in PyTorch 2.0.0 has still minor limitations, inference and training already massively benefit from SDPA in most cases. We encourage you to use this native implementation be it to train or deploy your PyTorch models, and for 🤗 Transformers models as a one-line transformation! - -In the future, we would like to adapt the API to enable users to use SDPA in encoder-based models as well. - -We thank Benjamin Lefaudeux, Daniel Haziza and Francisco Massa for their advice on the head dimension influence, as well as Michael Gschwind, Christian Puhrsch and Driss Guessous for their feedback on the blog post! - -## Benchmark reproduction - -The benchmark presented in this post was done using torch==2.0.0, transformers==4.27.4, accelerate==0.18.0 and optimum==1.8.0. - -The benchmarks can be easily reproduced using the scripts for [inference](https://github.com/huggingface/optimum/blob/main/tests/benchmark/benchmark_bettertransformer.py), [training](https://github.com/huggingface/optimum/blob/main/tests/benchmark/benchmark_bettertransformer_training_minimal.py) for 🤗 Transformers models, and [standalone SDPA](https://github.com/fxmarty/efficient-attention-benchmark). diff --git a/_posts/2023-06-07-join-pytorch.md b/_posts/2023-06-07-join-pytorch.md deleted file mode 100644 index c0de0ba7b5f3..000000000000 --- a/_posts/2023-06-07-join-pytorch.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -layout: blog_detail -title: "Join the PyTorch Foundation: Membership Now Open" ---- - -In September 2022, we welcomed PyTorch to the Linux Foundation from Meta, which formed the PyTorch Foundation with founding members AMD, Amazon Web Services (AWS), Google, Meta, Microsoft, and NVIDIA. - -Since then, [we've seen significant growth](https://www.linuxfoundation.org/blog/pytorch-foundation-the-first-six-months), including a **39% increase in commits** across all repositories, **27% increase of unique contributors**, and a **12% increase community contributions** – all in the last 90 days! We’re grateful to our founding members for their support to move the foundation forward. - -Today, we’re announcing that **membership is now open to join the PyTorch Foundation**. - -As a member of the PyTorch Foundation, you'll have access to resources that allow you to be stewards of stable, secure, and long-lasting codebases. You can collaborate on training and certification programs, local and regional events, open source developer tooling, academic research, and guides to help new users and contributors have a productive experience. - -The PyTorch Foundation’s goal is to help end users navigate the PyTorch ecosystem, recruit talent, and adopt PyTorch and support open source AI technologies successfully. - -## Why join as a member - -Being a part of the PyTorch Foundation grants opportunities to help build the future of end-to-end machine learning frameworks alongside your industry peers. - -**Membership benefits include:** - -* Gain technical traction and insight for your organization's products by immersing your teams with other industry leaders. -* Influence technical priorities, approaches, and code. -* Support the PyTorch project community by helping fund programs and services that the project and its community rely on. -* Engage with the PyTorch project ecosystem, network with fellow members, and contribute to building and maintaining an engaging and strong PyTorch ecosystem. -* Provide thought leadership and participate in unique, wide-reaching networking and marketing programs expanding industry awareness as PyTorch amplifies member progress. -* Retain, attract, and increase engineering skills and employees and build your innovation partner network, supply chain, and customer pipeline. -* As an active member of the PyTorch community, you can deepen your engagement and leadership in local and industry developer networks and conferences. - -## How to join - -Commercial organizations are invited to apply for General membership, while non-profits and academic institutions are encouraged to apply for Associate membership. - -### Premier Members - -Organizations are welcome to submit an application to be considered as a Premier member. Premier members are the highest tier. They will appoint one voting representative in any subcommittees or activities of the PTF Governing Board, and receive prominent placement in displays of membership including website, landscape and marketing materials, exclusive live webinars with PyTorch online programs and everything included within a “general” membership. The annual fee is $150,000 + an LF Silver Membership. - -### General Members - -General members will participate in all marketing, community and thought leadership opportunities, as well as discounts on event sponsorships and training courses. General members also have the opportunity to be considered for a PTF board position. The annual fee is dependent on the size of your organization. More details can be found [here](http://pytorch.org/join). - -### Associate Members - -Associate members are free to join and will receive support and participation opportunities with the PyTorch Foundation team. More information can be found [here](http://pytorch.org/join). - -## Hear from our founding members - -### AMD - -“AMD strongly believes in and supports an open software ecosystem. We are very proud to be a founding member of the PyTorch Foundation, helping to develop an open and collaborative community for AI and ML. AI and ML have the opportunity to impact everything we do, and the work done through the PyTorch Foundation is critical in developing an open framework that is vendor neutral and helps democratize AI for all.” - -### AWS - -“AWS is a firm believer in the PyTorch Foundation mission to develop AI and deep learning tools through open collaboration. Our customers use PyTorch every day to build, train, and deploy machine learning models on AWS. Through our involvement, AWS is supporting innovation and helping to make open source tooling more accessible to our customers and the broader community.” - -### Google - -“The AI revolution is upon us and it’s being built on PyTorch. With new applications like ChatGPT and Stable Diffusion built on PyTorch, the wave of generative AI continues to be felt across every facet of society. We at Google are excited to be a founding member of the PyTorch Foundation and we’re excited for the opportunity to work closely with other leaders in AI to help grow this amazing and innovative community.” - -### Meta - -“Meta has a long history of putting open science at the core of our work in AI and PyTorch is no exception. PyTorch was built from the ground up with an open source, community-first philosophy. We transitioned PyTorch to the PyTorch Foundation because we believe this approach enables the fastest progress in building and deploying new systems that will address real-world needs and answer fundamental questions about the nature of intelligence. With the PyTorch Foundation, the entire AI community is positioned to push the field forward in countless exciting new ways.” - -### Microsoft - -“Microsoft believes strongly in PyTorch and it's been an honor to be a founding member of the PyTorch Foundation. Internally, we use PyTorch extensively, and an outgrowth of that is the Azure Container for PyTorch, which provides deep optimization for PyTorch development, including ONNX Runtime, DeepSpeed, and Nebula to greatly reduce training cost and accelerate training times on Azure Machine Learning. As part of our ongoing commitment to open source machine learning platforms, we look forward to partnering with industry leaders to continue contributing to the advancement of PyTorch.” - -### NVIDIA - -"As a leading Python-based AI framework, [PyTorch](https://www.nvidia.com/en-us/glossary/data-science/pytorch/) has been fundamental to the development of LLMs and GenAI. NVIDIA’s goal is to deepen our collaboration with the open-source AI community as part of the PyTorch Foundation, and help build the next wave of advanced, energy efficient, and cost-effective applications with [accelerated computing](https://www.nvidia.com/en-us/data-center/solutions/accelerated-computing/).” - -## Join today - -We are excited to see the PyTorch Foundation continue to grow alongside the community through neutral governance and support. We hope you’ll [join us as a member](/join)! \ No newline at end of file diff --git a/_posts/2023-06-16-docathon-h1-2023-wrap-up.md b/_posts/2023-06-16-docathon-h1-2023-wrap-up.md deleted file mode 100644 index 9fe1055ce95d..000000000000 --- a/_posts/2023-06-16-docathon-h1-2023-wrap-up.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -layout: blog_detail -title: "🎉 PyTorch Docathon H1 2023 Wrap-up 🎉" ---- - -Thank you to all who participated in our first ever PyTorch Docathon, the results have been nothing short of amazing! We want to extend our sincerest gratitude to all the participants who made this event a resounding success. Your passion, talent, and hard work have left an indelible mark on the PyTorch documentation. - -The virtual Docathon ran from May 31 through June 15 with more than 230 registrants and more than 110 participants joining the Docathon Slack channel, the energy and enthusiasm were palpable. Entrants were judged on the difficulty of submissions that resulted in over 40 merged pull requests and the publication of four new tutorials and addition of one new example. - -We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide. See the full list of contributors [here](https://github.com/pytorch/tutorials/blob/main/docathon-leaderboard.md). - -Meet the top contributors: - -- First place: [JoseLuisC99](https://github.com/JoseLuisC99), [QasimKhan5x](https://github.com/QasimKhan5x), [bjhargrave](https://github.com/bjhargrave) -- Second place: [Aidyn-A](https://github.com/Aidyn-A), [CaoE](https://github.com/CaoE), [HemanthSai7](https://github.com/HemanthSai7), [leslie-fang-intel](https://github.com/leslie-fang-intel), [Valentine233](https://github.com/Valentine233) -- Third place: [TheMemoryDealer](https://github.com/TheMemoryDealer), [arunppsg](https://github.com/arunppsg), [noqqaqq](https://github.com/noqqaqq), [zabboud](https://github.com/zabboud), [kiersten-stokes](https://github.com/kiersten-stokes) -- Honorable mentions: [frasertajima](https://github.com/frasertajima), [nairbv](https://github.com/nairbv), [mikebrow](https://github.com/mikebrow), [NeoKish](https://github.com/NeoKish), [fabiogomez11c](https://github.com/fabiogomez11c) - -As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch [documentation](https://github.com/pytorch/tutorials#contributing) and [code](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md), and pushing the boundaries of what's possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the AI community. - -Team PyTorch diff --git a/_posts/2023-06-22-optimized-pytorch-w-graviton.md b/_posts/2023-06-22-optimized-pytorch-w-graviton.md deleted file mode 100644 index aa58e8d717b0..000000000000 --- a/_posts/2023-06-22-optimized-pytorch-w-graviton.md +++ /dev/null @@ -1,194 +0,0 @@ ---- -layout: blog_detail -title: "Optimized PyTorch 2.0 Inference with AWS Graviton processors" -author: Sunita Nadampalli from AWS & Ankith Gunapal from Meta ---- - -New generations of CPUs offer significant performance improvement in machine learning (ML) inference due to specialized built-in instructions. Combined with their flexibility, high speed of development, and low operating cost, these general-purpose processors offer an alternative ML inference solution to other existing hardware solutions. - -AWS, Arm, Meta, and others helped optimize the performance of PyTorch 2.0 inference for Arm-based processors. As a result, we are delighted to announce that Arm-based AWS Graviton instance inference performance for PyTorch 2.0 is up to 3.5 times the speed for ResNet-50 compared to the previous PyTorch release, and up to 1.4 times the speed for BERT, making Graviton-based instances the fastest compute optimized instances on AWS for these models (see the following graph). - -![Relative speed improvement achieved by upgrading PyTorch to 2.0](/assets/images/optimized/im1.png){:style="max-height:800px; width:100%"} - -**Image 1**: Relative speed improvement achieved by upgrading from PyTorch version 1.13 to 2.0 (higher is better). The performance is measured on c7g.4xlarge instances. - -As shown in the next graph, we measured up to 50% cost savings for PyTorch inference with Graviton3-based c7g instances across Torch Hub ResNet-50 and multiple Hugging Face models compared to comparable x86-based compute optimized Amazon EC2 instances. For that graph, we first measured the cost per million inference for the five instance types. Then, we normalized the cost per million inference results to a c5.4xlarge instance, which is the baseline measure of “1” on the Y-axis of the chart. - -![Relative cost of PyTorch inference running on different AWS instances](/assets/images/optimized/im2.png){:style="max-height:800px; width:100%"} - -**Image 2**: Relative cost of PyTorch inference running on different AWS instances (lower is better).
        Source: AWS ML Blog on [Graviton PyTorch2.0 inference performance](https://aws.amazon.com/blogs/machine-learning/optimized-pytorch-2-0-inference-with-aws-graviton-processors/).
        - - - -Similar to the preceding inference cost comparison graph, the following graph shows the model p90 latency for the same five instance types. We normalized the latency results to the c5.4xlarge instance, which is the baseline measure of “1” on the Y-axis of the chart. The c7g.4xlarge (AWS Graviton3) model inference latency is up to 50% better than the latencies measured on c5.4xlarge, c6i.4xlarge, and c6a.4xlarge. \ - -![Relative latency (p90) of PyTorch inference running on different AWS instances](/assets/images/optimized/im3.png){:style="max-height:800px; width:100%"} - -**Image 3**: Relative latency (p90) of PyTorch inference running on different AWS instances (lower is better).
        Source: AWS ML Blog on [Graviton PyTorch2.0 inference performance](https://aws.amazon.com/blogs/machine-learning/optimized-pytorch-2-0-inference-with-aws-graviton-processors/).
        - - -## Optimization details - -PyTorch supports Compute Library for the Arm® Architecture (ACL) GEMM kernels via the oneDNN backend (previously called “MKL-DNN”) for AArch64 platforms. The optimizations are primarily for PyTorch ATen CPU BLAS, ACL kernels for fp32 and bfloat16, and oneDNN primitive caching. There are no frontend API changes, so no changes are required at the application level to get these optimizations working on Graviton3-based instances. - - -### PyTorch level optimizations - -We extended the ATen CPU BLAS interface to accelerate more operators and tensor configurations via oneDNN backend for aarch64 platform. The following diagram highlights (in orange) the optimized components that improved the PyTorch inference performance on aarch64 platform. - -![PyTorch software stack highlighting (in orange) the components optimized for inference performance improvement on AArch64 platform](/assets/images/optimized/im4.png){:style="max-height:800px; width:100%"} - -**Image 4**: PyTorch software stack highlighting (in orange) the components optimized for inference performance improvement on AArch64 platform - - -### ACL kernels and BFloat16 FPmath mode - -The ACL library provides Neon and SVE optimized GEMM kernels for both fp32 and bfloat16 formats: These kernels improve the SIMD hardware utilization and reduce the end to end inference latencies. The bfloat16 support in Graviton3 allows efficient deployment of models trained using bfloat16, fp32 and Automatic Mixed Precision (AMP). The standard fp32 models use bfloat16 kernels via oneDNN FPmath mode without model quantization. They provide up to two times faster performance compared to existing fp32 model inference without bfloat16 FPmath support. For more details on ACL GEMM kernel support, refer to [Arm Compute Library github](https://github.com/ARM-software/ComputeLibrary). - - -### Primitive Caching - -The following call sequence diagram shows how ACL operators are integrated into oneDNN backend. As shown in the diagram, ACL objects are handled as oneDNN resources instead of the primitive objects. This is because the ACL objects are stateful and mutable. Since the ACL objects are handled as resource objects, they are not cacheable with the default primitive caching feature supported in oneDNN. We implemented primitive caching at ideep operator level for “convolution”, “matmul” and “inner product” operators to avoid redundant GEMM kernel initialization and tensor allocation overhead. - -![Call sequence diagram showing how the Compute Library for the Arm® Architecture (ACL) GEMM kernels are integrated into oneDNN backend](/assets/images/optimized/im5.png){:style="max-height:800px; width:100%"} - -**Image 5**: Call sequence diagram showing how the Compute Library for the Arm® Architecture (ACL) GEMM kernels are integrated into oneDNN backend - - -## How to take advantage of the optimizations - -Install the PyTorch 2.0 wheel from the official repo and set environment variables to enable the additional optimizations. - -``` -# Install Python -sudo apt-get update -sudo apt-get install -y python3 python3-pip - -# Upgrade pip3 to the latest version -python3 -m pip install --upgrade pip - -# Install PyTorch and extensions -python3 -m pip install torch -python3 -m pip install torchvision torchaudio torchtext - -# Turn on Graviton3 optimization -export DNNL_DEFAULT_FPMATH_MODE=BF16 -export LRU_CACHE_CAPACITY=1024 -``` - - -## Running an inference - - -You can use PyTorch [torchbench](https://github.com/pytorch/benchmark) to measure the CPU inference performance improvements, or to compare different instance types. - - -``` -# Pre-requisite: -# pip install PyTorch2.0 wheels and set the above mentioned environment variables - -# Clone PyTorch benchmark repo -git clone https://github.com/pytorch/benchmark.git - -# Setup ResNet-50 benchmark -cd benchmark -python3 install.py resnet50 - -# Install the dependent wheels -python3 -m pip install numba - -# Run ResNet-50 inference in jit mode. On successful completion of the inference runs, -# the script prints the inference latency and accuracy results -python3 run.py resnet50 -d cpu -m jit -t eval --use_cosine_similarity -``` - - - -## Performance Analysis - -Now, we will analyze the inference performance of ResNet-50 on Graviton3-based c7g instance using PyTorch profiler. We run the code below with PyTorch 1.13 and PyTorch 2.0 and run the inference for a few iterations as a warmup before measuring the performance. - -``` -# Turn on Graviton3 optimization -export DNNL_DEFAULT_FPMATH_MODE=BF16 -export LRU_CACHE_CAPACITY=1024 -``` - -``` -import torch -from torchvision import models -sample_input = [torch.rand(1, 3, 224, 224)] -eager_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT) -model = torch.jit.script(eager_model, example_inputs=[sample_input, ]) - -model = model.eval() -model = torch.jit.optimize_for_inference(model) - -with torch.no_grad(): - # warmup runs - for i in range(10): - model(*sample_input) - prof = torch.profiler.profile( - on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'), record_shapes=True, with_stack=True) - # profile after warmup - prof.start() - model(*sample_input) - prof.stop() -``` - -We use tensorboard to view results of the profiler and analyze model performance. - -Install PyTorch Profiler Tensorboard plugin as follows - -``` -pip install torch_tb_profiler -``` - -Launch the tensorboard using - -``` -tensorboard --logdir=./logs -``` - -Launch the following in the browser to view the profiler output. The profiler supports ‘Overview’, ‘Operator’, ‘Trace’ and ‘Module’ views to get insight into the inference execution. - -``` -http://localhost:6006/#pytorch_profiler -``` - -The following diagram is the profiler ‘Trace’ view which shows the call stack along with the execution time of each function. In the profiler, we selected the forward() function to get the overall inference time. As shown in the diagram, the inference time for the ResNet-50 model on Graviton3-based c7g instance is around 3 times faster in PyTorch 2.0 compared to PyTorch 1.13. - -![Profiler Trace view: Forward pass wall duration on PyTorch 1.13 and PyTorch 2.0](/assets/images/optimized/im6.png){:style="max-height:800px; width:100%"} - -**Image 6**: Profiler Trace view: Forward pass wall duration on PyTorch 1.13 and PyTorch 2.0 - -The next diagram is the ‘Operator’ view which shows the list of PyTorch operators and their execution time. Similar to the preceding Trace view, the Operator view shows that the operator host duration for the ResNet-50 model on Graviton3-based c7g instance is around 3 times faster in PyTorch 2.0 compared to PyTorch 1.13. - -![Profiler Operator view: Forward operator Host duration on PyTorch 1.13 and PyTorch 2.0](/assets/images/optimized/im7.png){:style="max-height:800px; width:100%"} - -**Image 7**: Profiler Operator view: Forward operator Host duration on PyTorch 1.13 and PyTorch 2.0 - - -## Benchmarking Hugging Face models - -You can use the[ Amazon SageMaker Inference Recommender](https://docs.aws.amazon.com/sagemaker/latest/dg/inference-recommender.html) utility to automate performance benchmarking across different instances. With Inference Recommender, you can find the real-time inference endpoint that delivers the best performance at the lowest cost for a given ML model. We collected the preceding data using the Inference Recommender notebooks by deploying the models on production endpoints. For more details on Inference Recommender, refer to the[ amazon-sagemaker-examples](https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-inference-recommender/huggingface-inference-recommender/huggingface-inference-recommender.ipynb) GitHub repo. We benchmarked the following models for this post:[ ResNet50 image classification](https://pytorch.org/hub/pytorch_vision_resnet/),[ DistilBERT sentiment analysis](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english),[ RoBERTa fill mask](https://huggingface.co/roberta-base), and[ RoBERTa sentiment analysis](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment). - - -## Conclusion - - -For PyTorch 2.0, the Graviton3-based C7g instance is the most cost-effective compute optimized Amazon EC2 instance for inference. These instances are available on[ SageMaker](https://aws.amazon.com/about-aws/whats-new/2022/10/amazon-sagemaker-adds-new-graviton-based-instances-model-deployment/) and[ Amazon EC2](https://aws.amazon.com/ec2/instance-types/c7g/). The[ AWS Graviton Technical Guide](https://github.com/aws/aws-graviton-getting-started) provides the list of optimized libraries and best practices that will help you achieve cost benefit with Graviton instances across different workloads. - -If you find use cases where similar performance gains are not observed on Graviton, please open an issue on the [aws-graviton-getting-started](https://github.com/aws/aws-graviton-getting-started) github to let us know about it. We will continue to add more performance improvements to make AWS Graviton-based instances the most cost-effective and efficient general purpose processor for inference using PyTorch. - - -## Acknowledgments - -We would like to thank Ali Saidi (Sr. Principal Engineer) and Csaba Csoma (Sr. Manager, Software Development) from AWS, Ashok Bhat (Sr. Product Manager), Nathan Sircombe (Sr. Engineering Manager) and Milos Puzovic (Principal Software Engineer) from Arm for their support during the Graviton PyTorch inference optimization work. We would also like to thank Geeta Chauhan (Engineering Leader, Applied AI) from Meta for her guidance on this blog. - - -## About the authors - -**Sunita Nadampalli** is a ML Engineer and Software Development Manager at AWS. - -**Ankith Gunapal** is an AI Partner Engineer at Meta(PyTorch). diff --git a/_posts/2023-06-28-path-achieve-low-inference-latency.md b/_posts/2023-06-28-path-achieve-low-inference-latency.md deleted file mode 100644 index 26fed2617e70..000000000000 --- a/_posts/2023-06-28-path-achieve-low-inference-latency.md +++ /dev/null @@ -1,357 +0,0 @@ ---- -layout: blog_detail -title: "The Path to Achieve Ultra-Low Inference Latency With LLaMA 65B on PyTorch/XLA" -author: Milad Mohammadi, Jiewen Tan, Liyang Lu, Siyuan Liu, Yeounoh Chung, Wonjoo Lee, Manfei Bai, Steven Krawczyk, Shauheen Zahirazami, Alex Wertheim, Meghan Cowan, Jack Cao, Joe Spisak ---- - -## Background & State of the Art - -In the natural language processing (NLP) space, language models are designed to generate a token (e.g. word) using a sequence of past input tokens. Large Language Models (LLMs) are the latest deep learning innovation in this space built to generate text in a human-like fashion. These models generally use [transformers](https://arxiv.org/pdf/1706.03762.pdf) to improve their attention over a large sequence of input tokens. - -[LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/), open sourced by [Meta AI](https://ai.facebook.com/), is a powerful foundation LLM trained on over 1T tokens. LLaMA is competitive with many best-in-class models such as [GPT-3](https://openai.com/blog/gpt-3-apps), [Chinchilla](https://arxiv.org/pdf/2203.15556.pdf), [PaLM](https://arxiv.org/pdf/2204.02311.pdf). [LLaMA (13B) outperforms GPT-3 (175B)](https://arxiv.org/pdf/2302.13971.pdf) highlighting its ability to extract more compute from each model parameter. - -In this blog post, we use LLaMA as an example model to demonstrate the capabilities of PyTorch/XLA for LLM inference. We discuss how the computation techniques and optimizations discussed here improve inference latency by 6.4x on 65B parameter LLaMA models powered by Google Cloud TPU v4 (v4-16). - - -## Model Overview - -We demonstrate the performance capabilities of PyTorch/XLA on [LLaMA](https://github.com/facebookresearch/llama), the latest LLM from Meta. We showcase performance optimizations on a series of common LLaMA configurations. Notice the 175B parameter model configuration is absent in the public domain. For the 175B parameter model mentioned below, we apply [OPT 175B model configuration](https://github.com/huggingface/transformers/blob/v4.27.2/src/transformers/models/opt/modeling_opt.py#L804) to the LLaMA code base. Unless stated otherwise, in all configurations, we use `max_seq_len=256` and `dtype=bfloat16` for weights and activations. - - -#### Table 1: Model Configurations Explored in this article - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        LLaMA - Model Hyper Parameters -
        # Parameters - Dimensions - N Heads - N Layers - Max Seq Len -
        7B - 4,096 - 32 - 32 - 256 -
        33B - 6,656 - 52 - 60 - 256 -
        65B - 8,192 - 64 - 80 - 256 -
        175B - 12,288 - 96 - 96 - 256 -
        - - - - -## Performance Challenges of LLMs - -LLMs have a few properties that make them challenging for compiler optimizations. (a) LLMs use autoregressive decoding to generate the next token baked on the previous ones; this means prompt tensors and coaches have a dynamic shape. (b) LLMs must work with variable input prompt lengths without triggering recompilation due to input tensor shape changes; input tensors must be properly bucketized and padded to avoid recompilation. (c) LLMs often require more memory than a single TPU (or GPU) device can support. A model-sharding scheme is required to fit the model across a distributed compute architecture. For instance, a LLaMA model with 65B parameters can fit on a v4-16 Cloud TPU, which is comparable to 8 A100 GPUs. (d) running LLMs in production can be expensive; one way to improve performance per total cost of ownership (Perf/TCO) is via quantization; quantization can potentially reduce hardware requirements. - - -## Inference Tech Stack in PyTorch/XLA - -Our goal is to offer the AI community a high performance inference stack. PyTorch/XLA integrates with [TorchDynamo](https://pytorch.org/docs/stable/torch.compiler), [PjRt](https://pytorch.org/blog/pytorch-2.0-xla/#pjrt-runtime-beta), [OpenXLA](https://pytorch.org/blog/pytorch-2.0-xla-path-forward/), and various model parallelism schemes. TorchDynamo eliminates tracing overhead at runtime, PjRt enables efficient host-device communication; PyTorch/XLA traceable collectives enable model and data parallelism on LLaMA via [TorchDynamo](https://pytorch.org/docs/stable/torch.compiler). To try our results, please use our custom [torch](https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch-nightly+20230422-cp38-cp38-linux_x86_64.whl), [torch-xla](https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-nightly+20230422-cp38-cp38-linux_x86_64.whl) wheels to reproduce our [LLaMA inference solution](https://github.com/pytorch-tpu/llama/tree/blog). PyTorch/XLA 2.1 will support the features discussed in this post by default. - - -## Parallel Computing - - -### [FairScale](https://github.com/facebookresearch/fairscale) Sharding - -LLaMA uses FairScale model sharding API ([fairscale.nn.model_parallel.layers](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L13-L17)). We built an equivalent representation of this API using PyTorch/XLA communication collective (CC) ops such as `all-reduce` to communicate program state (e.g. activations) between accelerators. TorchDynamo does not fully support capturing CC ops currently (a.k.a. [traceable collectives](https://github.com/pytorch/pytorch/issues/93173)). Without this support, a TorchDynamo FX graph would be cut at every device communication, meaning at every model layer. Graph cuts lead to performance loss as the underlying XLA compiler loses full graph optimization opportunities. To resolve this, we offer PyTorch/XLA traceable collectives by integrating the dispatcher collectives into our existing CC APIs. The difference is we don’t need to insert `c10d.wait()` ops after collectives, given the lazy execution nature of PyTorch/XLA. With support for traceable collectives, PyTorch/XLA allows singular FX graph generation in TorchDynamo. - - -## Autoregressive Decoding on PyTorch/XLA - -LLMs need autoregressive decoding to feed the previous word as a prompt to predict the next token. Autoregressive decoding leads to unbounded dynamic shape problems, which in turn causes recompilation of every prompt. We optimized the LLaMA autoregressive decoder to operate with fixed shapes that in-place updates the KV-cache, output sequences, and attention masks during every token generation. With a combination of padding, masking, and index ops, we avoided excessive graph recompilation, thereby achieving efficient autoregressive decoding. - - -### KV-Cache Optimization - -LLaMA implements autoregressive decoding with KV-cache. For every generated token, the KV-cache stores the attention key/value activations of each Transformer layer. Thus, upon decoding a new token, the key/values of prior tokens no longer need recomputation. - -In LLaMA, the KV-cache tensor slices are updated in-place; this leads to recompilation events every time a token is generated. To address this issue, we use index tensors and `tensor.index_copy()` ops to replace the in-place slice updates. Attention masks and output sequences also benefit from the same optimization. - - -## Input Prompt Optimization - -Variable length input prompts are common in LLM applications. This property causes input tensor shape dynamism and in turn recompilation events. When processing a prompt to fill the KV-cache, we either (a) process the input prompt token-by-token, or (b) process the whole prompt in one iteration. The pros and cons of each method are: - -1. Pre-compile 1 graph and process a prompt token-by-token - * Practical: 1 graph is compiled during warm-up - * Slow: *O(L)* to process an input prompt length *L* - a disadvantage for long prompts -2. Pre-compile all graphs with input lengths ranging from 1 to max_seq_len (e.g. 2,048) - * Impractical: pre-compile and cache *max_seq_len* graphs during warm-up time - * Fast: 1 graph execution to process the full prompt - -We introduce prompt length bucketization, an optimization to strike a balance between the two alternatives. We define a set of ascending bucket sizes, *(b0,b1,b2,...,bB-1)*, and then pre-compile program graphs with input sizes according to these bucket values, *(G0,G1,G2,...,GB-1)*; *B* is the number of buckets. For a given input prompt, we round up the prompt length to the closest bucket value *bn*, pad the sequence, and use *Gn* to process the prompt in one iteration. The computation on the padding tokens is discarded. For prompts larger than the largest bucket size, we process them section-by-section. - -The optimal bucket sizes should be determined by prompt length distribution in a target application. Here, we adopt bucket lengths: 128, 256, 384, 512. Any input prompt with up to 2,047 tokens requires up to 4 graph executions. For example, a 1,500 input prompt with generation length of 256 requires 260 graph executions - 4 to process the input, and 256 to generate the output. - - -## Quantization - -Quantization reduces the number of bits necessary to represent a value; it reduces the bandwidth to communicate data across multiple accelerator nodes (via collectives) and lowers the hardware requirements to serve a specific model size. - -Normally, with `BF16` weights, a 175B parameter model would consume about 351GB of memory, and therefore require a v4-32 instance to accommodate the model. By quantizing the weights to `INT8`, we reduced the model size by roughly 50%, allowing it to run on a smaller v4-16 instance. Because LLaMA shards model activations, quantization offers negligible communication gain. - -In our experiments, we quantized the linear layer. Since LLaMA model checkpoints are unavailable publicly, and our goal is to evaluate performance, the quantized model is initialized with random weights.Recent literature such as [AWQ](https://arxiv.org/pdf/2306.00978.pdf) and [Integer or Floating Point?](https://arxiv.org/pdf/2305.12356.pdf) offer insights into performance properties of LLaMA under various low-bit quantization schemes. - - -### Effect of Batch Size on Quantization Performance - - -[TPU v4](https://arxiv.org/pdf/2304.01433.pdf) is programmed to run `matmul` on the Matrix Multiply Unit (MXU) when the model batch size (BS) > 1. For BS = 1, `matmul` runs on the Vector Processor Unit (VPU). Since MXU is more efficient than VPU, `INT8` quantization gains performance at BS>1. See [Performance Analysis](#heading=h.4xqv3t16rl42) section for details. - - -## Op Support - -Occasionally, new models introduce new mathematical operations that require PyTorch/XLA to extend its supported op set for compilation. For LLaMA, we supported: [multinomial](https://github.com/pytorch/xla/issues/4839). - - -## Methodology - -LLaMA works on PyTorch/XLA out of the box on LazyTensorCore. We use this configuration as a baseline for our follow up analysis. All experiments assume 256-long input prompts. In the absence of a publicly available model checkpoint, we used random tensor initialization for this inference stack optimization effort. A model checkpoint is not expected to change latency results discussed here. - - -### Model Sizing - -Assuming `N` is the number of parameters, `dimensions` is the hidden size, `n_layers` is the number of layers, `n_heads` is the number of attention heads, the equation below can be used to approximate the model size. See the [Model Overview](#heading=h.tehlvi942ssk) section for details. - - -``` -N = (dimensions)^2 * n_layers * 12 -``` - - -`n_heads` doesn’t affect `N`, but the following equation holds for the open sourced model configs. - - -``` -dim = 128 * n_heads -``` - -#### Cache Sizing - -Both model parameters and the cache layers in the Attention block contribute to memory consumption. Since the default LLaMA model uses `BF16` weights, the memory consumption calculation in this section is based on `BF16` weights. - -The size of the cache layer is calculated by `cache_size = max_batch_size * max_seq_len * dimensions`. `max_batch_size = 1` and `max_seq_len = 256 `are used as an example configuration in the following calculations. There are 2 cache layers in each Attention block. So, the total LLaMA cache size (in Bytes) is `total_cache_size = n_layers * 2 * cache_size * (2 bytes)`. - - -#### TPU v4 Hardware Sizing - -Each TPU v4 chip has 32GB of available High-Bandwidth Memory (HBM). Table 2 has the details on memory consumption and the number of required TPU chips to hold a LLaMA model. - -#### Table 2: LLaMA TPU v4 HBM requirements (i.e. TPU v4 chip requirements) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        # Parameters - Parameter (MB) - Cache (MB) - Total (GB) - Min # of TPU v4 Chips -
        7B - 14,000 - 134 - 14.128 - 1 -
        33B - 66,000 - 408 - 66.41 - 3 -
        65B - 130,000 - 671 - 130.67 - 5 -
        175B - 350,000 - 1,208 - 351.21 - 11 -
        - - -### Metrics - -Below are useful metrics to measure inference speed. Assuming `T` is the total time, `B` is the batch size, `L` is the decoded sequence length. - - -#### Latency Definition - -Latency is the time it takes to get the decoded result at target length `L`, regardless of the batch size `B`. Latency represents how long the user should wait to get the response from the generation model. - - -``` -Latency = T (s) -``` - - - -#### Per-token latency - -One step of autoregressive decoding generates a token for each sample in the batch. Per-token latency is the average time for that one step. - - -``` -Per-token latency = T / L (s/token) -``` - - - -#### Throughput - -Throughput measures how many tokens are generated per unit time. While it’s not a useful metric for evaluating online serving it is useful to measure the speed of batch processing. - - -``` -Throughput = B * L / T (tokens/s) -``` - - -To minimize confusion and misinterpretation, it’s better to avoid metrics like `T / (B * L)`, which mixes latency and throughput. - - -## Results - -Figure 1 shows latency / token results for LLaMA 7B to 175B models. In each case, the model is run on a range of TPU v4 configurations. For instance, LLaMA 7B shows 4.7ms/token and 3.8ms/token on v4-8 and v4-16 respectively. For more comparison, visit the HuggingFace [LLM performance leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). - -In the absence of the features discussed in this blog post, the LLaMA 65B running on v4-32 delivers 120ms/token instead of 14.5ms/token obtained here, leading to **8.3x** speedup. As discussed earlier, developers are encouraged to try our custom [torch](https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch-nightly+20230422-cp38-cp38-linux_x86_64.whl), [torch-xla](https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-nightly+20230422-cp38-cp38-linux_x86_64.whl) wheels that unlock the repro of [LLaMA inference](https://github.com/pytorch-tpu/llama/tree/blog) results shared here. - - -![Figure 1: LLaMA Inference Performance on TPU v4 hardware](/assets/images/low-latency/im1.svg){:style="max-height:800px; width:100%"} - -**Figure 1**: LLaMA Inference Performance on TPU v4 hardware - -PyTorch/XLA:GPU performance is better than PyTorch:GPU eager and similar to PyTorch Inductor. PyTorch/XLA:TPU performance is superior to PyTorch/XLA:GPU. In the near future, XLA:GPU will deliver optimizations that bring parity with XLA:TPU. The single A100 configuration only fits LLaMA 7B, and the 8-A100 doesn’t fit LLaMA 175B. - - - -![Figure 2: LLaMA Inference Performance on GPU A100 hardware](/assets/images/low-latency/im2.svg){:style="max-height:800px; width:100%"} - -**Figure 2**: LLaMA Inference Performance on GPU A100 hardware - - -As the batch size increases, we observe a sublinear increase in per-token latency highlighting the tradeoff between hardware utilization and latency. - -![Figure 3: LLaMA Inference Performance across different batch sizes](/assets/images/low-latency/im3.svg){:style="max-height:800px; width:100%"} - -**Figure 3**: LLaMA Inference Performance across different batch sizes - - -Our studies suggest the impact of maximum sequence input length (`max_seq_len`) on inference latency is relatively minimal. We attribute this to the sequential and iterative nature of token generation. The small difference in performance can be due to KV cache access latency changes as the storage size increases. - -![Figure 4: LLaMA Inference Performance across different prompt lengths](/assets/images/low-latency/im4.svg){:style="max-height:800px; width:100%"} - -**Figure 4**: LLaMA Inference Performance across different prompt lengths - -LLMs are often memory bound applications; thus, by quantizing model parameters we enable loading and executing a larger tensor on MXUs per unit time (i.e. HBM ⇒ CMEM and CMEM ⇒ MXU data moevment). Figure 5 shows `INT8` weight-only quantization offers 1.6x-1.9x speedup allowing running a larger model on a given hardware. - -When BS=1, INT8 tensors are dispatched to VPU which is smaller than MXU (see the [TPU v4 paper](https://arxiv.org/pdf/2304.01433.pdf)); otherwise, MXU is used. As a result, when BS=1, quantization memory bandwidth gains are offset by lack of MXU utilization. When BS>1, however, memory gains deliver superior latency on the quantized model. For example, in the case of 175B parameters LLaMA, v4-16 with quantiztion and v4-32 without quantiztion deliver similar performance. Note we do not provied `FP8` comparisons because PyTorch is yet to offer this data type. - -![Figure 5: LLaMA Inference Performance vs. weight-only quantization. The missing blue bars suggest the model size doesn’t fit in the specified TPU hardware.](/assets/images/low-latency/im5.svg){:style="max-height:800px; width:100%"} - -**Figure 5**: LLaMA Inference Performance vs. weight-only quantization. The missing blue bars suggest the model size doesn’t fit in the specified TPU hardware. - - -Figure 6 demonstrates the steady performance advantage of PyTorch/XLA as the input prompt length grows from 10 tokens to 1,500 tokens. This strong scaling capability suggests minimal PyTorch/XLA recompilation events enabling a wide range of real-world applications. In this experiment, the maximum length is 2,048 and maximum generation length is 256. - - -![Figure 6: LLaMA Inference Performance vs. Input Prompt Length](/assets/images/low-latency/im6.svg){:style="max-height:800px; width:100%"} - -**Figure 6**: LLaMA Inference Performance vs. Input Prompt Length - - - -## Final Thoughts - -We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to [GitHub](https://github.com/pytorch/xla) so that we can openly collaborate. You can also [try out](https://colab.sandbox.google.com/github/pytorch/xla/blob/master/contrib/colab/getting-started.ipynb) PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs. - -Cheers, -The PyTorch/XLA Team at Google -#PoweredByPyTorch diff --git a/_posts/2023-06-29-optimizing-libtorch.md b/_posts/2023-06-29-optimizing-libtorch.md deleted file mode 100644 index c856f2d28363..000000000000 --- a/_posts/2023-06-29-optimizing-libtorch.md +++ /dev/null @@ -1,95 +0,0 @@ ---- -layout: blog_detail -title: "Optimizing LibTorch-based inference engine memory usage and thread-pooling" -author: Himalay Mohanlal Joriwal, Pierre-Yves Aquilanti, Vivek Govindan, Hamid Shojanazeri, Ankith Gunapal, Tristan Rice ---- - -## Outline - -In this blog post we show how to optimize LibTorch-based inference engine to maximize throughput by reducing memory usage and optimizing the thread-pooling strategy. We apply these optimizations to Pattern Recognition engines for audio data, for example, music and speech recognition or acoustic fingerprinting. The optimizations discussed in this blog post allow for memory usage reduction by 50% and reduction in end-to-end latency for Inference by 37.5%. These optimizations are applicable to computer vision and natural language processing. - - -## Audio Recognition Inferencing - -Audio Recognition (AR) engines can be used to recognize and identify sound patterns. As an example, identifying the type and species of a bird from audio recordings, distinguishing music from the singer's voice, or detecting an abnormal sound indicating a breach in a building. To identify sounds of interest, AR engines process audio through 4 stages: - -1. **File Validation**: The AR engine validates the input audio file. -2. **Feature Extraction**: Features are extracted from each segment within the audio file. -3. **Inference**: LibTorch performs inference using CPUs or accelerators. In our case Intel processors on an Elastic Cloud Compute (EC2) instance. -4. **Post-processing**: A post-processing model decodes the results and calculates scores that are used to convert inference output into tags or transcripts. - -Of these 4 steps, inference is the most computationally intensive and can take up to 50% of the pipeline processing time depending on the model complexity. This means that any optimization at this stage has a significant impact on the overall pipeline.  - - -## Optimizing the Audio Recognition engine with concurrency...is not so simple - -Our objective for this processing pipeline is to extract audio segments into tags or transcripts through a processing. The input data is an audio file composed of several short sound segments (S1 to S6 in Figure 1). The output data corresponds to tags or transcripts ordered by timestamps. - - - -![Figure 1: Example audio file with segment boundaries](/assets/images/optimizing-libtorch/im1.jpg){:style="max-height:800px; width:100%"} - -**Figure 1**: Example audio file with segment boundaries - - -Each segment can be processed independently and in an out-of-order fashion. This offers the opportunity to process segments concurrently and in parallel to optimize the overall inference throughput as well as maximize the usage of the resources. - -Parallelization on an instance can be achieved through multi-threading (pThreads, std::threads, OpenMP) or multi-processing. The advantage of multi-threading over multi-processing is the ability to use shared memory. It enables developers to minimize data duplication across threads by sharing data across threads; the AR models in our case (_Figure 2_). Furthermore, a reduction in memory allows us to run more pipelines in parallel by increasing the number of engine threads in order to utilize all vCPUs on our Amazon EC2 instance ([c5.4xlarge](https://aws.amazon.com/ec2/instance-types/c5/) in our case, it offers 16 vCPUs). In theory, we expect to see higher hardware utilization and higher throughput for our AR engine as a result. - - -![Figure 2: Multi-threaded AR Engine](/assets/images/optimizing-libtorch/im2.jpg){:style="max-height:800px; width:100%"} - -**Figure 2**: Multi-threaded AR Engine - -But we found these assumptions to be wrong. Indeed, we found that increasing the number of threads of the application led to an increase of the end-to-end latency for each audio segment and to a decrease of the engine throughput. For example, increasing the concurrency from 1 to 5 threads led to an increase of the latency by 4x which had a proportional effect on decreasing the throughput. In fact, metrics showed that within the pipeline, the latency of the inference stage alone was 3x higher than it’s single thread baseline.  - -Using a profiler, we found that the CPU [Spin Time](https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference.html#cpu-metrics-reference_SPIN-AND-OVERHEAD-TIME) increased, potentially due to CPU oversubscription which impacts system and application performance. Given our control over the application's multi-thread implementation, we chose to dive deeper into the stack and identify potential conflicts with LibTorch’s default settings. - - -### Diving deeper on LibTorch’s multi-threading and its impact on concurrency - -LibTorch’s parallel implementations on CPU for inference are based on  [global thread pools](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#cpu-threading-and-torchscript-inference). Examples of implementations are Inter-op and intra-op parallelism, which can be chosen depending on the model’s properties. In both cases, it is possible to set [the number of threads](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#tuning-the-number-of-threads) in each thread-poll to optimize the latency and throughput.  - -To test if LibTorch’s parallel default implementation settings had a counter effect on our inference latency, we ran an experiment on a 16 vCPus machine with a 35-minute audio file, keeping the LibTorch inter-threads constant at 1 (because our models didn’t utilize the inter-op thread pool). We collected the following data as shown in Figure 3 and 4.  - - -![Figure 3: CPU Utilization for different number of engine threads](/assets/images/optimizing-libtorch/im3.jpg){:style="max-height:800px; width:100%"} - -**Figure 3**: CPU Utilization for different number of engine threads - -![Figure 4: Processing times for different number of engine threads](/assets/images/optimizing-libtorch/im4.jpg){:style="max-height:800px; width:100%; margin-top: 4rem;"} - -**Figure 4**: Processing times for different number of engine threads - -Execution time in Figure 4 is the end-to-end processing time for processing all the segments of the given audio file. We have 4 different configurations of LibTorch intra-threads which are 1, 4, 8, 16 and we change the number of engine threads from 1 to 16 for each intra-thread LibTorch configuration. As we see in Figure 3, CPU utilization increases with an increase in the number of engine threads for all LibTorch intra-thread configurations. But as we see in Figure 4, an increase in CPU utilization doesn't translate into lower execution time. We found out that in all but one case, as the number of engine threads shot up, so did execution time. The one exception was the case where the intra-thread pool size was 1. - - -### Resolving the global thread pool issue - -Using too many threads with a global thread pool led to performance degradation and caused an over-subscription problem. Without disabling[ LibTorch global thread pools](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html), it was difficult to match the performance of the multi-process engine. - -Disabling the LibTorch global thread pool is as simple as setting the intra-op/inter-op parallelism threads to 1, as shown here: - -``` -at::set_num_threads(1)           // Disables the intraop thread pool. -at::set_num_interop_threads(1). // Disables the interop thread pool. -``` - -As shown in Figure 4, the lowest processing time was measured when the LibTorch global thread pool was disabled. - -This solution improved AR engine throughput in several cases. However, when evaluating long datasets (audio files longer than 2 hours in load test), we found that the memory footprint of the engine gradually started to increase. - - -### Optimizing memory usage - -We ran a load-test on the system with two hours long audio files and found out that the observed memory increase was the result of memory fragmentation within a multi-threaded LibTorch inference. We resolved this using[ jemalloc](https://github.com/jemalloc/jemalloc), which is a general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support. [Using jemalloc](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#switch-memory-allocator), our peak memory usage decreased by an average of 34% and average memory usage decreased by 53%. - - -![Figure 5: Memory usage over time using the same input file with and without jemalloc](/assets/images/optimizing-libtorch/im5.jpg){:style="max-height:800px; width:100%"} - -**Figure 5**: Memory usage over time using the same input file with and without jemalloc - - -## Summary - -To optimize the performance of multi-threaded LibTorch-based inference engines, we recommend verifying that there is no oversubscription problem in LibTorch. In our case, all threads in the multi-threaded engine were sharing the LibTorch global thread pool, which caused an oversubscription problem. This was remedied by disabling the global thread pool: we disabled the interop and intraop global thread pool by setting threads to 1. To optimize the memory of a multi-threaded engine, we recommend using Jemalloc as a memory allocator tool rather than the default malloc function. \ No newline at end of file diff --git a/_posts/2023-07-10-how-to-accelerate.md b/_posts/2023-07-10-how-to-accelerate.md deleted file mode 100644 index 5284e75ba166..000000000000 --- a/_posts/2023-07-10-how-to-accelerate.md +++ /dev/null @@ -1,218 +0,0 @@ ---- -layout: blog_detail -title: "How to Accelerate PyTorch Geometric on Intel® CPUs" -author: Intel ---- - -## Overview - -The Intel PyTorch team has been collaborating with the PyTorch Geometric (PyG) community to provide CPU performance optimizations for Graph Neural Network (GNN) and PyG workloads. In the PyTorch 2.0 release, several critical optimizations were introduced to improve GNN training and inference performance on CPU. Developers and researchers can now take advantage of [Intel’s AI/ML Framework optimizations](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html) for significantly faster model training and inference, which unlocks the ability for GNN workflows directly using PyG. - -In this blog, we will perform a deep dive on how to optimize PyG performance for both training and inference while using the PyTorch 2.0 flagship torch.compile feature to speed up PyG models. - - -## Message Passing Paradigm - -Message passing refers to the process of nodes exchanging information with their respective neighbors by sending messages to one another. In PyG, the process of message passing can be generalized into three steps: - -1. **Gather**: Collect edge-level information of adjacent nodes and edges. -2. **Apply**: Update the collected information with user-defined functions (UDFs). -3. **Scatter**: Aggregate to node-level information, e.g., via a particular reduce function such as sum, mean, or max. - - -![Figure 1: The message passing paradigm](/assets/images/how-to-accelerate/f1-pyg-message-passing-paradigm.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -**Figure 1**: The message passing paradigm (Source: [Matthias Fey](http://github.com/rusty1s)) - - -Message passing performance is highly related to the storage format of the adjacency matrix of the graph, which records how pairs of nodes are connected. Two methods for the storage format are: - - - -* **Adjacency matrix in COO (Coordinate Format):** The graph data is physically stored in a two-dimensional tensor shape of **[2, num_edges]**, which maps each connection of source and destination nodes. The performance hotspot is scatter-reduce. -* **Adjacency matrix in CSR (Compressed Sparse Row):** Similar format to COO, but compressed on the row indices. This format allows for more efficient row access and faster sparse matrix-matrix multiplication (SpMM). The performance hotspot is sparse matrix related reduction ops. - - -## Scatter-Reduce - -The pattern of scatter-reduce is parallel in nature, which updates values of a **self** tensor using values from a **src** tensor at the entries specified by **index**. Ideally, parallelizing on the outer dimension would be most performant. However, direct parallelization leads to write conflicts, as different threads might try to update the same entry simultaneously. - - -![Figure 2: Scatter-reduce and its optimization scheme](/assets/images/how-to-accelerate/f2-scatter-reduce-scheme.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -**Figure 2**: Scatter-reduce and its optimization scheme (Source: Mingfei Ma) - - -To optimize this kernel, we use sorting followed by a reduction: - - - -* **Sorting:** Sort the **index** tensor in ascending order with parallel radix sort, such that indices pointing to the same entry in the **self** tensor are managed in the same thread. -* **Reduction:** Paralleled on the outer dimension of **self**, and do vectorized reduction for each indexed **src** entry. - -For its backward path during the training process (i.e., gather), sorting is not needed because its memory access pattern will not lead to any write conflicts. - - -## SpMM-Reduce - -Sparse matrix-matrix reduction is a fundamental operator in GNNs, where **A** is sparse adjacency matrix in CSR format and **B** is a dense feature matrix where the reduction type could be _sum_, _mean_ or _max_. - - - -![Figure 3: SpMM optimization scheme](/assets/images/how-to-accelerate/f3-spmm-optimization-scheme.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - - -**Figure 3**: SpMM optimization scheme (Source: Mingfei Ma) - -The biggest challenge when optimizing this kernel is how to balance thread payload when parallelizing along rows of the sparse matrix **A**. Each row in **A** corresponds to a node, and its number of connections may vary vastly from one to another; this results in thread payload imbalance. One technique to address such issues is to do payload scanning before thread partition. Aside from that, other techniques are also introduced to further exploit CPU performance such as vectorization and unrolling and blocking. - -These optimizations are done via **torch.sparse.mm** using the reduce flags of _amax_, _amin_, _mean_, _sum_. - - -## Performance Gains: Up to 4.1x Speedup - -We collected benchmark performance for both inference and training in [pytorch_geometric/benchmark](http://github.com/pyg-team/pytorch_geometric/tree/master/benchmark) and in the [Open Graph Benchmark (OGB)](http://github.com/snap-stanford/ogb) to demonstrate the performance improvement from the above-mentioned methods on Intel® Xeon® Platinum 8380 Processor. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model – Dataset - Option - Speedup ratio -
        - GCN-Reddit (inference) - 512-2-64-dense - 1.22x -
        1024-3-128-dense - 1.25x -
        512-2-64-sparse - 1.31x -
        1024-3-128-sparse - 1.68x -
        - GraphSage-ogbn-products (inference) - 1024-3-128-dense - 1.15x -
        512-2-64-sparse - 1.20x -
        1024-3-128-sparse - 1.33x -
        full-batch-sparse - 4.07x -
        GCN-PROTEINS (training) - 3-32 - 1.67x -
        GCN-REDDIT-BINARY (training) - 3-32 - 1.67x -
        GCN-Reddit (training) - 512-2-64-dense - 1.20x -
        1024-3-128-dense - 1.12x -
        - -**Table 1**: Performance Speedup on PyG Benchmark1 - -From the benchmark results, we can see that our optimizations in PyTorch and PyG achieved **1.1x-4.1x speed-up** for inference and training. - - -## torch.compile for PyG - -The PyTorch2.0 flagship feature torch.compile is fully compatible with PyG 2.3 release, bringing additional speed-up in PyG model inference/training over imperative mode, thanks to TorchInductor C++/OpenMP backend for CPUs. In particular, **a 3.0x – 5.4x performance speed-up** is measured on [basic GNN models](http://github.com/pyg-team/pytorch_geometric/blob/master/test/nn/models/test_basic_gnn.py) with Intel Xeon Platinum 8380 Processor on model training2. - - - -![Figure 4: Performance Speedup with Torch Compile](/assets/images/how-to-accelerate/f4-torch-compile-performance-speedup.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -**Figure 4**: Performance Speedup with Torch Compile - -Torch.compile can fuse the multiple stages of message passing into a single kernel, which provides significant speedup due to the saved memory bandwidth. Refer to this [pytorch geometric tutorial](http://pytorch-geometric.readthedocs.io/en/latest/tutorial/compile.html) for additional support. - -**Please note** that torch.compile within PyG is in beta mode and under active development. Currently, some features do not yet work together seamlessly such as torch.compile(model, dynamic=True), but fixes are on the way from Intel. - - -## Conclusion & Future Work - -In this blog, we introduced the GNN performance optimizations included in PyTorch 2.0 on CPU. We are closely collaborating with the PyG community for future optimization work, which will focus on in-depth optimizations from torch.compile, sparse optimization, and distributed training. - - -### Acknowledgement - -The results presented in this blog is a joint effort of Intel PyTorch team and Kumo. Special thanks to [Matthias Fey](http://github.com/rusty1s) (Kumo), [Pearu Peterson](http://github.com/pearu) (Quansight) and [Christian Puhrsch](http://www.linkedin.com/in/christianpuhrsch/) (Meta) who spent precious time and gave substantial assistance! Together, we made one more step forward on the path of improving the PyTorch CPU ecosystem. - - -### References - -* [Accelerating PyG on Intel CPUs](http://www.pyg.org/ns-newsarticle-accelerating-pyg-on-intel-cpus) -* [PyG 2.3.0](http://github.com/pyg-team/pytorch_geometric/releases/tag/2.3.0): PyTorch 2.0 support, native sparse tensor support, explainability and accelerations - -### Footnotes - -#### Product and Performance Information - -1Platinum 8380: 1-node, 2x Intel Xeon Platinum 8380 processor with 256GB (16 slots/ 16GB/3200) total DDR4 memory, uCode 0xd000389, HT on, Turbo on, Ubuntu 20.04.5 LTS, 5.4.0-146-generic, INTEL SSDPE2KE016T8 1.5T; GCN + Reddit FP32 inference, GCN+Reddit FP32 training, GraphSAGE + ogbn-products FP32 inference, GCN-PROTAIN, GCN-REDDIT-BINARY FP32 training; Software: PyTorch 2.1.0.dev20230302+cpu, pytorch_geometric 2.3.0, torch-scatter 2.1.0, torch-sparse 0.6.16, test by Intel on 3/02/2023. - -2Platinum 8380: 1-node, 2x Intel Xeon Platinum 8380 processor with 256GB (16 slots/ 16GB/3200) total DDR4 memory, uCode 0xd000389, HT on, Turbo on, Ubuntu 20.04.5 LTS, 5.4.0-146-generic, INTEL SSDPE2KE016T8 1.5T; GCN, GraphSAGE, GIN and EdgeCNN, FP32; Software: PyTorch 2.1.0.dev20230411+cpu, pytorch_geometric 2.4.0, torch-scatter 2.1.1+pt20cpu, torch-sparse 0.6.17+pt20cpu, test by Intel on 4/11/2023. - -3Performance varies by use, configuration and other factors. Learn more at www.Intel.com/PerformanceIndex. diff --git a/_posts/2023-07-25-announcing-cpp.md b/_posts/2023-07-25-announcing-cpp.md deleted file mode 100644 index dd1969f98909..000000000000 --- a/_posts/2023-07-25-announcing-cpp.md +++ /dev/null @@ -1,138 +0,0 @@ ---- -layout: blog_detail -title: "Announcing CPP-based S3 IO DataPipes" -author: John He, Khaled ElGalaind, Roshani Nagmote, Daiming Yang ---- - -Training large deep learning models requires large datasets. [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3) is a scalable cloud object store service used for storing large training datasets. Machine learning (ML) practitioners need an efficient data pipe that can download data from Amazon S3, transform the data, and feed the data to GPUs for training models with high throughput and low latency. - -In this post, we introduce the new S3 IO DataPipes for PyTorch, [`S3FileLister`](hhttps://github.com/pytorch/data/blob/main/torchdata/datapipes/iter/load/s3io.py#L19) and [`S3FileLoader`](https://github.com/pytorch/data/blob/main/torchdata/datapipes/iter/load/s3io.py#L106). For memory efficiency and fast runs, the new DataPipes use the C++ extension to access Amazon S3. Benchmarking shows that `S3FileLoader` is 59.8% faster than [`FSSpecFileOpener`](https://github.com/pytorch/data/blob/main/torchdata/datapipes/iter/load/fsspec.py#L125) for downloading a natural language processing (NLP) dataset from Amazon S3. You can build [IterDataPipe](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) training pipelines with the new DataPipes. We also demonstrate that the new DataPipe can reduce overall Bert and ResNet50 training time by 7%. The new DataPipes have been upstreamed to the open-source [`TorchData 0.4.0`](https://github.com/pytorch/data/releases/tag/v0.4.0) with [PyTorch 1.12.0](https://github.com/pytorch/pytorch/releases/tag/v1.12.0). - - -## Overview - -Amazon S3 is a scalable cloud storage service with no limit on data volume. Loading data from Amazon S3 and feeding the data to high-performance GPUs such as NVIDIA A100 can be challenging. It requires an efficient data pipeline that can meet the data processing speed of GPUs. To help with this, we released a new high performance tool for PyTorch: S3 IO DataPipes. DataPipes are subclassed from `torchdata.datapipes.iter.IterDataPipe`, so they can interact with the `IterableDataPipe` interface. Developers can quickly build their DataPipe DAGs to access, transform, and manipulate data with shuffle, sharding, and batch features. - -The new DataPipes are designed to be file format agnostic and Amazon S3 data is downloaded as binary large objects (BLOBs). It can be used as a composable building block to assemble a DataPipe graph that can load tabular, NLP, and computer vision (CV) data into your training pipelines. - -Under the hood, the new S3 IO DataPipes employ a C++ S3 handler with the AWS C++ SDK. In general, a C++ implementation is more memory efficient and has better CPU core usage (no Global Interpreter Lock) in threading compared to Python. The new C++ S3 IO DataPipes are recommended for high throughput, low latency data loading in training large deep learning models. - -The new S3 IO DataPipes provide two first-class citizen APIs: -* **S3FileLister** – Iterable that lists S3 file URLs within the given S3 prefixes. The functional name for this API is `list_files_by_s3`. -* **S3FileLoader** – Iterable that loads S3 files from the given S3 prefixes. The functional name for this API is `load_files_by_s3`. - - -## Usage - -In this section, we provide instructions for using the new S3 IO DataPipes. We also provide a code snippet for `load_files_by_s3()`. - -### Build from source -The new S3 IO DataPipes use the C++ extension. It is built into the `torchdata` package by default. However, if the new DataPipes are not available within the environment, for example Windows on Conda, you need to build from the source. For more information, refer to [Iterable Datapipes](https://github.com/pytorch/data/tree/main/torchdata/datapipes/iter/load#s3-io-datapipe-documentation). - -### Configuration -Amazon S3 supports global buckets. However, a bucket is created within a Region. You can pass a Region to the DataPipes by using `__init__()`. Alternatively, you can either `export AWS_REGION=us-west-2` into your shell or set an environment variable with `os.environ['AWS_REGION'] = 'us-east-1'` in your code. - -To read objects in a bucket that aren’t publicly accessible, you must provide AWS credentials through one of the following methods: - -* [Install and configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) the [AWS Command Line Interface](https://aws.amazon.com/cli/) (AWS CLI) with `AWS configure` -* Set credentials in the AWS credentials profile file on the local system, located at `~/.aws/credentials` on Linux, macOS, or Unix -* Set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables -* If you’re using this library on an [Amazon Elastic Compute Cloud](https://aws.amazon.com/ec2) (Amazon EC2) instance, specify an [AWS Identity and Access Management](https://aws.amazon.com/iam) (IAM) role and then give the EC2 instance access to that role - - -### Example code -The following code snippet provides a typical usage of `load_files_by_s3()`: - - -``` -from torch.utils.data import DataLoader
 -from torchdata.datapipes.iter import IterableWrapper

 - -s3_shard_urls = IterableWrapper(["s3://bucket/prefix/",])
.list_files_by_s3() -s3_shards = s3_shard_urls.load_files_by_s3()
 -# text data
 -training_data = s3_shards.readlines(return_path=False)
 -data_loader = DataLoader( - training_data, - batch_size=batch_size, - num_workers=num_workers,
 -)
# training loop
 -for epoch in range(epochs):
 - # training step
 - for bach_data in data_loader:
 - # forward pass, backward pass, model update 
 -``` - - -## Benchmark - -In this section, we demonstrate how the new DataPipe can reduce overall Bert and ResNet50 training time. - -### Isolated DataLoader performance evaluation against FSSpec - -`FSSpecFileOpener` is another PyTorch S3 DataPipe. It uses `botocore` and `aiohttp/asyncio` to access S3 data. The following is the performance test setup and result (quoted from [Performance Comparison between native AWSSDK and FSSpec (boto3) based DataPipes](https://github.com/pytorch/data/issues/500)). - -The S3 data in the test is a sharded text dataset. Each shard has about 100,000 lines and each line is around 1.6 KB, making each shard about 156 MB. The measurements in this benchmark are averaged over 1,000 batches. No shuffling, sampling, or transforms were performed. - -The following chart reports the throughput comparison for various batch sizes for `num_workers=0`, the data loader runs in the main process. `S3FileLoader` has higher queries per second (QPS). It is 90% higher than `fsspec` at batch size 512. - - -![Batch Sizes 1](/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-1.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -The following chart reports the results for `num_workers=4`, the data loaders runs in the main process. `S3FileLoader` is 59.8% higher than `fsspec` at batch size 512. - - -![Batch Sizes 2](/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-5.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -### Training ResNet50 Model against Boto3 -For the following chart, we trained a ResNet50 model on a cluster of 4 p3.16xlarge instances with a total 32 GPUs. The training dataset is ImageNet with 1.2 million images organized into 1,000-image shards. The training batch size is 64. The training time is measured in seconds. For eight epochs, `S3FileLoader` is 7.5% faster than Boto3. - - -![Boto3](/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-2.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -### Training a Bert model against Boto3 -For the following cart, we trained a Bert model on a cluster of 4 p3.16xlarge instances with a total 32 GPUs. The training corpus has 1474 files. Each file has around 150,000 samples. To run a shorter epoch, we use 0.05% (approximately 75 samples) per file. The batch size is 2,048. The training time is measured in seconds. For one epoch, `S3FileLoader` is 7% faster than Boto3. - - -![Boto3 2](/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-3.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -### Comparison against the original PyTorch S3 plugin -The new PyTorch S3 DataPipes perform substantially better than the original [PyTorch S3 plugin](https://github.com/aws/amazon-s3-plugin-for-pytorch). We have tuned the internal buffer size for `S3FileLoader`. The loading time is measured in seconds. - -For the 10 sharded charades files (approximately 1.5 GiB each), `S3FileLoader` was 3.5 times faster in our experiments. - -### Best practices -Training large deep learning models may require a massive compute cluster with tens or even hundreds of nodes. Each node in the cluster may generate a large number of data loading requests that hit a specific S3 shard. To avoid throttle, we recommend sharding training data across S3 buckets and S3 folders. - - -![Best Practices](/assets/images/2023-7-25-announcing-ccp-based-s3-io-datapipes-4.png){:style="max-width:620px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -To achieve good performance, it helps to have file sizes that are big enough to parallelize across a given file, but not so big that we hit the limits of throughput on that object on Amazon S3 depending on the training job. The optimal size can be between 50–200 MB. - - -## Conclusion and next steps - -In this post, we introduced you to the new PyTorch IO DataPipes. The new DataPipes use `aws-sdk-cpp` and show better performance against Boto3-based data loaders. - -For next steps, we plan to improve on usability, performance, and functionality by focusing on the following features: - -* **S3 authorization with IAM roles** – Currently, the S3 DataPipes support explicit access credentials, instance profiles, and S3 bucket policies. However, there are use cases where IAM roles are preferred. -* **Double buffering** – We plan to offer double buffering to support multi-worker downloading. -* **Local caching** – We plan on making model training able to traverse the training dataset for multiple passes. Local caching after the first epoch can cut out time of flight delays from Amazon S3, which can substantially accelerate data retrieval time for subsequent epochs. -* **Customizable configuration** – We plan to expose more parameters such as internal buffer size, multi-part chunk size, and executor count and allow users to further tune data loading efficiency. -* **Amazon S3 upload** – We plan to expand the S3 DataPipes to support upload for checkpointing. -* **Merge with fsspec** – `fsspec` is used in other systems such as `torch.save()`. We can integrate the new S3 DataPipes with `fsspec` so they can have more use cases. - - - - -### Acknowledgement - -We would like to thank Vijay Rajakumar and Kiuk Chung from Amazon for providing their guidance for S3 Common RunTime and PyTorch DataLoader. We also want to thank Erjia Guan, Kevin Tse, Vitaly Fedyunin , Mark Saroufim, Hamid Shojanazeri, Matthias Reso, and Geeta Chauhan from Meta AI/ML, and Joe Evans from AWS for reviewing the blog and the GitHub PRs. - - - -### References - -* [Announcing the Amazon S3 plugin for PyTorch](https://aws.amazon.com/blogs/machine-learning/announcing-the-amazon-s3-plugin-for-pytorch/) -* [Performance Comparison between native AWSSDK and FSSpec (boto3) based DataPipes](https://github.com/pytorch/data/issues/500) diff --git a/_posts/2023-07-27-ibm-joins-pytorch.md b/_posts/2023-07-27-ibm-joins-pytorch.md deleted file mode 100644 index 9b9f1a9ac758..000000000000 --- a/_posts/2023-07-27-ibm-joins-pytorch.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -layout: blog_detail -title: "IBM Joins the PyTorch Foundation as a Premier Member" -author: Team PyTorch ---- - -The PyTorch Foundation, part of The Linux Foundation, is pleased to announce that IBM has joined as a premier member. - -![IBM Logo](/assets/images/pytorch-ibm-logo.png){:style="max-width:250px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -The foundation serves as a neutral space for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. With its extensive industry expertise and leadership in open source and AI, IBM is committed to actively contributing to the PyTorch community. - -IBM offers a comprehensive portfolio of enterprise AI solutions and recently released watsonx, its next-generation data and AI platform. IBM’s watsonx platform leverages PyTorch to offer an enterprise-grade software stack for end-to-end training and fine-tuning of AI foundation models. - -"By joining the PyTorch Foundation, we aim to contribute our expertise and resources to further advance PyTorch’s capabilities and make AI more accessible in hybrid cloud environments with flexible hardware options,” said Priya Nagpurkar, Vice President, Hybrid Cloud Platform and Developer Productivity, IBM Research. “We intend for our collaboration with PyTorch to bring the power of foundation models and generative AI to enterprises using the watsonx platform to drive business transformation.” - -IBM and PyTorch have already collaborated on two projects. The first enables foundation models with billions of parameters to train efficiently on standard cloud networking infrastructure, such as Ethernet networking. Together, IBM and PyTorch have also worked on ways to make checkpointing for AI training considerably more cost-effective, by fixing the distributed checkpointing within PyTorch to support certain types of object storage. - -“We’re happy to welcome IBM as a premier member. IBM's expertise and dedication to advancing the field of artificial intelligence align perfectly with the mission of the PyTorch community,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Their commitment to open collaboration and innovation will strengthen our collective efforts to empower developers and researchers worldwide.” - -As a premier member, IBM is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction. - -![Raghu Ganti Headshot](/assets/images/pytorch-ibm-headshot.png){:style="max-width:250px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -We’re happy to welcome Raghu Ganti, Principal Research Scientist at IBM Research, to our board. Raghu co-leads IBM Research’s foundation model training and validation platform, built on Red Hat OpenShift. His team primarily contributes to the PyTorch training components, with the mission of democratizing training and validation of foundation models. - -To learn more about how you can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/join). diff --git a/_posts/2023-07-31-performant-distributed-checkpointing.md b/_posts/2023-07-31-performant-distributed-checkpointing.md deleted file mode 100644 index 0b3d47f572e9..000000000000 --- a/_posts/2023-07-31-performant-distributed-checkpointing.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -layout: blog_detail -title: "Performant Distributed checkpointing in Production with IBM" -author: "Meta: Iris Zhang, Less Wright, Rodrigo Kumpera, Chien-Chin Huang, -IBM: Davis Wertheimer, Supriyo Chakraboty, Sophia Wen, Raghu Ganti, Mudhakar Srivatsa, Seethrami Seelam" ---- - -![Params saved per minute](/assets/images/2023-07-31-performant-distributed-checkpointing-1.png){:style="width:100%;"} - - -Last year, IBM Research began collaborating with us to onboard Fully Sharded Data Parallelism (FSDP) for their large foundation models. They became interested as FSDP is a PyTorch native offering for scaling their distributed training efforts on IBM Cloud. - -We are pleased to share that, in collaboration with IBM, we have achieved substantial checkpointing speedups for large models (72x vs the original PyTorch 1.13 save speed), proven model and optimizer checkpoint scaling to 30B parameters, and enabled cloud first training using FSDP + Distributed Checkpoint on S3 backends. - -## What is a Distributed Checkpoint? - -Distributed checkpointing is the PyTorch native solution for saving and loading PyTorch models and optimizer states from multiple ranks, as well as supporting dynamically changing world sizes between reloads. - - -![Checkpoint time vs model params](/assets/images/2023-07-31-performant-distributed-checkpointing-2.png){:style="width:100%;"} - - -PyTorch Distributed Checkpoint (DCP) APIs were introduced in PyTorch 1.13, and are included as an official prototype feature in PyTorch 2.0. - -Distributed checkpoint is different from torch.save() and torch.load() in a few significant ways: - -1. DCP produces multiples files per checkpoint, with at least one file per rank, -2. DCP operates in place, meaning that the model should allocate its data first and the Distributed Checkpoint will then use the storage. - -A major improvement from 1.13 to 2.0 includes adding sharded_state_dict support for checkpointing FSDP models. This allows checkpointing for larger sized models, as well as adding support for load-time resharding. Load time resharding enables saving in one cluster topology, and loading into another. This feature was highly requested as it allows training jobs to be run on one cluster, saved, and then continued on a different cluster with different world size. - -Another major change is that we decouple the storage layer from the checkpoint planning layer and separate implementation from the interface for both layers. With this change, users can now specify how their state_dict should be chunked or transformed during the checkpoint planning phase. Additionally, the customizable storage layer can easily accommodate different backends. - -More information on the Distributed Checkpoint package can be found [here](https://pytorch.org/docs/stable/distributed.checkpoint.html). - -## Performant Distributed checkpointing in Production with IBM - -IBM at Think 2023 announced its [watsonx.ai](https://www.ibm.com/products/watsonx-ai) platform for development and deployment of foundation models for the enterprise. Built on Hybrid Cloud, the platform enables use cases across multiple modalities such as NLP, timeseries, weather, chemistry, tabular data, and cybersecurity, with model sizes from 100s of millions to 10s of billions of parameters. Model architectures range from vision transformers, to multi-modal RoBERTa-style feature extractors, to large-scale generative language models similar to T5, GPT and Llama. - -As of today, IBM has now enabled checkpointing for T5-style architectures up to 11B parameters, and decoder architectures (GPT style) up to 30B. - -IBM helped us identify that this limits the scaling power of DCP from both memory and performance standpoints. With their suggestion, we enhanced our FileSystemWriter to produce single checkpoint per rank to reduce read write overhead. - -With this option as the new default, DCP now creates a single file per rank during checkpoint saving, which would then be sliced when reading parameters at load time. - -By combining sharded_state_dict support with single filer per rank writer, distributed checkpoint was able to accelerate checkpoint saving time over 72x vs the original PyTorch 1.13 save speed, and enable rapid checkpointing for models sizes over 15B which would previously simply time out. - -_"Looking back, it’s really astounding the speedups we’ve seen, handling training for many of these models. We went from taking almost half an hour to write a single 11B checkpoint in PyTorch 1.13, to being able to handle a 30B parameter model, with optimizer and dataloader state - so that’s over eight times the raw data - in just over 3 minutes. That’s done wonders for both the stability and efficiency of our jobs, as we scale up training to hundreds of gpus." – **Davis Wertheimer, IBM Research**_ - -IBM’s adoption has also helped us validate and improve our solutions in a real world, large-scale training environment. As an example, IBM discovered that DCP was working well for them on a single node with multiple GPUs, but erred out when used on multiple nodes. - -Upon investigating the issue, we realized that we were assuming writing to a NFS-like shared file system, which assumes strong read-after-write consistencies. Object stores with file system APIs such as S3FS provide eventual consistency semantics, thus causing the distributed checkpoint in such a setting to fail. Working together with IBM, we identified this issue and fixed it by making [one line code change](https://research.ibm.com/blog/ibm-pytorch-ai-training) and enabled object storage backend for DCP! Such storage approaches are typically an order of magnitude cheaper than shared file systems thus enabling finer grained checkpointing. - -## Looking for Collaboration - -If you are interested in trying Distributed Checkpoint, feel free to reach out to us! - -If you run into any issue when trying it, you can open an [issue](https://github.com/pytorch/pytorch/labels/module%3A%20distributed_checkpoint) at our Github repo. - -## Acknowledgements - -This project would not have been possible without the assistance from many collaborators. We would like to thank Yanli Zhao, Andrew Gu, Rohan Varma for their support of FSDP. Thanks to Pritam Damania, Junjie Zhao, and Wanchao Liang for their support of ShardedTensor. \ No newline at end of file diff --git a/_posts/2023-08-01-amd-journey.md b/_posts/2023-08-01-amd-journey.md deleted file mode 100644 index 90364b98e533..000000000000 --- a/_posts/2023-08-01-amd-journey.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -layout: blog_detail -title: "AMD's Journey to Openness and Performance" ---- - -AMD has gained progress in building a robust software stack that supports an open ecosystem of models, libraries, frameworks, and tools. With proven platforms gaining momentum, there is significance of a leadership software stack and an optimized ecosystem for achieving application performance. PyTorch is a key part of AMD’s AI journey, and AMD's Victor Peng, AMD President and Soumith Chintala, founder of PyTorch discussed the latest progress at the DC & AI Keynote on June 12. - -## Building a Powerful SW Stack with ROCm - -Victor introduced ROCm, AMD's SW stack for Instinct Data Center GPUs. It offers a comprehensive set of open-source libraries, runtime, compilers, and tools for developing, running, and fine-tuning AI models. The fifth generation ROCm incorporates optimizations for AI and high-performance computing workloads, including tailored kernels for low-latency memory systems, support for new data types, and integration with OpenAI Triton. With tools for porting AI software to AMD Instinct platforms, ROCm ensures quality and robustness, tested extensively and compliant with PyTorch and TensorFlow frameworks. - -## Collaboration with PyTorch - -To shed light on the partnership between AMD and PyTorch, Victor invited [Soumith Chintala](https://www.linkedin.com/in/soumith/), the founder of PyTorch, to discuss the advancements and integration between the two. PyTorch, the industry's most famous AI framework, boasts a vibrant developer community and is known for its continuous innovation and incorporation of cutting-edge research. - -To highlight the AMD and PyTorch partnership, Victor hosted a discussion with Soumith Chintala, the founder of PyTorch. PyTorch, renowned for its innovation and community, is the industry's leading AI framework. The latest version, PyTorch 2.0, integrates with hardware-agnostic software compilers like OpenAI Triton, enabling efficient training and deployment of AI models. With optimized techniques, PyTorch 2.0 enhances productivity and offers remarkable speed improvements. The collaboration between AMD and the PyTorch Foundation ensures seamless utilization of AMD GPUs, expanding AI accelerator accessibility worldwide and paving the way for future optimizations and broader hardware support. - -## Empowering the Developer Community - -The partnership between AMD and PyTorch benefits the developer community by democratizing access to AI accelerators. Support for AMD GPUs in PyTorch allows developers to train and deploy models across various platforms, including CPUs like EPYC and Ryzen, GPUs like Instinct and Radeon, and embedded devices like Versal SoCs. By ensuring immediate compatibility of new models on AMD platforms, the collaboration streamlines the development process and empowers developers to leverage the full potential of AMD's hardware. This increased accessibility and flexibility enable developers worldwide to push the boundaries of AI innovation. - -## Hugging Face and AI Model Innovation - -Victor praised Hugging Face as the leading force behind open-source AI model innovation, empowering generative AI with transformative transformers. AMD's optimized software enables a high-performing development stack, supporting groundbreaking AI advancements for customers and developers through scalable real-world deployments. - -## Conclusion - -At the DC & AI Keynote, AMD demonstrated its dedication to openness, performance, and collaboration. The ROCm SW stack, PyTorch integration, and support for Hugging Face exemplify AMD's commitment to empowering developers and researchers to achieve AI breakthroughs. By offering accessible, high-performing solutions, AMD fuels the future of AI as a leading GPU platform integrated with PyTorch. - -To listen to the full keynote visit the [AMD Youtube](https://www.youtube.com/watch?v=l3pe_qx95E0) channel - -To listen to Soumith Chintala’s section of the [keynote](https://www.youtube.com/watch?v=RgQEG2G1iaY) diff --git a/_posts/2023-08-03-hugging-face-joins.md b/_posts/2023-08-03-hugging-face-joins.md deleted file mode 100644 index d943d6451af1..000000000000 --- a/_posts/2023-08-03-hugging-face-joins.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -layout: blog_detail -title: "Hugging Face Joins the PyTorch Foundation as a Premier Member" ---- - -![Smiling hugging face](/assets/images/huggingface-joins-1.jpg){:style="max-width:250px;float:right;"} - -The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Hugging Face has joined as a premier member. - - -Hugging Face has been a long time supporter and contributor to the PyTorch Ecosystem by providing powerful models and resources that accelerate research, development, and adoption of AI technologies, particularly in the field of natural language processing. - -“Our mission has always been to democratize AI and make it accessible to everyone. We’re truly aligned with PyTorch’s objective of reducing the barrier of entry to practitioners. By joining the PyTorch Foundation, we can further amplify that impact and support this very important framework of the ecosystem that is PyTorch,” said Lysandre Debut, Head of Open Source at Hugging Face. “We believe the two ecosystems have significant overlap, and collaborating with the foundation will allow us to bridge the gap to provide the best software, the best tools to the machine learning community at large.” - -Hugging Face's Model Hub and open source libraries promote collaboration and knowledge sharing within the AI open source community, making Hugging Face a great match to the growing PyTorch Foundation. They continue to drive industry adoption and collaboration by creating user-friendly tools and resources and providing accessible and well-documented libraries. - -“Hugging Face's commitment to open source development and their exceptional contributions to the PyTorch ecosystem have truly impressed us. With their help, we will drive innovation, foster collaboration, and empower the global AI community to create transformative solutions for the AI community,” said PyTorch Foundation Executive Director Ibrahim Haddad. “We welcome Hugging Face to the PyTorch Foundation and look forward to the achievements that lie ahead.” - -As a premier member, Hugging Face is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction. - -![Lysandre Debut](/assets/images/huggingface-joins-2.jpg){:style="max-width:250px;float:right;"} - -We’re happy to welcome Lysandre Debut, Head of Open Source at Hugging Face to our board. Lysandre has been at Hugging Face since the company’s pivot to open-source, and was the first engineer to focus entirely on the open-source mission. Now leading the open-source part of the organization, Lysandre remains technically involved by being a core maintainer of the Transformers library. - - -To learn more about how you can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/join). - -## About Hugging Face - -Hugging Face is a community and company dedicated to lowering the barrier of entry to Machine Learning and Deep Learning. Strong advocates for open-source and open-science, their model Hub hosts more than 250,000 public models and 50,000 public datasets that are very simple to use. Transformers, Diffusers, PEFT, Accelerate, and Datasets are some of the open-source tools made available by Hugging Face. - -## About PyTorch Foundation - -The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration. - -## About The Linux Foundation - -The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page: www.linuxfoundation.org/trademark-usage. Linux is a registered trademark of Linus Torvalds. \ No newline at end of file diff --git a/_posts/2023-08-07-int8-quantization.md b/_posts/2023-08-07-int8-quantization.md deleted file mode 100644 index 585e2ffc2396..000000000000 --- a/_posts/2023-08-07-int8-quantization.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -layout: blog_detail -title: "INT8 Quantization for x86 CPU in PyTorch" -author: Intel ---- - -## Overview - -INT8 quantization is a powerful technique for speeding up deep learning inference on x86 CPU platforms. By reducing the precision of the model's weights and activations from 32-bit floating-point (FP32) to 8-bit integer (INT8), INT8 quantization can significantly improve the inference speed and reduce memory requirements without sacrificing accuracy. - -In this blog, we will discuss the recent progress on INT8 quantization for x86 CPU in PyTorch, focusing on the new x86 quantization backend. We will also briefly look at the new quantization path with PyTorch 2.0 Export (PT2E) and TorchInductor. - - -## X86 Quantization Backend - -The current recommended way of quantization in PyTorch is [FX](http://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html?highlight=fx). Before PyTorch 2.0, the default quantization backend (a.k.a. QEngine) on x86 CPUs was FBGEMM, which leveraged the FBGEMM performance library to achieve the performance speedup. In the PyTorch 2.0 release, a new quantization backend called X86 was introduced to replace FBGEMM. The x86 quantization backend offers improved INT8 inference performance when compared to the original FBGEMM backend by leveraging the strengths of both FBGEMM and the [Intel® oneAPI Deep Neural Network Library (oneDNN)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onednn.html) kernel libraries. - - -## Performance Benefit from X86 Backend - -To measure the performance benefits of the new X86 backend, we ran INT8 inference on 69 popular deep learning models (shown in **Figures 1-3** below) using [4th Gen Intel® Xeon® Scalable processors](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/platform.html). The results showed a 2.97X geomean performance speedup compared to FP32 inference performance, while the speedup was 1.43X with the FBGEMM backend. The charts below show the per-model performance speedup comparing the x86 backend and the FBGEMM backend. - -![Figure 1: Models with less than 2x performance boost with x86 backend1](/assets/images/int8/pytorch_quant_x86_1.jpg){:style="width:100%;"} - -**Figure 1**: Models with less than 2x performance boost with x86 backend1 - - - -![Figure 2: Models with 2x-4x performance boost with x86 backend1](/assets/images/int8/pytorch_quant_x86_2.jpg){:style="width:100%; margin-top: 4em;"} - -**Figure 2**: Models with 2x-4x performance boost with x86 backend1 - - - -![Figure 3: Models with larger than 4x performance boost with x86 backend1](/assets/images/int8/pytorch_quant_x86_3.jpg){:style="width:100%; margin-top: 4em;"} - -**Figure 3**: Models with larger than 4x performance boost with x86 backend1 - - -## Usage of x86 Backend - -By default in 2.0, users on x86 platforms will use the x86 quantization backend and their PyTorch programs will remain unchanged when using the default backend. Alternatively, users can specify x86 as the quantization backend explicitly. \ -Below is an example code snippet of PyTorch static post-training quantization with x86 quantization backend. - - -``` -import torch -from torch.ao.quantization import get_default_qconfig_mapping -from torch.quantization.quantize_fx import prepare_fx, convert_fx - -qconfig_mapping = get_default_qconfig_mapping() -# Or explicity specify the qengine -# qengine = 'x86' -# torch.backends.quantized.engine = qengine -# qconfig_mapping = get_default_qconfig_mapping(qengine) - -model_fp32 = MyModel().eval() -x = torch.randn((1, 3, 224, 224), dtype=torch.float) -x = x.to(memory_format=torch.channels_last) - -# Insert observers according to qconfig and backend config -prepared_model = prepare_fx(model_fp32, qconfig_mapping, example_inputs=x) - -# Calibration code not shown - -# Convert to quantized model -quantized_model = convert_fx(prepared_model) -``` - - - -## Technical Details of x86 Backend - -We devised heuristic dispatching rules according to the performance numbers from the models we benchmarked to decide whether to invoke oneDNN or FBGEMM performance library to execute the convolution or matrix multiplication operations. The rules are a combination of operation kinds, shapes, CPU architecture information, etc. Detailed logic is available [here](http://github.com/pytorch/pytorch/blob/93ff71ec37e3c946603600a46edef70b42f81213/aten/src/ATen/native/quantized/cpu/OnednnUtils.h#L396). For more design and technical discussion, please refer to the [Request for Comments](http://github.com/pytorch/pytorch/issues/83888). - - -## Next Steps With a New Quantization Path PyTorch 2.0 Export - -Although still far from finalized, a new quantization path, PyTorch 2.0 Export (PT2E), is in early design and PoC stage. The new approach is slated to replace the FX quantization path in the future. It is built upon the capabilities of TorchDynamo Export, a feature introduced in the PyTorch 2.0 release for FX graph capturing. This graph is then quantized and lowered to different backends. TorchInductor, the new DL compiler of PyTorch, has shown promising results in terms of FP32 inference speedup on x86 CPU. We are working actively to enable it as one of the quantization backends of PT2E. We believe the new path will lead to further improvements in INT8 inference performance due to more flexibility of fusion at different levels. - - -## Conclusion - -The x86 backend introduced in PyTorch 2.0 release has demonstrated a remarkable improvement in INT8 inference speed on x86 CPU platforms. It offers a 1.43X speedup compared to the original FBGEMM backend while maintaining backward compatibility. This enhancement can benefit end users with minimal or no modifications to their programs. Furthermore, a new quantization path, PT2E, is currently in development and is expected to provide even more possibilities in the future. - - -## Acknowledgement - -Special thanks to Nikita Shulga, Vasiliy Kuznetsov, Supriya Rao, and Jongsoo Park. Together, we made one more step forward on the path of improving the PyTorch CPU ecosystem. - - -## Configuration - -1 AWS EC2 r7iz.metal-16xl instance (Intel(R) Xeon(R) Gold 6455B, 32-core/64-thread, Turbo Boost On, Hyper-Threading On, Memory: 8x64GB, Storage: 192GB); OS: Ubuntu 22.04.1 LTS; Kernel: 5.15.0-1028-aws; Batch Size: 1; Core per Instance: 4; PyTorch 2.0 RC3; TorchVision 0.15.0+cpu, test by Intel on 3/77/2023. May not reflect all publicly available security updates. diff --git a/_posts/2023-08-10-intel-joins-pytorch.md b/_posts/2023-08-10-intel-joins-pytorch.md deleted file mode 100644 index 8c8e3e60caf0..000000000000 --- a/_posts/2023-08-10-intel-joins-pytorch.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -layout: blog_detail -title: "Intel Joins the PyTorch Foundation as a Premier Member" ---- - -![Intel logo](/assets/images/intel-new-logo.svg){:style="max-width:250px;float:right;margin: 20px;"} - -The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Intel has joined as a premier member. - -“The PyTorch Foundation is thrilled to welcome Intel as a premier member, marking a significant milestone in our mission to empower the global AI community. Intel's extensive expertise and commitment to advancing cutting-edge technologies align perfectly with our vision of fostering open-source innovation,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Together, we will accelerate the development and democratization of PyTorch, and use the collaboration to shape a vibrant future of AI for all.” - -Intel has developed and released several PyTorch-based tools and libraries to enable developers to accelerate their AI workflows, and is actively working on optimizing PyTorch to leverage Intel hardware capabilities. - -“At Intel, we believe in the power of collaboration and open-source innovation to propel the ecosystem towards an AI Everywhere future. Joining the Governing Board of the PyTorch Foundation is a testament to Intel’s commitment to advancing and democratizing AI,” said Wei Li, Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel. “By harnessing the collective expertise and resources within the deep learning community, we aim to accelerate the development of PyTorch and continue to drive breakthroughs in AI research and applications.” - -Intel fosters industry collaboration, co-engineering, and open source contributions to accelerate software innovation and develop new technologies that bring benefits to the open source community. By working together with other member companies and under the guidance of the PyTorch Foundation, Intel remains committed to actively contributing to and advocating for the community. - -As a premier member, Intel is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction. - -![Wei Li](/assets/images/wei-li.jpg){:style="max-width:250px;float:right;margin: 20px;"} - -We’re happy to welcome Wei Li, Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel, to our board. Dr. Wei Li is Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel, where he leads a world-wide team of engineering “magicians” who make AI Everywhere a reality by supercharging machine performance and developer productivity. Wei and his team have been instrumental in Intel's recent multi-billion-dollar AI revenue growth by delivering 10-100X software acceleration, across deep learning, statistical machine learning and big data analytics, to complement Intel’s AI-optimized hardware portfolio. - -To learn more about how you can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/join). - -Read more about Intel’s commitment to the PyTorch Community [here](https://www.intel.com/content/www/us/en/developer/articles/technical/ai-everywhere-intel-joins-pytorch-foundation.html#gs.4984sj). - -## About Intel - -Intel (Nasdaq: INTC) is an industry leader, creating world-changing technology that enables global progress and enriches lives. Inspired by Moore’s Law, we continuously work to advance the design and manufacturing of semiconductors to help address our customers’ greatest challenges. By embedding intelligence in the cloud, network, edge and every kind of computing device, we unleash the potential of data to transform business and society for the better. To learn more about Intel’s innovations, go to[ newsroom.intel.com](https://newsroom.intel.com/) and [intel.com](https://intel.com/). - -© Intel Corporation. Intel, the Intel logo and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. - -## About PyTorch Foundation - -The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration. - -## About The Linux Foundation - -The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its [trademark usage page](https://www.linuxfoundation.org/legal/trademark-usage). Linux is a registered trademark of Linus Torvalds. \ No newline at end of file diff --git a/_posts/2023-08-24-large-scale-training-hugging-face.md b/_posts/2023-08-24-large-scale-training-hugging-face.md deleted file mode 100644 index 1aef238a3fdd..000000000000 --- a/_posts/2023-08-24-large-scale-training-hugging-face.md +++ /dev/null @@ -1,264 +0,0 @@ ---- -layout: blog_detail -title: "Large Scale Training of Hugging Face Transformers on TPUs With PyTorch/XLA FSDP" -author: Alex Wertheim, Milad Mohammadi, Jack Cao, Alex Spiridonov, Joe Spisak, Lysandre Debut, Sylvain Gugger, Sourab Mangrulkar ---- - - -AI is transforming many industries through advanced capabilities such as understanding and generating language, answering questions, and delivering accurate recommendations. These capabilities are fueled by ever-increasing size and complexity of AI models, which require vast amounts of computing power to train. - -To meet the growing demands of AI training at scale, last year we introduced [Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) in PyTorch/XLA. FSDP is a model parallelism architecture that unlocks the ability to easily and efficiently scale AI models into hundreds of billions of parameters. With [PyTorch/XLA FSDP](https://github.com/pytorch/xla/blob/master/docs/fsdp.md), during distributed training, each device can store a specific model shard, and all-gather the full model weights when it is time to perform the forward pass. Nested FSDP further optimizes performance by only using a given layer’s full parameters during its forward pass. - -We are excited to announce that PyTorch/XLA FSDP has [landed](https://github.com/huggingface/transformers/releases/tag/v4.27.0) in [Hugging Face Transformers](https://github.com/huggingface/transformers). Now, Hugging Face users can train PyTorch models with up to 20 times more parameters using the same amount of computing power as before. - -We built PyTorch/XLA FSDP support directly into the Hugging Face Trainer class, so that any model using Trainer can leverage FSDP. And with the [addition of automatic wrapping to PyTorch/XLA FSDP](https://pytorch.org/blog/pytorch-2.0-xla/#fsdp-beta), nested FSDP wrapping is both flexible and simple to apply. These new features make it easy to train a wide range of Hugging Face models at large scales. In this guide, we demonstrate training GPT-2 models with up to 128B parameters on Google Cloud TPUs. PyTorch/XLA FSDP training on TPUs is highly efficient, achieving up to 45.1% model FLOPS utilization (MFU) for GPT-2: - -![Figure 1: Model FLOPS utilization for Hugging Face GPT-2 on Google Cloud TPU v4](/assets/images/hugging_face_transformers.svg){:style="width:100%; margin-top: 4em;"} - -**Figure 1**: Model FLOPS utilization for Hugging Face GPT-2 on Google Cloud TPU v4 - -## Configuring PyTorch/XLA FSDP in the Hugging Face Trainer ## - -First, follow your preferred method to create your TPU(s) and install PyTorch and PyTorch/XLA. You need versions >= 2.0 for PyTorch and PyTorch/XLA. - -``` - pip3 install https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torc h-2.0-cp38-cp38-linux_x86_64.whl --user - - pip3 install https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torc h_xla-2.0-cp38-cp38-linux_x86_64.whl -``` - -Next, clone and install the Hugging Face Transformers repo. Install all necessary dependencies (e.g., datasets, evaluate, scikit-learn, accelerate). - -``` - cd $HOME - git clone https://github.com/huggingface/transformers.git cd transformers - git checkout v4.31-release - pip3 install -e . - pip3 install datasets evaluate scikit-learn - pip3 install accelerate==0.21.0 -``` - -In `$HOME/transformers`, create any model-specific configuration files you might need. Here is an example of a configuration file for a GPT-2 model with 2B parameters, which we later refer to as `gpt2_config.json`: - -``` -{ - "activation_function": "gelu_new", - "architectures": [ - "GPT2LMHeadModel" - ], - "attn_pdrop": 0.1, - "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_embd": 3072, - "n_head": 24, - "n_layer": 18, - "n_positions": 1024, - "resid_pdrop": 0.1, - "summary_activation": null, - "summary_first_dropout": 0.1, - "summary_proj_to_labels": true, - "summary_type": "cls_index", - "summary_use_proj": true, - "task_specific_params": { - "text-generation": { - "do_sample": true, - "max_length": 50 - } - }, - "vocab_size": 50257 -} -``` - -With PyTorch/XLA FSDP, it is possible to train model sizes much bigger than this on large accelerator slices. We have trained GPT-2 models as large as 128B parameters with these techniques; for expert tips on how to replicate this scale, see the appendix. - -In `$HOME/transformers`, create your FSDP configuration file, a JSON file containing all of the configurable aspects of your XLA FSDP wrapping stored as a dictionary. Following the [official Hugging Face Transformers XLA FSDP documentation](https://huggingface.co/docs/transformers/main_classes/trainer#pytorchxla-fully-sharded-data-parallel), the following arguments are available to set: -- `xla (bool, \*optional\*, defaults to False)`: This is a boolean which determines whether or not you use XLA FSDP. Make sure to set this to `true`. -- `xla_fsdp_settings (dict, \*optional\*)`: This is a dictionary which stores all of the XLA FSDP wrapping parameters you want to set; note that you do not have to specify settings for parameters where you are using the default value. For a complete list of settings, see [here](https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py). - -For `compute_dtype` and `buffer_dtype`, enter these as strings which contain the corresponding torch data type, e.g. `bfloat16`. - -- `fsdp_min_num_params (int, \*optional\*, defaults to 0)`: An integer which sets the minimum number of parameters for size-based auto wrapping. Every module with at least as many parameters as `fsdp_min_num_params` will be XLA FSDP wrapped. -- `fsdp_transformer_layer_cls_to_wrap (List[str], \*optional\*)`: A list of (case-sensitive) transformer layer class names to wrap. Note that this is mutually exclusive with `fsdp_min_num_params`. Example: `["GPT2Block", "GPT2MLP"]`. -- `xla_fsdp_grad_ckpt (bool, \*optional\*, defaults to False)`: This is a boolean which determines whether to use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be used when the `xla` flag is set to true, and an auto wrapping policy is specified through `fsdp_min_num_params` or `fsdp_transformer_layer_cls_to_wrap`. - -**Note:** For transformer-based models, use `fsdp_transformer_layer_cls_to_wrap` instead of `fsdp_min_num_params` when performing automatic nested FSDP wrapping. Layers which share weights should not belong to separate FSDP wrapped units, and the input and output embedding layers in transformer-based models share weights. - -For this GPT-2 example, here is what the corresponding `fsdp_config.json` file looks like: - -``` - { - "fsdp_transformer_layer_cls_to_wrap": [ - "GPT2Block" - ], - "xla": true, - "xla_fsdp_settings": { - "compute_dtype": "bfloat16", - "shard_param_on_dim_0": true, - "pin_layout_in_collective_ops": true - }, - "xla_fsdp_grad_ckpt": true - } -``` - -Now, it’s time to train your model! First, ensure that you have your PyTorch/XLA runtime set up appropriately by setting| - -``` - export PJRT_DEVICE=TPU -``` - - -When running training, the key flags to pass are: - -a) `--fsdp "full_shard"` -b) `--fsdp_config fsdp_config.json` - -where you should replace `fsdp_config.json` with whatever you named your FSDP configuration file. Here is a sample command to train our example 2B GPT-2 model, where training is started by `xla_spawn.py`, a [launcher script for](https://github.com/huggingface/transformers/blob/main/examples/pytorch/xla_spawn.py) distributed TPU training. - -``` - python3 -u examples/pytorch/xla_spawn.py --num_cores 4 examples/pytorch/language-modeling/run_clm.py \ - --num_train_epochs 1 \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ --per_device_train_batch_size 32 \ --per_device_eval_batch_size 32 \ - --do_train \ - --do_eval \ - --output_dir /tmp/test-clm \ - --overwrite_output_dir \ - --config_name gpt2_config.json \ - --cache_dir /tmp \ - --tokenizer_name gpt2 \ - --block_size 1024 \ - --optim adafactor \ - --adafactor true \ - --save_strategy no \ - --logging_strategy no \ - --fsdp "full_shard" \ - --fsdp_config fsdp_config.json -``` - -## Measuring Model FLOPS Utilization (MFU) for GPT-2 ## - -Model FLOPS are the floating point operations required to perform a single forward and backward pass. Model FLOPS are hardware- and implementation- independent, and only depend on the underlying model. In each step, the number of FLOPS is computed via the following formulas: - -``` -tokens_per_batch = global_batch_size \* seq_len - -FLOPS_per_step = 6 \* tokens_per_batch \* num_params -``` - -where `seq_len` is the sequence length and `num_params` is the number of parameters in the model. We note that this estimation assumes that the input dimensionality is much larger than the input sequence length (`d_model >> seq_len`). If this assumption is violated the self-attention FLOPs start to be significant enough and this expression will underestimate the true MFU. - -Based on the step time and the hardware details (numbers of chips and the peak FLOPS per chip), we can compute Model FLOPS Utilization (MFU), which measures how effectively our implementation is using the underlying hardware. Achieving 100% MFU means that the hardware is being used perfectly by that model. We calculate MFU using the following formula: - -``` -model_FLOPS_utilization = FLOPS_per_step / step_time(s) / chip_count / FLOPS_per_chip -``` - -When training a GPT-2 model with 2B parameters with the XLA FSDP configuration file above on a Cloud TPU v4-8, we measure a step time of 4.191s. Using the above formula, we calculate 35.7% MFU on a v4-8. For further details on calculating MFU, refer to the [PaLM paper](https://arxiv.org/pdf/2204.02311.pdf). - -The table below presents MFU for GPT-2 models with sizes between 2B and 128B, with a sequence length of 1024. - - - -|**TPU NumCores**|**v4-8**|**v4-64**|**v4-128**|**v4-128**|**v4-256**|**v4-512**| -| - | - | - | - | - | - | - | -|**# of Tokens / Batch**|131,072|524,288|524,288|524,288|1,048,576|1,048,576| -|**# of Parameters**|2B|16B|20B|32B|64B|128B| -|**Step Time (ms)**|4,191|14,592|7,824|12,970|25,653|30,460| -|**PFLOPS / Step**|1\.65|50|62|101|404|809| -|**MFU**|35\.7%|38\.8%|45\.1%|44\.4%|44\.7%|37\.7%| - -**Table 1: GPT-2 model FLOPS utilization calculation details** - -Among these configurations, MFU peaks at 45.1% for the 20B parameter model on v4-128. This result compares favorably to, for example, 41.5% MFU for [a 22B Megatron-like model](https://arxiv.org/pdf/2205.05198.pdf). - -There are two actionable insights from these experiments: - -First, simply increasing the number of chips without increasing the batch size generally means lower FLOPS utilization, because more time is spent on sharing the model shards. FSDP uses all-reduce communication collectives which are not asynchronous, which means that chip-to-chip communication cannot be overlapped with computation. As the number of chips increases, the number of model shards that must be communicated increases, and so we should expect the portion of the step time spent on communication to increase with the number of chips. - -Second, increasing the batch size generally means better FLOPS utilization. As the number of chips increases, the memory footprint of the model decreases, which often frees up high bandwidth memory (HBM) to scale up the global batch size. With a larger global batch size, the number of tokens processed in each step increases, and thus, so does the FLOPS per step. As long as the step time does not increase proportionally, we expect a larger global batch size to improve MFU. - -Therefore, to maximize the MFU, we recommend training with the largest global batch size possible that can fit in the HBM of the TPU slice, using FSDP to reduce memory required for the model parameters. - -## Training Very Large Models (tested to 128B parameters) ## - -When using PyTorch/XLA, tensors must be initialized on the CPU before being moved to the XLA device. This means one may encounter host-side out-of-memory errors if the model is sufficiently large, even though the model can fit in the device HBM after sharding. To avoid this, we must defer each submodule’s initialization until it is FSDP wrapped, which ensures that submodules are sharded as soon as their values are populated, avoiding host-side limitations. - -Below, we explain how to modify a local copy of the Hugging Face transformers repository to train a GPT-2 model with up to 128B parameters using this technique. - -First, using the commands below, install torchdistX, which is a library containing experimental PyTorch Distributed features. This is the engine behind deferred initialization, and allows you to create tensors that don’t require immediate storage and can be materialized later. You also need to install a specific PyTorch/XLA 2.0 version that takes advantage of this package; note that you must uninstall PyTorch and PyTorch/XLA first, if you installed them earlier. - -``` -pip3 install torch==2.0 --index-url [https://download.pytorch.org/whl/test/cpu](https://download.pytorch.org/whl/test/cpu) --user -pip3 install torch_xla[torchdistx] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/experimen tal/torch_xla-2.0-cp38-cp38-linux_x86_64.whl -``` - -Next, apply the following changes to your local copy of Hugging Face Transformers: - -In `src/transformers/trainer.py`, add the following function in `_wrap_model` on the line immediately prior to PyTorch/XLA FSDP wrapping: - - -``` -from torchdistx import deferred_init - -def _init_with_torchdistX(module): - def check_fn(k): - return not isinstance(k, FSDP) - deferred_init.materialize_module(module, check_fn=check_fn) -``` - -The function `materialize_module` will initialize the model tensors if `check_fn` returns `True`. In this case, `check_fn` checks whether the module has been FSDP wrapped. - -Within `_wrap_model`, modify your FSDP wrapping to accept the additional argument `param_init_fn=_init_with_torchdistX`: - -``` -self.model = model = FSDP( - model, - auto_wrap_policy=auto_wrap_policy, - auto_wrapper_callable=auto_wrapper_callable, - param_init_fn=_init_with_torchdistX, - \*\*fsdp_kwargs, - ) -``` - -In `examples/pytorch/language-modeling/run_clm.py`, add the following import statement at the beginning of the file: - - -``` -from torchdistx import deferred_init -``` - -Edit the model initialization so that the model is wrapped with `deferred_init.deferred_init` by replacing the line - - -``` -model = AutoModelForCausalLM.from_config(config) -``` - -with - -``` -model = deferred_init.deferred_init(AutoModelForCausalLM.from_config, config) -``` - -Note that this assumes you are supplying your own model configuration file. Otherwise, you should modify your model initialization statement accordingly. - -You should also comment out these two lines which immediately follow the line above: - -``` -n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) logger.info(f"Training new model from scratch - Total size={n_params/2\*\*20:.2f}M params") -``` - -They will cause an error if left unmodified, since the model tensors do not actually have storage when these lines are executed. - -With these changes, you can now run GPT-2 models with as many as 128B parameters, provided the accelerator size is suitably large. - -## Next Steps & Acknowledgements ## - -To learn more, the docs can be found [here](https://huggingface.co/docs/transformers/main_classes/trainer#pytorchxla-fully-sharded-data-parallel). We’d love to [hear from you](https://github.com/pytorch/xla#providing-feedback) if you run into any issues with FSDP in PyTorch/XLA, or just want to tell us about how you are using it. - -We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to [GitHub](https://github.com/pytorch/xla) so that we can openly collaborate. - -We’d like to thank Ronghang Hu and Ross Girshick at Meta AI and Lysandre Debut, Sourab Mangrulkar, Sylvain Gugger and Arthur Zucker for all the support and collaboration. We’d also like to thank Jiewen Tan, Liyang Lu, Will Cromar, Vaibhav Singh, and Chandra Devarakonda for their assistance in preparing this post. - -Cheers! - -The PyTorch/XLA Team at Google diff --git a/_posts/2023-08-31-pytorch-xla-spmd.md b/_posts/2023-08-31-pytorch-xla-spmd.md deleted file mode 100644 index 715a1dc6ff4b..000000000000 --- a/_posts/2023-08-31-pytorch-xla-spmd.md +++ /dev/null @@ -1,200 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch/XLA SPMD: Scale Up Model Training and Serving with Automatic Parallelization" -author: Yeounoh Chung, Jon Bolin, Milad Mohammadi, Jiewen Tan, Jack Cao, Joe Spisak, Alex Spiridonov, Shauheen Zahirazami, Steven Krawczyk, Wonjoo Lee Mohit Khatwani, Wanchao Liang, Vaibhav Singh ---- - - -Today, we are delighted to announce PyTorch/XLA SPMD: the integration of [GSPMD](https://arxiv.org/pdf/2105.04663.pdf) into PyTorch with an easy to use API. PyTorch developers seeking superior performance and scale can train and serve the largest neural networks while maximizing utilization of AI accelerators, such as Google Cloud TPUs. - - -## Introduction - -[GSPMD](https://arxiv.org/abs/2105.04663) is an automatic parallelization system for ML workloads. The XLA compiler transforms the single device program into a partitioned one with proper collectives, based on the user provided sharding hints. This allows developers to write PyTorch programs as if they are on a single large device without any custom sharded computation and/or collective communication ops to scale models. - -PyTorch/XLA SPMD allows PyTorch users to parallelize their ML workloads with GSPMD with less effort and with better performance. Some of the key highlights are: - - -* Better developer experience. Everything happens with a few [sharding annotations](#simple-example-with-sharding-annotation) from the user, and PyTorch/XLA SPMD achieves comparable performance to the most efficient PyTorch sharding implementation (see the Examples and Results section below). PyTorch/XLA SPMD separates the task of programming an ML model from the challenge of parallelization. Its automated approach to model sharding frees up the user from implementing the sharded version of ops with proper collectives in place. -* A single API that enables a large variety of parallelism algorithms (including data parallelism, fully sharded data parallelism, spatial partitioning tensor and pipeline parallelism, as well as combinations of these algorithms) for different ML workloads and model architectures. -* Industry-leading performance in large model training. PyTorch/XLA SPMD brings the powerful XLA GSPMD to PyTorch, enabling users to harness the full power of Google Cloud TPUs. -* Enabling PyTorch and JAX developers take advantage of the same underlying XLA API to scale models. - - -## Key Concepts - -The key concepts behind the sharding annotation API are: 1) Mesh, 2) Partition Spec, and 3) `mark_sharding` API to express sharding intent using Mesh and Partition Spec. A more detailed design overview is available as a user guide [here](https://github.com/pytorch/xla/blob/master/docs/spmd.md). - - -### Mesh - -For a given cluster of devices, a physical mesh is a representation of the interconnect topology. - -We derive a logical mesh based on this topology to create sub-groups of devices which can be used for partitioning different axes of tensors in a model. We apply sharding annotations to map the program across the logical mesh; this automatically inserts communication collectives in the program graph to support functional correctness (see the figure below). - - -![SPMD on PyTorch/XLA](/assets/images/pytorch-xla-spmd/fig1.png){:style="width:100%;"} - - -We abstract logical mesh with [Mesh API](https://github.com/pytorch/xla/blob/028df4da388468fa9a41b1f98ea08bfce13b4c63/torch_xla/experimental/xla_sharding.py#L16). The axes of the logical Mesh can be named. Here is an example: - -``` -import numpy as np -import torch_xla.runtime as xr -import torch_xla.experimental.xla_sharding as xs -from torch_xla.experimental.xla_sharding import Mesh - -# Enable XLA SPMD execution mode. -xr.use_spmd() - -# Assuming you are running on a TPU host that has 8 devices attached -num_devices = xr.global_runtime_device_count() -# mesh shape will be (4,2) in this example -mesh_shape = (num_devices // 2, 2) -device_ids = np.array(range(num_devices)) -# axis_names 'x' nad 'y' are optional -mesh = Mesh(device_ids, mesh_shape, ('x', 'y')) - -mesh.get_logical_mesh() ->> array([[0, 1], - [2, 3], - [4, 5], - [6, 7]]) -mesh.shape() ->> OrderedDict([('x', 4), ('y', 2)]) -``` - -### Partition Spec - -partition_spec has the same rank as the input tensor. Each dimension describes how the corresponding input tensor dimension is sharded across the device mesh (logically defined by mesh_shape). `partition_spec` is a tuple of `device_mesh` dimension `index`, None, or a tuple of mesh dimension indices. The `index` can be an `int` or `str` if the corresponding mesh dimension is named. This specifies how each input rank is sharded (`index` to `mesh_shape`) or replicated (`None`). - -``` -# Provide optional mesh axis names and use them in the partition spec -mesh = Mesh(device_ids, (4, 2), ('data', 'model')) -partition_spec = ('model', 'data') -xs.mark_sharding(input_tensor, mesh, partition_spec) -``` - -We support all three types of sharding described in the original [GSPMD](https://arxiv.org/abs/2105.04663) paper. For instance, one can specify partial replication like this: - -``` -# Provide optional mesh axis names and use them in the partition spec -mesh = Mesh(device_ids, (2, 2, 2), ('x', 'y', 'z')) - -# evenly shard across x and z and replicate among y -partition_spec = ('x', 'z') # equivalent to ('x', None, 'z') -xs.mark_sharding(input_tensor, mesh, partition_spec) -``` - -### Simple Example With Sharding Annotation - -Users can annotate native PyTorch tensors using the `mark_sharding` API ([src](https://github.com/pytorch/xla/blob/9a5fdf3920c18275cf7dba785193636f1b39ced9/torch_xla/experimental/xla_sharding.py#L388)). This takes `torch.Tensor` as input and returns a [XLAShardedTensor](https://github.com/pytorch/xla/blob/03991d44a0a0297ced3ba9fc10ba451a4b6c94ab/torch_xla/experimental/xla_sharded_tensor.py#L55-L62) as output. - -``` -def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh, partition_spec: Tuple[Union[int, None]]) -> XLAShardedTensor -``` - -Invoking `mark_sharding` API takes a user defined logical [mesh](#mesh) and [partition_spec](#partition-spec) and generates a sharding annotation for the XLA compiler. The sharding specification is attached to the `XLATensor`, as well as the original input tensor. Here is a simple usage example from the [[RFC](https://github.com/pytorch/xla/issues/3871)], to illustrate how the sharding annotation API works: - -``` -import numpy as np -import torch -import torch_xla.core.xla_model as xm -import torch_xla.runtime as xr -import torch_xla.experimental.xla_sharding as xs -from torch_xla.experimental.xla_sharded_tensor import XLAShardedTensor -from torch_xla.experimental.xla_sharding import Mesh - -# Enable XLA SPMD execution mode. -xr.use_spmd() - -# Device mesh, this and partition spec as well as the input tensor shape define the individual shard shape. -num_devices = xr.global_runtime_device_count() -mesh_shape = (2, num_devicese // 2) # 2x4 on v3-8, 2x2 on v4-8 -device_ids = np.array(range(num_devices)) -mesh = Mesh(device_ids, mesh_shape, ('x', 'y')) - -t = torch.randn(8, 4).to(xm.xla_device()) - -# Mesh partitioning, each device holds 1/8-th of the input -partition_spec = (0, 1) -m1_sharded = xs.mark_sharding(t, mesh, partition_spec) -assert isinstance(m1_sharded, XLAShardedTensor) == True -# Note that the sharding annotation is also in-placed updated to t -``` - -We can annotate different tensors in the PyTorch program to enable different parallelism techniques, as described in the comment below: - -``` -# Sharding annotate the linear layer weights. SimpleLinear() is a nn.Module. -model = SimpleLinear().to(xm.xla_device()) -xs.mark_sharding(model.fc1.weight, mesh, partition_spec) - -# Training loop -model.train() -for step, (data, target) in enumerate(loader): - # Assumes `loader` returns data, target on XLA device - optimizer.zero_grad() - # Sharding annotate input data, we can shard any input - # dimensions. Sharding the batch dimension enables - # data parallelism, sharding the feature dimension enables - # spatial partitioning. - xs.mark_sharding(data, mesh, partition_spec) - ouput = model(data) - loss = loss_fn(output, target) - optimizer.step() - xm.mark_step() -``` - -More complete unit test cases and integration test examples are available in the PyTorch/XLA [repo](https://github.com/pytorch/xla/tree/r2.0/test/spmd). - - -## Results - - -### Performance - -We measured the performance of PyTorch/XLA SPMD using a GPT-2 model ([src](https://github.com/pytorch-tpu/transformers/tree/yeounoh_gpt2_spmd)) and compared it with [user-mode FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/). - -Here, SPMD applies the same sharding scheme as the FSDP plot (i.e. 1D sharding). Users are expected to achieve better MFU results by exploring more advanced SPMD sharding schemes. - -![SPMD vs. FSDP](/assets/images/pytorch-xla-spmd/fig2.png){:style="width:100%; max-width: 600px; display: block; margin-left: auto; margin-right: auto"} - - -We use Model FLOPS Utilization (MFU) as a metric for comparison. MFU is “the ratio of the observed throughput relative to the theoretical maximum throughput of a system operating at peak FLOPs” ([PaLM paper](https://arxiv.org/pdf/2204.02311.pdf)). - - -``` -flops_per_step = 6 * global_batch_size * seq_len * num_params -model_flops_utilization = flops_per_step / step_time(s) / chip_count / flops_per_chip -``` - - -This estimation assumes that the input dimensionality is much larger than the input sequence length (d_model >> seq_len). If this assumption is violated the self-attention FLOPs start to be significant enough and this expression will underestimate the true MFU. - - -### Scalability - -One of the core benefits of SPMD is the flexible partitioning which can be used to save accelerator memory (HBM) usage and improve scalability. For scalability analysis, we present two studies: 1) we examine the peak HBM across 4 model sizes using Hugging Face transformers (GPT-2) as the base implementation; 2) we examine the peak HBM usage with [spatial partitioning](https://cloud.google.com/blog/products/ai-machine-learning/train-ml-models-on-large-images-and-3d-volumes-with-spatial-partitioning-on-cloud-tpus). - - - -![Peak HBM Utilization](/assets/images/pytorch-xla-spmd/fig3.png){:style="width:100%; max-width: 600px; display: block; margin-left: auto; margin-right: auto"} - - -The above figure illustrates the unsharded 2B parameters model peak memory footprint stands at 26GB (red dashed line). harding model weights (model parallelism) reduces the peak memory footprint, and thus, enables larger model training with a given TPU pod slice. In these experiments, we achieved up to 39.75% MFU on a 4B parameters model on Google Cloud TPU v4-16. - -We also ran an input batch scalability test using [spatial partitioning](https://cloud.google.com/blog/products/ai-machine-learning/train-ml-models-on-large-images-and-3d-volumes-with-spatial-partitioning-on-cloud-tpus) and a simple ResNet50 example ([src](https://github.com/pytorch/xla/blob/master/test/spmd/test_train_spmd_imagenet.py)) on Cloud TPU v4-8. Input batch is commonly sharded across the batch dimension for data parallelism (DDP, FSDP), but PyTorch/XLA SPMD enables input sharding across input feature dimensions for spatial sharding. As shown in the below figure, one can push the per-device batch size to 512 with spatial partitioning which is not possible with other data parallelism techniques. - - - -![Batch size scaling with spatial partitioning](/assets/images/pytorch-xla-spmd/fig4.png){:style="width:100%; max-width: 741px; display: block; margin-left: auto; margin-right: auto"} - - -## The Road Forward for PyTorch/XLA SPMD - -We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. SPMD is still experimental, and we continuously add new features to it. In future releases, we plan to address async dataloading, partially replicated sharding, and other improvements. We’d love to [hear from you](https://github.com/pytorch/xla#providing-feedback), answer your questions about PyTorch/XLA SPMD, and learn how you use SPMD. - -Cheers! - -The PyTorch/XLA Team at Google \ No newline at end of file diff --git a/_posts/2023-09-05-automated-trace-collection.md b/_posts/2023-09-05-automated-trace-collection.md deleted file mode 100644 index 27e511672d40..000000000000 --- a/_posts/2023-09-05-automated-trace-collection.md +++ /dev/null @@ -1,117 +0,0 @@ ---- -layout: blog_detail -title: "Automated trace collection and analysis" -author: Anupam Bhatnagar, Brian Coutinho ---- - - -In this blog, we share how we enabled the collection and analysis of PyTorch Profiler traces for training workloads **without any user side code instrumentation**. We leveraged Dynolog - an open source daemon for CPU and GPU telemetry to collect PyTorch Profiler traces, and analyzed the collected traces using Holistic Trace Analysis - an open source library for analyzing PyTorch Profiler traces. This toolchain has allowed engineers at Meta to accelerate their performance optimization workflows. The keystone to our solution was implementing pre and post hooks for the base Optimizer class in PyTorch. We demo PyTorch trace collection using Dynolog in a short video. - - -## Problem - -Software developers at Meta run a large number of distributed training runs daily. In order to ensure that GPUs are being used effectively it is necessary to measure and analyze GPU performance for all jobs. Moreover, developers need the capability to introspect models and understand how CPUs and GPUs interact to debug performance issues. Developers build initial prototypes using a handful of GPUs and the production versions scale out to hundreds or thousands of GPUs, serving numerous business use cases such as generative AI, recommendation systems, ad ranking etc. - -Given the scale at Meta, it is necessary to have toolchains for performance measurement and monitoring which have low overhead and operate seamlessly with each other, to maintain high developer efficiency. - -In this blog, we describe how we use the PyTorch Profiler, Dynolog (a telemetry daemon) and Holistic Trace Analysis (a performance debugging library) to collect traces without any user side code instrumentation and analyze them to identify jobs with low GPU utilization. - - -## Solution - -The diagram below shares an overview of how the toolchain works together. - - - -1. User launches a PyTorch application. -2. A training service or user triggers a profiling session using the Dynolog CLI which sends a request over the network to the Dynolog daemon. -3. Dynolog daemon relays the profiling configuration to the PyTorch application, setting it temporarily in a profiling mode. -4. PyTorch Profiler collects a trace and stores it to the database (e.g., network file system or S3 bucket). -5. The collected traces are then analyzed using Holistic Trace Analysis (HTA). - -![Figure 1: Dynolog, PyTorch Profiler and HTA toolchain workflow](/assets/images/dyno_hta.png){:style="width:100%; max-width: 662px; display: block; margin-left: auto; margin-right: auto"} - -
        -Figure 1: Dynolog, PyTorch Profiler and HTA toolchain workflow -
        - -Let’s dig a bit deeper in each of the components. - - -### Dynolog - -[Dynolog](https://developers.facebook.com/blog/post/2022/11/16/dynolog-open-source-system-observability/) is a lightweight monitoring daemon for heterogeneous CPU-GPU systems. It supports continuous monitoring of [performance metrics](https://github.com/facebookincubator/dynolog/blob/main/docs/Metrics.md) from the CPU (utilization, network bandwidth, instructions/second) and GPU (SM Occupancy, DRAM bandwidth, GPU power draw). Additionally, dynolog exports APIs to collect deep-dive profiling data that can be accessed via the dyno CLI. - -One of the chief integrations Dynolog offers is interfacing with the [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html). This enables [on-demand remote tracing](https://pytorch.org/blog/performance-debugging-of-production-pytorch-models-at-meta/) using a single command to trace thousands of servers. This can be accomplished by using the `dyno gputrace` command. - - -### PyTorch Profiler - -GPU kernels execute asynchronously, and GPU-side support is needed to create the trace. NVIDIA provides this visibility via the CUPTI library. Kineto is the subsystem within Profiler that interfaces with CUPTI. The [PyTorch Profiler](https://pytorch.org/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/) leverages the [Kineto library](https://github.com/pytorch/kineto) to collect GPU traces. To enable automated profiling of training workloads at scale **without any user side code instrumentation** we made a few fundamental changes to PyTorch. These changes enable trace collection without any user intervention. - - - -* Registration:** **First, we modified PyTorch to register with the Dynolog daemon on start up. This feature is switched on by setting the environment variable KINETO_USE_DAEMON=True. With this environment variable set to True, the PyTorch Profiler periodically polls Dynolog to check for on-demand tracing requests. -* Iteration hooks: Then, we [implemented pre and post hooks for the base Optimizer class](https://github.com/pytorch/pytorch/pull/89176). This allowed us to annotate start/end of training iterations. The profiler is then aware of the iteration count and can safely capture a fixed number of iterations in the trace. - - -### Holistic Trace Analysis (HTA) - -ML researchers and engineers often struggle to computationally scale up their models as they are unaware of the performance bottlenecks in their workloads. Large distributed training jobs could generate thousands of traces, containing way too much data for a human to inspect. This is where [Holistic Trace Analysis](https://pytorch.org/blog/trace-analysis-for-masses/) comes in. HTA is an open source library for performance analysis - it takes as input PyTorch Profiler traces and up-levels the performance information contained in them. Its goal is to help researchers and engineers achieve the best performance from the hardware stack. To aid performance debugging HTA provides the following features (partial list): - - - -* [Temporal Breakdown](https://hta.readthedocs.io/en/latest/source/features/temporal_breakdown.html): Breakdown of GPU time in terms of time spent in computation, communication, memory events, and idle time on a single node and across all ranks. -* [Idle Time Breakdown](https://hta.readthedocs.io/en/latest/source/features/idle_time_breakdown.html): Breakdown of GPU idle time into waiting for the host, waiting for another kernel or attributed to an unknown cause. -* [Kernel Breakdown](https://hta.readthedocs.io/en/latest/source/features/kernel_breakdown.html): Find kernels with the longest duration on each rank. -* [Kernel Duration Distribution](https://hta.readthedocs.io/en/latest/source/features/kernel_breakdown.html#kernel-duration-distribution): Distribution of average time taken by longest kernels across different ranks. -* [Communication Computation Overlap](https://hta.readthedocs.io/en/latest/source/features/comm_comp_overlap.html): Calculate the percentage of time when communication overlaps computation. - -We invite you to check out these [Jupyter notebooks](https://github.com/facebookresearch/HolisticTraceAnalysis/tree/main/examples) to see what HTA can do for you. If you are a first time user we recommend starting with the [trace_analysis_demo](https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/examples/trace_analysis_demo.ipynb) notebook. - -To summarize, Dynolog allows us to collect PyTorch Profiler traces on-the-fly in a scalable manner. Furthermore, by leveraging HTA we can automate performance analysis and identify bottlenecks. At Meta, we use the Dynolog, PyTorch Profiler and HTA toolchain to accelerate our performance optimization workflows. - - -## Demo - -We share a screencast showcasing trace collection without any user side code instrumentation for a toy PyTorch program. The demo runs in a docker container and the trace collection is triggered using Dynolog. HTA can be used to subsequently analyze the collected trace. - - - -## FAQs - -_Q. What else can `dyno gputrace` do for me?_ - -The `dyno gputrace` command supports several custom PyTorch Profiler options: - - - -* capturing python stacks -* memory profiling -* record input shapes - -Please run `dyno gputrace --help` for all the options. - -_Q. Does Dynolog collect hardware performance metrics?_ - -Dynolog can also be used for always-on monitoring: - - - -* It incorporates out-of-box [GPU performance monitoring](https://github.com/facebookincubator/dynolog/tree/main#gpu-monitoring) for NVIDIA GPUs using [DCGM](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/index.html#). -* Dynolog provides basic Linux kernel [performance metrics](https://github.com/facebookincubator/dynolog/blob/main/docs/Metrics.md) including CPU, network and IO resource usage. -* Dynolog manages hardware performance counters for micro-architecture specific events related to CPU Cache, TLBs etc on Intel and AMD CPUs. - -_Q: How can I build the Docker image used in the demo?_ - -The dockerfile is available [here](https://github.com/facebookincubator/dynolog/blob/main/dynolog_hta.dockerfile). Use the command below to build the Docker image. - - -``` -docker build -f /path/to/dynolog_repo/dynolog_hta.dockerfile -t . -``` - - -_Q. How can I run the docker image?_ - -You can refer to this [cheat sheet](https://gist.github.com/anupambhatnagar/07ebff374bc45e4b63eb42893cca7e87) to run the Docker image. diff --git a/_posts/2023-09-06-graphcore-joins-pytorch.md b/_posts/2023-09-06-graphcore-joins-pytorch.md deleted file mode 100644 index ce982178faa8..000000000000 --- a/_posts/2023-09-06-graphcore-joins-pytorch.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -layout: blog_detail -title: "Graphcore Joins the PyTorch Foundation as a General Member" ---- - -![Graphcore logo](/assets/images/graphcore-logo.jpg){:style="max-width:350px;float:right;margin: 20px;"} - -The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Graphcore has joined as a general member. - -Graphcore is a UK-based company that specializes in designing and manufacturing AI accelerators, hardware and software specifically tailored for artificial intelligence and machine learning workloads. - -“We’re thrilled that PyTorch is the leading framework for development on the Graphcore platform,” said Executive Director of the PyTorch Foundation Ibrahim Haddad. “Graphcore has played an important role in the hardware and open source space, and we look forward to their continued contributions to PyTorch.” - -Graphcore has contributed to the PyTorch ecosystem by developing integrations to run on their IPU hardware. These integrations enable researchers and practitioners to use their preferred frameworks while taking advantage of Graphcore's specialized hardware. - -“At Graphcore we’re truly aligned with PyTorch’s objective of reducing the barrier of entry to AI practitioners. By supporting a native PyTorch software environment for IPUs we are giving developers access to new underlying hardware, designed from the ground up for AI, to help unlock new AI techniques to improve efficiency or performance and to drive breakthroughs in AI research and applications, with the same user-friendly PyTorch framework they know and expect. We look forward to contributing to and growing the global AI community as an active member of the PyTorch Foundation and are proud to be the first general member.” Anthony Barbier, Software Frameworks Lead at Graphcore. - -To learn more about how you can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/join). - -## About Graphcore - -Graphcore compute systems are accelerating the AI revolution. Powered by the groundbreaking Intelligence Processing Unit (IPU), Graphcore delivers leading-edge AI performance with unprecedented efficiency. IPUs are used around the world by organisations building their intelligent compute capabilities, including AI-centric startups, large multinational corporations and both public and private research institutions. Graphcore is backed by some of the world’s leading investors and has attracted more than $700m of funding. The company is based in Bristol, UK, with offices across Europe, Asia and North America. - -## About PyTorch Foundation - -The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration. - -## About The Linux Foundation - -The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its [trademark usage page](https://www.linuxfoundation.org/trademark-usage). Linux is a registered trademark of Linus Torvalds. \ No newline at end of file diff --git a/_posts/2023-09-12-one-year-pytorch.md b/_posts/2023-09-12-one-year-pytorch.md deleted file mode 100644 index 222713ad415a..000000000000 --- a/_posts/2023-09-12-one-year-pytorch.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -layout: blog_detail -title: "One Year of PyTorch Foundation" ---- - -It’s been one year since we announced the formation of the PyTorch Foundation! 🎉 - -In its inaugural year, the PyTorch Foundation made a significant impact by launching PyTorch 2.0, growing contributors and adding new member companies. We’re grateful to our founding members for their support to move the foundation forward. - -**A few milestones in the past year include:** - -💻 Over 600,000 repositories on GitHub -✅ 60% of AI implementations choosing PyTorch -📈 More than 20% year over year growth in new repositories -🤝 Over 12,000 commits since last year - -**And a look at what the foundation has been up to this past year:** - -![PyTorch project timeline](/assets/images/pytorch-timeline.svg){:style="width:100%; max-width: 662px; display: block; margin-left: auto; margin-right: auto"} - -We look forward to growing our community for the years to come through supporting our contributors, democratizing the AI field, and creating new innovations. - -We invite you to join us at this year's [PyTorch Conference](https://events.linuxfoundation.org/pytorch-conference/) on October 16-17 in San Francisco. Conference registration is filling up quickly, so take advantage of your chance to be part of this exciting event. - -Join us to stay informed about the latest announcements and have the opportunity to connect with both the founding members and new additions to the PyTorch community. - -With thanks and gratitude, -The PyTorch Foundation Team \ No newline at end of file diff --git a/_posts/2023-09-13-accelerated-cpu-inference.md b/_posts/2023-09-13-accelerated-cpu-inference.md deleted file mode 100644 index 4f94cb12a93f..000000000000 --- a/_posts/2023-09-13-accelerated-cpu-inference.md +++ /dev/null @@ -1,489 +0,0 @@ ---- -layout: blog_detail -title: "Accelerated CPU Inference with PyTorch Inductor using torch.compile" -author: Intel ---- - -## Story at a Glance - -* _Although the PyTorch* Inductor C++/OpenMP* backend has enabled users to take advantage of modern CPU architectures and parallel processing, it has lacked optimizations, resulting in the backend performing worse than eager mode in terms of end-to-end performance._ -* _Intel optimized the Inductor backend using a hybrid strategy that classified operations into two categories: Conv/GEMM and non-Conv/GEMM element-wise and reduction ops._ -* _For popular deep learning models, this hybrid strategy demonstrates promising performance improvements compared to eager mode and improves the C++/OpenMP backend’s efficiency and reliability for PyTorch models._ - ---- - - -## Inductor Backend Challenges - - -The PyTorch Inductor C++/OpenMP backend enables users to take advantage of modern CPU architectures and parallel processing to accelerate computations. - - -However, during the early stages of its development, the backend lacked some optimizations, which prevented it from fully utilizing the CPU computation capabilities. As a result, for most models the C++/OpenMP backend performed worse than eager mode in terms of end-to-end performance, with 45% of TorchBench, 100% of Hugging Face, and 75% of TIMM models performing worse than eager mode. - - -In this post, we highlight Intel’s optimizations to the Inductor CPU backend, including the technologies and results. - - -We optimized the backend by using a hybrid strategy that classified operations into two categories: Conv/GEMM and non-Conv/GEMM element-wise and reduction ops. Post-op fusion and weight prepacking using the oneDNN performance library were utilized to optimize the former, while explicit vectorization in C++ codegen was used to optimize the latter. - - -This hybrid strategy demonstrated promising performance improvements compared to eager mode, particularly on popular deep learning models such as Inductor Hugging Face, Inductor TorchBench and Inductor TIMM. Overall, Intel’s optimizations improve the C++/OpenMP backend's efficiency and reliability for PyTorch models. - -![Figure 1. Performance Speedup Ratio Trend](/assets/images/accelerated-cpu-inference/f1-pytorch-inference-speedup-ratio-trend-multi.png.rendition.intel.web.1648.927.png){:style="width:100%;"} - -**Figure 1**: Performance Speedup Ratio Trend - - - -### Performance Status of Intel Hybrid Optimizations - - -Compared to eager mode with the hybrid optimizations, the C++/OpenMP backend shows promising performance improvements. We measured the performance of the three Inductor benchmark suites—TorchBench, Hugging Face, and TIMM—and the results are as follows. (_Note: we publish our performance data twice per week on [GitHub](http://github.com/pytorch/pytorch/issues/93531)._) - - -Overall, these optimizations help to ensure that the C++/OpenMP backend provides efficient and reliable support for PyTorch models. - - -### Passrate - -``` -+----------+------------+-------------+-------------+ -| Compiler | torchbench | huggingface | timm_models | -+----------+------------+-------------+-------------+ -| inductor | 93%, 56/60 | 96%, 44/46 | 100%, 61/61 | -+----------+------------+-------------+-------------+ -``` - -### Geometric mean speedup (Single-Socket Multi-threads) - -``` -+----------+------------+-------------+-------------+ -| Compiler | torchbench | huggingface | timm_models | -+----------+------------+-------------+-------------+ -| inductor | 1.39x | 1.20x | 1.73x | -+----------+------------+-------------+-------------+ -``` - -### Individual Model Performance - - -![Figure 2. TorchBench FP32 Performance (Single-Socket Multi-threads)](/assets/images/accelerated-cpu-inference/f2-torchbench-fp32-performance-multithread.png.rendition.intel.web.1648.927.png){:style="width:100%;"} - -**Figure 2**: TorchBench FP32 Performance (Single-Socket Multi-threads) - - - - - -![Figure 3. Hugging Face FP32 Performance (Single-Socket Multi-thread)](/assets/images/accelerated-cpu-inference/f3-huggingface-fp32-performance-multithread.png.rendition.intel.web.1648.927.png){:style="width:100%;margin-top: 3em;"} - -**Figure 3**: Hugging Face FP32 Performance (Single-Socket Multi-thread) - - - -![Figure 4. TIMM FP32 Performance (Single-Socket Multi-threads)](/assets/images/accelerated-cpu-inference/f4-timm-fp32-performance-multithread.png.rendition.intel.web.1648.927.png){:style="width:100%;margin-top: 3em;"} - -**Figure 4**: TIMM FP32 Performance (Single-Socket Multi-threads) - - -### Geometric mean speedup (Single-core Single-thread) - - -``` -+----------+------------+-------------+-------------+ -| Compiler | torchbench | huggingface | timm_models | -+----------+------------+-------------+-------------+ -| inductor | 1.29x | 1.15x | 1.37x | -+----------+------------+-------------+-------------+ -``` - - -![Figure 5. TorchBench FP32 Performance (Single-Socket Single-thread)](/assets/images/accelerated-cpu-inference/f5-torchbench-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png){:style="width:100%;"} - -**Figure 5**: TorchBench FP32 Performance (Single-Socket Single-thread) - - - -![Figure 6. Hugging Face FP32 Performance (Single-Socket Single Thread)](/assets/images/accelerated-cpu-inference/f6-huggingface-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png){:style="width:100%;margin-top: 3em;"} - -**Figure 6**: Hugging Face FP32 Performance (Single-Socket Single Thread) - - - -![Figure 7. TIMM FP32 Performance (Single-Socket Single-thread)](/assets/images/accelerated-cpu-inference/f7-timm-fp32-performance-single-thread.png.rendition.intel.web.1648.927.png){:style="width:100%;margin-top: 3em;"} - -**Figure 7**: TIMM FP32 Performance (Single-Socket Single-thread) - - -## Technical Deep Dive - -Now, let's take a closer look at the two primary optimizations used in the Inductor C++/OpenMP backend: - -1. weight prepacking and post-operation fusion via oneDNN library -2. explicit vectorization in Inductor C++ codegen - -### Weight Prepackaging & Post-op Fusion via oneDNN - -Shorthand for Intel® oneAPI Deep Neural Network Library, oneDNN library provides a range of post-op fusions (i.e., fuse convolution and matmal with its consecutive operation) that can benefit popular models. The [Intel® Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) has implemented most of these fusions and has achieved significant performance improvements. As a result, we have upstreamed all of these fusions that have been applied in Intel’s PyTorch extension to Inductor, enabling a wider range of models to benefit from these optimizations. We have defined these fusions as operators under the mkldnn namespace. This allows the Python module to invoke these mkldnn operations directly. - -Currently, the defined fused operations are as follows. You can find these defined fused operations at [RegisterMkldnnOpContextClass.cpp](https://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp#L35-#L48). - -* `_linear_pointwise`: Fuses Linear and its post-unary element-wise operations -* `_linear_pointwise.binary`: Fuses Linear and its post-binary element-wise operations -* `_convolution_pointwise`: Fuses Convolution and its post-unary element-wise operations -* `_convolution_pointwise.binary`: Fuses Convolution and its post-binary element-wise operations - -The detailed fusion patterns are defined in the [mkldnn.py](https://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/mkldnn.py#L774-#L818) file: `convolution/linear + sigmoid/hardsigmoid/tanh/hardtanh/hardswish/leaky_relu/gelu/relu/relu6/siluconvolution/linear + add/add_/iadd/sub/sub_` - -On the Inductor side, we apply these fusions on the FX graph that has been lowered. We have defined [mkldnn_fuse_fx](https://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/mkldnn.py#L491) as the entry point to apply all the fusions. The code snippet for this is as follows: - - -``` -def mkldnn_fuse_fx(gm: torch.fx.GraphModule, example_inputs): - ... - gm = fuse_unary(gm) - gm = fuse_binary(gm) - ... - if config.cpp.weight_prepack: - gm = pack_module(gm) - return gm -``` - -In the `mkldnn_fuse_fx` function, we apply fusion on the FX graph that hasn’t been lowered yet. To fuse convolution/linear and its consecutive elementwise operations, we invoke `fuse_unary` and `fuse_binary` as follows: - -``` - gm = fuse_unary(gm) - gm = fuse_binary(gm) -``` - -In addition to the post-op fusion, we apply weight prepacking to improve the Conv/GEMM performance further: - -``` - gm = pack_module(gm) -``` - - -Weight prepacking involves rearranging the weight tensor in a blocked layout, which: - -* can improve vectorization and cache reuse compared to plain formats like NCHW or NHWC and; -* can help avoid weight reordering at runtime, which can reduce overhead and improve performance and; -* increases memory usage as the tradeoff. - -For these reasons, we provide `config.cpp.weight_prepack` flag in Inductor to provide users with more control over this optimization, allowing them to enable it based on their specific needs. - - -### Explicit Vectorization in Inductor C++ Codegen - -Vectorization is a key optimization technique that can significantly improve the performance of numerical computations. By utilizing SIMD (Single Instruction, Multiple Data) instructions, vectorization enables multiple computations to be performed simultaneously on a single processor core, which can lead to significant performance improvements. - -In the Inductor C++/OpenMP backend, we use [Intel® AVX2](https://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/codecache.py#L372) and [Intel® AVX-512](https://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/codecache.py#L359) ISA (Instruction Set Architecture) options for vectorization by leveraging the aten vectorization library to facilitate the implementation. Aten vectorization supports multiple platforms, including x86 and Arm, as well as multiple data types. It can be extended to support other ISAs easily by adding more [VecISA](https://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/codecache.py#L275) sub-classes. This allows Inductor to easily support other platforms and data types in the future. - -Due to differences in platforms, the C++/OpenMP backend of Inductor starts by detecting the CPU features to determine the vectorization bit width at the beginning of code generation. By default, if the machine supports both AVX-512 and AVX2, the backend will choose 512-bit vectorization. - -If the hardware supports vectorization, the C++/OpenMP backend first detects if the loop body can be vectorized or not. There are primarily three scenarios that we are not able to generate kernel with vectorization: - -1. Loop body lacks vector intrinsics support, e.g., `rand` and `atomic_add`. -2. Loop body lacks efficient vector intrinsics support, e.g., non-contiguous `load/store`. -3. Data types with vectorization not yet supported but work in progress, e.g., integer, double, half, and bfloat16. - -To address this issue, the C++/OpenMP backend uses [CppVecKernelChecker](https://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/codegen/cpp.py#L1396) to detect whether all operations in a particular loop body can be vectorized or not. In general, we classified the operations into two categories by identifying if they depend on the context. - -For most elementwise operations such as `add`, `sub`, `relu`, vectorization is straightforward, and their execution does not depend on context. - -However, for certain other operations, their semantics are more complex and their execution depends on context through static analysis. - -For example, let's consider the where operation that takes in mask, `true_value`, and `false_value` while the mask value is loaded from a `uint8` tensor. The fx graph could be as follows: - - -``` -graph(): - %ops : [#users=9] = placeholder[target=ops] - %get_index : [#users=1] = call_module[target=get_index](args = (index0,), kwargs = {}) - %load : [#users=1] = call_method[target=load](args = (%ops, arg1_1, %get_index), kwargs = {}) - %to_dtype : [#users=1] = call_method[target=to_dtype](args = (%ops, %load, torch.bool), kwargs = {}) - ... - %where : [#users=1] = call_method[target=where](args = (%ops, %to_dtype, %to_dtype_2, %to_dtype_3), kwargs = {}) -``` - -Regarding `uint8`, it is a general data type and could be used for computation but is not limited to being used as Boolean for mask. Hence, we need to analyze its context statically. In particular, the [CppVecKernelChecker](https://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/codegen/cpp.py#L1396) will check whether a uint8 tensor is only used by `to_dtype` and `to_dtype` is only used by where. If yes, it could be vectorized. Otherwise, it will fall back to the scalar version. The generated code could be as follows: - -Scalar Version - -``` -auto tmp0 = in_ptr0[i1 + (17*i0)]; -auto tmp3 = in_ptr1[i1 + (17*i0)]; -auto tmp1 = static_cast(tmp0); -auto tmp2 = static_cast(-33.0); -auto tmp4 = tmp1 ? tmp2 : tmp3; -tmp5 = std::max(tmp5, tmp4); -``` - -Vectorization Version - -``` -float g_tmp_buffer_in_ptr0[16] = {0}; -// Convert the flag to float for vectorization. -flag_to_float(in_ptr0 + (16*i1) + (17*i0), g_tmp_buffer_in_ptr0, 16); -auto tmp0 = at::vec::Vectorized::loadu(g_tmp_buffer_in_ptr0); -auto tmp3 = at::vec::Vectorized::loadu(in_ptr1 + (16*i1) + (17*i0)); -auto tmp1 = (tmp0); -auto tmp2 = at::vec::Vectorized(static_cast(-33.0)); -auto tmp4 = decltype(tmp2)::blendv(tmp3, tmp2, tmp1); -``` - -In addition to context analysis, the C++/OpenMP backend also incorporates several other vectorization-related optimizations. These include: - -* Tiled kernel implementation for supporting transpose load - [cpp.py](http://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/codegen/cpp.py#L1211) -* Data type demotion based on value range - [cpp.py](http://github.com/pytorch/pytorch/blob/fe05266fda4f908130dea7cbac37e9264c0429a2/torch/_inductor/codegen/cpp.py#L1647-#L1672) -* Replacement of [sleef](http://github.com/shibatch/sleef/tree/e0a003ee838b75d11763aa9c3ef17bf71a725bff) implementation with oneDNN/oneMKL implementation for optimizing aten vectorization - [#94577](http://github.com/pytorch/pytorch/pull/94577), [#92289](http://github.com/pytorch/pytorch/pull/92289), [#91613](http://github.com/pytorch/pytorch/pull/91613) - -In summary, we examined vectorization optimization in Inductor C++ backend for FP32 training and inference of 150 benchmark models with 90% of inference kernels and 71% of training kernels being vectorized. - -In terms of inference, a total of 28,185 CPP kernels were generated, with 25,579 (90%) of them being vectorized, while the remaining 10% were scalar. As for training, 103,084 kernels were generated, with 73,909 (71%) being vectorized and 29% not vectorized. - -The results indicate that **the vectorization of inference kernels is quite impressive** (there is still some work to be done in training kernels since we just started to work on the training). The remaining non-vectorized kernels are analyzed in different categories, highlighting the next steps to improve vectorization coverage: index-related operations, int64 support, vertical reduction, vectorization with fallback, and more. - -In addition, we also optimized the C++/OpenMP backend with other optimizations like buffer-reuse and CppWrapper. - -#### Future Work - -The next step, we will continue optimizing the C++/OpenMP backend and extend it to support more data types as the next step. This includes: - -1. Improve vectorization coverage -2. Support and optimize low precision kernel including BF16, FP16, Quantization -3. Training optimization -4. Loop tiling -5. Autotune -6. Further fusion optimization of Conv/GEMM kernels. -7. Explore alternative codegen paths: clang/llvm/triton - -## Summary - -Inductor C++/OpenMP backend is a flexible and efficient backend for the CPU. This blog describes the optimizations used in the C++/OpenMP backend of Inductor for inference and training of three benchmark suites – TorchBench, Hugging - -Face and TIMM. The primary optimizations include weight prepacking and post-operation fusion via the oneDNN library, as well as explicit vectorization in Inductor C++ codegen using AVX2 and AVX-512 instructions. - -The results show that 90% of inference kernels and 71% of training kernels are vectorized, indicating impressive vectorization for inference and room for improvement in training. In addition, we also applied other optimizations like buffer-reuse and CppWrapper. And we will continuously focus on the future work mentioned above to further improve the performance. - -### Acknowledgements - - -The results presented in this blog post are the culmination of a collaborative effort between the Intel PyTorch team and Meta. We would like to express our sincere gratitude to [@jansel](http://dev-discuss.pytorch.org/u/jansel), [@desertfire](http://dev-discuss.pytorch.org/u/desertfire), and [@Chillee](http://dev-discuss.pytorch.org/u/chillee) for their invaluable contributions and unwavering support throughout the development process. Their expertise and dedication have been instrumental in achieving the optimizations and performance improvements discussed here. - - -### Configuration Details - -#### Hardware Details - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        -Item - -Value -
        -Manufacturer - -Amazon EC2 -
        -Product Name - -c6i.16xlarge -
        -CPU Model - -Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz -
        -Installed Memory - -128GB (1x128GB DDR4 3200 MT/s [Unknown]) -
        -OS - -Ubuntu 22.04.2 LTS -
        -Kernel - -5.19.0-1022-aws -
        -Microcode - -0xd000389 -
        -GCC - -gcc (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0 -
        -GLIBC - -ldd (Ubuntu GLIBC 2.35-0ubuntu3.1) 2.35 -
        -Binutils - -GNU ld (GNU Binutils for Ubuntu) 2.38 -
        -Python - -Python 3.10.6 -
        -OpenSSL - -OpenSSL 3.0.2 15 Mar 2022 (Library: OpenSSL 3.0.2 15 Mar 2022) -
        - - -#### Software Details - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        -SW - -Nightly commit - -Main commit -
        -Pytorch - -a977a12 - -0b1b063 -
        -Torchbench - -/ - -a0848e19 -
        -torchaudio - -0a652f5 - -d5b2996 -
        -torchtext - -c4ad5dd - -79100a6 -
        -torchvision - -f2009ab - -b78d98b -
        -torchdata - -5cb3e6d - -f2bfd3d -
        -dynamo_benchmarks - -fea73cb - -/ -
        - - - -#### Configuration - -* Intel OpenMP -* Jemalloc - oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1 -* **Single-Socket Multi-threads:** #of Instances: 1; Cores/Instance: 32 -* **Single-Core Single-thread:** #of Instances: 1; Cores/Instance: 1 diff --git a/_posts/2023-09-25-inside-the-matrix.md b/_posts/2023-09-25-inside-the-matrix.md deleted file mode 100644 index 71ef8e6dbb6e..000000000000 --- a/_posts/2023-09-25-inside-the-matrix.md +++ /dev/null @@ -1,508 +0,0 @@ ---- -layout: blog_detail -title: "Inside the Matrix: Visualizing Matrix Multiplication, Attention and Beyond" -author: Basil Hosmer ---- - -_Use 3D to visualize matrix multiplication expressions, attention heads with real weights, and more._ - -Matrix multiplications (matmuls) are the building blocks of today’s ML models. This note presents [mm](https://bhosmer.github.io/mm/ref.html), a visualization tool for matmuls and compositions of matmuls. - -Matrix multiplication is inherently a three-dimensional operation. Because mm uses all three spatial dimensions, it can convey meaning more clearly and intuitively than the usual squares-on-paper idioms, especially (though not only) for visual/spatial thinkers. - -We also have room to _compose_ matmuls in geometrically consistent ways - so we can visualize big, compound structures like attention heads and MLP layers using the same rules as simple expressions. And more advanced features, like animating different matmul algorithms, partitioning for parallelism, and loading external data to explore the behavior of actual models, all build naturally on this foundation. - -mm is fully interactive, runs [in the browser](https://bhosmer.github.io/mm/) and keeps its complete state in the URL, so links are shareable sessions (the screenshots and videos in this note all have links that open the corresponding visualization in the tool). This [reference guide](https://bhosmer.github.io/mm/ref.html) describes all of the available functionality. - -We'll first introduce the visualization approach, build intuition by visualizing some simple matmuls and expressions, then dive into some more extended examples: - - - -1. **Pitch** - why is this way of visualizing better? -2. **Warmup - animations** - watching the canonical matmul decompositions in action -3. **Warmup - expressions** - a quick tour of some fundamental expression building blocks -4. **Inside an attention head** - an in-depth look at the structure, values and computation behavior of a couple of attention heads from GPT2 via [NanoGPT](https://github.com/karpathy/nanoGPT) -5. **Parallelizing attention** - visualizing attention head parallelization with examples from the recent [Blockwise Parallel Transformer](https://arxiv.org/pdf/2305.19370.pdf) paper -6. **Sizes in an attention layer** - what do the MHA and FFA halves of an attention layer look like together, when we visualize a whole layer as a single structure? How does the picture change during autoregressive decoding? -7. **LoRA** - a visual explanation of this elaboration of the attention head architecture -8. **Wrapup** - next steps and call for feedback - - -## 1 Pitch - -[mm](https://bhosmer.github.io/mm/ref.html)'s visualization approach is based on the premise that _matrix multiplication is fundamentally a three-dimensional operation_. - -In other words this: - -![matrix multiplication is fundamentally a three-dimensional operation](/assets/images/inside-the-matrix/matmul3.jpg){:style="width:100%; max-width: 478px; display: block; margin-left: auto; margin-right: auto"} - -is a sheet of paper trying to be this ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22open%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22open%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A20%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22none%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A3%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22closed%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0.5%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22closed%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22semilocal%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A0.8227%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-48.763575165818956%2C%22y%22%3A43.72517618222101%2C%22z%22%3A33.70077275818966%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D)): - - - -![wrap the matmul around a cube](/assets/images/inside-the-matrix/initial.jpg){:style="width:100%"} - - -When we wrap the matmul around a cube this way, the correct relationships between argument shapes, result shape and shared dimensions all fall into place. - -Now the computation makes _geometric sense_: each location `i, j` in the result matrix anchors a vector running along the depth dimension `k` in the cube's interior, where the horizontal plane extending from row `i` in `L` and a vertical plane extending from column `j` in `R` intersect. Along this vector, pairs of `(i, k)` `(k, j)` elements from the left and right arguments meet and are multiplied, and the resulting products are summed along `k` and the result is deposited in location `i, j` of the result. - -(Jumping ahead momentarily, [here's an animation](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22open%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22open%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A48%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22dotprod%20(row%20major)%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0.5%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22closed%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22semilocal%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-54.145594172414235%2C%22y%22%3A48.55110882721702%2C%22z%22%3A37.42031544768185%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D).) - -This is the _intuitive_ meaning of matrix multiplication: - - - -1. **project** two orthogonal matrices into the interior of a cube -2. **multiply** the pair of values at each intersection, forming a grid of products -3. **sum** along the third orthogonal dimension to produce a result matrix. - -For orientation, the tool displays an arrow in the cube's interior that points towards the result matrix, with a blue vane coming from the left argument and a **r**ed vane coming from the **r**ight argument. The tool also displays white guidelines to indicate the row axis of each matrix, though they're faint in this screenshot. - -The layout constraints are straightforward: - - - -* left argument and result must be adjoined along their shared **height** (i) dimension -* right argument and result must be adjoined along their shared **width** (j) dimension -* left and right arguments must be adjoined along their shared (left width/right height) dimension, which becomes the matmul’s **depth** (k) dimension - -This geometry gives us a solid foundation for visualizing all the standard matmul decompositions, and an intuitive basis for exploring nontrivially complex _compositions_ of matmuls, as we'll see below. - - -## 2 Warmup - animations - -Before diving into some more complex examples, we'll run through a few intuition builders to get a feel for how things look and feel in this style of visualization. - - -### 2a Dot product - -First, the canonical algorithm - computing each result element by taking the dot product of the corresponding left row and right column. What we see in the animation is the sweep of multiplied value vectors through the cube’s interior, each delivering a summed result at the corresponding position. - -Here, `L` has blocks of rows filled with 1 (blue) or -1 (red); `R` has column blocks filled similarly. `k` is 24 here, so the result matrix (`L @ R`) has blue values of 24 and red values of -24 ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22open%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22open%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A48%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22dotprod%20(row%20major)%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0.5%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22closed%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22semilocal%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-54.145594172414235%2C%22y%22%3A48.55110882721702%2C%22z%22%3A37.42031544768185%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D) - long click or control-click to inspect values): - -

        - -

        - -### 2b Matrix-vector products - -A matmul decomposed into matrix-vector products looks like a vertical plane (a product of the left argument with each column of the right argument) painting columns onto the result as it sweeps horizontally through the cube's interior ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22closed%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22closed%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A12%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22mvprod%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0.5%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22closed%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22semilocal%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-54.145594172414235%2C%22y%22%3A48.55110882721702%2C%22z%22%3A37.42031544768185%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D)): - -

        - -

        - -Observing the intermediate values of a decomposition can be quite interesting, even in simple examples. - -For instance, note the prominent vertical patterns in the intermediate matrix-vector products when we use randomly initialized arguments- reflecting the fact that each intermediate is a column-scaled replica of the left argument ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22gaussian%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22(-Math.trunc(i%20%2F%208)%20%25%202)%20%2B%20.5%22%2C%22folder%22%3A%22open%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22gaussian%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22(-Math.trunc(j%20%2F%208)%20%25%202)%20%2B%20.5%22%2C%22folder%22%3A%22open%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A6%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22mvprod%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22open%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22local%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-54.14559417241423%2C%22y%22%3A48.55110882721702%2C%22z%22%3A37.42031544768186%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D)): - -

        - -

        - -### 2c Vector-matrix products - -A matmul decomposed into vector-matrix products looks like a horizontal plane painting rows onto the result as it descends through the cube's interior ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22closed%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22closed%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A12%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22vmprod%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0.5%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22closed%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22semilocal%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-54.145594172414235%2C%22y%22%3A48.55110882721702%2C%22z%22%3A37.42031544768185%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D)): - -

        - -

        - - -Switching to randomly initialized arguments, we see patterns analogous to those we saw with matrix-vector products - only this time the patterns are horizontal, corresponding to the fact that each intermediate vector-matrix product is a row-scaled replica of the right argument. - -When thinking about how matmuls express the rank and structure of their arguments, it's useful to envision both of these patterns happening simultaneously in the computation ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22gaussian%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22(-Math.trunc(i%20%2F%208)%20%25%202)%20%2B%20.5%22%2C%22folder%22%3A%22open%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22gaussian%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22(-Math.trunc(j%20%2F%208)%20%25%202)%20%2B%20.5%22%2C%22folder%22%3A%22open%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A6%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22vmprod%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22open%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22local%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-54.14559417241423%2C%22y%22%3A48.55110882721702%2C%22z%22%3A37.42031544768186%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D)): - -

        - -

        - -Here's one more intuition builder using vector-matrix products, showing how the identity matrix functions exactly like a mirror set at a 45deg angle to both its counterargument and the result ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A24%2C%22init%22%3A%22eye%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22(-Math.trunc(i%20%2F%208)%20%25%202)%20%2B%20.5%22%2C%22folder%22%3A%22open%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22row%20major%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22(-Math.trunc(j%20%2F%208)%20%25%202)%20%2B%20.5%22%2C%22folder%22%3A%22open%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A12%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22vmprod%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22open%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22local%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-50.560896320538845%2C%22y%22%3A45.336792719337595%2C%22z%22%3A34.94291121097398%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D)): - -

        - -

        - -### 2d Summed outer products - -The third planar decomposition is along the `k` axis, computing the matmul result by a pointwise summation of vector outer products. Here we see the plane of outer products sweeping the cube "from back to front", accumulating into the result ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22closed%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22expr%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201%22%2C%22folder%22%3A%22closed%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A12%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22vvprod%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0.5%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22closed%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22semilocal%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-54.145594172414235%2C%22y%22%3A48.55110882721702%2C%22z%22%3A37.42031544768185%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D)): - -

        - -

        - -Using randomly initialized matrices with this decomposition, we can see not just values but _rank_ accumulate in the result, as each rank-1 outer product is added to it. - -Among other things this builds intuition for why "low-rank factorization" - i.e. approximating a matrix by constructing a matmul whose arguments are small in the depth dimension - works best when the matrix being approximated is low rank. [LoRA](https://arxiv.org/pdf/2106.09685.pdf) in a later section ([open in mm](https://bhosmer.github.io/mm/index.html?params=%7B%22expr%22%3A%22L%20%40%20R%22%2C%22name%22%3A%22L%20%40%20R%22%2C%22epilog%22%3A%22none%22%2C%22left%22%3A%7B%22name%22%3A%22L%22%2C%22matmul%22%3Afalse%2C%22h%22%3A32%2C%22w%22%3A24%2C%22init%22%3A%22gaussian%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22(-Math.trunc(i%20%2F%208)%20%25%202)%20%2B%20.5%22%2C%22folder%22%3A%22closed%22%7D%2C%22right%22%3A%7B%22name%22%3A%22R%22%2C%22matmul%22%3Afalse%2C%22h%22%3A24%2C%22w%22%3A32%2C%22init%22%3A%22gaussian%22%2C%22url%22%3A%22%22%2C%22min%22%3A-1%2C%22max%22%3A1%2C%22dropout%22%3A0%2C%22expr%22%3A%22(-Math.trunc(j%20%2F%208)%20%25%202)%20%2B%20.5%22%2C%22folder%22%3A%22closed%22%7D%2C%22anim%22%3A%7B%22fuse%22%3A%22none%22%2C%22speed%22%3A6%2C%22hide%20inputs%22%3Afalse%2C%22alg%22%3A%22vvprod%22%2C%22spin%22%3A0%2C%22folder%22%3A%22open%22%7D%2C%22block%22%3A%7B%22i%20blocks%22%3A1%2C%22j%20blocks%22%3A1%2C%22k%20blocks%22%3A1%7D%2C%22layout%22%3A%7B%22scheme%22%3A%22blocks%22%2C%22gap%22%3A5%2C%22scatter%22%3A0%2C%22molecule%22%3A1%2C%22blast%22%3A0%2C%22polarity%22%3A%22negative%22%2C%22left%20placement%22%3A%22left%22%2C%22right%20placement%22%3A%22top%22%2C%22result%20placement%22%3A%22front%22%2C%22folder%22%3A%22open%22%7D%2C%22deco%22%3A%7B%22legends%22%3A6%2C%22shape%22%3Atrue%2C%22spotlight%22%3A2%2C%22row%20guides%22%3A1%2C%22flow%20guides%22%3A0%2C%22lens%20size%22%3A0.5%2C%22magnification%22%3A10%2C%22interior%20spotlight%22%3Afalse%2C%22axes%22%3Afalse%2C%22folder%22%3A%22open%22%7D%2C%22viz%22%3A%7B%22sensitivity%22%3A%22local%22%2C%22min%20size%22%3A0.196%2C%22min%20light%22%3A0.4%2C%22max%20light%22%3A0.6%2C%22elem%20scale%22%3A1%2C%22zero%20hue%22%3A0.77%2C%22hue%20gap%22%3A0.74%2C%22hue%20spread%22%3A0.04%2C%22folder%22%3A%22open%22%7D%2C%22diag%22%3A%7B%22url%22%3A%22%22%7D%2C%22cam%22%3A%7B%22x%22%3A-54.14559417241423%2C%22y%22%3A48.55110882721702%2C%22z%22%3A37.42031544768186%2C%22target%22%3A%7B%22x%22%3A0%2C%22y%22%3A0%2C%22z%22%3A0%7D%7D%2C%22folder%22%3A%22closed%22%2C%22compress%22%3Afalse%7D)): - -

        - -

        - -## 3 Warmup - expressions - -How can we extend this visualization approach to _compositions_ of matmuls? Our examples so far have all visualized a single matmul `L @ R` of some matrices `L` and `R` - what about when `L` and/or `R` are themselves matmuls, and so on transitively? - -It turns out we can extend the approach nicely to compound expressions. The key rules are simple: the subexpression (child) matmul is another cube, subject to the same layout constraints as the parent, and the result face of the child is _simultaneously_ the corresponding argument face of the parent, like a covalently shared electron. - -Within these constraints, we're free to arrange the faces of a child matmul however we like. Here we use the tool's default scheme, which generates alternating convex and concave cubes - this layout works well in practice to maximize use of space and minimize occlusion. (Layouts are completely customizable, however - see the [reference](https://bhosmer.github.io/mm/ref.html) for details.) - -In this section we'll visualize some of the key building blocks we find in ML models, to gain fluency in the visual idiom and to see what intuitions even simple examples can give us. - - -### 3a Left-associative expressions - -We'll look at two expressions of the form `(A @ B) @ C`, each with its own distinctive shape and character. (Note: mm adheres to the convention that matrix multiplication is left-associative and writes this simply as `A @ B @ C`.) - -First we'll give `A @ B @ C` the characteristic FFN shape, in which the "hidden dimension" is wider than the "input" or "output" dimensions. (Concretely in the context of this example, this means that the width of `B` is greater than the widths of `A` or `C`.) - -As in the single matmul examples, the floating arrows point towards the result matrix, blue vane coming from the left argument and red vane from right argument ([open in mm](https://bhosmer.github.io/mm/index.html?0=A%20%40%20B%20%40%20C&1=A%20%40%20B%20%40%20C&2=none&12=closed&64=true&3.1=A%20%40%20B&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.12=open&3.2=none&13.1=A&13.4=false&13.5=64&13.6=32&13.7=expr&13.8=&13.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&13.9=-1&13.10=1&13.11=0&13.12=open&14.1=B&14.4=false&14.5=32&14.6=96&14.7=row%20major&14.8=&14.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&14.9=-1&14.10=1&14.11=0&14.12=open&15.16=inherit&17.18=positive&17.19=left&17.20=bottom&17.21=back&22.23=1&24.1=C&24.4=false&24.5=96&24.6=32&24.7=col%20major&24.8=&24.9=-1&24.10=1&24.11=0&24.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&24.12=open&25.26=none&25.27=12&25.28=false&25.16=none&25.29=0&25.12=closed&30.31=1&30.32=1&30.23=1&33.34=blocks&33.35=5&33.36=0&33.37=1&33.38=0&33.18=negative&33.19=left&33.20=top&33.21=front&33.12=closed&39.40=6&39.41=true&39.42=2&39.43=1&39.44=0.5&39.45=0.5&39.46=10&39.47=false&39.48=false&39.12=open&49.50=semilocal&49.51=0.2&49.52=0.4&49.53=0.6&49.54=1.25&49.55=0.77&49.56=0.74&49.57=0.04&49.12=open&58.8=&59.60=-102.42301073851515&59.61=96.27580041479706&59.62=112.34410815468306&63.60=-4.617417891034972&63.61=-3.695553245058398&63.62=-1.8863985145585351&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&folder=12&left.left=13&left.right=14&left.anim=15&alg=16&left.layout=17&polarity=18&left%20placement=19&right%20placement=20&result%20placement=21&left.block=22&k%20blocks=23&right=24&anim=25&fuse=26&speed=27&hide%20inputs=28&spin=29&block=30&i%20blocks=31&j%20blocks=32&layout=33&scheme=34&gap=35&scatter=36&molecule=37&blast=38&deco=39&legends=40&shape=41&spotlight=42&row%20guides=43&flow%20guides=44&lens%20size=45&magnification=46&interior%20spotlight=47&axes=48&viz=49&sensitivity=50&min%20size=51&min%20light=52&max%20light=53&elem%20scale=54&zero%20hue=55&hue%20gap=56&hue%20spread=57&diag=58&cam=59&x=60&y=61&z=62&cam.target=63&compress=64)): - - -![As in the single matmul examples, the floating arrows point towards the result matrix, blue vane coming from the left argument and red vane from right argument](/assets/images/inside-the-matrix/la2still.jpg){:style="width:100%"} - - - -Next we'll visualize `A @ B @ C` with the width of `B` _narrower_ than that of `A` or `C`, giving it a bottleneck or "autoencoder" shape ([open in mm](https://bhosmer.github.io/mm/index.html?0=A%20%40%20B%20%40%20C&1=A%20%40%20B%20%40%20C&2=none&12=closed&64=true&3.1=A%20%40%20B&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.12=open&3.2=none&13.1=A&13.4=false&13.5=64&13.6=96&13.7=expr&13.8=&13.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&13.9=-1&13.10=1&13.11=0&13.12=open&14.1=B&14.4=false&14.5=96&14.6=32&14.7=row%20major&14.8=&14.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&14.9=-1&14.10=1&14.11=0&14.12=open&15.16=inherit&17.18=positive&17.19=left&17.20=bottom&17.21=back&22.23=1&24.1=C&24.4=false&24.5=32&24.6=96&24.7=col%20major&24.8=&24.9=-1&24.10=1&24.11=0&24.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&24.12=open&25.26=none&25.27=12&25.28=false&25.16=none&25.29=0&25.12=closed&30.31=1&30.32=1&30.23=1&33.34=blocks&33.35=5&33.36=0&33.37=1&33.38=0&33.18=negative&33.19=left&33.20=top&33.21=front&33.12=closed&39.40=6&39.41=true&39.42=2&39.43=1&39.44=0.5&39.45=0.5&39.46=10&39.47=false&39.48=false&39.12=open&49.50=semilocal&49.51=0.2&49.52=0.4&49.53=0.6&49.54=1.25&49.55=0.77&49.56=0.74&49.57=0.04&49.12=open&58.8=&59.60=-125.71162036288077&59.61=101.84279252909485&59.62=122.50425255743914&63.60=-14.817097084822203&63.61=-9.723209466639396&63.62=-5.4699873376955646&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&folder=12&left.left=13&left.right=14&left.anim=15&alg=16&left.layout=17&polarity=18&left%20placement=19&right%20placement=20&result%20placement=21&left.block=22&k%20blocks=23&right=24&anim=25&fuse=26&speed=27&hide%20inputs=28&spin=29&block=30&i%20blocks=31&j%20blocks=32&layout=33&scheme=34&gap=35&scatter=36&molecule=37&blast=38&deco=39&legends=40&shape=41&spotlight=42&row%20guides=43&flow%20guides=44&lens%20size=45&magnification=46&interior%20spotlight=47&axes=48&viz=49&sensitivity=50&min%20size=51&min%20light=52&max%20light=53&elem%20scale=54&zero%20hue=55&hue%20gap=56&hue%20spread=57&diag=58&cam=59&x=60&y=61&z=62&cam.target=63&compress=64)): - - -![visualize A @ B @ C with the width of B narrower than that of A or C](/assets/images/inside-the-matrix/lacontract.jpg){:style="width:100%"} - - - -This pattern of alternating convex and concave blocks extends to chains of arbitrary length: for example this multilayer bottleneck ([open in mm](https://bhosmer.github.io/mm/index.html?0=A%20%40%20B%20%40%20C%20%40%20D%20%40%20E&1=A%20%40%20B%20%40%20C%20%40%20D%20%40%20E&2=none&23=closed&63=true&3.2=none&4.5=inherit&6.7=1&8.9=positive&8.10=left&8.11=bottom&8.12=back&13.0=A%20%40%20B%20%40%20C%20%40%20D%20%40%20E&13.1=A%20%40%20B%20%40%20C&13.2=none&14.1=A%20%40%20B&14.15=true&14.16=32&14.17=32&14.18=row%20major&14.19=&14.20=-1&14.21=1&14.22=0&14.23=open&14.2=none&24.1=A&24.15=false&24.16=64&24.17=96&24.18=expr&24.19=&24.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&24.20=-1&24.21=1&24.22=0&24.23=open&25.1=B&25.15=false&25.16=96&25.17=64&25.18=row%20major&25.19=&25.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&25.20=-1&25.21=1&25.22=0&25.23=open&26.5=inherit&27.9=positive&27.10=left&27.11=bottom&27.12=back&28.7=1&29.1=C&29.15=false&29.16=64&29.17=32&29.18=col%20major&29.19=&29.20=-1&29.21=1&29.22=0&29.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&29.23=open&30.31=none&30.32=12&30.33=false&30.5=none&30.34=0&30.23=closed&35.36=1&35.7=1&37.9=negative&37.10=left&37.11=top&37.12=front&38.39=6&38.40=true&38.41=2&38.42=1&38.43=0.5&38.44=0.5&38.45=10&38.46=false&38.47=false&38.23=open&48.49=semilocal&48.50=0.2&48.51=0.4&48.52=0.6&48.53=1.25&48.54=0.77&48.55=0.74&48.56=0.04&48.23=open&57.19=&58.59=-125.71162036288077&58.60=101.84279252909485&58.61=122.50425255743914&62.59=-14.817097084822203&62.60=-9.723209466639396&62.61=-5.4699873376955646&13.23=open&13.63=true&13.15=true&64.1=D&64.15=false&64.16=32&64.17=64&64.18=col%20major&64.19=&64.20=-1&64.21=1&64.22=0&64.0=&64.23=open&3.1=A%20%40%20B%20%40%20C%20%40%20D&3.15=true&65.1=E&65.15=false&65.16=64&65.17=96&65.18=col%20major&65.19=&65.20=-1&65.21=1&65.22=0&65.0=&66.31=none&66.32=12&66.33=false&66.5=none&66.34=0&66.23=closed&67.36=1&67.68=1&67.7=1&69.70=blocks&69.71=5&69.72=0&69.73=1&69.74=0&69.9=negative&69.10=left&69.11=top&69.12=front&69.23=closed&75.39=5.28&75.40=true&75.41=2&75.42=1&75.43=0.5&75.44=0.5&75.45=10&75.46=false&75.47=false&75.23=open&76.49=semilocal&76.50=0.2&76.51=0.4&76.52=0.6&76.53=1.25&76.54=0.77&76.55=0.74&76.56=0.04&76.23=open&77.19=&78.59=-163.23429720087873&78.60=132.20892347209139&78.61=159.04014894666057&79.59=-14.817097084822203&79.60=-9.723209466639396&79.61=-5.4699873376955646&expr=0&name=1&epilog=2&left=3&left.anim=4&alg=5&left.block=6&k%20blocks=7&left.layout=8&polarity=9&left%20placement=10&right%20placement=11&result%20placement=12&left.left=13&left.left.left=14&matmul=15&h=16&w=17&init=18&url=19&min=20&max=21&dropout=22&folder=23&left.left.left.left=24&left.left.left.right=25&left.left.left.anim=26&left.left.left.layout=27&left.left.left.block=28&left.left.right=29&left.left.anim=30&fuse=31&speed=32&hide%20inputs=33&spin=34&left.left.block=35&i%20blocks=36&left.left.layout=37&left.left.deco=38&legends=39&shape=40&spotlight=41&row%20guides=42&flow%20guides=43&lens%20size=44&magnification=45&interior%20spotlight=46&axes=47&left.left.viz=48&sensitivity=49&min%20size=50&min%20light=51&max%20light=52&elem%20scale=53&zero%20hue=54&hue%20gap=55&hue%20spread=56&left.left.diag=57&left.left.cam=58&x=59&y=60&z=61&left.left.cam.target=62&compress=63&left.right=64&right=65&anim=66&block=67&j%20blocks=68&layout=69&scheme=70&gap=71&scatter=72&molecule=73&blast=74&deco=75&viz=76&diag=77&cam=78&cam.target=79)): - - -![pattern of alternating convex and concave blocks extends to chains of arbitrary length](/assets/images/inside-the-matrix/nlayerbottleneck.jpg){:style="width:100%"} - - - - -### 3b Right associative expressions - -Next we'll visualize a right-associative expression `A @ (B @ C)`. - -In the same way left-associative expressions extend horizontally - sprouting from the left argument of the root expression, so to speak - right-associative chains extend vertically, sprouting from the root's right argument. - -One sometimes sees an MLP formulated right-associatively, i.e. with columnar input on the right and weight layers running right to left. Using the matrices from the 2-layer FFN example pictured above - suitably transposed - here's what that looks like, with `C` now playing the role of the input, `B` the first layer and `A` the second layer ([open in mm](https://bhosmer.github.io/mm/index.html?0=A%20%40%20(B%20%40%20C)&1=A%20%40%20(B%20%40%20C)&2=none&12=closed&64=true&3.1=A&3.4=false&3.5=32&3.6=96&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&3.12=open&13.1=B%20%40%20C&13.4=true&13.5=32&13.6=32&13.7=col%20major&13.8=&13.9=-1&13.10=1&13.11=0&13.2=none&14.15=inherit&16.17=1&18.19=positive&18.20=right&18.21=top&18.22=back&23.1=B&23.4=false&23.5=96&23.6=32&23.7=col%20major&23.8=&23.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&23.9=-1&23.10=1&23.11=0&23.12=open&24.1=C&24.4=false&24.5=32&24.6=64&24.7=expr&24.8=&24.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&24.9=-1&24.10=1&24.11=0&24.12=open&13.12=open&25.26=none&25.27=12&25.28=false&25.15=none&25.29=0&25.12=closed&30.31=1&30.32=1&30.17=1&33.34=blocks&33.35=5&33.36=0&33.37=1&33.38=0&33.19=negative&33.20=left&33.21=top&33.22=front&33.12=closed&39.40=6&39.41=true&39.42=2&39.43=1&39.44=0.5&39.45=0.5&39.46=10&39.47=false&39.48=false&39.12=closed&49.50=semilocal&49.51=0.2&49.52=0.4&49.53=0.6&49.54=1.25&49.55=0.77&49.56=0.74&49.57=0.04&49.12=closed&58.8=&58.12=open&59.60=-105.78213185291946&59.61=96.67420268229331&59.62=113.6419504179439&63.60=-4.617417891034972&63.61=-3.695553245058398&63.62=-1.8863985145585351&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&folder=12&right=13&right.anim=14&alg=15&right.block=16&k%20blocks=17&right.layout=18&polarity=19&left%20placement=20&right%20placement=21&result%20placement=22&right.left=23&right.right=24&anim=25&fuse=26&speed=27&hide%20inputs=28&spin=29&block=30&i%20blocks=31&j%20blocks=32&layout=33&scheme=34&gap=35&scatter=36&molecule=37&blast=38&deco=39&legends=40&shape=41&spotlight=42&row%20guides=43&flow%20guides=44&lens%20size=45&magnification=46&interior%20spotlight=47&axes=48&viz=49&sensitivity=50&min%20size=51&min%20light=52&max%20light=53&elem%20scale=54&zero%20hue=55&hue%20gap=56&hue%20spread=57&diag=58&cam=59&x=60&y=61&z=62&cam.target=63&compress=64)): - - -![an MLP formulated right-associatively](/assets/images/inside-the-matrix/raffn.jpg){:style="width:100%"} - - - -Aside: in addition to the color of the arrow vanes (blue for left, red for right), a second visual cue for distinguishing left and right arguments is their _orientation_: the rows of the left argument are coplanar with those of the result - they stack along the same axis (`i`). Both cues tell us for example that `B` is the left argument to `(B @ C)` above. - - -### 3c Binary expressions - -For a visualization tool to be useful beyond simple didactic examples, visualizations need to remain legible as expressions get more complicated. A key structural component in real-world use cases is binary expressions - matmuls with subexpressions on both the left and right. - -Here we'll visualize the simplest such expression shape, `(A @ B) @ (C @ D)` ([open in mm](https://bhosmer.github.io/mm/index.html?0=A%20%40%20B%20%40%20(C%20%40%20D)&1=A%20%40%20B%20%40%20(C%20%40%20D)&2=none&12=closed&69=true&3.1=A%20%40%20B&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.12=open&3.2=none&13.1=A&13.4=false&13.5=64&13.6=64&13.7=expr&13.8=&13.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&13.9=-1&13.10=1&13.11=0&13.12=closed&14.1=B&14.4=false&14.5=64&14.6=64&14.7=row%20major&14.8=&14.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&14.9=-1&14.10=1&14.11=0&14.12=closed&15.16=inherit&17.18=positive&17.19=left&17.20=bottom&17.21=back&22.23=1&24.1=C%20%40%20D&24.4=true&24.5=32&24.6=32&24.7=col%20major&24.8=&24.9=-1&24.10=1&24.11=0&24.2=none&25.16=inherit&26.23=1&27.18=positive&27.19=right&27.20=top&27.21=back&28.1=C&28.4=false&28.5=64&28.6=64&28.7=col%20major&28.8=&28.9=-1&28.10=1&28.11=0&28.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&28.12=open&29.1=D&29.4=false&29.5=64&29.6=64&29.7=expr&29.8=&29.9=-1&29.10=1&29.11=0&29.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&29.12=open&30.31=none&30.32=12&30.33=false&30.16=none&30.34=0&30.12=closed&35.36=1&35.37=1&35.23=1&38.39=blocks&38.40=5&38.41=0&38.42=1&38.43=0&38.18=negative&38.19=left&38.20=top&38.21=front&38.12=closed&44.45=6&44.46=true&44.47=2&44.48=1&44.49=0.5&44.50=0.5&44.51=10&44.52=false&44.53=false&44.12=closed&54.55=semilocal&54.56=0.4&54.57=0.4&54.58=0.6&54.59=1.5&54.60=0.77&54.61=0.74&54.62=0.04&54.12=open&63.8=&64.65=-149.45958189074523&64.66=140.76437147298853&64.67=162.13832534246401&68.65=-4.044017278625395&68.66=-2.123834827920271&68.67=-2.551083969824457&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&folder=12&left.left=13&left.right=14&left.anim=15&alg=16&left.layout=17&polarity=18&left%20placement=19&right%20placement=20&result%20placement=21&left.block=22&k%20blocks=23&right=24&right.anim=25&right.block=26&right.layout=27&right.left=28&right.right=29&anim=30&fuse=31&speed=32&hide%20inputs=33&spin=34&block=35&i%20blocks=36&j%20blocks=37&layout=38&scheme=39&gap=40&scatter=41&molecule=42&blast=43&deco=44&legends=45&shape=46&spotlight=47&row%20guides=48&flow%20guides=49&lens%20size=50&magnification=51&interior%20spotlight=52&axes=53&viz=54&sensitivity=55&min%20size=56&min%20light=57&max%20light=58&elem%20scale=59&zero%20hue=60&hue%20gap=61&hue%20spread=62&diag=63&cam=64&x=65&y=66&z=67&cam.target=68&compress=69)): - -![binary expressions - matmuls with subexpressions on both the left and right](/assets/images/inside-the-matrix/binary4.jpg){:style="width:100%"} - - - - -### 3d Quick aside: partitioning and parallelism - -A full presentation of this topic is out of scope for this note, though we'll see it in action later in the context of attention heads. But as a warmup, two quick examples should give a sense of how this style of visualization makes reasoning about parallelizing compound expressions very intuitive, via the simple geometry of partitioning. - -In the first example we'll apply the canonical "data parallel" partitioning to the left-associative multilayer bottleneck example above. We partition along `i`, segmenting the initial left argument ("batch") and all intermediate results ("activations"), but none of the subsequent arguments ("weights") - the geometry making it obvious which participants in the expression are segmented and which remain whole ([open in mm](https://bhosmer.github.io/mm/index.html?0=A%20%40%20B%20%40%20C%20%40%20D%20%40%20E&1=A%20%40%20B%20%40%20C%20%40%20D%20%40%20E&2=none&23=closed&63=true&3.1=A%20%40%20B%20%40%20C%20%40%20D&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=inherit&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.0=A%20%40%20B%20%40%20C%20%40%20D%20%40%20E&21.1=A%20%40%20B%20%40%20C&21.2=none&22.1=A%20%40%20B&22.4=true&22.5=32&22.6=32&22.7=row%20major&22.8=&22.9=-1&22.10=1&22.11=0&22.23=open&22.2=none&24.1=A&24.4=false&24.5=64&24.6=96&24.7=expr&24.8=&24.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&24.9=-1&24.10=1&24.11=0&24.23=open&25.1=B&25.4=false&25.5=96&25.6=64&25.7=row%20major&25.8=&25.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&25.9=-1&25.10=1&25.11=0&25.23=open&26.13=inherit&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.15=1&29.1=C&29.4=false&29.5=64&29.6=32&29.7=col%20major&29.8=&29.9=-1&29.10=1&29.11=0&29.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&29.23=open&30.31=none&30.32=12&30.33=false&30.13=none&30.34=0&30.23=closed&35.36=1&35.15=1&37.17=negative&37.18=left&37.19=top&37.20=front&38.39=6&38.40=true&38.41=2&38.42=1&38.43=0.5&38.44=0.5&38.45=10&38.46=false&38.47=false&38.23=open&48.49=semilocal&48.50=0.2&48.51=0.4&48.52=0.6&48.53=1.25&48.54=0.77&48.55=0.74&48.56=0.04&48.23=open&57.8=&58.59=-125.71162036288077&58.60=101.84279252909485&58.61=122.50425255743914&62.59=-14.817097084822203&62.60=-9.723209466639396&62.61=-5.4699873376955646&21.23=open&21.63=true&21.4=true&64.1=D&64.4=false&64.5=32&64.6=64&64.7=col%20major&64.8=&64.9=-1&64.10=1&64.11=0&64.0=&64.23=open&65.1=E&65.4=false&65.5=64&65.6=96&65.7=col%20major&65.8=&65.9=-1&65.10=1&65.11=0&65.0=&66.31=none&66.32=12&66.33=false&66.13=none&66.34=0&66.23=closed&67.36=8&67.68=1&67.15=1&67.23=open&69.70=blocks&69.71=5&69.72=0&69.73=1&69.74=0&69.17=negative&69.18=left&69.19=top&69.20=front&69.23=closed&75.39=5.28&75.40=true&75.41=2&75.42=1&75.43=0.5&75.44=0.5&75.45=10&75.46=false&75.47=false&75.23=closed&76.49=semilocal&76.50=0.3&76.51=0.4&76.52=0.6&76.53=1.5&76.54=0.77&76.55=0.74&76.56=0.04&76.23=closed&77.8=&78.59=-174.76129648411032&78.60=141.54502619212317&78.61=170.2709730709386&79.59=-14.817097084822203&79.60=-9.723209466639396&79.61=-5.4699873376955646&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&folder=23&left.left.left.left=24&left.left.left.right=25&left.left.left.anim=26&left.left.left.layout=27&left.left.left.block=28&left.left.right=29&left.left.anim=30&fuse=31&speed=32&hide%20inputs=33&spin=34&left.left.block=35&i%20blocks=36&left.left.layout=37&left.left.deco=38&legends=39&shape=40&spotlight=41&row%20guides=42&flow%20guides=43&lens%20size=44&magnification=45&interior%20spotlight=46&axes=47&left.left.viz=48&sensitivity=49&min%20size=50&min%20light=51&max%20light=52&elem%20scale=53&zero%20hue=54&hue%20gap=55&hue%20spread=56&left.left.diag=57&left.left.cam=58&x=59&y=60&z=61&left.left.cam.target=62&compress=63&left.right=64&right=65&anim=66&block=67&j%20blocks=68&layout=69&scheme=70&gap=71&scatter=72&molecule=73&blast=74&deco=75&viz=76&diag=77&cam=78&cam.target=79)): - - -![the canonical "data parallel" partitioning to the left-associative multilayer bottleneck example](/assets/images/inside-the-matrix/bottleneck_part.jpg){:style="width:100%"} - - - -The second example would (for me, anyway) be much harder to build intuition about without clear geometry to support it: it shows how a binary expression can be parallelized by partitioning the left subexpression along its `j` axis, the right subexpression along its `i` axis, and the parent expression along its `k` axis ([open in mm](https://bhosmer.github.io/mm/index.html?0=A%20%40%20B%20%40%20(C%20%40%20D)&1=A%20%40%20B%20%40%20(C%20%40%20D)&2=none&12=closed&69=true&3.1=A%20%40%20B&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.12=open&3.2=none&13.1=A&13.4=false&13.5=64&13.6=64&13.7=expr&13.8=&13.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&13.9=-1&13.10=1&13.11=0&13.12=closed&14.1=B&14.4=false&14.5=64&14.6=64&14.7=row%20major&14.8=&14.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&14.9=-1&14.10=1&14.11=0&14.12=closed&15.16=inherit&17.18=positive&17.19=left&17.20=bottom&17.21=back&22.23=1&24.1=C%20%40%20D&24.4=true&24.5=32&24.6=32&24.7=col%20major&24.8=&24.9=-1&24.10=1&24.11=0&24.2=none&25.16=inherit&26.23=1&27.18=positive&27.19=right&27.20=top&27.21=back&28.1=C&28.4=false&28.5=64&28.6=64&28.7=col%20major&28.8=&28.9=-1&28.10=1&28.11=0&28.0=-2%20*%20(Math.trunc(i%20%2F%208)%20%25%202)%20%2B%201&28.12=open&29.1=D&29.4=false&29.5=64&29.6=64&29.7=expr&29.8=&29.9=-1&29.10=1&29.11=0&29.0=-2%20*%20(Math.trunc(j%20%2F%208)%20%25%202)%20%2B%201&29.12=open&30.31=none&30.32=12&30.33=false&30.16=none&30.34=0&30.12=closed&35.36=1&35.37=1&35.23=8&35.12=open&38.39=blocks&38.40=5&38.41=0&38.42=1&38.43=0&38.18=negative&38.19=left&38.20=top&38.21=front&38.12=closed&44.45=6&44.46=true&44.47=2&44.48=1&44.49=0.5&44.50=0.5&44.51=10&44.52=false&44.53=false&44.12=closed&54.55=semilocal&54.56=0.4&54.57=0.4&54.58=0.6&54.59=1.5&54.60=0.77&54.61=0.74&54.62=0.04&54.12=open&63.8=&64.65=-163.0431410622342&64.66=153.55767080483412&64.67=176.87418575632128&68.65=-4.044017278625395&68.66=-2.123834827920271&68.67=-2.551083969824457&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&folder=12&left.left=13&left.right=14&left.anim=15&alg=16&left.layout=17&polarity=18&left%20placement=19&right%20placement=20&result%20placement=21&left.block=22&k%20blocks=23&right=24&right.anim=25&right.block=26&right.layout=27&right.left=28&right.right=29&anim=30&fuse=31&speed=32&hide%20inputs=33&spin=34&block=35&i%20blocks=36&j%20blocks=37&layout=38&scheme=39&gap=40&scatter=41&molecule=42&blast=43&deco=44&legends=45&shape=46&spotlight=47&row%20guides=48&flow%20guides=49&lens%20size=50&magnification=51&interior%20spotlight=52&axes=53&viz=54&sensitivity=55&min%20size=56&min%20light=57&max%20light=58&elem%20scale=59&zero%20hue=60&hue%20gap=61&hue%20spread=62&diag=63&cam=64&x=65&y=66&z=67&cam.target=68&compress=69)): - - -![a binary expression can be parallelized by partitioning the left subexpression along its j axis, the right subexpression along its i axis, and the parent expression along its k axis](/assets/images/inside-the-matrix/binary_part.jpg){:style="width:100%"} - - - -## 4 Inside an Attention Head - -Let's look at a GPT2 attention head - specifically layer 5, head 4 of the "gpt2" (small) configuration (layers=12, heads=12, embed=768) from [NanoGPT](https://github.com/karpathy/nanoGPT), using OpenAI weights via HuggingFace. Input activations are taken from a forward pass on an OpenWebText training sample of 256 tokens. - -There's nothing particularly unusual about this particular head; I chose it mainly because it computes a fairly common attention pattern and lives in the middle of the model, where activations have become structured and show some interesting texture. (Aside: in a subsequent note I'll present an attention head explorer that lets you visualize all layers and heads of this model, along with some travel notes.) - -[Open in mm](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input%20%40%20wQ)%20%40%20(K_t%20%3D%20wK_t%20%40%20input_t))%20%40%20(V%20%3D%20input%20%40%20wV)%20%40%20wO&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&24.1=wQ&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&24.0=&25.13=vmprod&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=vmprod&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&38.0=&39.1=wV&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=10&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&53.49=open&59.60=10&59.61=true&59.62=4&59.63=0.394&59.64=0.655&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.2&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=closed&78.8=&78.49=closed&79.80=-1149.3128801149742&79.81=1143.004532598807&79.82=1754.3660479535383&83.80=-6.708919569777563&83.81=75.05036284609801&83.82=-216.66743330111652&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84) (may take a few seconds to fetch model weights) - -![There's nothing particularly unusual about this particular head](/assets/images/inside-the-matrix/mha1.jpg){:style="width:100%"} - - - - -### 4a Structure - -The entire attention head is visualized as a single compound expression, starting with input and ending with projected output. (Note: to keep things self-contained we do per-head output projection as described in [Megatron-LM](https://arxiv.org/pdf/1909.08053.pdf).) - -The computation contains six matmuls: - -``` -Q = input @ wQ // 1 -K_t = wK_t @ input_t // 2 -V = input @ wV // 3 -attn = sdpa(Q @ K_t) // 4 -head_out = attn @ V // 5 -out = head_out @ wO // 6 -``` - -A thumbnail description of what we're looking at: - - - -* the blades of the windmill are matmuls 1, 2, 3 and 6: the former group are the in-projections from input to Q, K and V; the latter is the out-projection from attn @ V back to the embedding dimension. -* at the hub is the double matmul that first calculates attention scores (convex cube in back), then uses them to produce output tokens from the values vector (concave cube in front). Causality means that the attention scores form a lower triangle. - -But I'd encourage [exploring this example in the tool itself](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input%20%40%20wQ)%20%40%20(K_t%20%3D%20wK_t%20%40%20input_t))%20%40%20(V%20%3D%20input%20%40%20wV)%20%40%20wO&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&24.1=wQ&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&24.0=&25.13=vmprod&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=vmprod&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&38.0=&39.1=wV&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=10&59.61=true&59.62=4&59.63=0.394&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=closed&69.70=local&69.71=0&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=open&78.8=&78.49=open&79.80=-1212.5184472916683&79.81=1205.8631771144878&79.82=1850.8460431010271&83.80=-6.708919569777563&83.81=75.05036284609801&83.82=-216.66743330111652&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84), rather than relying on the screenshot or the video below to convey just how much signal can be absorbed from it - both about its structure and the actual values flowing through the computation. - - -### 4b Computation and Values - -Here's an animation of the attention head computation. Specifically, we're watching - -``` -sdpa(input @ wQ @ K_t) @ V @ wO -``` - -(i.e., matmuls 1, 4 , 5 and 6 above, with `K_t` and `V` precomputed) being computed as a fused chain of vector-matrix products: each item in the sequence goes all the way from input through attention to output in one step. More on this animation choice in the later section on parallelization, but first let's look at what the values being computed tell us. - -[Open in mm](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_5%20%40%20wQ_5_4)%20%40%20(K_t%20%3D%20wK_t_5_4%20%40%20input_t_0_5))%20%40%20(V%20%3D%20input_0_5%20%40%20wV_5_4)%20%40%20wO_5_4&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_5&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&24.1=wQ_5_4&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&24.0=&25.13=vmprod&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_5_4&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_5&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=vmprod&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_5&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&38.0=&39.1=wV_5_4&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_5_4&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=vmprod&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=8.38&59.61=true&59.62=4&59.63=0.394&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.05&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&78.8=&79.80=-382.8684269325278&79.81=293.7591554956184&79.82=395.95878922315694&83.80=-14.023727291338966&83.81=-38.22974037070054&83.82=-84.10726407282482&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84) - -

        - -

        - -There's a lot of interesting stuff going on here. - - - -* Before we even get to the attention calculation, it's quite striking how low-rank `Q` and `K_t` are. [Zooming in on the Q @ K_t vector-matrix product animation](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_5%20%40%20wQ_5_4)%20%40%20(K_t%20%3D%20wK_t_5_4%20%40%20input_t_0_5))%20%40%20(V%20%3D%20input_0_5%20%40%20wV_5_4)%20%40%20wO_5_4&1=out&2=none&24=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_5&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&23.24=closed&25.1=wQ_5_4&25.4=false&25.5=768&25.6=64&25.7=url&25.9=-1&25.10=1&25.11=0&25.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&25.0=&26.13=none&26.24=open&27.15=1&28.17=positive&28.18=left&28.19=bottom&28.20=back&22.24=open&29.2=none&30.13=none&31.15=1&32.17=positive&32.18=right&32.19=top&32.20=back&33.1=wK_t_5_4&33.4=false&33.5=64&33.6=768&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&33.9=-1&33.10=1&33.11=0&33.0=&34.1=input_t_0_5&34.4=false&34.5=768&34.6=256&34.7=url&34.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&34.9=-1&34.10=1&34.11=0&34.0=&29.1=K_t&29.4=true&35.13=vmprod&36.15=1&37.17=negative&37.18=left&37.19=top&37.20=front&21.24=closed&38.1=V&38.4=true&38.2=none&39.1=input_0_5&39.4=false&39.5=256&39.6=768&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&39.0=&40.1=wV_5_4&40.4=false&40.5=768&40.6=64&40.7=url&40.9=-1&40.10=1&40.11=0&40.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&40.0=&41.13=none&42.15=1&43.17=negative&43.18=right&43.19=top&43.20=back&3.24=closed&44.1=wO_5_4&44.4=false&44.5=64&44.6=768&44.7=url&44.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&44.9=-1&44.10=1&44.11=0&44.0=&45.46=sync&45.47=4&45.48=false&45.13=vmprod&45.49=0&45.24=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=8.38&59.61=true&59.62=4&59.63=0.394&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.24=open&69.70=local&69.71=0.05&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&78.8=&79.80=-0.30816774330149777&79.81=333.6054152134701&79.82=155.72856559616935&83.80=-0.11764216999897817&83.81=-38.43510027180947&83.82=-78.52287109278605&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&folder=24&left.left.left.right=25&left.left.left.anim=26&left.left.left.block=27&left.left.left.layout=28&left.left.right=29&left.left.right.anim=30&left.left.right.block=31&left.left.right.layout=32&left.left.right.left=33&left.left.right.right=34&left.left.anim=35&left.left.block=36&left.left.layout=37&left.right=38&left.right.left=39&left.right.right=40&left.right.anim=41&left.right.block=42&left.right.layout=43&right=44&anim=45&fuse=46&speed=47&hide%20inputs=48&spin=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84), the situation is even more vivid: a significant number of channels (embedding positions) in _both_ `Q` and `K` look more or less constant across the sequence, implying that the useful attention signal is potentially driven by a only smallish subset of the embedding. Understanding and exploiting this phenomenon is one of the threads we're pulling on as part of the SysML ATOM transformer efficiency project. -* Perhaps most familiar is the strong-but-not-perfect diagonal that emerges in the attention matrix. This is a common pattern, showing up in many of the attention heads of this model (and those of many transformers). It produces _localized_ attention: the value tokens in the small neighborhood immediately preceding an output token's position largely determine that output token's content pattern. -* However, the size of this neighborhood and the influence of individual tokens within it vary nontrivially - this can be seen both in the off-diagonal frost in the attention grid, and in the [fluctuating patterns of the attn[i] @ V vector-matrix product plane](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_5%20%40%20wQ_5_4)%20%40%20(K_t%20%3D%20wK_t_5_4%20%40%20input_t_0_5))%20%40%20(V%20%3D%20input_0_5%20%40%20wV_5_4)%20%40%20wO_5_4&1=out&2=none&26=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_5&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&24.1=wQ_5_4&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&24.0=&25.13=none&25.26=open&27.15=1&28.17=positive&28.18=left&28.19=bottom&28.20=back&22.26=closed&29.2=none&30.13=none&31.15=1&32.17=positive&32.18=right&32.19=top&32.20=back&33.1=wK_t_5_4&33.4=false&33.5=64&33.6=768&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&33.9=-1&33.10=1&33.11=0&33.0=&34.1=input_t_0_5&34.4=false&34.5=768&34.6=256&34.7=url&34.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&34.9=-1&34.10=1&34.11=0&34.0=&29.1=K_t&29.4=true&35.13=none&35.26=open&36.15=1&37.17=negative&37.18=left&37.19=top&37.20=front&21.26=open&38.1=V&38.4=true&38.2=none&39.1=input_0_5&39.4=false&39.5=256&39.6=768&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&39.0=&40.1=wV_5_4&40.4=false&40.5=768&40.6=64&40.7=url&40.9=-1&40.10=1&40.11=0&40.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&40.0=&41.13=none&42.15=1&43.17=negative&43.18=right&43.19=top&43.20=back&3.26=open&44.1=wO_5_4&44.4=false&44.5=64&44.6=768&44.7=url&44.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&44.9=-1&44.10=1&44.11=0&44.0=&45.46=sync&45.47=16&45.48=false&45.13=vmprod&45.49=0&45.26=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=8.38&59.61=true&59.62=4&59.63=0.394&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.26=open&69.70=local&69.71=0.05&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&78.8=&79.80=-12.838747258760423&79.81=224.62765397316576&79.82=274.71626756027933&83.80=-13.049253781233714&83.81=-55.16215322834755&83.82=-70.26525235295296&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&folder=26&left.left.left.block=27&left.left.left.layout=28&left.left.right=29&left.left.right.anim=30&left.left.right.block=31&left.left.right.layout=32&left.left.right.left=33&left.left.right.right=34&left.left.anim=35&left.left.block=36&left.left.layout=37&left.right=38&left.right.left=39&left.right.right=40&left.right.anim=41&left.right.block=42&left.right.layout=43&right=44&anim=45&fuse=46&speed=47&hide%20inputs=48&spin=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84) as it descends the attention matrix on its way through the sequence. -* But note that the local neighborhood isn't the only thing that's attracting attention: the leftmost column of the attention grid, corresponding to the first token of the sequence, is entirely filled with nonzero (but fluctuating) values, meaning every output token will be influenced to some degree by the first value token. -* Moreover there's an [inexact but discernible oscillation in attention score dominance](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_5%20%40%20wQ_5_4)%20%40%20(K_t%20%3D%20wK_t_5_4%20%40%20input_t_0_5))%20%40%20(V%20%3D%20input_0_5%20%40%20wV_5_4)%20%40%20wO_5_4&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_5&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&24.1=wQ_5_4&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&24.0=&25.13=vmprod&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_5_4&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_5&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=vmprod&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_5&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&38.0=&39.1=wV_5_4&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_5_4&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=8.38&59.61=true&59.62=4&59.63=0.394&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.05&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&78.8=&79.80=-328.8286059935543&79.81=-64.64788859858083&79.82=156.66189435044396&83.80=-6.5479856531724625&83.81=-27.630477427688977&83.82=-64.70186279804427&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84) between the current token neighborhood and the initial token. The period of the oscillation varies, but broadly speaking starts short and then lengthens as one travels down the sequence (evocatively correlated with the quantity of candidate attention tokens for each row, given causality). -* To get a feel for how (`attn @ V)` is formed, it's important not to focus on attention in isolation - `V` is an equal player. Each output item is a weighted average of the entire `V` vector: at the limit when attention is a perfect diagonal, `attn @ V` is simply an exact copy of `V`. Here we see [something more textured](https://bhosmer.github.io/mm/index.html?0=out+%3D+%28attn+%3D+%28Q+%3D+input_0_5+%40+wQ_5_4%29+%40+%28K_t+%3D+wK_t_5_4+%40+input_t_0_5%29%29+%40+%28V+%3D+input_0_5+%40+wV_5_4%29+%40+wO_5_4&1=out&2=none&51=closed&84=true&3.1=attn+%40+V&3.4=true&3.5=32&3.6=32&3.7=row+major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&14.16=1&14.17=1&18.19=positive&18.20=left&18.21=bottom&18.22=back&23.1=attn&23.4=true&23.2=softmax%28tril%28x%2Fsqrt%28k%29%29%29&24.1=Q&24.4=true&24.2=none&25.1=input_0_5&25.4=false&25.5=256&25.6=768&25.7=url&25.9=-1&25.10=1&25.11=0&25.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&25.0=&26.1=wQ_5_4&26.4=false&26.5=768&26.6=64&26.7=url&26.9=-1&26.10=1&26.11=0&26.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&26.0=&27.13=vmprod&28.15=1&28.16=1&28.17=1&29.19=positive&29.20=left&29.21=bottom&29.22=back&30.2=none&31.13=none&32.15=1&32.16=1&32.17=1&33.19=positive&33.20=right&33.21=top&33.22=back&34.1=wK_t_5_4&34.4=false&34.5=64&34.6=768&34.7=url&34.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&34.9=-1&34.10=1&34.11=0&34.0=&35.1=input_t_0_5&35.4=false&35.5=768&35.6=256&35.7=url&35.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&35.9=-1&35.10=1&35.11=0&35.0=&30.1=K_t&30.4=true&36.13=vmprod&37.15=1&37.16=1&37.17=1&38.19=negative&38.20=left&38.21=top&38.22=front&39.1=V&39.4=true&39.2=none&40.1=input_0_5&40.4=false&40.5=256&40.6=768&40.7=url&40.9=-1&40.10=1&40.11=0&40.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&40.0=&41.1=wV_5_4&41.4=false&41.5=768&41.6=64&41.7=url&41.9=-1&41.10=1&41.11=0&41.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&41.0=&42.13=none&43.15=1&43.16=1&43.17=1&44.19=negative&44.20=right&44.21=top&44.22=back&45.1=wO_5_4&45.4=false&45.5=64&45.6=768&45.7=url&45.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&45.9=-1&45.10=1&45.11=0&45.0=&46.47=sync&46.48=16&46.49=false&46.13=none&46.50=0&46.51=open&52.16=1&52.15=1&52.17=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.19=negative&53.20=left&53.21=top&53.22=front&59.60=8.38&59.61=true&59.62=4&59.63=0.394&59.64=0&59.65=0.5&59.66=20&59.67=false&59.68=false&59.51=open&69.70=local&69.71=0.05&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&78.8=&79.80=-164.2339403949366&79.81=18.940074323234473&79.82=173.55325640245638&83.80=117.76774477612946&83.81=2.623526996843087&83.82=53.25986191913323&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k+blocks=15&i+blocks=16&j+blocks=17&left.layout=18&polarity=19&left+placement=20&right+placement=21&result+placement=22&left.left=23&left.left.left=24&left.left.left.left=25&left.left.left.right=26&left.left.left.anim=27&left.left.left.block=28&left.left.left.layout=29&left.left.right=30&left.left.right.anim=31&left.left.right.block=32&left.left.right.layout=33&left.left.right.left=34&left.left.right.right=35&left.left.anim=36&left.left.block=37&left.left.layout=38&left.right=39&left.right.left=40&left.right.right=41&left.right.anim=42&left.right.block=43&left.right.layout=44&right=45&anim=46&fuse=47&speed=48&hide+inputs=49&spin=50&folder=51&block=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row+guides=63&flow+guides=64&lens+size=65&magnification=66&interior+spotlight=67&axes=68&viz=69&sensitivity=70&min+size=71&min+light=72&max+light=73&elem+scale=74&zero+hue=75&hue+gap=76&hue+spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84): visible banding where particular tokens have scored high over a contiguous subsequence of attention rows, superimposed on a matrix visibly similar to to `V` but with some vertical smearing due to the fat diagonal. (Aside: per the [mm reference guide](https://bhosmer.github.io/mm/ref.html), long-clicking or control-clicking will reveal the actual numeric values of visualized elements.) -* Bear in mind that since we're in a middle layer (5), the input to this attention head is an intermediate representation, not the original tokenized text. So the [patterns seen in the input](https://bhosmer.github.io/mm/index.html?0=out+%3D+%28attn+%3D+%28Q+%3D+input+%40+wQ%29+%40+%28K_t+%3D+wK_t+%40+input_t%29%29+%40+%28V+%3D+input+%40+wV%29+%40+wO&1=out&2=none&26=closed&84=true&3.1=attn+%40+V&3.4=true&3.5=32&3.6=32&3.7=row+major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&14.16=1&14.17=1&18.19=positive&18.20=left&18.21=bottom&18.22=back&23.1=attn&23.4=true&23.2=softmax%28tril%28x%2Fsqrt%28k%29%29%29&24.1=Q&24.4=true&24.2=none&25.1=input&25.4=false&25.5=256&25.6=768&25.7=url&25.9=-1&25.10=1&25.11=0&25.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&25.0=&25.26=open&27.1=wQ&27.4=false&27.5=768&27.6=64&27.7=url&27.9=-1&27.10=1&27.11=0&27.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&27.0=&28.13=vmprod&29.15=1&29.16=1&29.17=1&30.19=positive&30.20=left&30.21=bottom&30.22=back&24.26=open&31.2=none&32.13=none&33.15=1&33.16=1&33.17=1&34.19=positive&34.20=right&34.21=top&34.22=back&35.1=wK_t&35.4=false&35.5=64&35.6=768&35.7=url&35.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&35.9=-1&35.10=1&35.11=0&35.0=&36.1=input_t&36.4=false&36.5=768&36.6=256&36.7=url&36.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&36.9=-1&36.10=1&36.11=0&36.0=&31.1=K_t&31.4=true&37.13=vmprod&38.15=1&38.16=1&38.17=1&39.19=negative&39.20=left&39.21=top&39.22=front&23.26=open&40.1=V&40.4=true&40.2=none&41.1=input&41.4=false&41.5=256&41.6=768&41.7=url&41.9=-1&41.10=1&41.11=0&41.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&41.0=&42.1=wV&42.4=false&42.5=768&42.6=64&42.7=url&42.9=-1&42.10=1&42.11=0&42.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&42.0=&43.13=none&44.15=1&44.16=1&44.17=1&45.19=negative&45.20=right&45.21=top&45.22=back&3.26=open&46.1=wO&46.4=false&46.5=64&46.6=768&46.7=url&46.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&46.9=-1&46.10=1&46.11=0&46.0=&47.48=sync&47.49=16&47.50=false&47.13=none&47.51=0&47.26=open&52.16=1&52.15=1&52.17=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.19=negative&53.20=left&53.21=top&53.22=front&59.60=10&59.61=true&59.62=5&59.63=0.394&59.64=0&59.65=0.25&59.66=4.632&59.67=false&59.68=false&59.26=open&69.70=local&69.71=0.2&69.72=0.4&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.26=open&78.8=&78.26=open&79.80=-1126.8641673236093&79.81=-4.707283693510895&79.82=168.0669807860928&83.80=-692.9006907132649&83.81=4.068470706235418&83.82=-171.27561837707958&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k+blocks=15&i+blocks=16&j+blocks=17&left.layout=18&polarity=19&left+placement=20&right+placement=21&result+placement=22&left.left=23&left.left.left=24&left.left.left.left=25&folder=26&left.left.left.right=27&left.left.left.anim=28&left.left.left.block=29&left.left.left.layout=30&left.left.right=31&left.left.right.anim=32&left.left.right.block=33&left.left.right.layout=34&left.left.right.left=35&left.left.right.right=36&left.left.anim=37&left.left.block=38&left.left.layout=39&left.right=40&left.right.left=41&left.right.right=42&left.right.anim=43&left.right.block=44&left.right.layout=45&right=46&anim=47&fuse=48&speed=49&hide+inputs=50&spin=51&block=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row+guides=63&flow+guides=64&lens+size=65&magnification=66&interior+spotlight=67&axes=68&viz=69&sensitivity=70&min+size=71&min+light=72&max+light=73&elem+scale=74&zero+hue=75&hue+gap=76&hue+spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84) are themselves thought-provoking - in particular, the strong vertical threads are particular embedding positions whose values are uniformly high magnitude across long stretches of the sequence - sometimes almost the entire thing. -* Interestingly, though, the [first vector in the input sequence is distinctive](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input%20%40%20wQ)%20%40%20(K_t%20%3D%20wK_t%20%40%20input_t))%20%40%20(V%20%3D%20input%20%40%20wV)%20%40%20wO&1=out&2=none&24=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&23.24=open&25.1=wQ&25.4=false&25.5=768&25.6=64&25.7=url&25.9=-1&25.10=1&25.11=0&25.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&25.0=&26.13=vmprod&27.15=1&28.17=positive&28.18=left&28.19=bottom&28.20=back&22.24=open&29.2=none&30.13=none&31.15=1&32.17=positive&32.18=right&32.19=top&32.20=back&33.1=wK_t&33.4=false&33.5=64&33.6=768&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&33.9=-1&33.10=1&33.11=0&33.0=&34.1=input_t&34.4=false&34.5=768&34.6=256&34.7=url&34.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&34.9=-1&34.10=1&34.11=0&34.0=&29.1=K_t&29.4=true&35.13=vmprod&36.15=1&37.17=negative&37.18=left&37.19=top&37.20=front&21.24=open&38.1=V&38.4=true&38.2=none&39.1=input&39.4=false&39.5=256&39.6=768&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&39.0=&40.1=wV&40.4=false&40.5=768&40.6=64&40.7=url&40.9=-1&40.10=1&40.11=0&40.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&40.0=&41.13=none&42.15=1&43.17=negative&43.18=right&43.19=top&43.20=back&3.24=open&44.1=wO&44.4=false&44.5=64&44.6=768&44.7=url&44.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&44.9=-1&44.10=1&44.11=0&44.0=&45.46=sync&45.47=16&45.48=false&45.13=none&45.49=0&45.24=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=10&59.61=true&59.62=5&59.63=0.394&59.64=0&59.65=0&59.66=4.632&59.67=false&59.68=false&59.24=open&69.70=local&69.71=0.2&69.72=0.4&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.24=open&78.8=&78.24=open&79.80=-905.2149526505231&79.81=126.10717525695773&79.82=-90.1644865901155&83.80=-739.2766627330938&83.81=125.47333863007341&83.82=-229.39828071999955&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&folder=24&left.left.left.right=25&left.left.left.anim=26&left.left.left.block=27&left.left.left.layout=28&left.left.right=29&left.left.right.anim=30&left.left.right.block=31&left.left.right.layout=32&left.left.right.left=33&left.left.right.right=34&left.left.anim=35&left.left.block=36&left.left.layout=37&left.right=38&left.right.left=39&left.right.right=40&left.right.anim=41&left.right.block=42&left.right.layout=43&right=44&anim=45&fuse=46&speed=47&hide%20inputs=48&spin=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84), not only breaking the pattern of these high-magnitude columns but carrying atypical values at almost every position (aside: not visualized here, but this pattern is repeated over multiple sample inputs). - -Note: apropos of the last two bullet points, it's worth reiterating that we're visualizing computation over a _single sample input_. In practice I've found that each head has a characteristic pattern it will express consistently (though not identically) over a decent collection of samples (and the upcoming attention head browser will provide a collection of samples to play with), but when looking at any visualization that includes activations, it's important to bear in mind that a full distribution of inputs may influence the ideas and intuitions it provokes it in subtle ways. - -Finally, one more pitch to [explore the animation directly](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_5%20%40%20wQ_5_4)%20%40%20(K_t%20%3D%20wK_t_5_4%20%40%20input_t_0_5))%20%40%20(V%20%3D%20input_0_5%20%40%20wV_5_4)%20%40%20wO_5_4&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_5&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&24.1=wQ_5_4&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&24.0=&25.13=vmprod&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_5_4&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_5&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=vmprod&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_5&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&38.0=&39.1=wV_5_4&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_5_4&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=vmprod&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=8.38&59.61=true&59.62=4&59.63=0.394&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.05&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&78.8=&79.80=-382.8684269325278&79.81=293.7591554956184&79.82=395.95878922315694&83.80=-14.023727291338966&83.81=-38.22974037070054&83.82=-84.10726407282482&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84)! - - -### 4c Heads are different in interesting ways - -Before we move on, here's one more demonstration of the usefulness of simply poking around a model to see how it works in detail. - -This is another attention head from GPT2. It behaves quite differently from layer 5, head 4 above - as one might expect, given that it's in a very different part of the model. This head is in the very first layer: layer 0, head 2 ([open in mm](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_0%20%40%20wQ_0_2)%20%40%20(K_t%20%3D%20wK_t_0_2%20%40%20input_t_0_0))%20%40%20(V%20%3D%20input_0_0%20%40%20wV_0_2)%20%40%20wO_0_2&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_0&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&23.0=&24.1=wQ_0_2&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wq2_768_64.csv&24.0=&25.13=none&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_0_2&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wk_t2_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_0&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=none&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_0&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&38.0=&39.1=wV_0_2&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wv2_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_0_2&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wo2_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=6&59.61=true&59.62=4&59.63=0.73&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.2&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=open&78.8=&78.49=open&79.80=-217.09372134188362&79.81=412.82010718887307&79.82=523.3596617096426&83.80=127.59196458710655&83.81=35.32022663933653&83.82=87.43354119148215&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84), may take a few seconds to load model weights): - - -![This is another attention head from GPT2](/assets/images/inside-the-matrix/gpt2_0_2c.jpg){:style="width:100%"} - - - -Things to note: - - - -* This head spreads attention very evenly. This has the effect of delivering a relatively _unweighted_ average of `V` (or rather, the appropriate causal prefix of `V`) to each row in `attn @ V`, as can be seen in [this animation](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_0%20%40%20wQ_0_2)%20%40%20(K_t%20%3D%20wK_t_0_2%20%40%20input_t_0_0))%20%40%20(V%20%3D%20input_0_0%20%40%20wV_0_2)%20%40%20wO_0_2&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_0&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&23.0=&24.1=wQ_0_2&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wq2_768_64.csv&24.0=&25.13=none&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_0_2&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wk_t2_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_0&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=none&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_0&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&38.0=&39.1=wV_0_2&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wv2_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_0_2&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wo2_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=vmprod&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=6&59.61=true&59.62=4&59.63=0.73&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=closed&69.70=local&69.71=0.4&69.72=0.4&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=open&78.8=&78.49=closed&79.80=11.34872888812131&79.81=324.07536950158396&79.82=239.8893041928473&83.80=11.804686909150822&83.81=25.33948904441141&83.82=46.896270190786204&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84): as we move down the attention score triangle, the `attn[i] @ V` vector-matrix product is small fluctuations away from being simply a downscaled, progressively revealed copy of `V`. -* `attn @ V` has [striking vertical uniformity](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_0%20%40%20wQ_0_2)%20%40%20(K_t%20%3D%20wK_t_0_2%20%40%20input_t_0_0))%20%40%20(V%20%3D%20input_0_0%20%40%20wV_0_2)%20%40%20wO_0_2&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_0&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&23.0=&24.1=wQ_0_2&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wq2_768_64.csv&24.0=&25.13=none&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_0_2&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wk_t2_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_0&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=none&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_0&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&38.0=&39.1=wV_0_2&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wv2_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_0_2&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wo2_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=6&59.61=true&59.62=4&59.63=0.73&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.2&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=open&78.8=&78.49=open&79.80=-152.24249732960024&79.81=115.78244265148294&79.82=89.29496035154&83.80=20.231661185991296&83.81=61.75722293832386&83.82=52.45120329048098&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84) - in large columnar regions of the embedding, the same value patterns persist over _the entire sequence_. One can think of these as properties shared by every token. -* Aside: on the one hand one might expect _some_ uniformity in `attn @ V` given the effect of very evenly spread attention. But each row has been constructed from only a causal subsequence of `V` rather than the whole thing - why is that not causing more variation, like a progressive morphing as one moves down the sequence? [By visual inspection V isn't uniform along its length](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_0%20%40%20wQ_0_2)%20%40%20(K_t%20%3D%20wK_t_0_2%20%40%20input_t_0_0))%20%40%20(V%20%3D%20input_0_0%20%40%20wV_0_2)%20%40%20wO_0_2&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_0&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&23.0=&24.1=wQ_0_2&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wq2_768_64.csv&24.0=&25.13=none&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_0_2&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wk_t2_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_0&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=none&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_0&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&38.0=&39.1=wV_0_2&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wv2_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_0_2&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wo2_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=6&59.61=true&59.62=4&59.63=0.73&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.2&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=open&78.8=&78.49=open&79.80=8.296178745251016&79.81=-533.8678069620822&79.82=35.64126972299759&83.80=8.29674894856322&83.81=36.25749961529174&83.82=35.64126624185369&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84), so the answer must lie in some more subtle property of its distribution of values. -* Finally, this head's output is [even more vertically uniform after out-projection](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_0%20%40%20wQ_0_2)%20%40%20(K_t%20%3D%20wK_t_0_2%20%40%20input_t_0_0))%20%40%20(V%20%3D%20input_0_0%20%40%20wV_0_2)%20%40%20wO_0_2&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_0&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&23.0=&24.1=wQ_0_2&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wq2_768_64.csv&24.0=&25.13=none&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_0_2&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wk_t2_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_0&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=none&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_0&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&38.0=&39.1=wV_0_2&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wv2_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_0_2&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wo2_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=6&59.61=true&59.62=4&59.63=0.73&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.2&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=open&78.8=&78.49=open&79.80=41.37507219272118&79.81=4.367136718959145&79.82=430.5595129727994&83.80=607.5332301692057&83.81=-2.548000389888877&83.82=-122.74351758382484&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84) -* the strong impression being that the bulk of the information being delivered by this attention head consists of properties which are shared by every token in the sequence. The composition of its [output projection weights](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_0%20%40%20wQ_0_2)%20%40%20(K_t%20%3D%20wK_t_0_2%20%40%20input_t_0_0))%20%40%20(V%20%3D%20input_0_0%20%40%20wV_0_2)%20%40%20wO_0_2&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_0&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&23.0=&24.1=wQ_0_2&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wq2_768_64.csv&24.0=&25.13=none&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_0_2&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wk_t2_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_0&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=none&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_0&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&38.0=&39.1=wV_0_2&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wv2_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_0_2&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wo2_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=6&59.61=true&59.62=4&59.63=0.73&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.2&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=open&78.8=&78.49=open&79.80=136.19712021298585&79.81=351.47497630691043&79.82=254.9066405965837&83.80=554.2569175811409&83.81=-39.63963114876448&83.82=-109.72659308933949&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84) reinforces this intuition. - -Overall, it's hard to resist the idea that the extremely regular, highly structured information this attention head produces might be obtained by computational means that are a bit... less lavish. Of course this isn't an unexplored area, but the specificity and richness of signal of the visualized computation has been useful in generating new ideas, and reasoning about existing ones. - - -### 4d Revisiting the pitch: invariants for free - -Stepping back, it's worth reiterating that the reason we can visualize nontrivially compound operations like attention heads and have them remain intuitive is that important algebraic properties - like how argument shapes are constrained, or which parallelization axes intersect which operations - _don't require additional thinking_: they arise directly from the geometry of the visualized object, rather than being additional rules to keep in mind. - -For example, in these attention head visualizations it's immediately obvious that - - - -* `Q` and `attn @ V` are the same length, `K` and `V` are the same length, and the lengths of these pairs are independent of each other -* `Q` and `K` are the same width, `V` and `attn @ V` are the same width, and the widths of these pairs are independent of each other. - -These properties are true by construction, as a simple consequence of which parts of the compound structure the constituents inhabit and how they are oriented. - -This "properties for free" benefit can be especially useful when exploring variations on a canonical structure - an obvious example being the one-row-high attention matrix in autoregressive token-at-a-time decoding ([open in mm](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_5%20%40%20wQ_5_4)%20%40%20(K_t%20%3D%20wK_t_5_4%20%40%20input_t_0_5))%20%40%20(V%20%3D%20input_0_5%20%40%20wV_5_4)%20%40%20wO_5_4&1=out&2=none&24=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(x%2Fsqrt(k))&22.1=Q&22.4=true&22.2=none&23.1=input_0_5&23.4=false&23.5=1&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&23.24=open&25.1=wQ_5_4&25.4=false&25.5=768&25.6=64&25.7=url&25.9=-1&25.10=1&25.11=0&25.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&25.0=&26.13=vmprod&27.15=1&28.17=positive&28.18=left&28.19=bottom&28.20=back&22.24=open&29.2=none&30.13=none&31.15=1&32.17=positive&32.18=right&32.19=top&32.20=back&33.1=wK_t_5_4&33.4=false&33.5=64&33.6=768&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&33.9=-1&33.10=1&33.11=0&33.0=&34.1=input_t_0_5&34.4=false&34.5=768&34.6=256&34.7=url&34.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&34.9=-1&34.10=1&34.11=0&34.0=&29.1=K_t&29.4=true&35.13=vmprod&36.15=1&37.17=negative&37.18=left&37.19=top&37.20=front&21.24=open&38.1=V&38.4=true&38.2=none&39.1=input_0_5&39.4=false&39.5=256&39.6=768&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&39.0=&40.1=wV_5_4&40.4=false&40.5=768&40.6=64&40.7=url&40.9=-1&40.10=1&40.11=0&40.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&40.0=&41.13=none&42.15=1&43.17=negative&43.18=right&43.19=top&43.20=back&3.24=closed&44.1=wO_5_4&44.4=false&44.5=64&44.6=768&44.7=url&44.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&44.9=-1&44.10=1&44.11=0&44.0=&45.46=sync&45.47=16&45.48=false&45.13=none&45.49=0&45.24=open&50.51=1&50.52=1&50.15=1&53.54=blocks&53.55=24&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&59.60=10&59.61=true&59.62=4&59.63=0.394&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.24=open&69.70=local&69.71=0.05&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&78.8=&79.80=-289.3020871171715&79.81=176.55051931108687&79.82=202.12550566094345&83.80=6.09420901744693&83.81=-60.94451681776672&83.82=-55.94371166611936&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&folder=24&left.left.left.right=25&left.left.left.anim=26&left.left.left.block=27&left.left.left.layout=28&left.left.right=29&left.left.right.anim=30&left.left.right.block=31&left.left.right.layout=32&left.left.right.left=33&left.left.right.right=34&left.left.anim=35&left.left.block=36&left.left.layout=37&left.right=38&left.right.left=39&left.right.right=40&left.right.anim=41&left.right.block=42&left.right.layout=43&right=44&anim=45&fuse=46&speed=47&hide%20inputs=48&spin=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84)): - -![the one-row-high attention matrix in autoregressive token-at-a-time decoding](/assets/images/inside-the-matrix/gpt2_decode2.jpg){:style="width:100%"} - - - - -## 5 Parallelizing attention - -In the animation of head 5, layer 4 above, we visualize 4 of the 6 matmuls in the attention head - -as a fused chain of vector-matrix products, confirming the geometric intuition that the entire left-associative chain from input to output is _laminar_ along the shared `i` axis, and can be parallelized. - - -### 5a Example: partitioning along i - -To parallelize the computation in practice, we would partition the input into blocks along the `i` axis. We can visualize this partition in the tool, by specifying that a given axis be partitioned into a particular number of blocks - in these examples we'll use 8, but there's nothing special about that number. - -Among other things, this visualization makes clear that `wQ` (for in-projection), `K_t` and `V` (for attention) and `wO` (for out-projection) are needed in their entirety by each parallel computation, since they're adjacent to the partitioned matrices along those matrices' unpartitioned dimensions ([open in mm](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_5%20%40%20wQ_5_4)%20%40%20(K_t%20%3D%20wK_t_5_4%20%40%20input_t_0_5))%20%40%20(V%20%3D%20input_0_5%20%40%20wV_5_4)%20%40%20wO_5_4&1=out&2=none&49=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=1&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input_0_5&23.4=false&23.5=256&23.6=768&23.7=url&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&23.0=&24.1=wQ_5_4&24.4=false&24.5=768&24.6=64&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&24.0=&25.13=vmprod&26.15=1&27.17=positive&27.18=left&27.19=bottom&27.20=back&28.2=none&29.13=none&30.15=1&31.17=positive&31.18=right&31.19=top&31.20=back&32.1=wK_t_5_4&32.4=false&32.5=64&32.6=768&32.7=url&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&32.9=-1&32.10=1&32.11=0&32.0=&33.1=input_t_0_5&33.4=false&33.5=768&33.6=256&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&33.9=-1&33.10=1&33.11=0&33.0=&28.1=K_t&28.4=true&34.13=vmprod&35.15=1&36.17=negative&36.18=left&36.19=top&36.20=front&37.1=V&37.4=true&37.2=none&38.1=input_0_5&38.4=false&38.5=256&38.6=768&38.7=url&38.9=-1&38.10=1&38.11=0&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&38.0=&39.1=wV_5_4&39.4=false&39.5=768&39.6=64&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&39.0=&40.13=none&41.15=1&42.17=negative&42.18=right&42.19=top&42.20=back&43.1=wO_5_4&43.4=false&43.5=64&43.6=768&43.7=url&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&44.45=sync&44.46=16&44.47=false&44.13=none&44.48=0&44.49=closed&50.51=8&50.52=1&50.15=1&50.49=open&53.54=blocks&53.55=10&53.56=0&53.57=1&53.58=0&53.17=negative&53.18=left&53.19=top&53.20=front&53.49=closed&59.60=10&59.61=true&59.62=4&59.63=0.507&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.49=open&69.70=local&69.71=0.2&69.72=0.3&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.49=closed&78.8=&78.49=open&79.80=-452.09425433307837&79.81=-10.01467989007457&79.82=392.9851223674549&83.80=-27.91725760321879&83.81=-18.858991089590095&83.82=-140.6826497984033&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.anim=25&left.left.left.block=26&left.left.left.layout=27&left.left.right=28&left.left.right.anim=29&left.left.right.block=30&left.left.right.layout=31&left.left.right.left=32&left.left.right.right=33&left.left.anim=34&left.left.block=35&left.left.layout=36&left.right=37&left.right.left=38&left.right.right=39&left.right.anim=40&left.right.block=41&left.right.layout=42&right=43&anim=44&fuse=45&speed=46&hide%20inputs=47&spin=48&folder=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84)): - -![wQ (for in-projection), K_t and V (for attention) and wO (for out-projection) are needed in their entirety by each parallel computation](/assets/images/inside-the-matrix/gpt2_parti.jpg){:style="width:100%"} - - - -### 5b Example: double partitioning - -As an example of partitioning along _multiple_ axes, we can visualize some recent work which innovates in this space ([Block Parallel Transformer](https://arxiv.org/pdf/2305.19370.pdf), building on work done in e.g. [Flash Attention](https://arxiv.org/pdf/2205.14135.pdf) and its antecedents). - -First, BPT partitions along `i` as described above - and actually extends this horizontal partitioning of the sequence into chunks all the way through the second (FFN) half of the attention layer as well. (We'll visualize this in a later section.) - -To fully attack the context length problem, a second partitioning is then added to MHA - that of the attention calculation itself (i.e., a partition along the `j` axis of `Q @ K_t`). The two partitions together divide attention into a grid of blocks ([open in mm](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input_0_5%20%40%20wQ_5_4)%20%40%20(K_t%20%3D%20wK_t_5_4%20%40%20input_t_0_5))%20%40%20(V%20%3D%20input_0_5%20%40%20wV_5_4)%20%40%20wO_5_4&1=out&2=none&16=closed&84=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=vmprod&14.15=8&14.16=open&17.18=positive&17.19=left&17.20=bottom&17.21=back&22.1=attn&22.4=true&22.2=softmax(tril(x%2Fsqrt(k)))&23.1=Q&23.4=true&23.2=none&24.1=input_0_5&24.4=false&24.5=256&24.6=768&24.7=url&24.9=-1&24.10=1&24.11=0&24.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&24.0=&25.1=wQ_5_4&25.4=false&25.5=768&25.6=64&25.7=url&25.9=-1&25.10=1&25.11=0&25.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&25.0=&26.13=vmprod&27.15=1&28.18=positive&28.19=left&28.20=bottom&28.21=back&29.2=none&30.13=none&31.15=1&32.18=positive&32.19=right&32.20=top&32.21=back&33.1=wK_t_5_4&33.4=false&33.5=64&33.6=768&33.7=url&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&33.9=-1&33.10=1&33.11=0&33.0=&34.1=input_t_0_5&34.4=false&34.5=768&34.6=256&34.7=url&34.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&34.9=-1&34.10=1&34.11=0&34.0=&29.1=K_t&29.4=true&35.13=vmprod&36.15=1&37.18=negative&37.19=left&37.20=top&37.21=front&38.1=V&38.4=true&38.2=none&39.1=input_0_5&39.4=false&39.5=256&39.6=768&39.7=url&39.9=-1&39.10=1&39.11=0&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&39.0=&40.1=wV_5_4&40.4=false&40.5=768&40.6=64&40.7=url&40.9=-1&40.10=1&40.11=0&40.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&40.0=&41.13=none&42.15=1&43.18=negative&43.19=right&43.20=top&43.21=back&3.16=open&44.1=wO_5_4&44.4=false&44.5=64&44.6=768&44.7=url&44.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&44.9=-1&44.10=1&44.11=0&44.0=&45.46=sync&45.47=16&45.48=false&45.13=none&45.49=0&45.16=closed&50.51=8&50.52=1&50.15=1&50.16=closed&53.54=blocks&53.55=10&53.56=0&53.57=1&53.58=0&53.18=negative&53.19=left&53.20=top&53.21=front&53.16=closed&59.60=10&59.61=true&59.62=4&59.63=0.507&59.64=0&59.65=0.5&59.66=12&59.67=false&59.68=false&59.16=open&69.70=local&69.71=0.2&69.72=0.3&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.16=closed&78.8=&78.16=open&79.80=-459.733038437248&79.81=-10.183892342609507&79.82=399.6251724834292&83.80=-27.91725760321879&83.81=-18.858991089590095&83.82=-140.6826497984033&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&folder=16&left.layout=17&polarity=18&left%20placement=19&right%20placement=20&result%20placement=21&left.left=22&left.left.left=23&left.left.left.left=24&left.left.left.right=25&left.left.left.anim=26&left.left.left.block=27&left.left.left.layout=28&left.left.right=29&left.left.right.anim=30&left.left.right.block=31&left.left.right.layout=32&left.left.right.left=33&left.left.right.right=34&left.left.anim=35&left.left.block=36&left.left.layout=37&left.right=38&left.right.left=39&left.right.right=40&left.right.anim=41&left.right.block=42&left.right.layout=43&right=44&anim=45&fuse=46&speed=47&hide%20inputs=48&spin=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84)): - -![The two partitions together divide attention into a grid of blocks](/assets/images/inside-the-matrix/gpt2_ik.jpg){:style="width:100%"} - - - -This visualization makes clear - - - -* the effectiveness of this double partitioning as an attack on the context length problem, since we've now visibly partitioned every occurrence of sequence length in the attention calculation -* the "reach" of this second partitioning: it's clear from the geometry that the in-projection computations of `K` and `V` can be partitioned along with the core double matmul - -Note one subtlety: the visual implication here is that we can also parallelize the subsequent matmul `attn @ V` along `k` and sum the partial results [split-k style](https://github.com/NVIDIA/cutlass/blob/main/media/docs/efficient_gemm.md#parallelized-reductions), thus parallelizing the entire double matmul. But the row-wise softmax in `sdpa()` adds the requirement that each row have all its segments normalized before the corresponding row of `attn @ V` can be computed, adding an extra row-wise step between the attention calculation and the final matmul. - - -## 6 Sizes in an Attention Layer - -The first (MHA) half of an attention layer is famously computationally demanding because of its quadratic complexity, but the second (FFN) half is demanding in its own right due to the width of its hidden dimension, typically 4 times that of the model's embedding dimension. Visualizing the biomass of a full attention layer can be useful in building intuition about how the two halves of the layer compare to each other. - - -### 6a Visualizing the full layer - -Below is a full attention layer with the first half (MHA) in the background and the second (FFN) in the foreground. As usual, arrows point in the direction of computation. - -Notes: - - - -* This visualization doesn't depict individual attention heads, but instead shows the unsliced Q/K/V weights and projections surrounding a central double matmul. Of course this isn't a faithful visualization of the full MHA operation - but the goal here is to give a clearer sense of the relative matrix _sizes_ in the two halves of the layer, rather than the relative amounts of computation each half performs. (Also, randomized values are used rather than real weights.) -* The dimensions used here are downsized to keep the browser (relatively) happy, but the proportions are preserved (from [NanoGPT's small config](https://github.com/karpathy/nanoGPT/blob/master/model.py#L217)): model embedding dimension = 192 (from 768), FFN embedding dimension = 768 (from 3072), sequence length = 256 (from 1024), although sequence length is not fundamental to the model. (Visually, changes in sequence length would appear as changes in the width of the input blades, and consequently in the size of the attention hub and the height of the downstream vertical planes.) - -[Open in mm](https://bhosmer.github.io/mm/index.html?0=layer_out%20%3D%20(attn_out%20%3D%20(attn%20%3D%20(Q%20%3D%20input%20%40%20wQ)%20%40%20(K_t%20%3D%20wK_t%20%40%20input_t))%20%40%20(V%20%3D%20input%20%40%20wV)%20%40%20wO)%20%40%20FFN_1%20%40%20FFN_2&1=layer_out&2=layernorm&16=closed&94=true&3.1=attn_out%20%40%20FFN_1&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=gelu&12.13=inherit&14.15=1&14.16=open&17.18=positive&17.19=left&17.20=bottom&17.21=back&22.2=layernorm&23.13=inherit&24.15=1&24.16=open&25.18=negative&25.19=left&25.20=top&25.21=front&26.1=attn%20%40%20V&26.4=true&26.5=32&26.6=32&26.7=row%20major&26.8=&26.9=-1&26.10=1&26.11=0&26.2=none&27.13=vmprod&28.15=1&28.16=open&29.18=positive&29.19=left&29.20=bottom&29.21=back&30.1=attn&30.4=true&30.2=softmax(tril(x%2Fsqrt(k)))&31.1=Q&31.4=true&31.2=none&32.1=input&32.4=false&32.5=256&32.6=192&32.7=gaussian&32.9=-1&32.10=1&32.11=0&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&32.0=&32.16=open&33.1=wQ&33.4=false&33.5=192&33.6=192&33.7=gaussian&33.9=-1&33.10=1&33.11=0&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&33.0=&33.16=open&34.13=vmprod&35.15=1&36.18=positive&36.19=left&36.20=bottom&36.21=back&31.16=closed&37.2=none&38.13=none&39.15=1&40.18=positive&40.19=right&40.20=top&40.21=back&41.1=wK_t&41.4=false&41.5=192&41.6=192&41.7=gaussian&41.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&41.9=-1&41.10=1&41.11=0&41.0=&41.16=open&42.1=input_t&42.4=false&42.5=192&42.6=256&42.7=gaussian&42.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&42.9=-1&42.10=1&42.11=0&42.0=&42.16=open&37.1=K_t&37.4=true&37.16=closed&43.13=vmprod&44.15=1&44.16=open&45.18=negative&45.19=left&45.20=top&45.21=front&30.16=open&46.1=V&46.4=true&46.2=none&47.1=input&47.4=false&47.5=256&47.6=192&47.7=gaussian&47.9=-1&47.10=1&47.11=0&47.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&47.0=&47.16=open&48.1=wV&48.4=false&48.5=192&48.6=192&48.7=gaussian&48.9=-1&48.10=1&48.11=0&48.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&48.0=&48.16=open&49.13=none&50.15=1&51.18=negative&51.19=right&51.20=top&51.21=back&46.16=open&26.16=open&52.1=wO&52.4=false&52.5=192&52.6=192&52.7=gaussian&52.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&52.9=-1&52.10=0.996&52.11=0&52.0=&52.16=closed&22.1=attn_out&22.4=true&22.16=open&53.1=FFN_1&53.4=false&53.5=192&53.6=768&53.7=gaussian&53.8=&53.9=-1&53.10=1&53.11=0&53.0=&53.16=closed&3.16=open&54.1=FFN_2&54.4=false&54.5=768&54.6=192&54.7=gaussian&54.8=&54.9=-1&54.10=1&54.11=0&54.0=&54.16=closed&55.56=sync&55.57=16&55.58=false&55.13=none&55.59=0&55.16=closed&60.61=1&60.62=1&60.15=1&60.16=closed&63.64=blocks&63.65=8&63.66=0&63.67=1&63.68=0&63.18=negative&63.19=left&63.20=top&63.21=front&63.16=closed&69.70=10&69.71=true&69.72=4&69.73=0.507&69.74=0.524&69.75=0.5&69.76=12&69.77=false&69.78=false&69.16=closed&79.80=local&79.81=0.1&79.82=0.2&79.83=0.9&79.84=2&79.85=0.75&79.86=0.75&79.87=0.03&79.16=closed&88.8=&88.16=open&89.90=-738.1526976199144&89.91=919.9001193338946&89.92=957.7418906526483&93.90=0&93.91=0&93.92=0&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&folder=16&left.layout=17&polarity=18&left%20placement=19&right%20placement=20&result%20placement=21&left.left=22&left.left.anim=23&left.left.block=24&left.left.layout=25&left.left.left=26&left.left.left.anim=27&left.left.left.block=28&left.left.left.layout=29&left.left.left.left=30&left.left.left.left.left=31&left.left.left.left.left.left=32&left.left.left.left.left.right=33&left.left.left.left.left.anim=34&left.left.left.left.left.block=35&left.left.left.left.left.layout=36&left.left.left.left.right=37&left.left.left.left.right.anim=38&left.left.left.left.right.block=39&left.left.left.left.right.layout=40&left.left.left.left.right.left=41&left.left.left.left.right.right=42&left.left.left.left.anim=43&left.left.left.left.block=44&left.left.left.left.layout=45&left.left.left.right=46&left.left.left.right.left=47&left.left.left.right.right=48&left.left.left.right.anim=49&left.left.left.right.block=50&left.left.left.right.layout=51&left.left.right=52&left.right=53&right=54&anim=55&fuse=56&speed=57&hide%20inputs=58&spin=59&block=60&i%20blocks=61&j%20blocks=62&layout=63&scheme=64&gap=65&scatter=66&molecule=67&blast=68&deco=69&legends=70&shape=71&spotlight=72&row%20guides=73&flow%20guides=74&lens%20size=75&magnification=76&interior%20spotlight=77&axes=78&viz=79&sensitivity=80&min%20size=81&min%20light=82&max%20light=83&elem%20scale=84&zero%20hue=85&hue%20gap=86&hue%20spread=87&diag=88&cam=89&x=90&y=91&z=92&cam.target=93&compress=94): - -![a full attention layer with the first half (MHA) in the background and the second (FFN) in the foreground](/assets/images/inside-the-matrix/attnlayer2.jpg){:style="width:100%"} - - - - -### 6b Visualizing the BPT partitioned layer - -Revisiting [Blockwise Parallel Transformer](https://arxiv.org/pdf/2305.19370.pdf) briefly, here we visualize BPT's parallelization scheme in the context of an entire attention layer (with individual heads elided per above). In particular, note how the partitioning along `i` (of sequence blocks) extends through both MHA and FFN halves ([open in mm](https://bhosmer.github.io/mm/index.html?0=layer_out%20%3D%20(attn_out%20%3D%20(attn%20%3D%20(Q%20%3D%20input%20%40%20wQ)%20%40%20(K_t%20%3D%20wK_t%20%40%20input_t))%20%40%20(V%20%3D%20input%20%40%20wV)%20%40%20wO)%20%40%20FFN_1%20%40%20FFN_2&1=layer_out&2=layernorm&16=closed&94=true&3.1=attn_out%20%40%20FFN_1&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=gelu&12.13=inherit&14.15=1&14.16=closed&17.18=positive&17.19=left&17.20=bottom&17.21=back&22.2=layernorm&23.13=inherit&24.15=1&24.16=closed&25.18=negative&25.19=left&25.20=top&25.21=front&26.1=attn%20%40%20V&26.4=true&26.5=32&26.6=32&26.7=row%20major&26.8=&26.9=-1&26.10=1&26.11=0&26.2=none&27.13=vmprod&28.15=8&28.16=open&29.18=positive&29.19=left&29.20=bottom&29.21=back&30.1=attn&30.4=true&30.2=softmax(tril(x%2Fsqrt(k)))&31.1=Q&31.4=true&31.2=none&32.1=input&32.4=false&32.5=256&32.6=192&32.7=gaussian&32.9=-1&32.10=1&32.11=0&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&32.0=&32.16=open&33.1=wQ&33.4=false&33.5=192&33.6=192&33.7=gaussian&33.9=-1&33.10=1&33.11=0&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&33.0=&33.16=open&34.13=vmprod&35.15=1&36.18=positive&36.19=left&36.20=bottom&36.21=back&31.16=closed&37.2=none&38.13=none&39.15=1&40.18=positive&40.19=right&40.20=top&40.21=back&41.1=wK_t&41.4=false&41.5=192&41.6=192&41.7=gaussian&41.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&41.9=-1&41.10=1&41.11=0&41.0=&41.16=open&42.1=input_t&42.4=false&42.5=192&42.6=256&42.7=gaussian&42.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&42.9=-1&42.10=1&42.11=0&42.0=&42.16=open&37.1=K_t&37.4=true&37.16=closed&43.13=vmprod&44.15=1&44.16=open&45.18=negative&45.19=left&45.20=top&45.21=front&30.16=closed&46.1=V&46.4=true&46.2=none&47.1=input&47.4=false&47.5=256&47.6=192&47.7=gaussian&47.9=-1&47.10=1&47.11=0&47.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&47.0=&47.16=open&48.1=wV&48.4=false&48.5=192&48.6=192&48.7=gaussian&48.9=-1&48.10=1&48.11=0&48.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&48.0=&48.16=open&49.13=none&50.15=1&51.18=negative&51.19=right&51.20=top&51.21=back&46.16=closed&26.16=open&52.1=wO&52.4=false&52.5=192&52.6=192&52.7=gaussian&52.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&52.9=-1&52.10=0.996&52.11=0&52.0=&52.16=closed&22.1=attn_out&22.4=true&22.16=open&53.1=FFN_1&53.4=false&53.5=192&53.6=768&53.7=gaussian&53.8=&53.9=-1&53.10=1&53.11=0&53.0=&53.16=closed&3.16=open&54.1=FFN_2&54.4=false&54.5=768&54.6=192&54.7=gaussian&54.8=&54.9=-1&54.10=1&54.11=0&54.0=&54.16=closed&55.56=sync&55.57=16&55.58=false&55.13=none&55.59=0&55.16=closed&60.61=8&60.62=1&60.15=1&60.16=open&63.64=blocks&63.65=8&63.66=0&63.67=1&63.68=0&63.18=negative&63.19=left&63.20=top&63.21=front&63.16=closed&69.70=10&69.71=true&69.72=4&69.73=0.507&69.74=0.524&69.75=0.5&69.76=12&69.77=false&69.78=false&69.16=closed&79.80=local&79.81=0.1&79.82=0.2&79.83=0.9&79.84=2&79.85=0.75&79.86=0.75&79.87=0.03&79.16=closed&88.8=&88.16=open&89.90=-766.4372214429399&89.91=955.1488380935747&89.92=994.4406298292719&93.90=0&93.91=0&93.92=0&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&folder=16&left.layout=17&polarity=18&left%20placement=19&right%20placement=20&result%20placement=21&left.left=22&left.left.anim=23&left.left.block=24&left.left.layout=25&left.left.left=26&left.left.left.anim=27&left.left.left.block=28&left.left.left.layout=29&left.left.left.left=30&left.left.left.left.left=31&left.left.left.left.left.left=32&left.left.left.left.left.right=33&left.left.left.left.left.anim=34&left.left.left.left.left.block=35&left.left.left.left.left.layout=36&left.left.left.left.right=37&left.left.left.left.right.anim=38&left.left.left.left.right.block=39&left.left.left.left.right.layout=40&left.left.left.left.right.left=41&left.left.left.left.right.right=42&left.left.left.left.anim=43&left.left.left.left.block=44&left.left.left.left.layout=45&left.left.left.right=46&left.left.left.right.left=47&left.left.left.right.right=48&left.left.left.right.anim=49&left.left.left.right.block=50&left.left.left.right.layout=51&left.left.right=52&left.right=53&right=54&anim=55&fuse=56&speed=57&hide%20inputs=58&spin=59&block=60&i%20blocks=61&j%20blocks=62&layout=63&scheme=64&gap=65&scatter=66&molecule=67&blast=68&deco=69&legends=70&shape=71&spotlight=72&row%20guides=73&flow%20guides=74&lens%20size=75&magnification=76&interior%20spotlight=77&axes=78&viz=79&sensitivity=80&min%20size=81&min%20light=82&max%20light=83&elem%20scale=84&zero%20hue=85&hue%20gap=86&hue%20spread=87&diag=88&cam=89&x=90&y=91&z=92&cam.target=93&compress=94)): - - -![visualize BPT's parallelization scheme in the context of an entire attention layer](/assets/images/inside-the-matrix/bptlayer.jpg){:style="width:100%"} - - - - -### 6c Partitioning the FFN - -The visualization suggests an additional partitioning, orthogonal to the ones described above - in the FFN half of the attention layer, splitting the double matmul `(attn_out @ FFN_1) @ FFN_2`, first along `j` for `attn_out @ FFN_1`, then along `k` in the subsequent matmul with `FFN_2`. This partition slices both layers of `FFN` weights, reducing the capacity requirements of each participant in the computation at the cost of a final summation of the partial results. - -Here's what this partition looks like applied to an otherwise unpartitioned attention layer ([open in mm](https://bhosmer.github.io/mm/index.html?0=layer_out+%3D+%28attn_out+%3D+%28attn+%3D+%28Q+%3D+input+%40+wQ%29+%40+%28K_t+%3D+wK_t+%40+input_t%29%29+%40+%28V+%3D+input+%40+wV%29+%40+wO%29+%40+FFN_1+%40+FFN_2&1=layer_out&2=layernorm&16=closed&94=true&3.1=attn_out+%40+FFN_1&3.4=true&3.5=32&3.6=32&3.7=row+major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=gelu&12.13=inherit&14.15=1&14.16=open&14.17=1&14.18=8&19.20=positive&19.21=left&19.22=bottom&19.23=back&24.2=layernorm&25.13=inherit&26.15=1&26.16=closed&26.17=1&26.18=1&27.20=negative&27.21=left&27.22=top&27.23=front&28.1=attn+%40+V&28.4=true&28.5=32&28.6=32&28.7=row+major&28.8=&28.9=-1&28.10=1&28.11=0&28.2=none&29.13=vmprod&30.15=1&30.16=open&30.17=1&30.18=1&31.20=positive&31.21=left&31.22=bottom&31.23=back&32.1=attn&32.4=true&32.2=softmax%28tril%28x%2Fsqrt%28k%29%29%29&33.1=Q&33.4=true&33.2=none&34.1=input&34.4=false&34.5=256&34.6=192&34.7=gaussian&34.9=-1&34.10=1&34.11=0&34.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&34.0=&34.16=open&35.1=wQ&35.4=false&35.5=192&35.6=192&35.7=gaussian&35.9=-1&35.10=1&35.11=0&35.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&35.0=&35.16=open&36.13=vmprod&37.15=1&37.17=1&37.18=1&38.20=positive&38.21=left&38.22=bottom&38.23=back&33.16=closed&39.2=none&40.13=none&41.15=1&41.17=1&41.18=1&42.20=positive&42.21=right&42.22=top&42.23=back&43.1=wK_t&43.4=false&43.5=192&43.6=192&43.7=gaussian&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&43.16=open&44.1=input_t&44.4=false&44.5=192&44.6=256&44.7=gaussian&44.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&44.9=-1&44.10=1&44.11=0&44.0=&44.16=open&39.1=K_t&39.4=true&39.16=closed&45.13=vmprod&46.15=1&46.16=open&46.17=1&46.18=1&47.20=negative&47.21=left&47.22=top&47.23=front&32.16=closed&48.1=V&48.4=true&48.2=none&49.1=input&49.4=false&49.5=256&49.6=192&49.7=gaussian&49.9=-1&49.10=1&49.11=0&49.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&49.0=&49.16=open&50.1=wV&50.4=false&50.5=192&50.6=192&50.7=gaussian&50.9=-1&50.10=1&50.11=0&50.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&50.0=&50.16=open&51.13=none&52.15=1&52.17=1&52.18=1&53.20=negative&53.21=right&53.22=top&53.23=back&48.16=closed&28.16=open&54.1=wO&54.4=false&54.5=192&54.6=192&54.7=gaussian&54.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&54.9=-1&54.10=0.996&54.11=0&54.0=&54.16=closed&24.1=attn_out&24.4=true&24.16=closed&55.1=FFN_1&55.4=false&55.5=192&55.6=768&55.7=gaussian&55.8=&55.9=-1&55.10=1&55.11=0&55.0=&55.16=closed&3.16=open&56.1=FFN_2&56.4=false&56.5=768&56.6=192&56.7=gaussian&56.8=&56.9=-1&56.10=1&56.11=0&56.0=&56.16=closed&57.58=sync&57.59=16&57.60=false&57.13=none&57.61=0&57.16=closed&62.17=1&62.15=8&62.18=1&62.16=closed&63.64=blocks&63.65=8&63.66=0&63.67=1&63.68=0&63.20=negative&63.21=left&63.22=top&63.23=front&63.16=closed&69.70=10&69.71=true&69.72=4&69.73=0.507&69.74=0.524&69.75=0.5&69.76=12&69.77=false&69.78=false&69.16=closed&79.80=local&79.81=0.1&79.82=0.2&79.83=0.9&79.84=2&79.85=0.75&79.86=0.75&79.87=0.03&79.16=closed&88.8=&88.16=open&89.90=-725.0392607527422&89.91=909.2543497392985&89.92=1420.9035091451585&93.90=84.4062135237143&93.91=2.295441349889614&93.92=60.16668289640925&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k+blocks=15&folder=16&i+blocks=17&j+blocks=18&left.layout=19&polarity=20&left+placement=21&right+placement=22&result+placement=23&left.left=24&left.left.anim=25&left.left.block=26&left.left.layout=27&left.left.left=28&left.left.left.anim=29&left.left.left.block=30&left.left.left.layout=31&left.left.left.left=32&left.left.left.left.left=33&left.left.left.left.left.left=34&left.left.left.left.left.right=35&left.left.left.left.left.anim=36&left.left.left.left.left.block=37&left.left.left.left.left.layout=38&left.left.left.left.right=39&left.left.left.left.right.anim=40&left.left.left.left.right.block=41&left.left.left.left.right.layout=42&left.left.left.left.right.left=43&left.left.left.left.right.right=44&left.left.left.left.anim=45&left.left.left.left.block=46&left.left.left.left.layout=47&left.left.left.right=48&left.left.left.right.left=49&left.left.left.right.right=50&left.left.left.right.anim=51&left.left.left.right.block=52&left.left.left.right.layout=53&left.left.right=54&left.right=55&right=56&anim=57&fuse=58&speed=59&hide+inputs=60&spin=61&block=62&layout=63&scheme=64&gap=65&scatter=66&molecule=67&blast=68&deco=69&legends=70&shape=71&spotlight=72&row+guides=73&flow+guides=74&lens+size=75&magnification=76&interior+spotlight=77&axes=78&viz=79&sensitivity=80&min+size=81&min+light=82&max+light=83&elem+scale=84&zero+hue=85&hue+gap=86&hue+spread=87&diag=88&cam=89&x=90&y=91&z=92&cam.target=93&compress=94)): - -![what this partition looks like applied to an otherwise unpartitioned attention layer](/assets/images/inside-the-matrix/attnlayer_ffnsplitk.jpg){:style="width:100%"} - - - -And here it is applied to a layer partitioned a la BPT ([open in mm](https://bhosmer.github.io/mm/index.html?0=layer_out+%3D+%28attn_out+%3D+%28attn+%3D+%28Q+%3D+input+%40+wQ%29+%40+%28K_t+%3D+wK_t+%40+input_t%29%29+%40+%28V+%3D+input+%40+wV%29+%40+wO%29+%40+FFN_1+%40+FFN_2&1=layer_out&2=layernorm&16=closed&94=true&3.1=attn_out+%40+FFN_1&3.4=true&3.5=32&3.6=32&3.7=row+major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=gelu&12.13=inherit&14.15=1&14.16=open&14.17=8&14.18=8&19.20=positive&19.21=left&19.22=bottom&19.23=back&24.2=layernorm&25.13=inherit&26.15=1&26.16=closed&26.17=8&26.18=1&27.20=negative&27.21=left&27.22=top&27.23=front&28.1=attn+%40+V&28.4=true&28.5=32&28.6=32&28.7=row+major&28.8=&28.9=-1&28.10=1&28.11=0&28.2=none&29.13=vmprod&30.15=8&30.16=open&30.17=8&30.18=1&31.20=positive&31.21=left&31.22=bottom&31.23=back&32.1=attn&32.4=true&32.2=softmax%28tril%28x%2Fsqrt%28k%29%29%29&33.1=Q&33.4=true&33.2=none&34.1=input&34.4=false&34.5=256&34.6=192&34.7=gaussian&34.9=-1&34.10=1&34.11=0&34.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&34.0=&34.16=open&35.1=wQ&35.4=false&35.5=192&35.6=192&35.7=gaussian&35.9=-1&35.10=1&35.11=0&35.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&35.0=&35.16=open&36.13=vmprod&37.15=1&37.17=8&37.18=1&38.20=positive&38.21=left&38.22=bottom&38.23=back&33.16=closed&39.2=none&40.13=none&41.15=1&41.17=1&41.18=1&42.20=positive&42.21=right&42.22=top&42.23=back&43.1=wK_t&43.4=false&43.5=192&43.6=192&43.7=gaussian&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wk_t4_64_768.csv&43.9=-1&43.10=1&43.11=0&43.0=&43.16=open&44.1=input_t&44.4=false&44.5=192&44.6=256&44.7=gaussian&44.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&44.9=-1&44.10=1&44.11=0&44.0=&44.16=open&39.1=K_t&39.4=true&39.16=closed&45.13=vmprod&46.15=1&46.16=open&46.17=8&46.18=1&47.20=negative&47.21=left&47.22=top&47.23=front&32.16=closed&48.1=V&48.4=true&48.2=none&49.1=input&49.4=false&49.5=256&49.6=192&49.7=gaussian&49.9=-1&49.10=1&49.11=0&49.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&49.0=&49.16=open&50.1=wV&50.4=false&50.5=192&50.6=192&50.7=gaussian&50.9=-1&50.10=1&50.11=0&50.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&50.0=&50.16=open&51.13=none&52.15=1&52.17=1&52.18=1&53.20=negative&53.21=right&53.22=top&53.23=back&48.16=closed&28.16=open&54.1=wO&54.4=false&54.5=192&54.6=192&54.7=gaussian&54.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&54.9=-1&54.10=0.996&54.11=0&54.0=&54.16=closed&24.1=attn_out&24.4=true&24.16=closed&55.1=FFN_1&55.4=false&55.5=192&55.6=768&55.7=gaussian&55.8=&55.9=-1&55.10=1&55.11=0&55.0=&55.16=closed&3.16=open&56.1=FFN_2&56.4=false&56.5=768&56.6=192&56.7=gaussian&56.8=&56.9=-1&56.10=1&56.11=0&56.0=&56.16=closed&57.58=sync&57.59=16&57.60=false&57.13=none&57.61=0&57.16=closed&62.17=8&62.15=8&62.18=1&62.16=open&63.64=blocks&63.65=8&63.66=0&63.67=1&63.68=0&63.20=negative&63.21=left&63.22=top&63.23=front&63.16=closed&69.70=10&69.71=true&69.72=4&69.73=0.507&69.74=0.524&69.75=0.5&69.76=12&69.77=false&69.78=false&69.16=closed&79.80=local&79.81=0.1&79.82=0.2&79.83=0.9&79.84=2&79.85=0.75&79.86=0.75&79.87=0.03&79.16=closed&88.8=&88.16=open&89.90=-908.5990219796431&89.91=1012.984380609292&89.92=1378.7815259698948&93.90=57.978218494193676&93.91=-30.847130586978256&93.92=41.66771129059017&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k+blocks=15&folder=16&i+blocks=17&j+blocks=18&left.layout=19&polarity=20&left+placement=21&right+placement=22&result+placement=23&left.left=24&left.left.anim=25&left.left.block=26&left.left.layout=27&left.left.left=28&left.left.left.anim=29&left.left.left.block=30&left.left.left.layout=31&left.left.left.left=32&left.left.left.left.left=33&left.left.left.left.left.left=34&left.left.left.left.left.right=35&left.left.left.left.left.anim=36&left.left.left.left.left.block=37&left.left.left.left.left.layout=38&left.left.left.left.right=39&left.left.left.left.right.anim=40&left.left.left.left.right.block=41&left.left.left.left.right.layout=42&left.left.left.left.right.left=43&left.left.left.left.right.right=44&left.left.left.left.anim=45&left.left.left.left.block=46&left.left.left.left.layout=47&left.left.left.right=48&left.left.left.right.left=49&left.left.left.right.right=50&left.left.left.right.anim=51&left.left.left.right.block=52&left.left.left.right.layout=53&left.left.right=54&left.right=55&right=56&anim=57&fuse=58&speed=59&hide+inputs=60&spin=61&block=62&layout=63&scheme=64&gap=65&scatter=66&molecule=67&blast=68&deco=69&legends=70&shape=71&spotlight=72&row+guides=73&flow+guides=74&lens+size=75&magnification=76&interior+spotlight=77&axes=78&viz=79&sensitivity=80&min+size=81&min+light=82&max+light=83&elem+scale=84&zero+hue=85&hue+gap=86&hue+spread=87&diag=88&cam=89&x=90&y=91&z=92&cam.target=93&compress=94)): - -![applied to a layer partitioned a la BPT](/assets/images/inside-the-matrix/bptlayer_ffnsplitk.jpg){:style="width:100%"} - - - -### 6d Visualizing token-at-a-time decoding - -During autoregressive token-at-a-time decoding, the query vector consists of a single token. It's instructive to have a mental picture of what an attention layer looks like in that situation - a single embedding row working its way through an enormous tiled plane of weights. - -Aside from the emphasizing the sheer immensity of weights compared to activations, this view is also evocative of the notion that `K_t` and `V` function like dynamically generated layers in a 6-layer MLP, although the mux/demux computations of MHA itself (papered over here, per above) make the correspondence inexact ([open in mm](https://bhosmer.github.io/mm/index.html?0=layer_out%20%3D%20(attn_out%20%3D%20(attn%20%3D%20(Q%20%3D%20input%20%40%20wQ)%20%40%20K_t)%20%40%20V%20%40%20wO)%20%40%20FFN_1%20%40%20FFN_2&1=layer_out&2=layernorm&16=closed&84=true&3.1=attn_out%20%40%20FFN_1&3.4=true&3.5=32&3.6=32&3.7=row%20major&3.8=&3.9=-1&3.10=1&3.11=0&3.2=gelu&12.13=inherit&14.15=1&14.16=open&17.18=positive&17.19=left&17.20=bottom&17.21=back&22.2=layernorm&23.13=inherit&24.15=1&24.16=open&25.18=negative&25.19=left&25.20=top&25.21=front&26.1=attn%20%40%20V&26.4=true&26.5=32&26.6=32&26.7=row%20major&26.8=&26.9=-1&26.10=1&26.11=0&26.2=none&27.13=vmprod&28.15=1&28.16=open&29.18=positive&29.19=left&29.20=bottom&29.21=back&30.1=attn&30.4=true&30.2=softmax(tril(x%2Fsqrt(k)))&31.1=Q&31.4=true&31.2=none&32.1=input&32.4=false&32.5=1&32.6=192&32.7=gaussian&32.9=-1&32.10=1&32.11=0&32.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input0_256_768.csv&32.0=&32.16=open&33.1=wQ&33.4=false&33.5=192&33.6=192&33.7=gaussian&33.9=-1&33.10=1&33.11=0&33.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wq4_768_64.csv&33.0=&33.16=open&34.13=vmprod&35.15=1&36.18=positive&36.19=left&36.20=bottom&36.21=back&31.16=open&37.1=K_t&37.4=false&37.5=192&37.6=256&37.7=gaussian&37.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_input_t0_768_256.csv&37.9=-1&37.10=1&37.11=0&37.0=&37.16=open&38.13=vmprod&39.15=1&39.16=open&40.18=negative&40.19=left&40.20=top&40.21=front&30.16=open&41.1=V&41.4=false&41.16=open&41.5=256&41.6=192&41.7=gaussian&41.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wv4_768_64.csv&41.0=&41.9=-1&41.10=1&41.11=0&26.16=open&42.1=wO&42.4=false&42.5=192&42.6=192&42.7=gaussian&42.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer5_wo4_64_768.csv&42.9=-1&42.10=0.996&42.11=0&42.0=&42.16=closed&22.1=attn_out&22.4=true&22.16=open&43.1=FFN_1&43.4=false&43.5=192&43.6=768&43.7=gaussian&43.8=&43.9=-1&43.10=1&43.11=0&43.0=&43.16=closed&3.16=open&44.1=FFN_2&44.4=false&44.5=768&44.6=192&44.7=gaussian&44.8=&44.9=-1&44.10=1&44.11=0&44.0=&44.16=closed&45.46=sync&45.47=16&45.48=false&45.13=none&45.49=0&45.16=closed&50.51=1&50.52=1&50.15=1&50.16=closed&53.54=blocks&53.55=8&53.56=0&53.57=1&53.58=0&53.18=negative&53.19=left&53.20=top&53.21=front&53.16=closed&59.60=10&59.61=true&59.62=4&59.63=0.507&59.64=0.524&59.65=0.5&59.66=12&59.67=false&59.68=false&59.16=closed&69.70=local&69.71=0.1&69.72=0.2&69.73=0.9&69.74=2&69.75=0.75&69.76=0.75&69.77=0.03&69.16=closed&78.8=&78.16=open&79.80=-621.3352100945762&79.81=774.3199147754915&79.82=806.172978523008&83.80=0&83.81=0&83.82=0&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&left.block=14&k%20blocks=15&folder=16&left.layout=17&polarity=18&left%20placement=19&right%20placement=20&result%20placement=21&left.left=22&left.left.anim=23&left.left.block=24&left.left.layout=25&left.left.left=26&left.left.left.anim=27&left.left.left.block=28&left.left.left.layout=29&left.left.left.left=30&left.left.left.left.left=31&left.left.left.left.left.left=32&left.left.left.left.left.right=33&left.left.left.left.left.anim=34&left.left.left.left.left.block=35&left.left.left.left.left.layout=36&left.left.left.left.right=37&left.left.left.left.anim=38&left.left.left.left.block=39&left.left.left.left.layout=40&left.left.left.right=41&left.left.right=42&left.right=43&right=44&anim=45&fuse=46&speed=47&hide%20inputs=48&spin=49&block=50&i%20blocks=51&j%20blocks=52&layout=53&scheme=54&gap=55&scatter=56&molecule=57&blast=58&deco=59&legends=60&shape=61&spotlight=62&row%20guides=63&flow%20guides=64&lens%20size=65&magnification=66&interior%20spotlight=67&axes=68&viz=69&sensitivity=70&min%20size=71&min%20light=72&max%20light=73&elem%20scale=74&zero%20hue=75&hue%20gap=76&hue%20spread=77&diag=78&cam=79&x=80&y=81&z=82&cam.target=83&compress=84)): - -![the mux/demux computations of MHA itself](/assets/images/inside-the-matrix/decoding.jpg){:style="width:100%"} - - - - -## 7 LoRA - -The recent LoRA paper ([LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/pdf/2106.09685.pdf)) describes an efficient finetuning technique based on the idea that weight deltas introduced during finetuning are low-rank. Per the paper, this "allows us to train some dense layers in a neural network indirectly by optimizing rank decomposition matrices of the dense layers’ change during adaptation [...], while keeping the pre-trained weights frozen." - - -### 7a The basic idea - -In a nutshell, the key move is to train the _factors_ of a weight matrix rather than the matrix itself: replace an `I x J` weights tensor with a matmul of an `I x K` tensor and a `K x J` tensor, holding `K` to some small number. - -If `K` is small enough the size win can be huge, but the tradeoff is that lowering it lowers the rank of what the product can express. As a quick illustration of both the size savings and the structuring effect on the result, here's a matmul of random `128 x 4` left and `4 x 128` right arguments - a.k.a. a rank-4 factorization of a `128 x 128` matrix. Notice the vertical and horizontal patterning in `L @ R` ([open in mm](https://bhosmer.github.io/mm/?0=L%20%40%20R&1=L%20%40%20R&2=none&12=closed&59=true&3.1=L&3.4=false&3.5=128&3.6=4&3.7=gaussian&3.8=&3.9=-1&3.10=1&3.11=0&3.0=&3.12=open&13.1=R&13.4=false&13.5=4&13.6=128&13.7=gaussian&13.8=&13.9=-1&13.10=1&13.11=0&13.0=&14.15=none&14.16=1&14.17=false&14.18=none&14.19=0&14.20=1&14.21=1&14.22=1&23.20=1&23.22=1&23.21=1&24.25=blocks&24.26=11.214&24.27=0&24.28=1&24.29=0&24.30=negative&24.31=left&24.32=top&24.33=front&24.12=open&34.35=10&34.36=true&34.37=2&34.38=1&34.39=0&34.40=0.5&34.41=10&34.42=false&34.43=false&34.12=open&44.45=local&44.46=0.3&44.47=0.5&44.48=0.7&44.49=1.25&44.50=0.77&44.51=0.74&44.52=0.04&44.12=open&53.8=&54.55=-147.50937470998977&54.56=141.1312665550063&54.57=104.24779022699425&58.55=-7.97607936427614&58.56=3.391570600844088&58.57=-11.685048965074932&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&folder=12&right=13&anim=14&fuse=15&speed=16&hide%20inputs=17&alg=18&spin=19&i%20blocks=20&k%20blocks=21&j%20blocks=22&block=23&layout=24&scheme=25&gap=26&scatter=27&molecule=28&blast=29&polarity=30&left%20placement=31&right%20placement=32&result%20placement=33&deco=34&legends=35&shape=36&spotlight=37&row%20guides=38&flow%20guides=39&lens%20size=40&magnification=41&interior%20spotlight=42&axes=43&viz=44&sensitivity=45&min%20size=46&min%20light=47&max%20light=48&elem%20scale=49&zero%20hue=50&hue%20gap=51&hue%20spread=52&diag=53&cam=54&x=55&y=56&z=57&cam.target=58&compress=59)): - -![a matmul of random 128 x 4 left and 4 x 128 right arguments](/assets/images/inside-the-matrix/lora_single.jpg){:style="width:100%"} - - - - -### 7b Applying LoRA to an attention head - -The way LoRA applies this factoring move to the fine tuning process is to - - - -* create a low-rank factorization for each weight tensor to be fine-tuned and train the factors, keeping the original weights frozen -* after fine tuning, multiply each pair of low-rank factors to get a matrix in the shape of the original weights tensor, and add it to the original pretrained weights tensor - -The following visualization shows an attention head with the weight tensors `wQ`, `wK_t`, `wV`, `wO` replaced by low rank factorizations `wQ_A @ wQ_B`, etc. Visually, the factor matrices show up as low fences along the edges of the windmill blades ([open in mm](https://bhosmer.github.io/mm/index.html?0=out%20%3D%20(attn%20%3D%20(Q%20%3D%20input%20%40%20(wQ%20%3D%20wQ_A%20%40%20wQ_B))%20%40%20(K_t%20%3D%20(wK_t%20%3D%20wK_t_A%20%40%20wK_t_B)%20%40%20input_t))%20%40%20(V%20%3D%20input%20%40%20(wV%20%3D%20wV_A%20%40%20wV_B))%20%40%20(wO%20%3D%20wO_A%20%40%20wO_B)&1=out&2=none&15=closed&104=true&3.1=attn%20%40%20V&3.4=true&3.5=32&3.6=32&3.7=rows&3.8=&3.9=-1&3.10=1&3.11=0&3.2=none&12.13=inherit&12.14=1&12.15=open&16.17=positive&16.18=left&16.19=bottom&16.20=back&21.1=attn&21.4=true&21.2=softmax(tril(x%2Fsqrt(k)))&22.1=Q&22.4=true&22.2=none&23.1=input&23.4=false&23.5=64&23.6=96&23.7=gaussian&23.9=-1&23.10=1&23.11=0&23.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&23.0=&23.15=closed&24.1=wQ&24.4=true&24.2=none&25.1=wQ_A&25.4=false&25.5=96&25.6=8&25.7=gaussian&25.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wq0_768_64.csv&25.9=-1&25.10=1&25.11=0&25.0=&25.15=closed&26.1=wQ_B&26.4=false&26.5=8&26.6=32&26.7=gaussian&26.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wq0_768_64.csv&26.9=-1&26.10=1&26.11=0&26.0=&26.15=open&27.13=inherit&27.14=1&27.15=closed&28.17=negative&28.18=right&28.19=top&28.20=back&29.30=1&24.15=closed&31.13=inherit&31.14=1&31.15=closed&32.17=positive&32.18=left&32.19=bottom&32.20=back&33.30=1&22.15=closed&34.2=none&35.13=inherit&35.14=1&35.15=closed&36.17=positive&36.18=right&36.19=top&36.20=back&37.1=wK_t&37.4=true&37.2=none&38.1=wK_t_A&38.4=false&38.5=32&38.6=8&38.7=gaussian&38.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wk_t0_64_768.csv&38.9=-1&38.10=1&38.11=0&38.0=&38.15=open&39.1=wK_t_B&39.4=false&39.5=8&39.6=96&39.7=gaussian&39.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wk_t0_64_768.csv&39.9=-1&39.10=1&39.11=0&39.0=&39.15=closed&40.13=inherit&40.14=1&40.15=open&41.17=negative&41.18=left&41.19=bottom&41.20=back&42.30=1&37.15=closed&43.1=input_t&43.4=false&43.5=96&43.6=64&43.7=gaussian&43.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input_t0_768_256.csv&43.9=-1&43.10=1&43.11=0&43.0=&43.15=closed&34.1=K_t&34.4=true&44.30=1&34.15=closed&45.13=inherit&45.14=1&45.15=closed&46.17=negative&46.18=left&46.19=top&46.20=front&47.30=1&21.15=closed&48.1=V&48.4=true&48.2=none&49.1=input&49.4=false&49.5=64&49.6=96&49.7=gaussian&49.9=-1&49.10=1&49.11=0&49.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_input0_256_768.csv&49.0=&49.15=closed&50.1=wV&50.4=true&50.2=none&51.1=wV_A&51.4=false&51.5=96&51.6=8&51.7=gaussian&51.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wv0_768_64.csv&51.9=-1&51.10=1&51.11=0&51.0=&51.15=open&52.1=wV_B&52.4=false&52.5=8&52.6=32&52.7=gaussian&52.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wv0_768_64.csv&52.9=-1&52.10=1&52.11=0&52.0=&52.15=open&53.13=inherit&53.14=1&53.15=open&54.17=positive&54.18=left&54.19=bottom&54.20=back&55.30=1&50.15=closed&56.13=inherit&56.14=1&56.15=closed&57.17=negative&57.18=right&57.19=top&57.20=back&58.30=1&48.15=closed&3.15=closed&59.30=1&60.1=wO&60.4=true&60.5=32&60.6=32&60.7=cols&60.8=&60.9=-1&60.10=1&60.11=0&60.2=none&61.1=wO_A&61.4=false&61.5=32&61.6=8&61.7=gaussian&61.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wo0_64_768.csv&61.9=-1&61.10=1&61.11=0&61.0=&61.15=open&62.1=wO_B&62.4=false&62.5=8&62.6=96&62.7=gaussian&62.8=https%3A%2F%2Fraw.githubusercontent.com%2Fbhosmer%2Ftestdata%2Fmain%2Fweights%2Fgpt2%2Flayer0_wo0_64_768.csv&62.9=-1&62.10=1&62.11=0&62.0=&62.15=closed&63.13=inherit&63.14=1&63.15=closed&64.17=positive&64.18=right&64.19=top&64.20=back&60.15=closed&65.30=1&66.67=none&66.68=100&66.69=false&66.13=none&66.70=-3&66.71=1&66.30=1&66.14=1&66.15=open&72.71=1&72.14=1&72.30=1&73.74=blocks&73.75=2&73.76=8&73.77=2&73.78=0&73.17=negative&73.18=left&73.19=top&73.20=front&73.15=closed&79.80=6.5&79.81=false&79.82=1&79.83=1&79.84=0.757&79.85=0.5&79.86=10&79.87=false&79.88=false&79.15=open&89.90=local&89.91=0.2&89.92=0.3&89.93=1&89.94=1.25&89.95=0.75&89.96=0.75&89.97=0.03&89.15=closed&98.8=&98.15=closed&99.100=-172.19348886030096&99.101=179.87607098671913&99.102=291.20723943546824&103.100=-6.083689158200286&103.101=-2.2203698054118064&103.102=-20.406063431589725&expr=0&name=1&epilog=2&left=3&matmul=4&h=5&w=6&init=7&url=8&min=9&max=10&dropout=11&left.anim=12&alg=13&j%20blocks=14&folder=15&left.layout=16&polarity=17&left%20placement=18&right%20placement=19&result%20placement=20&left.left=21&left.left.left=22&left.left.left.left=23&left.left.left.right=24&left.left.left.right.left=25&left.left.left.right.right=26&left.left.left.right.anim=27&left.left.left.right.layout=28&left.left.left.right.block=29&k%20blocks=30&left.left.left.anim=31&left.left.left.layout=32&left.left.left.block=33&left.left.right=34&left.left.right.anim=35&left.left.right.layout=36&left.left.right.left=37&left.left.right.left.left=38&left.left.right.left.right=39&left.left.right.left.anim=40&left.left.right.left.layout=41&left.left.right.left.block=42&left.left.right.right=43&left.left.right.block=44&left.left.anim=45&left.left.layout=46&left.left.block=47&left.right=48&left.right.left=49&left.right.right=50&left.right.right.left=51&left.right.right.right=52&left.right.right.anim=53&left.right.right.layout=54&left.right.right.block=55&left.right.anim=56&left.right.layout=57&left.right.block=58&left.block=59&right=60&right.left=61&right.right=62&right.anim=63&right.layout=64&right.block=65&anim=66&fuse=67&speed=68&hide%20inputs=69&spin=70&i%20blocks=71&block=72&layout=73&scheme=74&gap=75&scatter=76&molecule=77&blast=78&deco=79&legends=80&shape=81&spotlight=82&row%20guides=83&flow%20guides=84&lens%20size=85&magnification=86&interior%20spotlight=87&axes=88&viz=89&sensitivity=90&min%20size=91&min%20light=92&max%20light=93&elem%20scale=94&zero%20hue=95&hue%20gap=96&hue%20spread=97&diag=98&cam=99&x=100&y=101&z=102&cam.target=103&compress=104) - spacebar stops the spin): - -

        - -

        - -## 8 Wrapup - - -### 8a Call for feedback - -I've found this way of visualizing matmul expressions extremely helpful for building intuition and reasoning about not just matrix multiplication itself, but also many aspects of ML models and their computation, from efficiency to interpretability. - -if you try it out and have suggestions or comments, I definitely want to hear, either in the comments here or [in the repo](https://github.com/bhosmer/mm). - - -### 8b Next steps - - -* There's a [GPT2 attention head explorer](https://bhosmer.github.io/mm/examples/attngpt2/index.html) built on top of the tool which I'm currently using to inventory and classify the attention head traits found in that model. (This was the tool I used to find and explore the attention heads in this note.) Once complete I plan to post a note with the inventory. -* As mentioned up top, embedding these visualizations in Python notebooks is [dead simple](https://colab.research.google.com/drive/1wZIoU20eRWKtRNCW7e5Iugm3MhfaE1f7?usp=sharing). But session URLs can get... unwieldy, so it will be useful to have Python-side utilities for constructing them from configuration objects, similar to the simple JavaScript helpers used in the [reference guide](https://bhosmer.github.io/mm/ref.html). -* If you've got a use case you think might benefit from visualizations like this but it's not obvious how to use the tool to do it, get in touch! I'm not necessarily looking to expand its core visualization capabilities that much further (right tool for the job, etc.), but e.g. the API for driving it programmatically is pretty basic, there's plenty that can be done there. \ No newline at end of file diff --git a/_posts/2023-10-02-announcing-docathon-h2-2023.md b/_posts/2023-10-02-announcing-docathon-h2-2023.md deleted file mode 100644 index 6ea07b32f54f..000000000000 --- a/_posts/2023-10-02-announcing-docathon-h2-2023.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -layout: blog_detail -title: "Announcing PyTorch Docathon H2 2023" ---- - -We are excited to announce that we will be holding a Docathon for PyTorch on November 1, 2023! This event is an opportunity for our community to come together and improve the quality of our documentation. - -During the Docathon, we will focus on updating and improving existing content, as well as adding new tutorials and docstrings. We encourage all members of the community to participate and contribute their expertise to make our documentation even better. This is a great opportunity to learn and collaborate together. - -Check out our previous docathon success story [here](https://pytorch.org/blog/docathon-h1-2023-wrap-up/). - - -## Why Participate - -One of the best things about the Docathon is that you can make a tangible, positive impact on the quality of documentation in real time. This collaborative event brings together diverse team members from various companies, backgrounds, and roles, united to work towards a common goal. This event not only fosters team building and knowledge sharing but also presents an opportunity for individuals to acquire new skills, such as writing, editing, and utilizing documentation tools. Participating in a docathon can be particularly beneficial for team members who may lack experience in these areas. - -And of course all participants will be recognized for their contributions. Top participants will receive special awards. - - -## Event Details - - - -* Nov 1: Kick-off -* Nov 1- Nov 12: Submissions and Feedback -* Nov 13 - Nov 15: Final Reviews -* Nov 15: Winner Announcements - -Details for the Docathon to be announced at the kick-off call on November 1. - -To participate in the Docathon and receive updates about the event, register here: [RSVP](https://community.linuxfoundation.org/events/details/lfhq-pytorch-foundation-presents-fall-pytorch-docathon-nov-1st-rsvp/) - -We are excited to see the improvements that will come out of this Docathon, and we look forward to your participation! diff --git a/_posts/2023-10-03-interactive-chat-gen-model.md b/_posts/2023-10-03-interactive-chat-gen-model.md deleted file mode 100644 index df7501fe5922..000000000000 --- a/_posts/2023-10-03-interactive-chat-gen-model.md +++ /dev/null @@ -1,174 +0,0 @@ ---- -layout: blog_detail -title: "How to Build an Interactive Chat-Generation Model using DialoGPT and PyTorch" -author: Intel ---- - -The focus on interactive chat-generation (or conversational response-generation) models has greatly increased in the past several months. Conversational response-generation models such as ChatGPT and Google Bard have taken the AI world by storm. The purpose of interactive chat generation is to answer various questions posed by humans, and these AI based models use natural language processing (NLP) to generate conversations almost indistinguishable from those generated by humans. - - -This article showcases a [code sample](http://github.com/oneapi-src/oneAPI-samples/blob/master/AI-and-Analytics/Features-and-Functionality/IntelPytorch_Interactive_Chat_Quantization/IntelPytorch_Interactive_Chat_Quantization.ipynb) on how to create interactive chats based on a pre-trained DialoGPT model from Hugging Face with the addition of the [Intel® Extension for PyTorch](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-pytorch.html) to perform dynamic quantization on the model. - - -## Get Started - - -### Why DialoGPT? - - -DialoGPT (**Dialo**gue **G**enerative **P**re-trained **T**ransformer) is a large-scale, pre-trained dialogue-response-generation model trained on 147M conversation-like exchanges pulled out from Reddit comment chains and discussion threads. [DialoGPT](http://github.com/microsoft/DialoGPT) was proposed by Microsoft in 2019. The main goal was to create open-domain chatbots capable of producing natural responses to a variety of conversational topics. The conversational response-generation systems that leverage DialoGPT generate more applicable, resourceful, diverse, and context-specific replies. - - -### DialoGPT Architecture - - -DialoGPT architecture is based on the GPT-2 model. It is formulated as an autoregressive language model and uses a multi-layer transformer as the model architecture. GPT-2 was proposed by OpenAI. GPT-2 models are trained on general text data whereas DialoGPT is trained on Reddit discussion threads. - - -Let’s look at the GPT-2 architecture. There are two types of blocks in general transformer architecture: - -* Encoder - contains self-attention layer and feed-forward neural network -* Decoder - similar to encoder, but the self-attention layer is masked - -The self-attention layer allows a position to peak at tokens to the right of the current word (the successive words in text), whereas masked self-attention layer prevents that from happening. - - -![self-attention layer vs masked self-attention layer](/assets/images/f1-self-attention-vs-masked.png){:style="width:100%; max-width: 845px; display: block; margin-left: auto; margin-right: auto"} - - -GPT-2 is built using transformer decoder blocks. This means that the following layers are used in the architecture: - -1. Embedding Layer – responsible for converting input text into embeddings (each word is converted to a fixed-length vector representation) -2. Transformer Decoder – includes multiple decoder blocks with masked self-attention and feed forward neural network layers -3. Output Layer – responsible for converting embeddings obtained from the decoder into words - -GPT-2 architecture (and DialoGPT architecture) is shown below. - - -![GPT-2 architecture](/assets/images/f2-dialogpt-article.png){:style="width:100%; max-width: 651px; display: block; margin-left: auto; margin-right: auto"} - - - -As the model is based on transformers architecture, it has the issue of repetition and copying the inputs. To avoid repetition, we can use Top-K sampling and Top-p sampling. - -* Top-K sampling - filters the K most likely next words and redistributes the probability mass among only those K next words. -* Top-p sampling - rather than selecting only the most likely K words, selects the smallest possible set of words whose cumulative probability exceeds the probability p. - -The probability mass is then redistributed among the words in the set. As a result, the size of the set of words can be dynamically increased and decreased based on the probability distribution of the next word. - - -### Quantization using Intel® Extension for PyTorch - - -**What is Quantization?** - - -Quantization is a systematic reduction of the precision of all or several layers within the model. This means a higher-precision type, such as the single-precision floating-point (FP32) mostly used in deep learning, is converted into a lower-precision type such as FP16 (16 bits) or INT8 (8 bits). - - -This helps in achieving, - -* lower memory bandwidth -* lower storage -* higher performance with minimum-to-zero accuracy loss - -Quantization is especially important with large models such as those based on the Transformer architecture like BERT or GPT. - - -There are two types of quantization: - -* Static – Static quantization quantizes the weights and activations of the model. This quantization is used when both memory bandwidth and compute savings are important. -* Dynamic – In dynamic quantization, the weights are quantized ahead of time, but the activations are dynamically quantized during inference. - -**Intel Extension for PyTorch:** The Intel Extension extends PyTorch with up-to-date features and optimizations for an extra performance boost on Intel® hardware. Learn how to [install it standalone](http://github.com/intel/intel-extension-for-pytorch#installation) or get it a part of the [Intel® AI Analytics Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html). - - -The extension can be loaded as a Python* module or linked as a C++ library. Python users can enable it dynamically by importing intel_extension_for_pytorch. - -* This [CPU tutorial](http://intel.github.io/intel-extension-for-pytorch/cpu/latest/) gives detailed information about Intel Extension for PyTorch for Intel CPUs. Source code is available at the [master branch](http://github.com/intel/intel-extension-for-pytorch/tree/master). -* This [GPU tutorial](http://intel.github.io/intel-extension-for-pytorch/xpu/latest/) gives detailed information about Intel Extension for PyTorch for Intel GPUs. Source code is available at the [xpu-master branch](http://github.com/intel/intel-extension-for-pytorch/tree/xpu-master). - -**How to perform dynamic quantization using Intel Extension for PyTorch?** - - -Here are the steps to quantize the existing FP32 model to INT8 model using dynamic quantization: - -1. Prepare quantization configuration - We can use default dynamic quantization configuration with **ipex.quantization.default_dynamic_qconfig**. -2. Prepare the FP32 model by using the** ipex.quantization.prepare **method (provide the input parameters such as FP32 model to quantize, the prepared configuration, example inputs and information if the quantization should be in place). -3. Convert the model from FP32 to INT8 - Use **ipex.quantization.convert** method for conversion. The input model will be the model prepared in step 2. - -We also encourage you to check out the [Intel® Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) tool that automates popular model-compression technologies such as quantization, pruning, and knowledge distillation across multiple [deep learning frameworks](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html). - - -## Code Sample - - -The following steps are implemented in the [code sample](http://github.com/oneapi-src/oneAPI-samples/blob/master/AI-and-Analytics/Features-and-Functionality/IntelPytorch_Interactive_Chat_Quantization/IntelPytorch_Interactive_Chat_Quantization.ipynb): - - -1. **Load model and tokenizer:** [Transformers library](http://huggingface.co/docs/transformers/index) (check out [Intel® Extension for Transformers](http://github.com/intel/intel-extension-for-transformers)) and [Auto Classes available in the Hugging Face Main Classes](http://huggingface.co/docs/transformers/model_doc/auto) are used in this step. These allow us to automatically find the relevant model by the given name. It also allows to easily change the model without major changes in the code on the developer's side as shown below: -``` -tokenizer = AutoTokenizer.from_pretrained(model) -model = AutoModelForCausalLM.from_pretrained(model) -``` -The model parameter is specified as an input for the tokenizer, and model initialization is just the path to the pre-trained DialoGPT model. In this sample, we are using ‘microsoft/DialoGPT-large.' If you have limited resources, you can use ‘microsoft/DialoGPT-medium’ or ‘microsoft/DialoGPT-small’ models and receive comparable results. -2. **Perform dynamic quantization of the model:** - 1. Create the configuration using the default dynamic quantization configuration from Intel Extension for PyTorch library. - 2. Prepare the model. - 3. Convert the model from FP32 to INT8. \ -The steps are explained in detail in the above section. -3. **Response generation:** The first step in response generation is to encode the input sentence as shown in the code below: -``` -new_input_ids = tokenizer.encode(input(">> You:") + tokenizer.eos_token, return_tensors='pt') -``` -In this sample, we want our model to save history, so we are adding input sentences in the form of tokens to the chat history: -``` -bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_round > 0 else new_input_ids -``` -The text generation can be done by the model.generate function, where we can specify all important parameters like saved chat history, length of the response in tokens, and usage of both Top-K and Top-p sampling. -``` -chat_history_ids = model.generate(bot_input_ids, do_sample=True, max_length=2000, top_k=50, top_p=0.95, pad_token_id=tokenizer.eos_token_id) -``` -The last step is to decode and print the response: -4. **Preparation for interactive conversation:** After response generation, the last step is to add interaction. This can be done by using a simple for loop. Based on the initialized tokenizer, model, and empty chat history, responses are generated for a number of rounds: -``` -for chat_round in range(n): -chat_history_ids = generate_response( -tokenizer, -model, -chat_round, -chat_history_ids -) -``` -An example of interactive chat generation will look like the one shown in the picture below. - - -![An example of interactive chat generation](/assets/images/f3-dialogpt-interaction.png){:style="width:100%; max-width: 981px; display: block; margin-left: auto; margin-right: auto"} - - -## What’s Next? - - -Get started with interactive chat-generation models using Intel Extension for PyTorch and DialoGPT. Download and try the [Intel AI Analytics Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html) and [Intel Extension for PyTorch](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-pytorch.html) for yourself to build various end-to-end AI applications. - - -We encourage you to also check out and incorporate Intel’s other [AI/ML Framework optimizations](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html) and [end-to-end portfolio of tools](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/tools.html) into your AI workflow and learn about the unified, open, standards-based [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html) programming model that forms the foundation of Intel’s [AI Software Portfolio](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/overview.html) to help you prepare, build, deploy, and scale your AI solutions. - - -For more details about the new 4th Gen Intel® Xeon® Scalable processors, visit [Intel's AI Solution Platform portal](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/platform.html) where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs. - - -### Useful resources - -* [Intel AI Developer Tools and resources](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/overview.html) -* [oneAPI unified programming model](https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html) -* [Official documentation - PyTorch Optimizations from Intel](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-pytorch.html) -* [Intel® Extension for PyTorch - Documentation](http://intel.github.io/intel-extension-for-pytorch/) - -### Explore more AI code samples - -* [Language Identification: Building an End-to-End AI Solution using PyTorch](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification) -* [Optimize PyTorch Models using Intel® Extension for PyTorch (IPEX) Quantization](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/Features-and-Functionality/IntelPytorch_Quantization) -* [PyTorch Training Optimizations with Advanced Matrix Extensions Bfloat16](http://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16) - -[See all code samples](https://www.intel.com/content/www/us/en/developer/tools/oneapi/code-samples.html) \ No newline at end of file diff --git a/_posts/2023-10-04-high-performance-llama.md b/_posts/2023-10-04-high-performance-llama.md deleted file mode 100644 index 06bd5eba85b5..000000000000 --- a/_posts/2023-10-04-high-performance-llama.md +++ /dev/null @@ -1,314 +0,0 @@ ---- -layout: blog_detail -title: "High performance Llama 2 deployments with AWS Inferentia2 using TorchServe" -author: Mike Zhang, Li Ning, Sergey Ivanov, Naman Nandan, Hamid Shojanazeri, Geeta Chauhan, Abhi Shivaditya, Michael Nguyen, Pinak Panigrahi ---- - -Recently, [Llama 2](https://ai.meta.com/llama/) was released and has attracted a lot of interest from the machine learning community. [Amazon EC2 Inf2 instances](https://aws.amazon.com/ec2/instance-types/inf2/), powered by [AWS Inferentia2](https://aws.amazon.com/machine-learning/inferentia/), now support training and inference of Llama 2 models. In this post, we show low-latency and cost-effective inference of Llama-2 models on Amazon EC2 Inf2 instances using the latest [AWS Neuron SDK](https://aws.amazon.com/machine-learning/neuron/) release.  We first introduce how to create, compile and deploy the Llama-2 model and explain the optimization techniques introduced by AWS Neuron SDK to achieve high performance at low cost. We then present our benchmarking results. Lastly, we show how the Llama-2 model can be deployed through Amazon SageMaker using TorchServe on an Inf2 instance.  - -![Llama 2 is an auto-regressive language model that uses an optimized transformer architecture](/assets/images/high-performance-llama/software_stack_inf2.jpg){:style="width:100%; max-width: 420px; display: block; margin-left: auto; margin-right: auto"} - - -## What is Llama 2 - -Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. Llama 2 is intended for commercial and research use in English. It comes in multiple sizes—7 billion, 13 billion, and 70 billion parameters—as well as pre-trained and fine-tuned variations. According to Meta, the tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align to human preferences for helpfulness and safety. Llama 2 was pre-trained on 2 trillion tokens of data from publicly available sources. The tuned models are intended for assistant-like chat, whereas pre-trained models can be adapted for a variety of natural language generation tasks. Regardless of which version of the model a developer uses, the [responsible use guide from Meta ](https://ai.meta.com/llama/responsible-use-guide/)can assist in guiding additional fine-tuning that may be necessary to customize and optimize the models with appropriate safety mitigations. - - -## Amazon EC2 Inf2 instances Overview - -Amazon EC2 Inf2 instances, featuring Inferentia2, provide 3x higher compute, 4x more accelerator memory, resulting in up to 4x higher throughput, and up to 10x lower latency, compared to the first generation Inf1 instances. - -Large language model (LLM) inference is a memory bound workload, performance scales up with more accelerator memory bandwidth. Inf2 instances are the only inference optimized instances in Amazon EC2 to provide high speed accelerator interconnect (NeuronLink) enabling high performance large LLM model deployments with cost effective distributed inference. You can now efficiently and cost-effectively deploy billion-scale LLMs across multiple accelerators on Inf2 instances. - -Inferentia2 supports FP32, TF32, BF16, FP16, UINT8, and the new configurable FP8 (cFP8) data type. AWS Neuron can take high-precision FP32 and FP16 models and autocast them to lower-precision data types while optimizing accuracy and performance. Autocasting reduces time to market by removing the need for lower-precision retraining and enabling higher-performance inference with smaller data types. - -To make it flexible and extendable to deploy constantly evolving deep learning models, Inf2 instances have hardware optimizations and software support for dynamic input shapes as well as custom operators written in C++ through the standard PyTorch custom operator programming interfaces. - - -## Transformers Neuron (transformers-neuronx) - -[Transformers Neuron](https://github.com/aws-neuron/transformers-neuronx) is a software package that enables PyTorch users to deploy performance optimized LLM inference. It has an optimized version of transformer models implemented with XLA high level operators (HLO), which enables sharding tensors across multiple NeuronCores, a.k.a. tensor parallelism, and performance optimizations such as parallel context encoding and KV caching for Neuron hardware. The Llama 2 source code in XLA HLOs can be found [here](https://github.com/aws-neuron/transformers-neuronx/blob/main/src/transformers_neuronx/llama/model.py). - -Llama 2 is supported in Transformers Neuron through the [LlamaForSampling](https://github.com/aws-neuron/transformers-neuronx/blob/33fa412447a4028edb252fd06aae9ed93086a450/src/transformers_neuronx/llama/model.py#L29) class. Transformers Neuron provides a seamless user experience with Hugging Face models to provide optimized inference on Inf2 instances. More details can be found from the [Transforms Neuron Developer Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/transformers-neuronx-developer-guide.html#transformers-neuronx-developer-guide). In the following section, we will explain how to deploy the Llama-2 13B model using Transformers Neuron. And, this example also applies to other Llama-based models. - - -## Llama 2 model inference with Transformers Neuron - - -### Create model, compile and deploy - -We have three simple steps here to create, compile and deploy the model on Inf2 instances. - -1. Create a CPU model, use this [script](https://github.com/pytorch/serve/blob/d0ae857abfe6d36813c88e531316149a5a354a93/examples/large_models/inferentia2/llama2/Readme.md?plain=1#L71) or the following code snippet to serialize and save checkpoints in a local directory. - -``` -from transformers import AutoModelForCausalLM -from transformers_neuronx.module import save_pretrained_split -model_cpu = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf", low_cpu_mem_usage=True) -model_dir = "./llama-2-13b-split" -save_pretrained_split(model_cpu, model_dir) -``` - -{:start="2"} -2. Load and compile model from the local directory that you saved serialized checkpoints using the following. -To load the Llama 2 model, we use `LlamaForSampling` from Transformers Neuron. Note that the environment variable `NEURON_RT_NUM_CORES` specifies the number of NeuronCores to be used at runtime and it should match the tensor parallelism (TP) degree specified for the model. Also, `NEURON_CC_FLAGS` enables compiler optimization on decoder-only LLM models. - -``` -from transformers_neuronx.llama.model import LlamaForSampling -os.environ['NEURON_RT_NUM_CORES'] = '24' -os.environ['NEURON_CC_FLAGS'] = '--model-type=transformer' -model = LlamaForSampling.from_pretrained( - model_dir, - batch_size=1, - tp_degree=24, - amp='bf16', - n_positions=16, - context_length_estimate=[8] - ) -``` - -

        Now let's compile the model and load model weights into device memory with a one liner API.

        -``` -model.to_neuron() -``` - -{:start="3"} -3. Finally let's run the inference on the compiled model. Note that both input and output of the `sample` function are a sequence of tokens. - -``` -inputs = torch.tensor([[1, 16644, 31844, 312, 31876, 31836, 260, 3067, 2228, 31844]]) -seq_len = 16 -outputs = model.sample(inputs, seq_len, top_k=1) -``` - - - -### Inference optimizations in Transformers Neuron - -**Tensor parallelism** - -![Latency with different TP degrees](/assets/images/high-performance-llama/latency_vs_tp.jpg){:style="width:100%"} - - -Transformer Neuron implements parallel tensor operations across multiple NeuronCores. We denote the number of cores to be used for inference as TP degree. Larger TP degree provides higher memory bandwidth, leading to lower latency, as LLM token generation is a memory-IO bound workload. With increasing the TP degree, the inference latency has decreased significantly, our results shows, ~4x overall speed up with increased TP degrees from 2 to 24. For the Llama-2 7B model, latency decreases from 30.1 ms/token with 2 cores to 7.9 ms/token with 24 cores; similarly for the Llama-2 13B model, it goes down from 57.3 ms/token to 11.1 ms/token. - -**Parallel context encoding** - -In the transformer architecture, tokens are produced in a sequential procedure called autoregressive sampling while input prompt tokens can be processed in parallel with parallel context encoding. This can significantly reduce the latency for input prompt context encoding before token generation through autoregressive sampling. By default, the parameter `context_length_estimate` would be set as a list of power-of-2 numbers which aims to cover a wide variety of context lengths. Depending on the use case, it can be set to custom numbers. This can be done when creating the Llama 2 model using `LlamaForSampling.from_pretrained`. We characterize the impact of input token length on end-to-end (E2E) latency. As shown in the figure, latency for text generation with the Llama-2 7B model only slightly increases with bigger input prompts, thanks to parallel context encoding. - -![E2E latency](/assets/images/high-performance-llama/latency_vs_input_token_length.jpg){:style="width:100%"} - -**KV caching** - -Self-attention block performs the self-attention operation with KV vectors. And, KV vectors are calculated using token embeddings and weights of KV and thus associated with tokens. In naive implementations, for each generated token, the entire KV cache is recalculated, but this reduces performance. Therefore Transformers Neuron library is reusing previously calculated KV vectors to avoid unnecessary computation, also known as KV caching, to reduce latency in the autoregressive sampling phase.  - - -### Benchmarking results - -We benchmarked the latency and cost for both Llama-2 7B and 13B models under different conditions, i.e., number of output tokens, instance types. Unless specified, we use data type ‘bf16’ and batch size of 1 as this is a common configuration for real-time applications like chatbot and code assistant. - -**Latency** - -The following graphs shows the per token latency on inf2.48xlarge instance with TP degree 24. Here, the latency per output token is calculated as the end-to-end latency divided by the number of output tokens. Our experiments show Llama-2 7B end-to-end latency to generate 256 tokens is 2x faster compared to other comparable inference-optimized EC2 instances.  - -![Latency on inf2](/assets/images/high-performance-llama/latency_vs_output_token_length.png){:style="width:100%"} - -**Throughput** - -We now show the number of tokens generated per second for the Llama-2 7B and 13B models that can be delivered by the inf2.48xlarge instance. With TP degree 24, fully utilizing all the 24 NeuronCores, we can achieve 130 tokens/sec and 90 tokens/sec for the Llama-2 7B and 13B models, respectively. - -![E2E throughput](/assets/images/high-performance-llama/throughput_vs_output_token_length.jpg){:style="width:100%"} - - -**Cost** - -For latency-first applications, we show the cost of hosting Llama-2 models on the inf2.48xlarge instance, **$**0.011 per 1000 tokens and **$**0.016 per 1000 tokens for the 7B and 13B models, respectively, which achieve 3x cost saving over other comparable inference-optimized EC2 instances. Note that we report the cost based on [3-year reserved instance price](https://aws.amazon.com/ec2/instance-types/inf2/) which is what customers use for large production deployments. - - -![Cost on inf2](/assets/images/high-performance-llama/cost_vs_output_token_length_7b_13b.jpg){:style="width:100%"} - - -We also compare the cost of hosting the Llama-2 7B model on inf2.xlarge and inf2.48xlarge instances. We can see that inf2.xlarge is more than 4x cheaper than inf2.48xlarge but at the expense of longer latency due to smaller TP degree. For example, it takes 7.9 ms for the model to generate 256 output tokens with 256 input tokens on inf2.48xlarge but 30.1 ms on Inf2.xlarge. - -![Cost on Llama](/assets/images/high-performance-llama/cost_vs_output_token_length_xl_48xl.jpg){:style="width:100%"} - - - -## Serving Llama2 with TorchServe on EC2 Inf2 instance - -Now, we move on to model deployment. In this section, we show you how to deploy the [Llama-2 13B model](https://huggingface.co/meta-llama/Llama-2-13b-hf) through SageMaker using TorchServe, which is the recommended model server for PyTorch, preinstalled in the AWS PyTorch Deep Learning Containers (DLC). - -This section describes the preparation work needed for using TorchServe, particularly, how to configure `model_config.yaml` and `inf2_handler.py` as well as how to generate model artifacts and pre-compile the model for use in later model deployment. Preparing the model artifacts ahead-of-time avoids model compilation during model deployment and thus reduces the model loading time. - - -### Model configuration [model-config.yaml](https://github.com/pytorch/serve/blob/master/examples/large_models/inferentia2/llama2/model-config.yaml) - -The parameters defined in section `handler` and `micro_batching` are used in customer handler [inf2_handler.py](https://github.com/pytorch/serve/blob/master/examples/large_models/inferentia2/llama2/inf2_handler.py). More details about model_config.yaml are [here](https://github.com/pytorch/serve/blob/2bf505bae3046b0f7d0900727ec36e611bb5dca3/docs/configuration.md?plain=1#L267). TorchServe micro-batching is a mechanism to pre-process and post-process a batch of inference requests in parallel. It is able to achieve higher throughput by better utilizing the available accelerator when the backend is steadily fed with incoming data, see [here](https://github.com/pytorch/serve/tree/master/examples/micro_batching) for more details. For model inference on Inf2, `micro_batch_size, amp, tp_degree and max_length` specify the batch size, data type, tensor parallelism degree and max sequence length, respectively. - - -``` -# TorchServe Frontend Parameters -minWorkers: 1 -maxWorkers: 1 -maxBatchDelay: 100 -responseTimeout: 10800 -batchSize: 16 - -# TorchServe Backend Custom Handler Parameters -handler: - model_checkpoint_dir: "llama-2-13b-split" - amp: "bf16" - tp_degree: 12 - max_length: 100 - -micro_batching: - # Used by batch_size in function LlamaForSampling.from_pretrained - micro_batch_size: 1 - parallelism: - preprocess: 2 - inference: 1 - postprocess: 2 -``` - - - -### Custom handler [inf2_handler.py](https://github.com/pytorch/serve/blob/master/examples/large_models/inferentia2/llama2/inf2_handler.py) - -Custom handler in Torchserve is a simple Python script that lets you define the model initialization, preprocessing, inference and post-processing logic as functions. Here, we create our Inf2 custom handler. - -1. The [initialize](https://github.com/pytorch/serve/blob/d0ae857abfe6d36813c88e531316149a5a354a93/examples/large_models/inferentia2/llama2/inf2_handler.py#L33) function is used to load the model. Here, Neuron SDK will compile the model for the first time and save the precompiled model in the directory as enabled by `NEURONX_CACHE` in the directory specified by `NEURONX_DUMP_TO`. After the first time, subsequent runs will check if there are already pre-compiled model artifacts. If so, it will skip model compilation. -Once the model is loaded, we initiate warm-up inference requests so that the compiled version is cached. When the [neuron persistent cache ](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-features/neuron-caching.html)is utilized, it can significantly reduce the model loading latency, ensuring that the subsequent inference runs swiftly. - -``` -os.environ["NEURONX_CACHE"] = "on" -os.environ["NEURONX_DUMP_TO"] = f"{model_dir}/neuron_cache" -``` - -

        TorchServe `TextIteratorStreamerBatch` extends Hugging Face transformers `BaseStreamer` to support response streaming when `batchSize` is larger than 1. 

        - -``` -self.output_streamer = TextIteratorStreamerBatch( - self.tokenizer, - batch_size=self.handle.micro_batch_size, - skip_special_tokens=True, -) -``` - -{:start="2"} -2. The [inference](https://github.com/pytorch/serve/blob/d0ae857abfe6d36813c88e531316149a5a354a93/examples/large_models/inferentia2/llama2/inf2_handler.py#L124) function calls send_intermediate_predict_response to send the streaming response. - -``` -for new_text in self.output_streamer: - logger.debug("send response stream") - send_intermediate_predict_response( - new_text[: len(micro_batch_req_id_map)], - micro_batch_req_id_map, - "Intermediate Prediction success", - 200, - self.context, - ) -``` - - -### Package model artifacts - -Package all the model artifacts into a folder `llama-2-13b-neuronx-b1` using the `torch-model-archiver`.  - - -``` -torch-model-archiver --model-name llama-2-13b-neuronx-b1 --version 1.0 --handler inf2_handler.py -r requirements.txt --config-file model-config.yaml --archive-format no-archive -``` - - -### Serve the model - - -``` -export TS_INSTALL_PY_DEP_PER_MODEL="true" -torchserve --ncs --start --model-store model_store --models llama-2-13b-neuronx-b1 -``` - - -Once the log shows "**WORKER_MODEL_LOADED**", the pre-compiled model should be saved in the folder `llama-2-13b-neuronx-b1/neuron_cache`, which is tightly coupled with Neuron SDK version. Then, upload the folder `llama-2-13b-neuronx-b1` to your S3 bucket for later use in the product deployment. The Llama-2 13B model artifacts in this blog can be found [here](https://torchserve.s3.amazonaws.com/mar_files/sm-neuronx/llama-2-13b-neuronx-b1/), which is associated with Neuron SDK 2.13.2, in the TorchServe model zoo. - - -## Deploy Llama-2 13B model on SageMaker Inf2 instance using TorchServe  - -In this section, we deploy the Llama-2 13B model using a [PyTorch Neuronx container](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers) on a SageMaker endpoint with an ml.inf2.24xlarge hosting instance, which has 6 Inferentia2 accelerators corresponding to our model configuration `model_config.yaml` handler’s setting - `tp_degree: 12`. Given that we have packaged all the model artifacts into a folder using [torch-model-archiver](https://github.com/pytorch/serve/blob/master/model-archiver/README.md) and uploaded to S3 bucket, we will now use the SageMaker Python SDK to create a SageMaker model and deploy it to a SageMaker real-time endpoint using the deploy [uncompressed model method](https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-uncompressed.html). Speed is the key benefit to deploying in this manner with SageMaker and you get a fully functional production ready endpoint complete with a secure RESTful endpoint without any effort spent on infrastructure. There are 3 steps to deploying the model and running inference on SageMaker. The notebook example can be found [here](https://github.com/aws/amazon-sagemaker-examples-community/blob/main/torchserve/inf2/llama2/llama-2-13b.ipynb). - -1. Create a SageMaker model - -``` -from datetime import datetime - -instance_type = "ml.inf2.24xlarge" -endpoint_name = sagemaker.utils.name_from_base("ts-inf2-llama2-13b-b1") - -model = Model( - name="torchserve-inf2-llama2-13b" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), - # Enable SageMaker uncompressed model artifacts - model_data={ - "S3DataSource": { - "S3Uri": s3_uri, - "S3DataType": "S3Prefix", - "CompressionType": "None", - } - }, - image_uri=container, - role=role, - sagemaker_session=sess, - env={"TS_INSTALL_PY_DEP_PER_MODEL": "true"}, -) -``` - -{:start="2"} -2. Deploy a SageMaker model - -``` -model.deploy( - initial_instance_count=1, - instance_type=instance_type, - endpoint_name=endpoint_name, - volume_size=512, # increase the size to store large model - model_data_download_timeout=3600, # increase the timeout to download large model - container_startup_health_check_timeout=600, # increase the timeout to load large model -) -``` - -{:start="3"} -3. Run streaming response inference on SageMaker -When the endpoint is in service, you can use the `invoke_endpoint_with_response_stream` API call to invoke the model. This feature enables the return of each generated token to the user, enhancing the user experience. It's especially beneficial when generating an entire sequence is time-consuming. - -``` -import json - -body = "Today the weather is really nice and I am planning on".encode('utf-8') -resp = smr.invoke_endpoint_with_response_stream(EndpointName=endpoint_name, Body=body, ContentType="application/json") -event_stream = resp['Body'] -parser = Parser() -for event in event_stream: - parser.write(event['PayloadPart']['Bytes']) - for line in parser.scan_lines(): - print(line.decode("utf-8"), end=' ') -``` - - - -### Sample inference: - -Input - -"Today the weather is really nice and I am planning on" - -Output - -"Today the weather is really nice and I am planning on going to the beach. I am going to take my camera and take some pictures of the beach. I am going to take pictures of the sand, the water, and the people. I am also going to take pictures of the sunset. I am really excited to go to the beach and take pictures. - -The beach is a great place to take pictures. The sand, the water, and the people are all great subjects for pictures. The sunset is also a great subject for pictures." - - -## Conclusion - -In this post, we showcased how to run Llama 2 model inference using Transformers Neuron and deploy Llama 2 model serving using TorchServe through Amazon SageMaker on an EC2 Inf2 instance. We demonstrated the benefits of using Inferentia2—low latency and low cost—enabled by optimizations in AWS Neuron SDK including tensor parallelism, parallel context encoding and KV caching, particularly for LLM inference. To stay up to date, please follow [AWS Neuron’s latest release](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/index.html) for new features. - -Get started today with Llama 2 examples on [EC2](https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/transformers-neuronx/inference/meta-llama-2-13b-sampling.ipynb) and through [SageMaker](https://github.com/aws/amazon-sagemaker-examples-community/blob/main/torchserve/inf2/llama2/llama-2-13b.ipynb) and stay tuned for how to optimize Llama 70B on Inf2! \ No newline at end of file diff --git a/_posts/2023-10-04-new-library-updates.md b/_posts/2023-10-04-new-library-updates.md deleted file mode 100644 index 33791dc36995..000000000000 --- a/_posts/2023-10-04-new-library-updates.md +++ /dev/null @@ -1,162 +0,0 @@ ---- -layout: blog_detail -title: "New Library Updates in PyTorch 2.1" -author: Team PyTorch ---- - -## **Summary** - -We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 2.1 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.  - -Along with 2.1, we are also releasing a series of beta updates to the PyTorch domain libraries including TorchAudio and TorchVision. Please find the list of the latest stable versions and updates below. - -| Latest Stable Library Versions |([Full List](https://pytorch.org/docs/stable/index.html))* | | -|--------------------------------------------|------------------|-----------------------------| -| TorchArrow 0.1.0 | TorchRec 0.5.0 | TorchVision 0.16 | -| TorchAudio 2.1 | TorchServe 0.8.2 | TorchX 0.5.0 | -| TorchData 0.7.0 | TorchText 0.16.0 | PyTorch on XLA Devices 1.14 | - -\*To see [prior versions](https://pytorch.org/docs/stable/index.html) or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’. - -## **TorchAudio** - -TorchAudio v2.1 introduces the following new features and backward-incompatible changes: - -**\[Beta] A new API to apply filter, effects and codec** - -\`torchaudio.io.AudioEffector\` can apply filters, effects and encodings to waveforms in online/offline fashion. You can use it as a form of augmentation. - -Please refer to for the usage and examples. - -**\[Beta] Tools for Forced alignment** - -New functions and a pre-trained model for forced alignment were added. \`torchaudio.functional.forced\_align\` computes alignment from an emission and \`torchaudio.pipelines.MMS\_FA\` provides access to the model trained for multilingual forced alignment in [MMS: Scaling Speech Technology to 1000+ languages](https://ai.meta.com/blog/multilingual-model-speech-recognition/) project. - -Please refer to for the usage of \`forced\_align\` function, and for how one can use \`MMS\_FA\` to align transcript in multiple languages. - -**\[Beta] TorchAudio-Squim : Models for reference-free speech assessment** - -Model architectures and pre-trained models from the paper [TorchAudio-Sequim: Reference-less Speech Quality and Intelligibility measures in TorchAudio](https://arxiv.org/abs/2304.01448) were added. - -You can use the pre-trained models \`torchaudio.pipelines.SQUIM\_SUBJECTIVE\` and \`torchaudio.pipelines.SQUIM\_OBJECTIVE\`. They can estimate the various speech quality and intelligibility metrics (e.g. STOI, wideband PESQ, Si-SDR, and MOS). This is helpful when evaluating the quality of speech generation models, such as Text-to-Speech (TTS). - -Please refer to for the details. - -**\[Beta] CUDA-based CTC decoder** - -\`torchaudio.models.decoder.CUCTCDecoder\` performs CTC beam search in CUDA devices. The beam search is fast. It eliminates the need to move data from CUDA device to CPU when performing automatic speech recognition. With PyTorch's CUDA support, it is now possible to perform the entire speech recognition pipeline in CUDA. - -Please refer to for the detail. - -**\[Prototype] Utilities for AI music generation** - -We are working to add utilities that are relevant to music AI. Since the last release, the following APIs were added to the prototype. - -Please refer to respective documentation for the usage. -- [torchaudio.prototype.chroma\_filterbank](https://pytorch.org/audio/main/generated/torchaudio.prototype.functional.chroma_filterbank.html) -- [torchaudio.prototype.transforms.ChromaScale](https://pytorch.org/audio/main/generated/torchaudio.prototype.transforms.ChromaScale.html) -- [torchaudio.prototype.transforms.ChromaSpectrogram](https://pytorch.org/audio/main/generated/torchaudio.prototype.transforms.ChromaSpectrogram.html) -- [torchaudio.prototype.pipelines.VGGISH](https://pytorch.org/audio/main/generated/torchaudio.prototype.pipelines.VGGISH.html) - -**New recipes for training models** - -Recipes for Audio-visual ASR, multi-channel DNN beamforming and TCPGen context-biasing were added. - -Please refer to the recipes -- -- -- - -**Update to FFmpeg support** - -The version of supported FFmpeg libraries was updated. TorchAudio v2.1 works with FFmpeg 6, 5 and 4.4. The support for 4.3, 4.2 and 4.1 are dropped. - -Please refer to for the detail of the new FFmpeg integration mechanism. - -**Update to libsox integration** - -TorchAudio now depends on libsox installed separately from torchaudio. Sox I/O backend no longer supports file-like objects. (This is supported by FFmpeg backend and soundfile.) - -Please refer to for the details. - -## TorchRL - -Our RLHF components make it easy to build an RLHF training loop with limited RL knowledge. TensorDict enables an easy interaction between datasets (eg, HF datasets) and RL models. The new algorithms we provide deliver a wide range of solutions for offline RL training, which is more data efficient. - -Through RoboHive and IsaacGym, TorchRL now provides a built-in interface with hardware (robots), tying training at scale with policy deployment on device. Thanks to SMAC, VMAS, and PettingZoo and related MARL-oriented losses, TorchRL is now fully capable of training complex policies in multi-agent settings. - -**New algorithms** -- \[BETA] We integrate some RLHF components and examples: we provide building blocks for data formatting in RL frameworks, reward model design, specific transforms that enable efficient learning (eg. KL correction) and training scripts -- \[Stable] New algorithms include Decision transformers, CQL, multi-agent losses such as MAPPO and QMixer.**New features**- \[Stable] New transforms such as Visual Cortex 1 (VC1), a foundational model for RL.  -- We widened the panel of library covered by TorchRL:  - - \[Beta] IsaacGym, a powerful GPU-based simulator that allows interaction and rendering of thousands of vectorized environments by NVIDIA. - - \[Stable] PettingZoo, a multi-agent library by the Farama Foundation. - - \[Stable] SMAC-v2, the new Starcraft Multi-agent simulator - - \[Stable] RoboHive, a collection of environments/tasks simulated with the MuJoCo physics engine. - -**Performance improvements** - -We provide faster data collection through refactoring and integration of SB3 and Gym asynchronous environments execution. We also made our value functions faster to execute. - -## TorchRec - -**\[Prototype] Zero Collision / Managed Collision Embedding Bags** - -A common constraint in Recommender Systems is the sparse id input range is larger than the number of embeddings the model can learn for a given parameter size.   To resolve this issue, the conventional solution is to hash sparse ids into the same size range as the embedding table.  This will ultimately lead to hash collisions, with multiple sparse ids sharing the same embedding space.   We have developed a performant alternative algorithm that attempts to address this problem by tracking the _N_ most common sparse ids and ensuring that they have a unique embedding representation. The module is defined [here](https://github.com/pytorch/torchrec/blob/b992eebd80e8ccfc3b96a7fd39cb072c17e8907d/torchrec/modules/mc_embedding_modules.py#L26) and an example can be found [here](https://github.com/pytorch/torchrec/blob/b992eebd80e8ccfc3b96a7fd39cb072c17e8907d/torchrec/modules/mc_embedding_modules.py#L26). - -**\[Prototype] UVM Caching - Prefetch Training Pipeline** - -For tables where on-device memory is insufficient to hold the entire embedding table, it is common to leverage a caching architecture where part of the embedding table is cached on device and the full embedding table is on host memory (typically DDR SDRAM).   However, in practice, caching misses are common, and hurt performance due to relatively high latency of going to host memory.   Building on TorchRec’s existing data pipelining, we developed a new [_Prefetch Training Pipeline_](https://pytorch.org/torchrec/torchrec.distributed.html#torchrec.distributed.train_pipeline.PrefetchPipelinedForward) to avoid these cache misses by prefetching the relevant embeddings for upcoming batch from host memory, effectively eliminating cache misses in the forward path. - -## TorchVision  -### **Transforms and augmentations** - -**Major speedups** - -The new transforms in `torchvision.transforms.v2` are now[ 10%-40% faster](https://github.com/pytorch/vision/issues/7497#issuecomment-1557478635) than before! This is mostly achieved thanks to 2X-4X improvements made to `v2.Resize()`, which now supports native `uint8` tensors for Bilinear and Bicubic mode. Output results are also now closer to PIL's! Check out our[ performance recommendations](https://pytorch.org/vision/stable/transforms.html#performance-considerations) to learn more. - -Additionally, `torchvision` now ships with `libjpeg-turbo` instead of `libjpeg`, which should significantly speed-up the jpeg decoding utilities ([`read_image`](https://pytorch.org/vision/stable/generated/torchvision.io.read_image.html#torchvision.io.read_image),[ `decode_jpeg`](https://pytorch.org/vision/stable/generated/torchvision.io.read_image.html#torchvision.io.decode_jpeg)), and avoid compatibility issues with PIL. - -**CutMix and MixUp** - -Long-awaited support for the `CutMix` and `MixUp` augmentations is now here! Check[ our tutorial](https://pytorch.org/vision/stable/auto_examples/transforms/plot_cutmix_mixup.html#sphx-glr-auto-examples-transforms-plot-cutmix-mixup-py) to learn how to use them. - -**Towards stable V2 transforms** - -In the[ previous release 0.15](https://github.com/pytorch/vision/releases/tag/v0.15.1) we BETA-released a new set of transforms in `torchvision.transforms.v2` with native support for tasks like segmentation, detection, or videos. We have now stabilized the design decisions of these transforms and made further improvements in terms of speedups, usability, new transforms support, etc. - -We're keeping the `torchvision.transforms.v2` and `torchvision.tv_tensors` namespaces as BETA until 0.17 out of precaution, but we do not expect disruptive API changes in the future. - -Whether you’re new to Torchvision transforms, or you’re already experienced with them, we encourage you to start with[ Getting started with transforms v2](https://pytorch.org/vision/stable/auto_examples/transforms/plot_transforms_getting_started.html#sphx-glr-auto-examples-transforms-plot-transforms-getting-started-py) in order to learn more about what can be done with the new v2 transforms. - -Browse our[ main docs](https://pytorch.org/vision/stable/transforms.html#) for general information and performance tips. The available transforms and functionals are listed in the[ API reference](https://pytorch.org/vision/stable/transforms.html#v2-api-ref). Additional information and tutorials can also be found in our[ example gallery](https://pytorch.org/vision/stable/auto_examples/index.html#gallery), e.g.[ Transforms v2: End-to-end object detection/segmentation example](https://pytorch.org/vision/stable/auto_examples/transforms/plot_transforms_e2e.html#sphx-glr-auto-examples-transforms-plot-transforms-e2e-py) or[ How to write your own v2 transforms](https://pytorch.org/vision/stable/auto_examples/transforms/plot_custom_transforms.html#sphx-glr-auto-examples-transforms-plot-custom-transforms-py). - -### \[BETA] MPS support - -The `nms` and roi-align kernels (`roi_align`, `roi_pool`, `ps_roi_align`, `ps_roi_pool`) now support MPS. Thanks to[ Li-Huai (Allan) Lin](https://github.com/qqaatw) for this contribution! - -## TorchX - -**Schedulers** -- \[Prototype] Kubernetes MCAD Scheduler: Integration for easily scheduling jobs on Multi-Cluster-Application-Dispatcher (MCAD) - -- AWS Batch  - - - Add privileged option to enable running containers on EFA enabled instances with elevated networking permissions - -### **TorchX Tracker** -- \[Prototype] MLFlow backend for TorchX Tracker: in addition to _fsspec_ based tracker, TorchX can use MLFlow instance to track metadata/experiments  - -**Components** -- _dist.spmd_ component to support Single-Process-Multiple-Data style applications - -**Workspace** -- Add ability to access image and workspace path from Dockerfile while building docker workspace - -Release includes number of other bugfixes. - -To learn more about Torchx visit - -## TorchText and TorchData - -As of September 2023 we have paused active development of TorchText and TorchData as we re-evaluate how we want to serve the needs of the community in this space. diff --git a/_posts/2023-10-04-pytorch-2-1.md b/_posts/2023-10-04-pytorch-2-1.md deleted file mode 100644 index 99878ea3c84c..000000000000 --- a/_posts/2023-10-04-pytorch-2-1.md +++ /dev/null @@ -1,143 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.1: automatic dynamic shape compilation, distributed checkpointing" -author: Team PyTorch ---- - -We are excited to announce the release of PyTorch® 2.1 ([release note](https://github.com/pytorch/pytorch/releases/tag/v2.1.0))! PyTorch 2.1 offers automatic dynamic shape support in _torch.compile_, _torch.distributed.checkpoint_ for saving/loading distributed training jobs on multiple ranks in parallel, and _torch.compile_ support for the NumPy API. - -In addition, this release offers numerous performance improvements (e.g. CPU inductor improvements, AVX512 support, scaled-dot-product-attention support) as well as a prototype release of _torch.export_, a sound full-graph capture mechanism, and _torch.export_-based quantization. - -Along with 2.1, we are also releasing a series of updates to the PyTorch domain libraries. More details can be found in the library updates blog.  - -This release is composed of 6,682 commits and 784 contributors since 2.0. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.1.  More information about how to get started with the PyTorch 2-series can be found at our [Getting Started](https://pytorch.org/get-started/pytorch-2.0/) page. - -Summary:  -- _torch.compile_ now includes automatic support for detecting and minimizing recompilations due to tensor shape changes using _automatic dynamic shapes._ -- _torch.distributed.checkpoint_ enables saving and loading models from multiple ranks in parallel, as well as resharding due to changes in cluster topology. -- _torch.compile_ can now compile NumPy operations via translating them into PyTorch-equivalent operations. -- _torch.compile_ now includes improved support for Python 3.11. -- New CPU performance features include inductor improvements (e.g. bfloat16 support and dynamic shapes), AVX512 kernel support, and scaled-dot-product-attention kernels. -- _torch.export_, a sound full-graph capture mechanism is introduced as a prototype feature, as well as _torch.export_-based quantization. -- _torch.sparse_ now includes prototype support for semi-structured (2:4) sparsity on NVIDIA® GPUs. - - - | **Stable** | **Beta** | **Prototype** | **Performance Improvements** | -|------------|-----------------------------------------------|---------------------------------|-----------------------------------------------------------| -| | Automatic Dynamic Shapes | _torch.export()_ | AVX512 kernel support | -| | _torch.distributed.checkpoint_ | Torch.export-based Quantization | CPU optimizations for scaled-dot-product-attention (SPDA) | -| | _torch.compile_ + NumPy | semi-structed (2:4) sparsity | CPU optimizations for bfloat16 | -| | _torch.compile_ + Python 3.11 | _cpp_wrapper_ for torchinductor | | -| | _torch.compile + autograd.Function_ | | | -| | third-party device integration: _PrivateUse1_ | | | - -\*To see a full list of public 2.1, 2.0, and 1.13 feature submissions click [here](https://docs.google.com/spreadsheets/d/1TzGkWuUMF1yTe88adz1dt2mzbIsZLd3PBasy588VWgk/edit?usp=sharing). - -## **Beta Features** - - **(Beta) Automatic Dynamic Shapes** - - Dynamic shapes is functionality built into _torch.compile_ that can minimize recompilations by tracking and generating code based on the symbolic shape of a tensor rather than the static shape (e.g. _\[B, 128, 4]_ rather than _\[64, 128, 4]_). This allows _torch.compile_ to generate a single kernel that can work for many sizes, at only a modest cost to efficiency. Dynamic shapes has been greatly stabilized in PyTorch 2.1, and is now automatically enabled if _torch.compile_ notices recompilation due to varying input shapes. You can disable automatic dynamic by passing _dynamic=False_ to torch.compile, or by setting _torch.\_dynamo.config.automatic\_dynamic\_shapes = False_. - - In PyTorch 2.1, we have shown good performance with dynamic shapes enabled on a variety of model types, including large language models, on both CUDA and CPU. - - For more information on dynamic shapes, see [this documentation](https://pytorch.org/docs/2.1/torch.compiler_dynamic_shapes.html). - - **\[Beta] _torch.distributed.checkpoint_** - - _torch.distributed.checkpoint_ enables saving and loading models from multiple ranks in parallel. In addition, checkpointing automatically handles fully-qualified-name (FQN) mappings across models and optimizers, enabling load-time resharding across differing cluster topologies. - - For more information, see _torch.distributed.checkpoint_ [documentation](https://pytorch.org/docs/2.1/distributed.checkpoint.html) and [tutorial](https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html). - - **\[Beta] _torch.compile_ + _NumPy_** - - _torch.compile_ now understands how to compile NumPy operations via translating them into PyTorch-equivalent operations.  Because this integration operates in a device-agnostic manner, you can now GPU-accelerate NumPy programs – or even mixed NumPy/PyTorch programs – just by using _torch.compile_. - - Please see [this section](https://pytorch.org/docs/2.1/torch.compiler_faq.html#does-numpy-work-with-torch-compile) in the _torch.compile_ FAQ for more information about _torch.compile + NumPy interaction_, and follow the [PyTorch Blog](https://pytorch.org/blog/) for a forthcoming blog about this feature. - - **\[Beta] _torch.compile_ + Python 3.11** - - _torch.compile_ previously only supported Python versions 3.8-3.10. Users can now optimize models with _torch.compile_ in Python 3.11. - - **\[Beta] _torch.compile_ + _autograd.Function_** - - _torch.compile_ can now trace and optimize the backward function of user-defined [autograd Functions](https://pytorch.org/docs/stable/autograd.html#function), which unlocks training optimizations for models that make heavier use of extensions mechanisms. - - **\[Beta] Improved third-party device support: _PrivateUse1_** - - Third-party device types can now be registered to PyTorch using the privateuse1 dispatch key.  This allows device extensions to register new kernels to PyTorch and to associate them with the new key, allowing user code to work equivalently to built-in device types.  For example, to register _“my\_hardware\_device_”, one can do the following: - -``` -torch.rename_privateuse1_backend("my_hardware_device") -torch.utils.generate_methods_for_privateuse1_backend() -x = torch.randn((2, 3), device='my_hardware_device') -y = x + x # run add kernel on 'my_hardware_device' -``` - -To validate this feature, the OSS team from _Ascend NPU_ has successfully integrated [**torch\_npu**](https://github.com/Ascend/pytorch) into pytorch as a plug-in through the _PrivateUse1_ functionality. - -For more information, please see the PrivateUse1 tutorial [here](https://pytorch.org/tutorials/advanced/privateuseone.html). - -## **Prototype Features** - -**\[Prototype] _torch.export()_** - -_torch.export()_ provides a sound tracing mechanism to capture a full graph from a PyTorch program based on new technologies provided by PT2.0. - -Users can extract a clean representation (Export IR) of a PyTorch program in the form of a dataflow graph, consisting of mostly straight-line calls to PyTorch operators. Export IR can then be transformed, serialized, saved to file, transferred, loaded back for execution in an environment with or without Python. - -For more information, please see the tutorial [here](https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html). - -**\[Prototype] _torch.export_-based Quantization** - -_torch.ao.quantization_ now supports quantization on PyTorch 2 _torch.export_-based flows.  This includes support for built-in _XNNPACK_ and _X64Inductor_ _Quantizer_, as well as the ability to specify one’s own _Quantizer_. - -For an explanation on post-training static quantization with torch.export, see [this tutorial](https://pytorch.org/tutorials/prototype/pt2e_quant_ptq.html), for quantization-aware training for static quantization with torch.export, see [this tutorial](https://pytorch.org/tutorials/prototype/pt2e_quant_qat.html). - -For an explanation on how to write one’s own Quantizer, see [this tutorial](https://pytorch.org/tutorials/prototype/pt2e_quantizer.html). - -**\[Prototype] semi-structured (2:4) sparsity for NVIDIA® GPUs** - -_torch.sparse_ now supports creating and accelerating compute over semi-structured sparse (2:4) tensors.  For more information on the format, see [this](https://developer.nvidia.com/blog/accelerating-matrix-multiplication-with-block-sparse-format-and-nvidia-tensor-cores/) blog from NVIDIA.A minimal example introducing semi-structured sparsity is as follows: - -``` -from torch.sparse import to_sparse_semi_structured - -x = torch.rand(64, 64).half().cuda() -mask = torch.tensor([0, 0, 1, 1]).tile((64, 16)).cuda().bool() -linear = nn.Linear(64, 64).half().cuda() - -linear.weight = nn.Parameter(to_sparse_semi_structured(linear.weight.masked_fill(~mask, 0))) -linear(x) -``` - -To learn more, please see the [documentation](https://pytorch.org/docs/2.1/sparse.html#sparse-semi-structured-tensors) and accompanying [tutorial](https://pytorch.org/tutorials/prototype/semi_structured_sparse.html). - -**\[Prototype] _cpp\_wrapper_ for _torchinductor_** - -_cpp\_wrapper_ can reduce the Python overhead for invoking kernels in torchinductor by generating the kernel wrapper code in C++. This feature is still in the prototype phase; it does not support all programs that successfully compile in PT2 today. Please file issues if you discover limitations for your use case to help us prioritize. - -The API to turn this feature on is: -``` -import torch -import torch._inductor.config as config -config.cpp_wrapper = True -``` - -For more information, please see the [tutorial](https://pytorch.org/tutorials/prototype/inductor_cpp_wrapper_tutorial.html). - -## **Performance Improvements** - -**AVX512 kernel support** - -In PyTorch 2.0, AVX2 kernels would be used even if the CPU supported AVX512 instructions.  Now, PyTorch defaults to using AVX512 CPU kernels if the CPU supports those instructions, equivalent to setting _ATEN\_CPU\_CAPABILITY=avx512_ in previous releases.  The previous behavior can be enabled by setting _ATEN\_CPU\_CAPABILITY=avx2._ - -**CPU optimizations for scaled-dot-product-attention (SDPA)** - -Previous versions of PyTorch provided optimized CUDA implementations for transformer primitives via _torch.nn.functiona.scaled\_dot\_product\_attention_.  PyTorch 2.1 includes optimized FlashAttention-based CPU routines. - -See the documentation [here](https://pytorch.org/docs/2.1/generated/torch.nn.functional.scaled_dot_product_attention.html). - -**CPU optimizations for bfloat16** - -PyTorch 2.1 includes CPU optimizations for bfloat16, including improved vectorization support and _torchinductor_ codegen. diff --git a/_posts/2023-10-10-real-time-speech-rec.md b/_posts/2023-10-10-real-time-speech-rec.md deleted file mode 100644 index 30f9463074a9..000000000000 --- a/_posts/2023-10-10-real-time-speech-rec.md +++ /dev/null @@ -1,257 +0,0 @@ ---- -layout: blog_detail -title: "Real-time Audio-visual Speech Recognition" -author: Team PyTorch ---- - -Audio-Visual Speech Recognition (AV-ASR, or AVSR) is the task of transcribing text from audio and visual streams, which has recently attracted a lot of research attention due to its robustness to noise. The vast majority of work to date has focused on developing AV-ASR models for non-streaming recognition; studies on streaming AV-ASR are very limited. - -We have developed a compact real-time speech recognition system based on TorchAudio, a library for audio and signal processing with [PyTorch](http://pytorch.org). It can run locally on a laptop with high accuracy without accessing the cloud. Today, we are releasing [the real-time AV-ASR recipe](https://github.com/pytorch/audio/tree/main/examples/avsr) under a permissive open license (BSD-2-Clause license), enabling a broad set of applications and fostering further research on audio-visual models for speech recognition. - -This work is part of our approach to [AV-ASR research](https://arxiv.org/abs/2303.14307). A promising aspect of this approach is its ability to automatically annotate large-scale audio-visual datasets, which enables the training of more accurate and robust speech recognition systems. Furthermore, this technology has the potential to run on smart devices since it achieves the latency and memory efficiency that such devices require for inference. - -In the future, speech recognition systems are expected to power applications in numerous domains. One of the primary applications of AV-ASR is to enhance the performance of ASR in noisy environments. Since visual streams are not affected by acoustic noise, integrating them into an audio-visual speech recognition model can compensate for the performance drop of ASR models. Our AV-ASR system has the potential to serve multiple purposes beyond speech recognition, such as text summarization, translation and even text-to-speech conversion. Moreover, the exclusive use of VSR can be useful in certain scenarios, e.g. where speaking is not allowed, in meetings, and where privacy in public conversations is desired. - - -# AV-ASR - - - -![Fig. 1 The pipeline for audio-visual speech recognition system](/assets/images/real-time-speech-rec/pipeline.jpg){:style="width:100%;"} - -

        Fig. 1: The pipeline for audio-visual speech recognition system

        - - -Our real-time AV-ASR system is presented in Fig. 1. It consists of three components, a data collection module, a pre-processing module and an end-to-end model. The data collection module comprises hardware devices, such as a microphone and camera. Its role is to collect information from the real world. Once the information is collected, the pre-processing module location and crop out face. Next, we feed the raw audio stream and the pre-processed video stream into our end-to-end model for inference. - - -## Data collection - -We use `torchaudio.io.StreamReader` to capture audio/video from streaming device input, e.g. microphone and camera on laptop. Once the raw video and audio streams are collected, the pre-processing module locates and crops faces. It should be noted that data is immediately deleted during the streaming process. - - -## Pre-processing - -Before feeding the raw stream into our model, each video sequence has to undergo a specific pre-processing procedure. This involves three critical steps. The first step is to perform face detection. Following that, each individual frame is aligned to a referenced frame, commonly known as the mean face, in order to normalize rotation and size differences across frames. The final step in the pre-processing module is to crop the face region from the aligned face image. We would like to clearly note that our model is fed with raw audio waveforms and pixels of the face, without any further preprocessing like face parsing or landmark detection. An example of the pre-processing procedure is illustrated in Table 1. - - - - - - - - - - - - - - - -
        -Original image - - - -Detected image - - -Transformed image - - -Cropped image - -
        - 0. Original - -1. Detection - -2. Alignment - -3. Crop -
        - -

        Table 1: Preprocessing pipeline.

        - - -## Model - - - -![Fig. 2 The architecture for the audio-visual speech recognition system.](/assets/images/real-time-speech-rec/model.jpg){:style="width:100%;"} - -

        Fig. 2: The architecture for the audio-visual speech recognition system

        - - - -We consider two configurations: Small with 12 Emformer blocks and Large with 28, with 34.9M and 383.3M parameters, respectively. Each AV-ASR model composes front-end encoders, a fusion module, an Emformer encoder, and a transducer model. To be specific, we use convolutional frontends to extract features from raw audio waveforms and facial images. The features are concatenated to form 1024-d features, which are then passed through a two-layer multi-layer perceptron and an Emformer transducer model. The entire network is trained using RNN-T loss. The architecture of the proposed AV-ASR model is illustrated in Fig. 2. - - -## Analysis - -**Datasets.** We follow [Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels](https://arxiv.org/abs/2303.14307) to use publicly available audio-visual datasets including [LRS3](https://www.robots.ox.ac.uk/~vgg/data/lip_reading/), [VoxCeleb2](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html) and [AVSpeech](https://looking-to-listen.github.io/avspeech/) for training. We do not use mouth ROIs or facial landmarks or attributes during both training and testing stages. - -**Comparisons with the state-of-the-art.** Non-streaming evaluation results on LRS3 are presented in Table 2. Our audio-visual model with an algorithmic latency of 800 ms (160ms+1280msx0.5) yields a WER of 1.3%, which is on par with those achieved by state-of-the-art offline models such as AV-HuBERT, RAVEn, and Auto-AVSR. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Method - Total Hours - WER (%) -
        ViT3D-CM - 90, 000 - 1.6 -
        AV-HuBERT - 1, 759 - 1.4 -
        RAVEn - 1, 759 - 1.4 -
        AutoAVSR - 3, 448 - 0.9 -
        Ours - 3, 068 - 1.3 -
        - -

        Table 2: Non-streaming evaluation results for audio-visual models on the LRS3 dataset.

        - -**Noisy experiments.** During training, 16 different noise types are randomly injected to audio waveforms, including 13 types from [Demand](https://zenodo.org/record/1227121) database, 'DLIVING','DKITCHEN', 'OMEETING', 'OOFFICE', 'PCAFETER', 'PRESTO', 'PSTATION', 'STRAFFIC', 'SPSQUARE', 'SCAFE', 'TMETRO', 'TBUS’ and 'TCAR’, two more types of noise from [speech commands](https://arxiv.org/abs/1804.03209) database, white and pink and one more type of noise from [NOISEX-92](https://www.sciencedirect.com/science/article/abs/pii/0167639393900953) database, babble noise. SNR levels in the range of [clean, 7.5dB, 2.5dB, -2.5dB, -7.5dB] are selected from with a uniform distribution. Results of ASR and AV-ASR models, when tested with babble noise, are shown in Table 3. With increasing noise level, the performance advantage of our audio-visual model over our audio-only model grows, indicating that incorporating visual data improves noise robustness. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Type - - 10dB - 5dB - 0dB - -5dB - -10dB -
        A - 1.6 - 1.8 - 3.2 - 10.9 - 27.9 - 55.5 -
        A+V - 1.6 - 1.7 - 2.1 - 6.2 - 11.7 - 27.6 -
        - -

        Table 3: Streaming evaluation WER (%) results at various signal-to-noise ratios for our audio-only (A) and audio-visual (A+V) models on the LRS3 dataset under 0.80-second latency constraints.

        - - -**Real-time factor**. The real-time factor (RTF) is an important measure of a system's ability to process real-time tasks efficiently. An RTF value of less than 1 indicates that the system meets real-time requirements. We measure RTF using a laptop with an Intel® Core™ i7-12700 CPU running at 2.70 GHz and an NVIDIA 3070 GeForce RTX 3070 Ti GPU. To the best of our knowledge, this is the first AV-ASR model that reports RTFs on the LRS3 benchmark. The Small model achieves a WER of 2.6% and an RTF of 0.87 on CPU (Table 4), demonstrating its potential for real-time on-device inference applications. - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model - Device - Streaming WER [%] - RTF -
        Large - GPU - 1.6 - 0.35 -
        Small - GPU - 2.6 - 0.33 -
        CPU - 0.87 -
        - -

        Table 4: Impact of AV-ASR model size and device on WER and RTF. Note that the RTF calculation includes the pre-processing step wherein the Ultra-Lightweight Face Detection Slim 320 model is used to generate face bounding boxes.

        - - -Learn more about the system from the published works below: - - - -* Shi, Yangyang, Yongqiang Wang, Chunyang Wu, Ching-Feng Yeh, Julian Chan, Frank Zhang, Duc Le, and Mike Seltzer. "Emformer: Efficient memory transformer based acoustic model for low latency streaming speech recognition." In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6783-6787. IEEE, 2021. -* Ma, Pingchuan, Alexandros Haliassos, Adriana Fernandez-Lopez, Honglie Chen, Stavros Petridis, and Maja Pantic. "Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels." In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1-5. IEEE, 2023. diff --git a/_posts/2023-10-11-ml-model-server-resource-saving.md b/_posts/2023-10-11-ml-model-server-resource-saving.md deleted file mode 100644 index 2572f84947eb..000000000000 --- a/_posts/2023-10-11-ml-model-server-resource-saving.md +++ /dev/null @@ -1,282 +0,0 @@ ---- -layout: blog_detail -title: "ML Model Server Resource Saving - Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance" -author: Sangjune Park(Naver GplaceAI MLOps), Jooyoung Lee(Naver GplaceAI MLE), Junho Min(Naver GplaceAI MLE) ---- - -Reviewers: [Yunsang Ju](https://www.linkedin.com/in/yunsang-ju/)(Naver GplaceAI Leader), Min Jean Cho(Intel), Jing Xu(Intel), Mark Saroufim(Meta) - -## Intro - -Here, We will be sharing our experience in moving AI workloads from our GPU servers to our Intel CPU servers without any performance or quality degradation, and **saving annual costs of approximately 340 thousand U.S. Dollar** (refer to the **Conclusion**) in the process. - -We aim to provide value to our consumers by serving various AI models that enhance the Online to Offline (O2O) experience. With the ongoing growth in the demand for new models and the limited nature of high-cost resource GPUs, we needed to transition relatively lightweight AI models from GPU servers to Intel CPU servers for reducing resource consumption. In the same setting, however, the CPU server had issues where performance of rps, inference time, etc. was reduced by tens of times. We applied various engineering techniques and lightweighted the model to solve this problem, and we were able to successfully transition to the Intel CPU servers with the same performance or better performance as the GPU servers with just a three-fold scale out. - -For a more detailed introduction about our team, please refer to the [Introduction to NAVER Place AI Development Team](https://medium.com/naver-place-dev/introduction-to-naver-place-ai-development-team-a8b0630e3b23). - -I'll mention it again in the middle, but I've received a lot of help from [Grokking Pytorch Intel CPU Performance From First Principles](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex.html#grokking-pytorch-intel-cpu-performance-from-first-principles) written by Intel and PyTorch in the overall work. - -## Problem Definition - -### 1: Service Architecture - -![Simplified service architecture](/assets/images/ml-model-server-resource-saving/fg1.jpg){:style="width:100%"} - - -**Simplified service architecture (Image Source: NAVER GplaceAI)** - -To facilitate understanding, a brief introduction to our service architecture will be provided. CPU intensive tasks such as preprocessing input to tensor format (then forwarded to the model) and post processing inference results to human readable output (e.g. natural language and image formats) are performed on the App Server(FastAPI) The Model Server(TorchServe) exclusively handles inference operations. For stable operation of the service, the following actions need to be performed with sufficient throughput and low latency. - -The specific processing sequence is as follows: - - - -* The client submits a request to the app server via the Traefik gateway. -* The app server pre-processes the input by performing actions such as resizing and transforming, and converting it into a Torch tensor before then requesting the model server. -* The model server performs inference and returns the feature to the app server -* The app server converts the feature into a format understandable by humans through post-processing and returns it to the client - -### 2:  Throughput and Latency Measurement - - - -![Comparison of Image Scoring Models](/assets/images/ml-model-server-resource-saving/fg1-1.jpg){:style="width:100%"} - -**Comparison of Image Scoring Models** - -With all other conditions remaining the same, deploying on a threefold increase CPU server pod, yet, notably, the RPS (requests per second) and response time deteriorated by more than tenfold. While it was not surprising that CPU inference performance is inferior to GPUs, the challenging situation was evident. Given the goal of maintaining performance within limited resources, achieving an approximate **10 to 20 times performance improvement** was necessary Barring any additional scaling. - -### 3: Challenges From a Throughput Perspective - - -``` -Type Name # reqs # fails | Avg Min Max Med | req/s failures/s ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- -POST /predictions/image-scoring 37 0(0.00%) | 9031 4043 28985 8200 | 1.00 0.00 ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- - Aggregated 37 0(0.00%) | 9031 4043 28985 8200 | 1.00 0.00 -``` - - -One of the first steps TorchServer framework users might take in order to improve throughput is to increase the number of workers in TorchServe. This approach is effective on GPU servers Because of parallel workload processing, excluding the linear memory usage increase as workers scale. However, we were experiencing worse performance when increasing the number of workers. Identifying the cause of performance degradation on CPU servers required further investigation. - -### 4: Challenges From a Latency Perspective - -Our primary concern was latency. Throughput improvement is normally achievable when a system’s implementation is faithful to scale-out principles, except for perhaps very rare worst-case scenarios. However, in the case of the Image Scoring model example, even performing a single inference took more than 1 second, and as the request volume increased, latency increased to as much as 4 seconds. It was a situation where the timeout criteria to satisfy the client could not be met even with a single inference. - -## Proposed Solutions - -Improvements were needed from both an ML and an engineering perspective. It was essential to fundamentally reduce the inference time on the CPU and to identify the causes of performance degradation when applying config that generally enhances performance, in order to find the optimal configuration values. To accomplish this, collaboration was established with MLE professionals to concurrently execute tasks encompassing ‘model lightweighting without compromising performance’, and ‘Identify optimal configurations for achieving peak performance’. Using the aforementioned approaches we were able to effectively transition workload handling to our CPU servers. - -### 1: Resolving Low RPS from an Engineering Perspective - -First, the reason for performance degradation even after increasing the worker number was the front-end bound caused by logical threads in GEMM operations. Generally, when increasing the number of workers, the expected improvement effect is the increase in parallelism. Conversely, if performance decreases, one can infer the corresponding trade-off effect. - - - -![CPU + GPU](/assets/images/ml-model-server-resource-saving/fg2.jpg){:style="width:100%; max-width: 420px; display: block; margin-left: auto; margin-right: auto"} - -**Image Source: [Nvidia](https://blogs.nvidia.com/blog/2018/06/11/what-is-a-virtual-gpu/)** - -As many are aware, the reason model inference performance on CPUs is inferior to GPUs lies in the difference in hardware design, particularly in terms of multi-threading capabilities. Diving deeper, model inference is fundamentally a repetition of **GEMM (General Matrix Multiply)** operations, and these GEMM operations are executed independently in **“fused-multiply-add” (FMA)** or **“dot-product” (DP)** execution units. If the GEMM operation becomes a bottleneck on the CPU, increasing parallelism might actually result in decreased performance. While researching the problem we found relevant information within the [PyTorch documentation](https://pytorch-geometric.readthedocs.io/en/latest/advanced/cpu_affinity.html#binding-processes-to-physical-cores). - -**_While two logical threads run GEMM at the same time, they will be sharing the same core resources causing front-end bound_** - -This information highlighted that logical threads could cause a bottleneck in CPU GEMM operations, which helped us intuitively understand why performance decreased when increasing the worker num. This is because the default value of the torch thread corresponds to the physical core value of the CPU. - - -``` -root@test-pod:/# lscpu - … -Thread(s) per core: 2 -Core(s) per socket: 12 - … -root@test-pod:/# python ->>> import torch ->>> print(torch.get_num_threads()) -24 -``` - - -When the worker_num increases, the total thread count increases by the product of the physical core * worker number. Consequently, logical threads are utilized. In order to improve performance, the total number of threads per worker was adjusted to align with the physical core count. Below, it can be observed that the metric RPS **increased approximately threefold** to 6.3(from the previous value of 2.1) when the worker_num was increased to 4 and the total thread count was aligned with the number of physical cores. - - -``` -Type Name # reqs # fails | Avg Min Max Med | req/s failures/s ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- -POST /predictions/image-scoring 265 0(0.00%) | 3154 1885 4008 3200 | 6.30 0.00 ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- - Aggregated 265 0(0.00%) | 3154 1885 4008 3200 | 6.30 0.00 -``` - - -**Cautionary Note 1**: Our team is Using Kubernetes to maintain our deployments. So we are adjusting the which required us to adjust according to the CPU resource limit of the pod, rather than the physical core count of the node that can be checked using the lscpu command. (Setting the torch thread of each worker to 8/4 = 2, or 24/4 = 6 resulted in performance degradation.) - -**Cautionary Note 2**: Since torch thread settings for each worker [can only be configured as integers](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html), it's advisable to set the CPU limit divisible by the worker_num in order to adequately utilize CPU usage. - - - -![example](/assets/images/ml-model-server-resource-saving/fg3.jpg){:style="width:100%"} - -**ex) core=8, In the case of worker_num=3: int(8/worker_num) = 2, 2*worker_num/8 = 75%** - -![example](/assets/images/ml-model-server-resource-saving/fg4.jpg){:style="width:100%; margin-top: 30px"} - -**ex) core=8, In the case of worker_num=4: int(8/worker_num) = 2, 2*worker_num/8 = 100%** - -We also analyzed the model containers to see why we got a mere threefold improvement in performance despite a four times increase in the number of workers. Various resources were monitored, and among them, the core utilization rate was identified as the underlying cause. - - - -![threads](/assets/images/ml-model-server-resource-saving/fg5.jpg){:style="width:100%"} - -Even when the total thread count was adjusted to match the CPU(2nd Generation, Intel(R) Xeon(R) Silver 4214) limit(8 core), there were instances where computations were executed from logical thread to logical core. Due to the presence of 24 physical cores, the cores numbered 25 to 48 are classified as logical cores. The possibility of confining thread execution solely within physical cores seemed to offer the potential for further performance enhancement. The reference to this solution could be found within the source document mentioned in the PyTorch-geometric article that warned about CPU GEMM bottlenecks. - - - -* Reference Documentation: [Grokking Pytorch Intel CPU Performance From First Principles](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex.html#grokking-pytorch-intel-cpu-performance-from-first-principles) - -As per the instructions in the document, Intel provides Intel® Extension for PyTorch where we can simply pin cores to specific sockets. The application method is also made very simple, by adding the following settings to the **torchserve config.properties** file.(used intel_extension_for_pytorch==1.13.0) - - -``` -ipex_enable=true -CPU_launcher_enable=true -``` - - - - -![two-socket configuration](/assets/images/ml-model-server-resource-saving/fg6.jpg){:style="width:100%"} - -**Image Source: [PyTorch](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex.html#grokking-pytorch-intel-cpu-performance-from-first-principles)** - -Beyond the removal of logical threads through socket pinning, there is an additional effect of eliminating UPI cache hit overhead. Since the CPU comprises more than one socket when threads scheduled on socket 1 are rescheduled on socket 2, cache hits occur in cases of accessing the cache of socket 1 via Intel Ultra Path Interconnect (UPI). At this point, UPI access to the local cache becomes more than twice as slow as local cache access, resulting in more bottlenecks. With threads being pinned to socket units by oneAPI powered Intel® Extension for PyTorch, We observed rps handling increase of up to **four times than when the bottleneck existed**. - - -``` -Type Name # reqs # fails | Avg Min Max Med | req/s failures/s ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- -POST /predictions/image-scoring 131 0(0.00%) | 3456 1412 6813 3100 | 7.90 0.00 ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- - Aggregated 131 0(0.00%) | 3456 1412 6813 3100 | 7.90 0.00 -``` - - -**Cautionary Note 1**: Intel® Extension for PyTorch is specialized in neural network (referred to as "nn" hereafter) inference optimization, so the performance improvement from additional techniques outside nn might be minimal. Indeed, in the instance of the image scoring system highlighted as an example, where svr (support vector regression) is applied post-inference, the performance enhancement was confined to a 4-fold increase. However, for a purely nn inference model such as the food recognition model, **a** **performance boost of 7-fold (2.5rps -> 17.5rps)** was detected. - - -``` -Type Name # reqs # fails | Avg Min Max Med | req/s failures/s ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- -POST /predictions/food-classification 446 0(0.00%) | 1113 249 1804 1200 | 17.50 0.00 ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- - Aggregated 446 0(0.00%) | 1113 249 1804 1200 | 17.50 0.00 -``` - - -**Cautionary Note 2**: Applying Intel® Extension for PyTorch requires **torchserve version 0.6.1 or higher**. Since our team was using version 0.6.0, there was an issue where socket pinning was not functioning correctly. Currently, we have made modifications to the guide document, specifying the required version. - -Within [WorkerLifeCycle.java](https://github.com/pytorch/serve/blob/4236a86dc0a018198ecd3fe261e835b416df739e/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java), multi-worker pinning is not supported in 0.6.0 and below (ninstance is hardcoded to 1) - - -``` -// 0.6.0 version - -public ArrayList launcherArgsToList() { - ArrayList arrlist = new ArrayList(); - arrlist.add("-m"); - arrlist.add("intel_extension_for_pytorch.cpu.launch"); - arrlist.add(" — ninstance"); - arrlist.add("1"); - if (launcherArgs != null && launcherArgs.length() > 1) { - String[] argarray = launcherArgs.split(" "); - for (int i = 0; i < argarray.length; i++) { - arrlist.add(argarray[i]); - } - } - return arrlist; - } -// master version - -if (this.numWorker > 1) { - argl.add(" — ninstances"); - argl.add(String.valueOf(this.numWorker)); - argl.add(" — instance_idx"); - argl.add(String.valueOf(this.currNumRunningWorkers)); - } -``` - - -### 2: Addressing Slow Latency Through Model Lightweighting - -We also streamlined our model using **Knowledge Distillation** (commonly abbreviated as KD) to further reduce latency. As is widely known, kd is a technique where knowledge from a larger network (Teacher network) is conveyed to a smaller, lightweight network (Student network) which is less resource intensive and can be more readily deployed. For more detailed information, please refer to the paper where this concept was initially introduced, titled Distilling the Knowledge in a Neural Network. - -![neural networks](/assets/images/ml-model-server-resource-saving/fg7.jpg){:style="width:100%"} - - -There is a variety of KD techniques available and because we were primarily focused on **accuracy loss minimization**, we adopted the approach from the paper [Knowledge Distillation from A Stronger Teacher](https://arxiv.org/pdf/2205.10536.pdf), which was published in the year 2022. The concept is straightforward. Unlike the conventional method of distillation that utilizes only the model's prop values, the chosen approach involves having the student network learn the correlations between classes in the teacher network. When put into actual application, We observed effective model weight reduction to observe the effective reduction in the model's weight while mainting high accuracy. The following are the outcomes of our experimentation with the mentioned knowledge distillation technique on several candidate student models, where selections were made based on the maintained level of accuracy. - - - -![table of services](/assets/images/ml-model-server-resource-saving/fg8.jpg){:style="width:100%"} - - -For the image scoring system, additional measures were taken to reduce the input size. Considering that the prior use of CPU-based ML technique SVR (Support Vector Regression) was used (2-stage: CNN + SVR), even when this was streamlined into a 1-stage model, significant speed advantages were not observed in CPU inference. In order for streamlining to have significance, the input size of the student model during inference needed further reduction. Consequently, experiments were conducted with the size reduced from 384*384 to 224*224. - -Further simplifying transformations, the 2-stage (CNN + SVR) approach was unified into a 1-stage model with a larger ConvNext, and then kd was applied using the lightweight EfficientNet to resolve the accuracy trade-off. During the experiments, we encountered a problem where changing Img_resize to 224 led to a performance drop from 0.4007 to 0.4296 in terms of MAE. Due to the reduction in input size, various preprocessing techniques applied to the original training images (such as Affine, RandomRotate90, Blur, OneOf [GridDistortion, OpticalDistortion, ElasticTransform], VerticalFlip) had a counterproductive effect. By adopting these measures, effective training of the student was achieved, and the **MAE value improved by 25% compared to the previous one (.518 to .3876)**. - -## Validation - -### 1: Final Performance Measurement - -The following shows the final performance improvements using CPU servers, on the three models mentioned throughout this article. - - -``` -# Food photo classifier (pod 3): 2.5rps -> 84 rps - - Type Name # reqs # fails | Avg Min Max Med | req/s failures/s - --------|----------------------------------------------------------------------------|------|------------|-------|------|-------|-------|--------|--------- -POST /predictions/food-classification 2341 0(0.00%) | 208 130 508 200 | 84.50 0.00 ---------|----------------------------------------------------------------------------|--------|-------------|------|-------|--------|------|--------|---------- - Aggregated 2341 0(0.00%) | 208 130 508 200 | 84.50 0.00 - -# Image scoring (pod 3): 2.1rps -> 62rps - Type Name #reqs #fails | Avg Min Max Median | req/s failures/s - --------|---------------------------------------------------------------------------------|--------|-------------|--------|-------|--------|---------|--------|--------- - POST /predictions/image-scoring 1298 0 (0.00%) | 323 99 607 370 | 61.90 0.00 ---------|---------------------------------------------------------------------------------|--------|-------------|--------|------|--------|---------|--------|---------- - Aggregated 1298 0(0.00%) | 323 99 607 370 | 61.90 0.00 - -# receipt classifier(pod 3) : 20rps -> 111.8rps -Type Name # reqs # fails | Avg Min Max Med | req/s failures/s ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- -POST /predictions/receipt-classification 4024 0(0.00%) | 266 133 2211 200 | 111.8 0.00 ---------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|----------- - Aggregated 4020 0(0.00%) | 266 133 2211 200 | 111.8 0.00 -``` - - -### 2:  Traffic Mirroring - -As previously mentioned, our team's service architecture employs the tool "traefik" as a gateway in front of the app server, as briefly introduced at the beginning of the article. For final validation, the mirroring feature of this traefik gateway was utilized to mirror traffic from production to staging for a month of validation before applying it to production, which is now operational. - -Details regarding mirroring are beyond the scope of this topic and hence omitted. For those interested, kindly refer to the document at [https://doc.traefik.io/traefik/routing/services/#mirroring-service](https://doc.traefik.io/traefik/routing/services/#mirroring-service). - -## In Conclusion - -This concludes the discussion about transitioning from a GPU model server to a CPU server while maintaining service quality. Through this effort, our team **was able to save 15 GPUs each in South Korea and Japan**, resulting in an **annual cost savings of approximately 340 thousand U.S. Dollar**. Although we directly purchase and use GPUs within NAVER, we calculated a rough cost reduction [based on AWS EC2 instances](https://aws.amazon.com/ko/ec2/instance-types/g4/) that stably support T4 GPUs. - -![instance sizes](/assets/images/ml-model-server-resource-saving/fg9.jpg){:style="width:100%"} - - -**Calculation: 1.306 (1-year reserved instance effective hourly cost) * 24 (hours) * 365 (days) * 15 (number of GPUs) * 2 (KR + JP)** - -These secured GPUs will be harnessed to further advance and enhance our team's AI services, delivering exceptional service experiences. We sincerely appreciate your encouragement and anticipation.:) - -## Explore More - -- [https://www.intel.com/content/www/us/en/developer/ecosystem/pytorch-foundation.html](https://www.intel.com/content/www/us/en/developer/ecosystem/pytorch-foundation.html) -- [https://pytorch-geometric.readthedocs.io/en/latest/advanced/CPU_affinity.html#binding-processes-to-physical-cores](https://pytorch-geometric.readthedocs.io/en/latest/advanced/cpu_affinity.html#binding-processes-to-physical-cores) -- [https://arxiv.org/pdf/2205.10536.pdf](https://arxiv.org/pdf/2205.10536.pdf) \ No newline at end of file diff --git a/_posts/2023-10-13-flash-decoding.md b/_posts/2023-10-13-flash-decoding.md deleted file mode 100644 index 6f2a5e6a450f..000000000000 --- a/_posts/2023-10-13-flash-decoding.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -layout: blog_detail -title: "Flash-Decoding for long-context inference" -author: Tri Dao, Daniel Haziza, Francisco Massa, Grigory Sizov ---- - -## Motivation - -Large language models (LLM) such as ChatGPT or Llama have received unprecedented attention lately. However, they remain massively expensive to run. Even though generating a single response can cost about $0.01 (a few seconds of an 8xA100 instance on AWS), the costs quickly add up when scaling to billions of users, who could have multiple daily interactions with such LLMs. Some use cases are more expensive, like code auto-completion, because it runs whenever a new character is typed. As LLM applications multiply, even small efficiency gains to the generation time can have a massive impact. - -LLM inference (or “decoding”) is an iterative process: tokens are generated one at a time. Generating full sentences of N tokens requires N forward passes through the model. Fortunately, it is possible to cache previously calculated tokens: this means that a single generation step does not depend on the context length, except for a single operation, the attention. This operation does not scale well with context length. - -There are a number of important emerging use cases of LLMs that utilize a long context. With a longer context, LLMs can reason about longer documents, either to summarize or answer questions about them, they can keep track of longer conversations, or even process entire codebases before writing code. As an example, most LLMs had a context length of up to 2k in 2022 (GPT-3), but we now have open-source LLMs scaling up to 32k ([Llama-2-32k](https://together.ai/blog/llama-2-7b-32k)), or even 100k more recently ([CodeLlama](https://about.fb.com/news/2023/08/code-llama-ai-for-coding/)). In this setting, attention takes a significant fraction of time during inference. - -When scaling on the batch size dimension, the attention can also become a bottleneck even with relatively small contexts. This is because the amount of memory to read scales with the batch dimension, whereas it only depends on the model size for the rest of the model. - -We present a technique, Flash-Decoding, that significantly speeds up attention during inference, bringing up to 8x faster generation for very long sequences. The main idea is to load the keys and values in parallel as fast as possible, then separately rescale and combine the results to maintain the right attention outputs. - - -## Multi-head attention for decoding - -During decoding, every new token that is generated needs to attend to all previous tokens, to compute: - -softmax(queries @ keys.transpose) @ values - -This operation has been optimized with FlashAttention (v1 and v2 recently) in the training case, where the bottleneck is the memory bandwidth to read and write the intermediate results (e.g. Q @ K^T). However, these optimizations don’t apply directly to the inference case, because the bottlenecks are different. For training, FlashAttention parallelizes across the batch size and query length dimensions. During inference, the query length is typically 1: this means that if the batch size is smaller than the number of streaming multiprocessors (SMs) on the GPU (108 for an A100), the operation will only use a small part of the GPU! This is especially the case when using long contexts, because it requires smaller batch sizes to fit in GPU memory. With a batch size of 1, FlashAttention will use less than 1% of the GPU! - -![FlashAttention](/assets/images/Inference_regular_attn.gif){:style="width:100%; display: block; margin-left: auto; margin-right: auto"} - -_FlashAttention parallelizes across blocks of queries and batch size only, and does not manage to occupy the entire GPU during decoding_ - -The attention can also be done using matrix multiplication primitives - without using FlashAttention. In this case, the operation occupies the GPU entirely, but launches many kernels that write and read intermediate results, which is not optimal. - - -## A faster attention for decoding: Flash-Decoding - -Our new approach Flash-Decoding is based on FlashAttention, and adds a new parallelization dimension: the keys/values sequence length. It combines the benefits of the 2 approaches from above. Like FlashAttention, it stores very little extra data to global memory, however it fully utilizes the GPU even when the batch size is small, as long as the context length is large enough. - -![Flash-Decoding](/assets/images/inference_splitkv.gif){:style="width:100%; display: block; margin-left: auto; margin-right: auto"} - -_Flash-Decoding also parallelizes across keys and values, at the cost of a small final reduction step_ - -Flash-Decoding works in 3 steps: - -1. First, we split the keys/values in smaller chunks. -2. We compute the attention of the query with each of these splits in parallel using FlashAttention. We also write 1 extra scalar per row and per split: the log-sum-exp of the attention values. -3. Finally, we compute the actual output by reducing over all the splits, using the log-sum-exp to scale the contribution of each split. - -All of this is possible because the attention/softmax can be calculated iteratively. In Flash-Decoding, it is used at 2 levels: within splits (like FlashAttention), and across splits to perform the final reduction. - -In practice, step (1) does not involve any GPU operation, as the key/value chunks are views of the full key/value tensors. We then have 2 separate kernels to perform respectively (2) and (3). - - -## Benchmarks on CodeLlama 34B - -To validate this approach, we benchmark the decoding throughput of the CodeLLaMa-34b. This model has the same architecture as Llama 2, and more generally results should generalize across many LLMs. We measure the decoding speed in tok/s at various sequence lengths, from 512 to 64k, and compare multiple ways of calculating the attention: - -- Pytorch: Running the attention using pure PyTorch primitives (without using FlashAttention) -- FlashAttention v2 -- FasterTransformer: Uses the FasterTransformer attention kernel -- Flash-Decoding -- And an upper bound calculated as the time it takes to read from memory the entire model along with the KV-cache - -Flash-Decoding unlocks up to 8x speedups in decoding speed for very large sequences, and scales much better than alternative approaches. - -![CodeLlama](/assets/images/decoding_codellama34b.png){:style="width:100%; display: block; margin-left: auto; margin-right: auto"} - -_All approaches perform similarly for small prompts, but scale poorly as the sequence length increases from 512 to 64k, except Flash-Decoding. In this regime (batch size 1) with Flash-Decoding, scaling the sequence length has little impact on generation speed_ - - - - -## Component-level micro-benchmarks - -We also micro-benchmark the scaled multi-head attention for various sequence lengths and batch sizes on A100 with inputs in f16. We set the batch size to 1, and use 16 query heads of dimension 128, for 2 key/value heads (grouped-query attention), which matches the dimensions used in CodeLLaMa-34b when running on 4 GPUs. - -| | | | | -| ------------------- | ------------- | ---------------------- | -------------- | -| Setting \ Algorithm | PyTorch Eager (us) | Flash-Attention v2.0.9 (us) | Flash-Decoding (us) | -| B=256, seqlen=256 | 3058.6 | 390.5 | 63.4 | -| B=128, seqlen=512 | 3151.4 | 366.3 | 67.7 | -| B=64, seqlen=1024 | 3160.4 | 364.8 | 77.7 | -| B=32, seqlen=2048 | 3158.3 | 352 | 58.5 | -| B=16, seqlen=4096 | 3157 | 401.7 | 57 | -| B=8, seqlen=8192 | 3173.1 | 529.2 | 56.4 | -| B=4, seqlen=16384 | 3223 | 582.7 | 58.2 | -| B=2, seqlen=32768 | 3224.1 | 1156.1 | 60.3 | -| B=1, seqlen=65536 | 1335.6 | 2300.6 | 64.4 | -| B=1, seqlen=131072 | 2664 | 4592.2 | 106.6 | - -_Micro-benchmark of the multi-head attention, run-time in us. Flash-Decoding achieves almost constant run-time as the sequence length scales to up to 64k._ - -The up to 8x speedup end-to-end measured earlier is made possible because the attention itself is up to 50x faster than FlashAttention. Up until sequence length 32k, the attention time is roughly constant, because Flash-Decoding manages to fully utilize the GPU. - - -## Using Flash-Decoding - -Flash-decoding is available: - -- In the [FlashAttention](https://github.com/Dao-AILab/flash-attention/tree/main) package, starting at version 2.2 -- Through [xFormers](https://github.com/facebookresearch/xformers) starting at version 0.0.22 through \`xformers.ops.memory\_efficient\_attention\`. The dispatcher will automatically use either the Flash-Decoding or FlashAttention approaches depending on the problem size. When these approaches are not supported, it can dispatch to an efficient triton kernel that implements the Flash-Decoding algorithm. - -A full example of decoding with LLaMa v2 / CodeLLaMa is available in the FlashAttention repo [here](https://github.com/Dao-AILab/flash-attention/tree/main/examples/inference) and in the xFormers [repo](https://github.com/facebookresearch/xformers) here. We also provide a [minimal example](https://github.com/facebookresearch/xformers/tree/main/examples/llama_inference) of an efficient decoding code for LLaMa v1/v2 models, meant to be fast, easy to read, educational and hackable. - - -### Acknowledgements - -Thanks to Erich Elsen, Ashish Vaswani, and Michaël Benesty for suggesting this idea of splitting the KVcache loading. We want to thank Jeremy Reizenstein, Patrick Labatut and Andrew Tulloch for the valuable discussions, and Quentin Carbonneaux for contributing the efficient decoding example to xFormers. We also want to thank Geeta Chauhan and Gregory Chanan for helping with the writing and more broadly contributing to getting this published on the PyTorch blog. diff --git a/_posts/2023-10-17-compiling-numpy-code.md b/_posts/2023-10-17-compiling-numpy-code.md deleted file mode 100644 index 7987cac5895c..000000000000 --- a/_posts/2023-10-17-compiling-numpy-code.md +++ /dev/null @@ -1,302 +0,0 @@ ---- -layout: blog_detail -title: "Compiling NumPy code into C++ or CUDA via torch.compile" -author: Evgeni Burovski, Ralf Gommers and Mario Lezcano ---- - -Quansight engineers have implemented support for tracing through NumPy code via -`torch.compile` in PyTorch 2.1. This feature leverages PyTorch’s compiler to -generate efficient fused vectorized code without having to modify your original -NumPy code. Even more, it also allows for executing NumPy code on CUDA -just by running it through `torch.compile` under `torch.device("cuda")`! - -In this post, we go over how to use this feature and give a few tips and tricks -to make the most out of it. - - -## Compiling NumPy code into Parallel C++ - -We take as our running example one step in a K-Means algorithm. -This piece of code is borrowed from this [NumPy book](https://realpython.com/numpy-array-programming/#clustering-algorithms) - - -``` -import numpy as np - -def kmeans(X, means): - return np.argmin(np.linalg.norm(X - means[:, None], axis=2), axis=0) -``` - - -We create a synthetic dataset with 20M random 2-D points. We can see that, -given that the means are chosen appropriately, the function returns the correct -cluster for all of them - - -``` -npts = 10_000_000 -X = np.repeat([[5, 5], [10, 10]], [npts, npts], axis=0) -X = X + np.random.randn(*X.shape) # 2 distinct "blobs" -means = np.array([[5, 5], [10, 10]]) -np_pred = kmeans(X, means) -``` - - -Benchmarking this function gives us a baseline of **1.26s** on an AMD 3970X CPU. - -Compiling this function is now as easy as wrapping it with `torch.compile` and -executing it with the example inputs - - -``` -import torch - -compiled_fn = torch.compile(kmeans) -compiled_pred = compiled_fn(X, means) -assert np.allclose(np_pred, compiled_pred) -``` - - -The compiled function yields a 9x speed-up when running it on 1 core. Even -better, as opposed to NumPy, our generated code does take advantage of all the -cores in a processor. As such, when we run it on 32 cores, we get a **57x -speed-up**. Note that PyTorch always uses all the available cores unless -explicitly restricted, so this is the default behavior you get when using -`torch.compile`. - -We may inspect the generated C++ code by running the script with the -environment variable `TORCH_LOGS=output_code`. When doing so, we can see that -`torch.compile` was able to compile the broadcasting and the two reductions -into just one for-loop, and parallelize it using OpenMP - - -``` -extern "C" void kernel(const double* in_ptr0, const long* in_ptr1, long* out_ptr0) { - #pragma omp parallel num_threads(32) - #pragma omp for - for(long i0=0L; i0<20000000L; i0+=1L) { - auto tmp0 = in_ptr0[2L*i0]; - auto tmp1 = in_ptr1[0L]; - auto tmp5 = in_ptr0[1L + (2L*i0)]; - auto tmp6 = in_ptr1[1L]; - // Rest of the kernel omitted for brevity -``` - - - -## Compiling NumPy code into CUDA - -Compiling our code so that it runs on CUDA is as simple as setting the -default device to be CUDA - - -``` -with torch.device("cuda"): - cuda_pred = compiled_fn(X, means) -assert np.allclose(np_pred, cuda_pred) -``` - - -By inspecting the generated code via `TORCH_LOGS=output_code`, we see that, -rather than generating CUDA code directly, `torch.compile` generates rather -readable [triton](https://triton-lang.org/main/index.html) code - - -``` -def triton_(in_ptr0, in_ptr1, out_ptr0, XBLOCK : tl.constexpr): - xnumel = 20000000 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x0 = xindex - tmp0 = tl.load(in_ptr0 + (2*x0), xmask) - tmp1 = tl.load(in_ptr1 + (0)) - // Rest of the kernel omitted for brevity -``` - - -Running this small snippet on an RTX 2060 gives an **8x speed-up** over the -original NumPy code. This is something, but it is not particularly impressive, -given the speed-ups we have seen on CPU. Let’s have a look into how to squeeze -the most out of our GPU via a couple minor changes. - -`float64` vs `float32`. Many GPUs, in particular consumer-grade ones, are -rather sluggish when running operations on `float64`. For this reason, changing -the data generation to `float32`, the original NumPy code just gets a bit -faster, about a 9%, but our CUDA code gets 40% faster, yielding a 11x -speed-up over the plain NumPy code. - -`torch.compile`, by default, respects the NumPy semantics, and as such, it uses -`np.float64` as its default dtype for all its creation ops. As discussed, this -can hinder performance, so it is possible to change this default by setting - - -``` -from torch._dynamo import config -config.numpy_default_float = "float32" -``` - - -**CPU <> CUDA copies**. An 11x speed-up is good, but it is not even close to -the CPU numbers. This is caused by a small transformation that `torch.compile -`does behind the scenes. The code above takes NumPy arrays and returns NumPy -arrays. All of these arrays are on CPU, but the computations are performed on -the GPU. This means that every time the function is called, `torch.compile` has -to copy all these arrays from CPU to the GPU, and then copy the result back to -CPU to preserve the original semantics. There is no native solution to this -issue in NumPy, as NumPy does not have the notion of a `device`. That being -said, we can work around it by creating a wrapper to this function so that it -accepts PyTorch tensors and returns PyTorch tensors. - - -``` -@torch.compile -def tensor_fn(X, means): - X, means = X.numpy(), means.numpy() - ret = kmeans(X, means) - return torch.from_numpy(ret) - -def cuda_fn(X, means): - with torch.device("cuda"): - return tensor_fn(X, means) -``` - - -This function now takes tensors in CUDA memory and returns tensors in CUDA -memory, but the function itself is written in NumPy! `torch.compile` uses the -`numpy()` and the `from_numpy()` calls as hints, and optimizes them away, and -internally it simply works with PyTorch tensors without moving the memory at -all. When we keep the tensors in CUDA and perform the computations in -`float32`, we see a **200x speed-up** over the initial NumPy implementation on -`float32` arrays. - -**Mixing NumPy and PyTorch**. In this example, we had to write a small adaptor -to convert tensors to ndarrays and then back to tensors. In programs that mix -PyTorch and NumPy converting a tensor into an ndarray is often implemented as -`x.detach().cpu().numpy()`, or simply `x.numpy(force=True)`. Since when running -under `torch.compile` we can run NumPy code in CUDA, we can implement this -conversion pattern as call to `x.numpy()`, as we did above. Doing so and -running the resulting code under `device("cuda")` will generate efficient CUDA -code from original NumPy calls without copying the data from CUDA to CPU at -all. Note that the resulting code does not run without `torch.compile`. For it -to run in eager mode one would need to rollback to `x.numpy(force=True)`. - - -## Further Speed-up tricks - -**General advice**. The CUDA code we have shown is already quite efficient, but -it is true that the running example is rather short. When dealing with larger -programs, we may need to tweak parts of it to make it more efficient. A good -place to start is the multiple [tutorials and FAQs for torch.compile](https://pytorch.org/docs/main/torch.compiler.html#read-more). -This showcases a number of ways to inspect the tracing process, and how to -identify problematic code that may cause slowdowns. - -**Advice when compiling NumPy code**. NumPy, even if rather similar to PyTorch, -is often used very differently. It is rather common to perform computations in -NumPy and then do an if/else depending on values within the array, or perform -operations in-place, perhaps via boolean masks. These constructions, while -supported by `torch.compile`, hamper its performance. Changes like writing the -code in a branchless way to avoid graph breaks, or avoiding in-place ops can go -a long way. - -To write fast NumPy code, it is best to avoid loops, but sometimes they are -unavoidable. When tracing through a loop, `torch.compile` will try to fully -unroll it. This is sometimes desirable, but sometimes it may not even be -possible, like when we have a dynamic stopping condition, like in a while loop. -In these cases, it may be best to just compile the body of the loop, perhaps a -few iterations at a time (loop unrolling). - -**Debugging NumPy code**. Debugging is rather tricky when a compiler is -involved. To figure out whether an error you are hitting is a `torch.compile -`error, or an error from the program, you can execute your NumPy program without -`torch.compile` by replacing the NumPy import by `import torch._numpy as np`. -This is should just be used for **debugging purposes** and is in no way a -replacement for the PyTorch API, as it is **much slower** and, as a private API, -**may change without notice**. See also [this FAQ](https://pytorch.org/docs/stable/torch.compiler_faq.html#does-numpy-work-with-torch-compile) for other tricks. - - -## Differences between NumPy and `torch.compile` NumPy - -**NumPy scalars**. NumPy returns NumPy scalars in almost any case where PyTorch -would return a 0-D tensor (e.g. from `np.sum`). Under `torch.compile`, NumPy -scalars are treated as 0-D arrays. This is just fine in most cases. The only -case when their behavior diverges is when NumPy scalars are implicitly used as -Python scalars. For example, - - -``` ->>> np.asarray(2) * [1, 2, 3] # 0-D array is an array-like -array([2, 4, 6]) ->>> u = np.int32(2) ->>> u * [1, 2, 3] # scalar decays into a Python int -[1, 2, 3, 1, 2, 3] ->>> torch.compile(lambda: u * [1, 2, 3])() -array([2, 4, 6]) # acts as a 0-D array, not as a scalar ?!?! -``` - - -If we compile the first two lines, we see that `torch.compile` treats `u` as a -0-D array. To recover the eager semantics, we just need to make the casting -explicit - - -``` ->>> torch.compile(lambda: int(u) * [1, 2, 3])() -[1, 2, 3, 1, 2, 3] -``` - - -**Type promotion and versioning**. NumPy’s type promotion rules may be, at -times, a bit surprising - - -``` ->>> np.zeros(1, dtype=np.int8) + 127 -array([127], dtype=int8) ->>> np.zeros(1, dtype=np.int8) + 128 -array([128], dtype=int16) -``` - - -NumPy 2.0 is changing these rules to follow others that are closer to those -PyTorch. The relevant technical document is [NEP 50](https://numpy.org/neps/nep-0050-scalar-promotion.html). -`torch.compile` went ahead and implemented NEP 50 rather than the about-to-be-deprecated rules. - -In general, NumPy within torch.compile follows NumPy 2.0 pre-release. - - -## Beyond NumPy: SciPy and scikit-learn - -In parallel to this effort of making `torch.compile` understand NumPy code, -other Quansight engineers have designed and proposed a way to support PyTorch -tensors within scikit-learn and SciPy. This was received enthusiastically by -other maintainers from these libraries, as it was shown that using PyTorch as a -backend would often yield considerable speed-ups. Both projects have now merged -initial support for PyTorch tensors across a number of APIs and submodules. - -This sets the stepping stone to move towards a future where PyTorch tensors can -be used within other libraries in the Python data ecosystem. Even more, this -will enable running these other libraries on GPUs and even compiling code -mixing these libraries and PyTorch, similar to what we have been discussed in -this post. - -If you want to learn more about this effort, how to use it, or how to help -moving it forward, see [this other blogpost](https://labs.quansight.org/blog/array-api-support-scikit-learn). - - -## Conclusion - -PyTorch has committed since its inception to be a framework compatible with the -rest of the Python ecosystem. Enabling compiling NumPy programs, and -establishing the tools necessary to do the same for other prominent libraries -are two more steps in this direction. Quansight and Meta continue working hand -on hand, improving the compatibility between PyTorch and the rest of the -ecosystem. - -From Quansight, we would like to thank Mengwei, Voz, and Ed for their -invaluable help in integrating our work with `torch.compile`. We would also -like to thank Meta for funding this project as well as previous work on -improving NumPy compatibility within PyTorch, and the project that led to -supporting PyTorch within scikit-learn and SciPy. These are giant leaps towards -consolidating PyTorch as the framework of choice within the open source Python -data ecosystem. \ No newline at end of file diff --git a/_posts/2023-10-17-huawei-joins-pytorch.md b/_posts/2023-10-17-huawei-joins-pytorch.md deleted file mode 100644 index c966c497dec0..000000000000 --- a/_posts/2023-10-17-huawei-joins-pytorch.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -layout: blog_detail -title: "Huawei Joins the PyTorch Foundation as a Premier Member" ---- - -Today, the PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, announced that Huawei has joined as a premier member. - -Huawei has been a long-standing supporter and contributor to the PyTorch Ecosystem, and, through the release of progressive diverse computing, provides easier access to the PyTorch ecosystem for more hardware vendors. By joining as a premier member, Huawei will continue to optimize PyTorch to fully unleash Ascend computing capabilities. - -"We are delighted to join the PyTorch Foundation, and hope to further collaborate with other member companies and expand the community to a wider audience," said by Zhang Dixuan, President of Huawei Ascend Computing Business, "This move benefits both Huawei, PyTorch, and the wider AI ecosystem. It also aligns with our long-held beliefs in openness, innovation, collaboration, and shared success, and we are confident that it will spur new innovations in the global AI community." - -Huawei unveiled the All Intelligence strategy to accelerate intelligence across all industries. To cater the demand for AI computing needs, Huawei invests in the system-level technologies, and that belief is centered on open hardware and software that enables partners and fosters talent. This strategy aligns with the PyTorch Foundation's mission to develop AI as part of a sustainable open source ecosystem and produce inclusive technological feats. - -PyTorch Foundation Executive Director Ibrahim Haddad said, "We are delighted to welcome Huawei to the PyTorch Foundation. Huawei is a leading body in researching computer vision, natural language processing, speech recognition, and other emerging areas, and has proven experience in the field of foundation models. We have no doubt that we will benefit from their support and guidance." - -As a premier member, Huawei is granted one seat to the PyTorch Foundation Governing Board, and will help set policies, bylaws, and mission and vision statements that define the overarching scope of the PyTorch Foundation's initiatives, technical vision, and direction. - -The Board welcomes Huawei representative Fred Li, Head of Computing Open Source Development Team at Huawei. Fred leads an active and creative team in R&D and operations projects under the principle of "upstream first", which aims to make diverse computing power ubiquitous. - -To learn more about how you can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/foundation). - -## About Huawei - -Founded in 1987, Huawei is a leading global provider of information and communications technology (ICT) infrastructure and smart devices. We have 207,000 employees and operate in over 170 countries and regions, serving more than three billion people around the world. We are committed to bringing digital to every person, home and organization for a fully connected, intelligent world. - -## About PyTorch Foundation - -The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration. - -## About The Linux Foundation - -The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its [trademark usage page](https://www.linuxfoundation.org/legal/trademark-usage). Linux is a registered trademark of Linus Torvalds. - -
        - -华为成为PyTorch基金会Premier会员 - -PyTorch 基金会是深度学习社区在开源 PyTorch 框架和生态系统上进行协作的中立家园,今天宣布华为已作为Premier会员加入。 - -华为长期以来一直是PyTorch生态系统的支持者和贡献者,通过推进多样性算力支持与改进,帮助更多厂商后端能够更加轻松地接入PyTorch生态,并积极致力于PyTorch优化,从而充分释放昇腾的算力。 - -“通过加入PyTorch基金会,我们可以进一步与其他成员公司共同协作,加速PyTorch社区的发展。”华为昇腾计算业务总裁张迪煊表示,“我们相信这对华为和 PyTorch 生态系统是互惠互利的,也符合我们长期以来开放创新,协作共赢的开源理念,为全球人工智能社区带来更多的兴奋和创新。” - -华为发布全面智能化战略,加速千行万业智能化的转型,持续通过系统级持续创新,坚持硬件开放、软件开源、使能伙伴、发展人才,以满足各行各业多样性的AI算力需求。这与 PyTorch 基金会的使命完美契合且相互补充,即通过培育和维持开源生态系统来推动人工智能的发展,并使每个人都能使用这些技术创新。 - -“华为在计算机视觉、自然语言处理、语音识别等领域进行了广泛的研究,并且在大模型领域也积累了成熟的研究经验。我们相信 PyTorch 基金会将从他们对我们的成员和生态系统的支持中受益匪浅。”PyTorch 基金会执行董事 Ibrahim Haddad 说道。 - -作为 Premier 会员,华为获得了 PyTorch 基金会董事会的一个席位。董事会通过我们的章程、使命和愿景声明制定政策,描述基金会计划、技术愿景和方向的总体范围。 - -我们很高兴欢迎华为计算开源业务总经理李永乐加入我们的董事会。李永乐目前负责华为计算产品线开源业务,他领导着一支极具创新又充满活力的技术和运营团队,他们秉持着“Upstream first”的原则,让多样性算力无处不在。 - -要了解有关如何成为 PyTorch 基金会一部分的更多信息,请访问我们的[网站](https://pytorch.org/foundation)。 - -关于华为 - -华为创立于1987年,是全球领先的ICT(信息与通信)基础设施和智能终端提供商。我们的20.7万员工遍及170多个国家和地区,为全球30多亿人口提供服务。我们致力于把数字世界带入每个人、每个家庭、每个组织,构建万物互联的智能世界。 - -关于PyTorch基金会 - -PyTorch 基金会是深度学习社区在开源 PyTorch 框架和生态系统上进行协作的中立家园。 PyTorch 基金会得到其成员和 PyTorch 开源项目主要贡献者的支持。基金会利用成员和贡献者提供的资源来促进社区讨论和协作。 - -关于Linux基金会 - -Linux 基金会是世界领先的开源软件、硬件、标准和数据协作中心。 Linux 基金会项目对世界基础设施至关重要,包括 Linux、Kubernetes、Node.js、ONAP、PyTorch、RISC-V、SPDX、OpenChain 等。 Linux 基金会专注于利用最佳实践并满足贡献者、用户和解决方案提供商的需求,以创建可持续的开放协作模型。欲了解更多信息,请访问我们的 linuxfoundation.org。 Linux 基金会已注册商标并使用商标。有关 Linux 基金会的商标列表,请参阅其商标使用页面:www.linuxfoundation.org/trademark-usage。 Linux 是 Linus Torvalds 的注册商标。 diff --git a/_posts/2023-10-17-lightning-ai-joins-pytorch.md b/_posts/2023-10-17-lightning-ai-joins-pytorch.md deleted file mode 100644 index ca3949257351..000000000000 --- a/_posts/2023-10-17-lightning-ai-joins-pytorch.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -layout: blog_detail -title: "Lightning AI Joins the PyTorch Foundation as a Premier Member" ---- - -The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Lightning AI has joined as a premier member. - -Lightning AI is the company behind PyTorch Lightning, the platform and open-source framework for companies to build and deploy AI products leveraging the latest generative AI models. - -“This is a very important milestone for Lightning AI and the PyTorch Lightning community,” remarks Luca Antiga, Chief Technology Officer of Lightning AI. “By joining the PyTorch Foundation, we are strengthening our commitment to boost the adoption of PyTorch across industries. We look forward to partnering with the Foundation to push the vision of PyTorch forward.” - -PyTorch Lightning is one of the leading projects in the PyTorch ecosystem, allowing developers to build, train, fine-tune and deploy AI models at scale. PyTorch Lightning is helping drive the rapid adoption of PyTorch by both the research community and the enterprise. - -“Lightning AI has been a great steward of the AI community, and notably a key contributor to PyTorch over the years,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Their goal of making AI research scalable directly aligns with our mission at the foundation.” - -As a premier member, Lightning AI is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction. - -We’re happy to welcome Luca Antiga, Chief Technology Officer at Lightning AI, to our board. Luca joined the Lightning AI team in April 2021 when the Tensorwerk team joined Grid AI. Prior to joining Lightning AI, Luca co-founded Orobix, an applied AI company, and Tensorwerk. He was an early core contributor to PyTorch and co-authored Deep Learning with PyTorch (Manning). - -To learn more about how you can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/foundation). - -## About Lightning AI - -Lightning AI is the creator of PyTorch Lightning, the deep learning platform and open-source framework of choice for developers and companies seeking to build and deploy AI products. - -## About PyTorch Foundation - -The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration. - -## About The Linux Foundation - -The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its [trademark usage page](https://www.linuxfoundation.org/legal/trademark-usage). Linux is a registered trademark of Linus Torvalds. \ No newline at end of file diff --git a/_posts/2023-10-17-pytorch-edge.md b/_posts/2023-10-17-pytorch-edge.md deleted file mode 100644 index 34173b1e49c3..000000000000 --- a/_posts/2023-10-17-pytorch-edge.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Edge: Enabling On-Device Inference Across Mobile and Edge Devices with ExecuTorch" -author: the PyTorch Edge Team ---- - -We are excited to announce ExecuTorch, our all-new solution for enabling on-device inference capabilities across mobile and edge devices with the backing of industry leaders like Arm, Apple, and Qualcomm Innovation Center. - -As part of PyTorch Edge's vision for the future of the on-device AI stack and ecosystem, ExecuTorch addresses the fragmentation in the on-device AI ecosystem. It offers a design that provides extension points for seamless third-party integration to accelerate ML models on specialized hardware. Our partners have contributed custom delegate implementations to optimize model inference execution on their respective hardware platforms. - -We have created extensive documentation that provides more details about ExecuTorch’s architecture, its high-level components, example ML models running on ExecuTorch, and end-to-end tutorials for exporting and running a model on various hardware devices. We are excited to see all of the innovative use cases of ExecuTorch built by the community. - - -## Key Components of ExecuTorch - -ExecuTorch offers a compact runtime with a lightweight operator registry to cover the PyTorch ecosystem of models, and a streamlined path to execute PyTorch programs on edge devices. These devices range from mobile phones to embedded hardware powered by specific delegates built by our partners. In addition, ExecuTorch ships with a Software Developer Kit (SDK) and toolchain that provide an ergonomic UX for ML Developers to go from model authoring to training and device delegation in a single PyTorch workflow. This suite of tools enables ML developers to perform on-device model profiling and better ways of debugging the original PyTorch model. - -ExecuTorch is architected from the ground up in a composable manner to allow ML developers to make decisions on what components to leverage as well as entry points to extend them if needed. This design provides the following benefits to the ML community: - -* **Portability**: Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers. -* **Productivity**: Enabling developers to use the same toolchains and SDK from PyTorch model authoring and conversion, to debugging and deployment to a wide variety of platforms, resulting in productivity gains. -* **Performance**: Providing end users with a seamless and high-performance experience due to a lightweight runtime as well as its ability to utilize full hardware capabilities, including general purpose CPUs and specialized purpose microprocessors such as NPUs and DSPs. - - -## PyTorch Edge: from PyTorch Mobile to ExecuTorch - -Bringing research and production environments closer together is a fundamental goal of PyTorch. ML engineers increasingly use PyTorch to author and deploy machine learning models in highly dynamic and ever-evolving environments, from servers to edge devices such as mobile phones and embedded hardware. - -With the increasing adoption of AI in Augmented Reality (AR), Virtual Reality (VR), Mixed Reality (MR), Mobile, IoT and other domains, there is a growing need for an end-to-end on-device solution that is extensible, modular, and aligned with the PyTorch stack. - -PyTorch Edge builds on the same fundamental principle of improving research to production by enabling the deployment of various ML models (spanning vision, speech, NLP, translation, ranking, integrity and content creation tasks) to edge devices via a low-friction development and deployment process. It provides a framework stack that spans the universe of on-device use-cases that the PyTorch community cares about. - -PyTorch Edge provides portability of core components that is required to reach a wide spectrum of devices which are characterized by differing hardware configurations, performance and efficiency. Such portability is achieved by allowing optimization that are custom developed for the target use-cases, and developer productivity via well defined entry-points, representations, and tools to tie all this together into a thriving ecosystem. - -PyTorch Edge is the future of the on-device AI stack and ecosystem for PyTorch. We are excited to see what the community builds with ExecuTorch’s on-device inference capabilities across mobile and edge devices backed by our industry partner delegates. - -[Learn more about PyTorch Edge and ExecuTorch](https://pytorch.org/executorch/stable/index.html). diff --git a/_posts/2023-10-31-amd-extends-support-for-pt-ml.md b/_posts/2023-10-31-amd-extends-support-for-pt-ml.md deleted file mode 100644 index 0e878b19c754..000000000000 --- a/_posts/2023-10-31-amd-extends-support-for-pt-ml.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -layout: blog_detail -title: "AMD Extends Support for PyTorch Machine Learning Development on Select RDNA™ 3 GPUs with ROCm™ 5.7" -author: AMD ---- - -Researchers and developers working with Machine Learning (ML) models and algorithms using PyTorch can now use AMD ROCm 5.7 on Ubuntu® Linux® to tap into the parallel computing power of the Radeon™ RX 7900 XTX and the Radeon™ PRO W7900 graphics cards which are based on the AMD RDNA™ 3 GPU architecture. - -A client solution built on these two high-end GPUs enables a local, private, and cost-effective workflow for ML training and inference for those who previously relied on cloud-based solutions alone. - - -![ML Development on Desktop](/assets/images/2281965-ROCm-development-radeon.jpg){:style="width:100%; display: block; margin-left: auto; margin-right: auto"} - - -## Accelerate Machine Learning With Pytorch On Your Desktop - - - -* A local PC or workstation system running PyTorch with a Radeon 7900 series GPU presents a capable, yet affordable solution to address these growing workflow challenges thanks to large GPU memory sizes of 24GB and even 48GB. - -## Unified Software Stack For The Desktop And The Datacenter - - - -* The latest AMD ROCm 5.7 software stack for GPU programming unlocks the massively parallel compute power of these RDNA™ 3 architecture-based GPUs for use with PyTorch, one of the leading ML frameworks. The same unified software stack also supports the CDNA™ GPU architecture of the AMD Instinct™ MI series accelerators. - -## Freedom To Customize - -* The AMD ROCm platform is primarily Open-Source Software (OSS). It allows developers the freedom to customize and tailor their GPU software for their own needs while collaborating with a community of other developers, and helping each other find solutions in an agile, flexible, and rapid manner. The AMD ROCm platform’s goal is to allow users to maximize their GPU hardware investment. The AMD ROCm platform is designed to help develop, test, and deploy GPU accelerated HPC, AI, scientific computing, CAD, and other applications in a free, open source, integrated and secure software ecosystem. - -As the industry moves towards an ecosystem that supports a broad set of systems, frameworks and accelerators, AMD is determined to continue to make AI more accessible to PyTorch developers and researchers that benefit from a local client-based setup for ML development using RDNA™ 3 architecture-based desktop GPUs. - -## Learn More - -[https://www.amd.com/en/developer/resources/ml-radeon.html](https://www.amd.com/en/developer/resources/ml-radeon.html) - -## Download Software - -[https://www.amd.com/en/support/linux-drivers](https://www.amd.com/en/support/linux-drivers) - - - -## Visit the Documentation Portal to get started training ML models on your local desktop - -[https://rocm.docs.amd.com/projects/radeon/en/latest/](https://rocm.docs.amd.com/projects/radeon/en/latest/) - -## Prerequisites - -[https://rocm.docs.amd.com/projects/radeon/en/latest/docs/prerequisites.html](https://rocm.docs.amd.com/projects/radeon/en/latest/docs/prerequisites.html) - -## How to Guide - -[https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/howto.html](https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/howto.html) - - - -© 2023 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD Arrow logo, CDNA, Radeon, ROCm, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Linux® is the registered trademark of Linus Torvalds in the U.S. and other countries. Microsoft and Windows are registered trademarks of Microsoft Corporation in the US and/or other countries. PyTorch, the PyTorch logo and any related marks are trademarks of The Linux Foundation. TensorFlow, the TensorFlow logo and any related marks are trademarks of Google Inc. Ubuntu and the Ubuntu logo are registered trademarks of Canonical Ltd. Other product names used in this publication are for identification purposes only and may be trademarks of their respective owners. - -Radeon™ AI technology is compatible with all AMD Radeon 7000 Series graphics cards and newer. Please check with your system manufacturer for feature availability prior to purchase. GD-232. - - - -1. Based on AMD internal measurements, November 2022, comparing the Radeon RX 7900 XTX at 2.5GHz boost clock with 96 CUs issuing 2X the Bfloat16 math operations per clocks vs. the RX 6900 XT GPU at 2.25 GHz boost clock and 80 CUs issue 1X the Bfloat16 math operations per clock. RX-821 diff --git a/_posts/2023-11-02-accelerating-inference.md b/_posts/2023-11-02-accelerating-inference.md deleted file mode 100644 index 0983a6e98c83..000000000000 --- a/_posts/2023-11-02-accelerating-inference.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Inference on x86-64 Machines with oneDNN Graph" -author: Intel ---- - -_Supported in PyTorch 2.0 as a beta feature, oneDNN Graph leverages aggressive fusion patterns to accelerate inference on x86-64 machines, especially Intel® Xeon® Scalable processors._ - -[oneDNN Graph API](http://spec.oneapi.io/onednn-graph/latest/introduction.html) extends [oneDNN](http://spec.oneapi.io/versions/latest/elements/oneDNN/source/index.html) with a flexible graph API to maximize the optimization opportunity for generating efficient code on AI hardware. It automatically identifies the graph partitions to be accelerated via fusion. The [fusion patterns](http://github.com/oneapi-src/oneDNN/blob/dev-graph/doc/programming_model/ops_and_patterns.md#fusion-patterns) focus on fusing compute-intensive operations such as convolution, matmul, and their neighbor operations for both inference and training use cases. - -In PyTorch 2.0 and beyond, oneDNN Graph can help accelerate inference on x86-64 CPUs (primarily, Intel Xeon processor-based machines) with Float32 and BFloat16 (with PyTorch’s Automatic Mixed Precision support) datatypes. With BFloat16, speedup is limited to machines that support AVX512_BF16 ISA (Instruction Set Architecture), as well as machines that also support AMX_BF16 ISA. - - -## oneDNN Graph Usage - -From a user’s perspective, the usage is quite simple and intuitive, [with the only change in code being an API invocation](http://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-onednn-graph-with-torchscript-for-inference). To leverage oneDNN Graph with [JIT-tracing](http://pytorch.org/docs/stable/generated/torch.jit.trace.html), a model is profiled with an example input as shown below in Figure 1. - - -![Figure 1. A code-snippet that demonstrates using oneDNN Graph](/assets/images/f1-onednn-graph-api-code-snippet.png){:style="width:100%;"} - -

        Fig. 1: A code-snippet that demonstrates using oneDNN Graph

        - - -oneDNN Graph receives the model’s graph and identifies candidates for operator-fusion with respect to the input shape of the example input. Currently, only static shapes are supported. This means that any other input shape would neither be supported nor receive any performance-benefit. - - -## Measurements - -To ensure reproducibility of results, we used a [fork](http://github.com/sanchitintel/benchmark/tree/onednn-graph-preview2) of [TorchBench](http://github.com/pytorch/benchmark) to measure inference speed-up of some Vision models on an [AWS m7i.16xlarge](http://aws.amazon.com/ec2/instance-types/m7i/) instance, which uses 4th Gen Intel® Xeon® Scalable processors. - -The baseline for comparison was [torch.jit.optimize_for_inference](http://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html) which only supports Float32 datatype. The batch-size for each model was based on the respective batch size being used for them in TorchBench. - -In Figure 2, we depict the inference speedup of using oneDNN Graph over PyTorch alone. The geomean speedup with oneDNN Graph **for Float32 datatype was 1.24x**, and the geomean speedup **for BFloat16 datatype was 3.31x**1. - - -![Figure 2. Inference speedup with oneDNN Graph over default CPU JIT Fuser (which only uses Float32 datatype)](/assets/images/f2-inference-speedup-with-onednn-graph.png){:style="width:100%;"} - -

        Fig. 2: Inference speedup with oneDNN Graph over default CPU JIT Fuser (which only uses Float32 datatype)

        - - -## Future work - -oneDNN Graph is currently supported in PyTorch through TorchScript, but work is already underway by Intel to integrate it with the Inductor-CPU backend as a prototype feature in a future PyTorch release and Dynamo make supporting dynamic shapes easier with PyTorch, and we would like to introduce Dynamic shape support with Inductor-CPU. We also plan to add int8 quantization support. - -## Acknowledgements - -The results presented in this blog are a joint effort between Meta and the Intel PyTorch team. Special thanks to Elias Ellison from Meta who spent precious time thoroughly reviewing the PRs and gave us helpful feedback. \ No newline at end of file diff --git a/_posts/2023-11-06-high-performance-llama-2.md b/_posts/2023-11-06-high-performance-llama-2.md deleted file mode 100644 index bd047db8fe8e..000000000000 --- a/_posts/2023-11-06-high-performance-llama-2.md +++ /dev/null @@ -1,521 +0,0 @@ ---- -layout: blog_detail -title: "High-Performance Llama 2 Training and Inference with PyTorch/XLA on Cloud TPUs" -author: "Jiewen Tan, Jon Bolin, Yeounoh Chung, Liyang Lu, Siyuan Liu, Wonjoo Lee, Manfei Bai, Meghan Cowan, Jack Cao, Milad Mohammadi, Shauheen Zahirazami, Alex Spiridonov" ---- - -In a landscape where AI innovation is accelerating at an unprecedented pace, Meta’s [Llama](https://ai.meta.com/llama/) family of open sourced large language models (LLMs) stands out as a notable breakthrough. [Llama](https://ai.meta.com/blog/large-language-model-llama-meta-ai/) marked a significant step forward for LLMs, demonstrating the power of pre-trained architectures for a wide range of applications. [Llama 2](https://about.fb.com/news/2023/07/llama-2/) further pushed the boundaries of scale and capabilities, inspiring advancements in language understanding, generation, and beyond. - -Shortly after the announcement of Llama, we published a [blog post](https://pytorch.org/blog/path-achieve-low-inference-latency/) showcasing ultra-low inference latency for Llama using PyTorch/XLA on Cloud TPU v4. Building on these results, today, we are proud to share Llama 2 training and inference performance using [PyTorch/XLA](https://github.com/pytorch/xla) on Cloud TPU v4 and our newest AI supercomputer, [Cloud TPU v5e](https://cloud.google.com/blog/products/compute/announcing-cloud-tpu-v5e-and-a3-gpus-in-ga). - -In this blog post, we use Llama 2 as an example model to demonstrate the power of PyTorch/XLA on Cloud TPUs for LLM training and inference. We discuss the computation techniques and optimizations used to improve inference throughput and training model FLOPs utilization (MFU). **For Llama 2 70B parameters, we deliver 53% training MFU, 17 ms/token inference latency, 42 tokens/s/chip throughput powered by PyTorch/XLA on Google Cloud TPU.** We offer a [training user guide](https://github.com/pytorch-tpu/transformers/blob/llama2-google-next-training/SPMD_USER_GUIDE.md) and an [inference user guide](https://github.com/pytorch-tpu/llama/blob/llama2-google-next-inference/TORCH_XLA_USER_GUIDE.md) for reproducing the results in this article. Additionally, you may find our [Google Next 2023 presentation here](https://www.youtube.com/watch?v=PSpmRtWuMs8). - - -## Model Overview - -Llama 2 comes in various sizes, ranging from 7B to 70B parameters, catering to different needs, computational resources, and training / inference budgets. Whether it's small-scale projects or large-scale deployments, Llama models offer versatility and scalability to accommodate a wide range of applications. - -Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. The largest, 70B model, uses grouped-query attention, which speeds up inference without sacrificing quality. [Llama 2 is trained on 2 trillion tokens](https://arxiv.org/pdf/2307.09288.pdf) (40% more data than Llama) and has the context length of 4,096 tokens for inference (double the context length of Llama), which enables more accuracy, fluency, and creativity for the model. - -Llama 2 is a state-of-the-art LLM that outperforms many other open source language models on many benchmarks, including reasoning, coding, proficiency, and knowledge tests. The model’s scale and complexity place many demands on AI accelerators, making it an ideal benchmark for LLM training and inference performance of PyTorch/XLA on Cloud TPUs. - - -## Performance Challenge of LLMs - -Large-scale distributed training for LLMs such as Llama 2 introduces technical challenges that require practical solutions to make the most efficient use of TPUs. Llama’s size can strain both memory and processing resources of TPUs. To address this, we use model sharding, which involves breaking down the model into smaller segments, each fitting within the capacity of a single TPU core. This enables parallelism across multiple TPUs, improving training speed while reducing communication overhead. - -Another challenge is managing the large datasets required for training Llama 2 efficiently, which requires effective data distribution and synchronization methods. Additionally, optimizing factors like learning rate schedules, gradient aggregation, and weight synchronization across distributed TPUs is crucial for achieving convergence. - -After pretraining or fine-tuning Llama 2, running inference on the model checkpoint creates additional technical challenges. All of the challenges discussed in our [previous blog post](https://pytorch.org/blog/path-achieve-low-inference-latency/), such as autoregressive decoding, variable input prompt lengths, and the need for model sharding and quantization still apply for Llama 2. In addition, Llama 2 introduced two new capabilities: grouped-query attention and early stopping. We discuss how PyTorch/XLA handles these challenges to enable high-performance, cost-efficient training and inference of Llama 2 on Cloud TPU v4 and v5e. - - -## Large-Scale Distributed Training - -PyTorch/XLA offers two major ways of doing large-scale distributed training: [SPMD](https://pytorch.org/blog/pytorch-xla-spmd/), which utilizes the XLA compiler to transform and partition a single-device program into a multi-device distributed program; and [FSDP](https://pytorch.org/blog/large-scale-training-hugging-face/), which implements the widely-adopted [Fully Sharded Data Parallel](https://engineering.fb.com/2021/07/15/open-source/fsdp/) algorithm. - -In this blog post, we show how to use the SPMD API to annotate the [HuggingFace (HF) Llama 2](https://huggingface.co/blog/llama2) implementation to maximize performance. For comparison, we also show our FSDP results with the same configurations; read about [PyTorch/XLA FSDP API here](https://github.com/pytorch/xla/blob/master/docs/fsdp.md). - - -### SPMD Overview - -Let’s briefly review the fundamentals of SPMD. For details, please refer to our [blog post](https://pytorch.org/blog/pytorch-xla-spmd/) and [user guide](https://github.com/pytorch/xla/blob/master/docs/spmd.md). - - -#### Mesh - -A multidimensional array that describes the logical topology of the TPU devices: - -``` -# Assuming you are running on a TPU host that has 8 devices attached -num_devices = xr.global_runtime_device_count() -# mesh shape will be (4,2) in this example -mesh_shape = (num_devices // 2, 2) -device_ids = np.array(range(num_devices)) -# axis_names 'x' and 'y' are optional -mesh = Mesh(device_ids, mesh_shape, ('x', 'y')) -``` - -#### Partition Spec - -A tuple that describes how the corresponding tensor’s dimensions are sharded across the mesh: - -``` -partition_spec = ('x', 'y') -``` - -#### Mark Sharding - -An API that takes a mesh and a partition_spec, and then generates a sharding annotation for the XLA compiler. - -``` -tensor = torch.randn(4, 4).to('xla') -# Let's resue the above mesh and partition_spec. -# It means the tensor's 0th dim is sharded 4 way and 1th dim is sharded 2 way. -xs.mark_sharding(tensor, mesh, partition_spec) -``` - -### 2D Sharding with SPMD - -In our [SPMD blog post](https://pytorch.org/blog/pytorch-xla-spmd/), we demonstrated using 1D FSDP style sharding. Here, we introduce a more powerful sharding strategy, called [2D sharding](https://arxiv.org/pdf/2105.04663.pdf), where both the parameters and activations are sharded. This new sharding strategy not only allows fitting a larger model but also boosts the MFU to up to **54.3%**. For more details, read the Benchmarks section. - -This section introduces a set of general rules that applies to most LLMs, and for convenience we directly reference the variable names and configuration names from [HF Llama](https://github.com/pytorch-tpu/transformers/blob/llama2-google-next-training/src/transformers/models/llama/modeling_llama.py). - -First, let’s create a 2D Mesh with corresponding axis names: data and model. The data axis is usually where we distribute the input data, and the model axis is where we further distribute the model. - -``` -mesh = Mesh(device_ids, mesh_shape, ('data', 'model')) -``` - -The `mesh_shape` can be a hyper-parameter that is tuned for different model sizes and hardware configurations. The same mesh will be reused in all following sharding annotations. In the next few sections, we will cover how to use the mesh to shard parameters, activations and input data. - - -#### Parameter Sharding - -Below is a table that summarizes all parameters of HF Llama 2 and corresponding partition specifications. Example HF code can be found [here](https://github.com/pytorch-tpu/transformers/blob/llama2-google-next-training/examples/pytorch/language-modeling/run_clm.py#L572). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Parameter Name - Explanation - Parameter Shape - Partition Spec -
        embed_tokens - embedding layer - (vocab_size, hidden_size) - (model, data) -
        q_proj - attention weights - (num_heads x head_dim, hidden_size) - (data, model) -
        k_proj / v_proj - attention weights - (num_key_value_heads x head_dim, hidden_size) - (data, model) -
        o_proj - attention weights - (hidden_size, num_heads x head_dim) - (model, data) -
        gate_proj / up_proj - MLP weights - (intermediate_size, hidden_size) - (model, data) -
        down_proj - MLP weights - (hidden_size, intermediate_size) - (data, model) -
        lm_head - HF output embedding - (vocab_size, hidden_size) - (model, data) -
        - - -**Table 1: SPMD 2D Sharding Parameter Partition Spec** - -The rule is to shard the `hidden_size` dim of any weights except QKVO projections according to the `data` axis of the mesh, then shard the other dim with the remaining `model` axis. For QKVO, do the opposite. This model-data axis rotation methodology is similar to that of [Megatron-LM](https://arxiv.org/pdf/1909.08053.pdf) to reduce communication overhead. For `layernorm` weights, we implicitly mark them as replicated across different devices given they are 1D tensors. - - -#### Activation Sharding - -In order to better utilize the device memory, very often we need to annotate the output of some memory bound ops. That way the compiler is forced to only keep partial output on devices instead of the full output. In Llama 2, we explicitly annotate all `torch.matmul` and `nn.Linear` outputs. Table 2 summarizes the corresponding annotations; the example HF code can be found [here](https://github.com/pytorch-tpu/transformers/blob/llama2-google-next-training/src/transformers/models/llama/modeling_llama.py#L235). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Output Name - Explanation - Output Shape - Partition Spec -
        inputs_embeds - embedding layer output - (batch_size, sequence_length, hidden_size) - (data, None, model) -
        query_states - attention nn.Linear output - (batch_size, sequence_length, num_heads x head_dim) - (data, None, model) -
        key_states / value_states - attention nn.Linear output - (batch_size, sequence_length, num_key_value_heads x head_dim) - (data, None, model) -
        attn_weights - attention weights - (batch_size, num_attention_heads, sequence_length, sequence_length) - (data, model, None, None) -
        attn_output - attention layer output - (batch_size, sequence_length, hidden_size) - (data, None, model) -
        up_proj / gate_proj / down_proj - MLP nn.Linear outputs - (batch_size, sequence_length, intermediate_size) - (data, None, model) -
        logits - HF output embedding output - (batch_size, sequence_length, hidden_size) - (data, None, model) -
        - - -**Table 2: SPMD 2D Sharding Activation Partition Spec** - -The rule is to shard the `batch_size` dim of any outputs according to the `data` axis of the mesh, then replicate the length dims of any outputs, and finally shard the last dim along the `model` axis. - - -#### Input Sharding - -For input sharding, the rule is to shard the batch dim along the `data` axis of the mesh, and replicate the `sequence_length` dim. Below is the example code, and the corresponding HF change may be found [here](https://github.com/pytorch-tpu/transformers/blob/llama2-google-next-training/src/transformers/trainer.py#L1456). - -``` -partition_spec = ('data', None) -sharding_spec = xs.ShardingSpec(mesh, partition_spec) -# MpDeviceLoader will shard the input data before sending to the device. -pl.MpDeviceLoader(dataloader, self.args.device, input_sharding=sharding_spec, ...) -``` - -Now, all the data and model tensors that require sharding are covered! - - -#### Optimizer States & Gradients - -You may be wondering whether it is necessary to shard the optimizer states and gradients as well. Great news: the sharding propagation feature of the XLA compiler automates the sharding annotation in these two scenarios, without needing more hints to improve performance. - -It is important to note that optimizer states are typically initialized within the first iteration of the training loop. From the standpoint of the XLA compiler, the optimizer states are the outputs of the first graph, and therefore have the sharding annotation propagated. For subsequent iterations, the optimizer states become inputs to the second graph, with the sharding annotation propagated from the first one. This is also why PyTorch/XLA typically produces two graphs for the training loops. If the optimizer states are somehow initialized before the first iteration, users will have to manually annotate them, just like the model weights. - -Again, all concrete examples of the above sharding annotation can be found in our fork of HF Transformers [here](https://github.com/pytorch-tpu/transformers/tree/llama2-google-next-training). The repo also contains code for our experimental feature [MultiSlice](https://cloud.google.com/blog/products/compute/using-cloud-tpu-multislice-to-scale-ai-workloads), including `HybridMesh` and `dcn` axis, which follows the same principles mentioned above. - - -### Caveats - -While using SPMD for training, there are a few important things to pay attention to: - - - -* Use `torch.einsum` instead of `torch.matmul`; `torch.matmul` usually flattens tensors and does a `torch.mm` at the end, and that’s bad for SPMD when the combined axes are sharded. The XLA compiler will have a hard time determining how to propagate the sharding. -* PyTorch/XLA provides patched [nn.Linear](https://github.com/pytorch/xla/blob/master/torch_xla/experimental/xla_sharding.py#L570) to overcome the above constraint: - -``` -import torch_xla.experimental.xla_sharding as xs -from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear - - model = apply_xla_patch_to_nn_linear(model, xs.xla_patched_nn_linear_forward) -``` - -* Always reuse the same mesh across all shardings -* Always specify --dataloader_drop_last yes. The last smaller data is hard to annotate. -* Large models which are initialized on the host can induce host-side OOM. One way to avoid this issue is to initialize parameters on the [meta device](https://github.com/pytorch-tpu/transformers/blob/llama2-google-next-training/examples/pytorch/language-modeling/run_clm.py#L501), then create and shard real tensors layer-by-layer. - - -### Infrastructure Improvements - -Besides the above modeling techniques, we have developed additional features and improvements to maximize performance, including: - - - -* We enable asynchronous collective communication. This requires enhancements on the XLA compiler’s latency hiding scheduler to better optimize for the Llama 2 PyTorch code. -* We now allow sharding annotations in the middle of the IR graph, just like JAX’s [jax.lax.with_sharding_constraint](https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.with_sharding_constraint.html). Previously, only graph inputs were annotated. -* We also propagate replicated sharding spec from the compiler to the graph outputs. This allows us to shard the optimizer states automatically. - - -## Inference Optimizations - -All the PyTorch/XLA [optimizations](https://pytorch.org/blog/path-achieve-low-inference-latency/) implemented for Llama inference are applied to Llama 2 as well. That includes [Tensor Parallelism + Dynamo (torch.compile) using torch-xla collective ops](https://pytorch.org/blog/path-achieve-low-inference-latency/#fairscale-sharding), [autoregressive decoding logic improvement to avoid recompilation](https://pytorch.org/blog/path-achieve-low-inference-latency/#autoregressive-decoding-on-pytorchxla), [bucketized prompt length](https://pytorch.org/blog/path-achieve-low-inference-latency/#input-prompt-optimization), [KV-cache with compilation friendly index ops](https://pytorch.org/blog/path-achieve-low-inference-latency/#kv-cache-optimization). Llama 2 introduces two new changes: Grouped Query Attention, and Early Stopping when eos is reached for all prompts. We applied corresponding changes to promote better performance and flexibility with PyTorch/XLA. - - -### Grouped Query Attention - -Llama 2 enables [Grouped Query Attention](https://arxiv.org/pdf/2305.13245.pdf) for the 70B models. It allows the number of Key and Value heads to be smaller than the number of Query heads, while still supporting KV-cache sharding up to the number of KV heads. For the 70B models, the `n_kv_heads` is 8, which limits the tensor parallelism to be less or equal to 8. In order to shard the model checkpoint to run on more devices, the K, V projection weights need to be replicated first, and then split into multiple pieces. For example, to shard the 70B model checkpoint from 8 pieces to 16 pieces, the K, V projection weights are duplicated and split into 2 pieces for each shard. We provide a [reshard_checkpoints.py](https://github.com/pytorch-tpu/llama/blob/llama2-google-next-inference/reshard_checkpoints.py) script to handle that, and to make sure the sharded checkpoint performs mathematically identical to the original checkpoint. - - -### EOS Early Stopping - -The Llama 2 generation code added [the early stopping logic](https://github.com/facebookresearch/llama/blob/ea9f33d6d3ea8ed7d560d270986407fd6c2e52b7/llama/generation.py#L159). A `eos_reached` tensor is used to track the completion of all the prompt generations, and if the `eos` token is reached for all the prompts in the batch, the generation would stop early. The similar change is incorporated in the PyTorch/XLA optimized version as well, with some minor tweaks. - -In PyTorch/XLA, checking the value of a tensor like `eos_reached` as part of the control flow condition would invoke a blocking device-to-host transfer. The tensor would be transferred from device memory to CPU memory to evaluate its value, while all other logics are waiting. This introduced a delay on the scale of ms after every new token generation. As a trade-off, we reduce the rate of checking the `eos_reached` value to be [once every 10 new token generations](https://github.com/pytorch-tpu/llama/blob/b89dd0f2351c42fef367670d9d2c5b65cd0ae932/llama/generation.py#L268C13-L270C26). With this change, the impact of the blocking device-to-host transfer would be reduced by 10x, while the early stopping would still be effective, and at most 9 unnecessary tokens would be generated after each sequence reaches the `eos` token. - - -### Model Serving - -PyTorch/XLA is working on a serving strategy to enable the PyTorch community to serve their deep learning applications via [Torch.Export](https://pytorch.org/docs/stable/export.html), [StableHLO](https://github.com/openxla/stablehlo), and [SavedModel](https://www.tensorflow.org/guide/saved_model). PyTorch/XLA Serving is an experimental feature in [PyTorch/XLA 2.1 release](https://github.com/pytorch/xla/releases); for details visit our [serving user guide](https://github.com/pytorch/xla/blob/r2.1/docs/stablehlo.md#convert-saved-stablehlo-for-serving). Users can take advantage of TorchServe to run their single-host workloads. - - -## Benchmarks - - -### Metrics - -To measure training performance, we use the industry-standard metric: [Model FLOPS Utilization (MFU)](https://arxiv.org/abs/2204.02311). Model FLOPS are the floating point operations required to perform a single forward and backward pass. Model FLOPs are hardware and implementation independent and only depend on the underlying model. MFU measures how effectively the model is using the actual hardware during training. Achieving 100% MFU means that the model is using the hardware perfectly. - -To measure inference performance, we use the industry-standard metric of throughput. First, we measure latency per token when the model has been compiled and loaded. Then, we calculate throughput by dividing batch size (BS) over latency per chip. As a result, throughput measures how the model is performing in production environments regardless of how many chips are used. - - -### Results - - -#### Training Evaluation - -Figure 1 shows Llama 2 SPMD 2D sharding training results on a range of Google TPU v4 hardware with [PyTorch/XLA FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) as the baseline. We increased MFU by **28%** across all sizes of Llama 2 compared to FSDP running on the same hardware configuration. This performance improvement is largely due to: 1) 2D Sharding has less communication overhead than FSDP, and 2) asynchronous collective communication is enabled in SPMD which allows communication and computation overlapping. Also note that as the model size scales, we maintain the high MFU. Table 3 shows all the hardware configurations plus some hyperparameters used in the training benchmarks. - - -![Figure 1. Llama 2 Training MFU on TPU v4 Hardware](/assets/images/high-performance-llama-2/fig1.jpg){:style="width:100%;"} - -

        Fig. 1: Llama 2 Training MFU on TPU v4 Hardware

        - - -The results in Figure 1 are produced with sequence length 1,024. Figure 2 shows how the performance behaves with larger sequence lengths. It shows our performance also scales linearly with sequence lengths. The MFU is expected to decrease a little as a smaller per device batch size is needed to accommodate the additional memory pressure introduced by the larger sequence length since the sequence length axis is not sharded in 2D sharding. And TPU is very sensitive to batch size. For Llama 2, 70B parameters, the performance decrease is as low as **4%**. At the time of preparing these results, [Hugging Face Llama 2 tokenizer](https://github.com/pytorch-tpu/transformers/blob/llama2-google-next-training/src/transformers/models/llama/tokenization_llama.py#L48) limits the max model input to 2,048, preventing us from evaluating larger sequence lengths. - - - -![Figure 2. Llama 2 SPMD Training MFU on TPU v4 with Different Sequence Lengths](/assets/images/high-performance-llama-2/fig2.jpg){:style="width:100%;"} - -

        Fig. 2: Llama 2 SPMD Training MFU on TPU v4 with Different Sequence Lengths

        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model Size - 7B - 13B - 70B -
        TPU NumCores - V4-32 - V4-64 - V4-256 -
        Mesh Shape - (16, 1) - (32, 1) - (32, 4) -
        Seq Len - 1,024 - 2,048 - 1,024 - 2,048 - 1,024 - 2,048 -
        Global Batch - 256 - 128 - 256 - 128 - 512 - 256 -
        Per Device Batch - 16 - 8 - 8 - 4 - 16 - 8 -
        - - -**Table 3: Llama 2 SPMD Training Benchmark TPU Configurations and Hyperparameters** - -One last thing to call out is that we use [adafactor](https://arxiv.org/abs/1804.04235) as the optimizer for better memory utilization. And once again, here is the [user guide](https://github.com/pytorch-tpu/transformers/blob/llama2-google-next-training/SPMD_USER_GUIDE.md) to reproduce the benchmark results listed above. - - -#### Inference Evaluation - -In this section, we extend our [previous evaluation of Llama on Cloud v4 TPU](https://pytorch.org/blog/path-achieve-low-inference-latency/). Here, we demonstrate the performance properties of TPU v5e for inference applications. - -We define inference throughput as the number of tokens produced by a model per second per TPU chip. Figure 3 shows Llama 2 70B throughput on a v5e-16 TPU node. Given Llama is a memory bound application, we see that applying weight-only quantization unblocks extending the model batch size to 32. Higher throughput results would be possible on larger TPU v5e hardware up to the point where the ICI network bandwidth between chips throttle the TPU slice from delivering higher throughput. Exploring the upper bound limits of TPU v5e on Llama 2 was outside of the scope of this work. Notice, to make the Llama 2 70B model run on v5e-16, we replicated the attention heads to have one head per chip as discussed in the Inference section above. As discussed [previously](https://pytorch.org/blog/path-achieve-low-inference-latency/), with increasing model batch size, per-token latency grows proportionally; quantization improves overall latency by reducing memory I/O demand. - - -![Figure 3. Llama 2 70B Inference Per-Chip Throughput on TPU v5e vs. Batch Size](/assets/images/high-performance-llama-2/fig3.jpg){:style="width:100%;"} - -

        Fig. 3: Llama 2 70B Inference Per-Chip Throughput on TPU v5e vs. Batch Size

        - - -Figure 4 shows inference throughput results across different model sizes. These results highlight the largest throughput given the hardware configuration when using `bf16` precision. With weight only quantization, this throughput reaches 42 on the 70B model. As mentioned above, increasing hardware resources may lead to performance gains. - -![Figure 4. Llama 2 Inference Per-Chip Throughput on TPU v5e](/assets/images/high-performance-llama-2/fig4.jpg){:style="width:100%;"} - -

        Fig. 4: Llama 2 Inference Per-Chip Throughput on TPU v5e

        - - -Figure 5 shows the cost of serving Llama 2 models (from Figure 4) on Cloud TPU v5e. We report the TPU v5e per-chip cost based on the 3-year commitment (reserved) price in the `us-west4` region. All model sizes use maximum sequence length of 2,048 and maximum generation length of 1,000 tokens. Note that with quantization, the cost for the 70B model drops to **$0.0036 per 1,000 tokens**. - - -![Figure 5. Llama 2 Inference Per-Chip Cost on TPU v5e](/assets/images/high-performance-llama-2/fig5.jpg){:style="width:100%;"} - -

        Fig. 5: Llama 2 Inference Per-Chip Cost on TPU v5e

        - -Figure 6 summarizes our best Llama 2 inference latency results on TPU v5e. Llama 2 7B results are obtained from our non-quantized configuration (BF16 Weight, BF16 Activation) while the 13B and 70B results are from the quantized (INT8 Weight, BF16 Activation) configuration. We attribute this observation to the inherent memory saving vs. compute overhead tradeoff of quantization; as a result, for smaller models, quantization may not lead to lower inference latency. - -Additionally, prompt length has a strong effect on the memory requirements of LLMs. For instance, we observe a latency of 1.2ms / token (i.e. 201 tokens / second / chip) when `max_seq_len=256` at batch size of 1 with no quantization on v5e-4 running Llama2 7B. - - -![Figure 6. Llama 2 Inference Latency on TPU v5e](/assets/images/high-performance-llama-2/fig6.jpg){:style="width:100%;"} - -

        Fig. 6: Llama 2 Inference Latency on TPU v5e

        - - -## Final Thoughts - -The recent wave of AI innovation has been nothing short of transformative, with breakthroughs in LLMs at the forefront. Meta's Llama and Llama 2 models stand as notable milestones in this wave of progress. PyTorch/XLA uniquely enables high-performance, cost-efficient training and inference for Llama 2 and other LLMs and generative AI models on Cloud TPUs, including the new Cloud TPU v5e. Looking forward, PyTorch/XLA will continue to push the performance limits on Cloud TPUs in both throughput and scalability and at the same time maintain the same PyTorch user experience. - -We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to [GitHub](https://github.com/pytorch/xla) so that we can openly collaborate. You can also [try out](https://colab.sandbox.google.com/github/pytorch/xla/blob/master/contrib/colab/getting-started.ipynb) PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs. - -We would like to extend our special thanks to Marcello Maggioni, Tongfei Guo, Andy Davis, Berkin Ilbeyi for their support and collaboration in this effort. - -Cheers, -The PyTorch/XLA Team at Google diff --git a/_posts/2023-11-07-pytorch-compile-to-speed-up-inference.md b/_posts/2023-11-07-pytorch-compile-to-speed-up-inference.md deleted file mode 100644 index baa9dcd861ce..000000000000 --- a/_posts/2023-11-07-pytorch-compile-to-speed-up-inference.md +++ /dev/null @@ -1,181 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch compile to speed up inference on Llama 2" -author: "IBM Research: Antoni Viros i Martin, Brian Vaughan, Davis Wertheimer, Joshua Rosenkranz, Mudhakar Srivatsa, Nelson Mimura Gonzalez, Raghu Ganti, Supriyo Chakraborty, Zhuoran Liu -Meta: Geeta Chauhan, Hamid Shojanazeri" ---- - -In this blog, we discuss how to improve the inference latencies of the Llama 2 family of models using PyTorch native optimizations such as native fast kernels, compile transformations from torch compile, and tensor parallel for distributed inference. Our approach results in 29ms/token latency for single user requests on the 70B LLaMa model (as measured on 8 A100 GPUs). We are excited to share our findings with the community and make our code available [here](https://github.com/foundation-model-stack/foundation-model-stack). - - - -## Background - -We are amid a generative AI revolution with large language models of tens of billions of parameters becoming commoditized and available for use. However, it is well recognized in the community that deploying these large models in a cost-efficient manner remains a key challenge. Many different approaches have been attempted with varying degrees of success and offering different trade-offs. Hardware-specific optimizations (e.g., Faster Transformer from NVIDIA) are restricted to specific target hardware whereas approaches that rely on layers of abstraction (e.g., ONNX) enable arbitrary models but suffer from loss of efficiency. With the introduction of PyTorch compile last year, IBM and the PyTorch team started exploring the use of model compilation for inference optimizations with the goal of reducing the latency per token for generative models. - - -## Model Choice - -We choose to benchmark on the Llama 2 family of models, given their popularity. The models that we are interested in, and their hyper parameters relevant for this blog are given in the below table: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model size - Hidden dimension - Num heads - Num layers - Attention type -
        7B - 4096 - 32 - 32 - MHA -
        13B - 5120 - 40 - 40 - MHA -
        70B - 8192 - 64 - 80 - GQA -
        - - -These models are decoder only, which means that tokens get generated in a serialized manner, which is typically sped up using KV caching. We take a similar approach in our latency and throughput measurements. - - -## Inference Approach - -Our goal for inference is to provide a path for achieving the best possible latencies rapidly, to keep up with the velocity with which new model architectures are emerging in the community. A PyTorch native approach is appealing as it allows for the maximum flexibility in terms of “coverage” of models. We note that there are four orthogonal techniques that provide acceleration in inference: (a) Kernel fusion using compile, (b) Faster kernels, (c) Tensor parallel for larger models, and (d) Quantization. In our approach, we use the first three of these four levers - compile natively working with faster kernels from SDPA and a custom tensor parallel implementation that all work hand-in-glove to achieve inference latencies of 29ms/token on a 70B model as measured on 8 NVIDIA A100 GPUs with single user. - - -### Compile all the way! - -PyTorch Compile leverages tracing and graph capture to reduce the CPU overhead and in an ideal scenario results in a single graph execution/instruction from CPU to GPU. However, often compile introduces graph breaks due to model architecture and ops unsupported by compile. For example, complex operations such as einops are not supported by compile today. Similarly, tensor parallel inference can introduce graph breaks at each layer, since compile requires the tensor parallel implementation to use traceable communication collectives. If these graph breaks are not removed, the performance of the compiled artifacts will be hampered and could even be lower compared to eager mode execution. To get full benefit of the compiled artifacts, the graph breaks need to be removed. - -Below, we describe how we went about doing this for the 70b Llama 2 model and the challenges we had to overcome to get compile to work all the way through. - -Our first attempt was to try using torch.compile to compile the out-of-box Llama 2 model, but it failed because complex ops were not supported. Using TORCH_COMPILE_DEBUG = 1 we identified the RoPE positional encodings was using complex number functions resulting in graph breaks and significant slowdowns. We rewrote the RoPE function to bypass torch.einsum (Original implementation uses torch.polar that also conflicts with compile) and use torch.cos and torch.sin instead. - -``` -self.cached_freqs[dev_idx][alpha] = torch.stack( - [ - torch.cos(freqs), - -torch.sin(freqs), - torch.sin(freqs), - torch.cos(freqs), - ], - dim=2, - ).view(*freqs.shape, 2, 2) -``` - -_Our implementation of the frequencies computation_ -

        -``` -t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) -t = t / self.scaling_factor - -freqs = torch.einsum("i,j->ij", t, self.inv_freq) -# Different from paper, but it uses a different permutation in order to obtain the same calculation -emb = torch.cat((freqs, freqs), dim=-1) -self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) -self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) -``` - -_Hugging Face implementation of the frequencies computation_ - -Once RoPE was fixed, we were able to get 7B and 13B models to compile without ANY graph breaks on a single A100 GPU. - -We used SDPA, the PyTorch native implementation of efficient attention computation with tracing enabled (for compile). To avoid graph breaks related to forcing a single algorithm choice using a Python context, the recommended way, we had to use the `torch.backends.cuda.enable_*_sdp `functions. - -``` -attn = torch.nn.functional.scaled_dot_product_attention( - queries, - keys_e, - values_e, - attn_mask=attn_mask, - dropout_p=self.p_dropout if self.training else 0.0, - is_causal=is_causal_mask, -) -``` - -_Attention computation using SDPA_ - -Next we ran the same steps for the larger 70B model and found that even with half precision, the model does not fit in a single GPU and requires tensor parallel inference. Using torch.compile for the 70B model resulted in 162 graph breaks due to two all-reduces per layer, one all-gather for forward embedding, and one all-gather for reverse embedding. Due to this, we saw no significant improvement in inference latencies. We could not use the distributed tensor implementation from PyTorch at the time of writing this blog as it did not support compile. We rewrote the tensor parallel code from scratch so that it only depends on traceable collectives to make it work with compile. After this last change, PyTorch compiler did not introduce any graph breaks and we saw a significant speedup in inference latencies. Specifically, we measured latencies for the Llama 70B model at 29ms/token when using 8 A100 GPUs, a 2.4x improvement over unoptimized inference. - - -### Serving aspects - -Finally, a point to note here is that simply performing compile on a model is not sufficient to serve the model in a production setting. To realize the above performance with high throughput, we need to support dynamic batching, nested tensors, as well as have a warm up phase where we pre-compile for bucketized sequence lengths. We are working on these aspects to realize such performance in a production setting. - - -## Experiments and Measurements - -We use nodes with 8 A100 NVIDIA GPUs with 80G cards for all our measurements in two different environments (IBM Cloud and AWS, both running OpenShift). First, we compare the various techniques – eager mode, with SDPA Flash kernel, with Compile, and with Compile and SDPA. For the 70B model, we run it in Tensor Parallel mode with compile and SDPA. For this experiment, we use 512 tokens as input length with 50 token generation. For 7 and 13B models, we use single A100 for measurement of latencies, whereas we use 8 A100s for the 70B model. In addition, for the 70B model we use the reduce-overhead option in PyTorch compile that uses CudaGraphs to reduce CPU to GPU kernel launching overheads; the use of CudaGraphs in the 7B and 13B models did not show any benefits (and are thus not reported here). We observe from Figure 1 that compile and SDPA provide very low latencies, with 70B Llama 2 model at 29ms/token. - - -![Figure 1. Median latency across different techniques with sequence length 512 (measured on IBM Cloud A100 servers)](/assets/images/pytorch-compile-to-speed-up-inference/fig1.jpg){:style="width:100%;"} - -

        Fig. 1: Median latency across different techniques with sequence length 512 (measured on IBM Cloud A100 servers)

        - - -Next, we examine the impact of sequence length, where we increase it from 1024 to 4096 and observe that the median latency per token increases sub-linearly, demonstrating that when we increase context to large documents, we do not sacrifice response times. - -![Figure 2. Median latency for compile+SDPA with different sequence lengths (Measured on A100s on AWS)](/assets/images/pytorch-compile-to-speed-up-inference/fig2.jpg){:style="width:100%;"} - -

        Fig. 2: Median latency for compile+SDPA with different sequence lengths (Measured on A100s on AWS)

        - - -Finally, with increased batch sizes, we observe that the response latencies increase sub-linearly. For the 13B model, at batch size 8, we encounter an OOM. For the 70B model, given that it is running on 8 GPUs with tensor parallel, we do not see any such OOM issues. - -![Figure 3. Median latency for compile+SDPA with different batch sizes and sequence length fixed at 4096 (Measured on A100s on AWS)](/assets/images/pytorch-compile-to-speed-up-inference/fig3.jpg){:style="width:100%;"} - -

        Fig. 3: Median latency for compile+SDPA with different batch sizes and sequence length fixed at 4096 (Measured on A100s on AWS)

        - - -## Final Thoughts - -We have demonstrated how a PyTorch compile pathway for inference demonstrates ultra low latencies for 70B model inference. The next steps are to enable dynamic batching and nested tensors with the above levers. - -Special thanks to Edward Yang, Elias Ellison, Driss Guessous, Will Feng, Will Constable, Horace He, Less Wright, and Andrew Gu from Team PyTorch, whose PRs reviews and code contributions made it possible for us to realize the latencies using PyTorch native approach. We thank the broader Team PyTorch that have been tirelessly working to make PyTorch better, special shout outs to the SDPA team for enabling tracing and compile on fast kernels, the compile team that has been closely guiding us on how to work around as well as fix issues (including identifying and raising NVIDIA driver bugs in CUDA graphs). - -Inference latency has been one of the roadblocks for LLM adoption in critical enterprise workflows, but another major one is the need for safety, trustworthiness and governance. IBM’s guide for AI safety and LLM risk can be found [here](https://www.ibm.com/downloads/cas/E5KE5KRZ) and Meta’s responsible user guide for LLaMa can be found [here](https://ai.meta.com/llama/responsible-use-guide/). - - -## References - -* GitHub resources: [https://ibm.biz/fm-stack](https://ibm.biz/fm-stack) -* [The Path to Achieve Ultra-Low Inference Latency With LLaMa 65B on PyTorch/XLA](https://pytorch.org/blog/path-achieve-low-inference-latency/) -* [Speed, Python: Pick Two. How CUDA Graphs Enable Fast Python Code for Deep Learning](https://blog.fireworks.ai/speed-python-pick-two-how-cuda-graphs-enable-fast-python-code-for-deep-learning-353bf6241248) -* IBM’s resources on AI Ethics and Trust: [https://www.ibm.com/downloads/cas/E5KE5KRZ](https://www.ibm.com/downloads/cas/E5KE5KRZ) -* Meta LLaMa responsible user guide: [https://ai.meta.com/llama/responsible-use-guide/](https://ai.meta.com/llama/responsible-use-guide/) \ No newline at end of file diff --git a/_posts/2023-11-16-accelerating-generative-ai.md b/_posts/2023-11-16-accelerating-generative-ai.md deleted file mode 100644 index a5199809361c..000000000000 --- a/_posts/2023-11-16-accelerating-generative-ai.md +++ /dev/null @@ -1,333 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Generative AI with PyTorch: Segment Anything, Fast" ---- - -This post is the first part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples of how these features can be combined to see how far we can push PyTorch native performance. - -As announced during the [PyTorch Developer Conference 2023](https://www.youtube.com/watch?v=IWpM_9AsC-U), the PyTorch team [rewrote Meta’s Segment Anything (“SAM”) Model](https://github.com/facebookresearch/segment-anything) **resulting in 8x faster code** than [the original implementation](https://github.com/facebookresearch/segment-anything), with no loss of accuracy, all using native PyTorch optimizations. We leverage a breadth of new PyTorch features: - - - -* [Torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html): A compiler for PyTorch models -* [GPU quantization](https://github.com/pytorch-labs/ao/tree/main#torchao): Accelerate models with reduced precision operations -* [Scaled Dot Product Attention (SDPA)](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html): Memory efficient attention implementations -* [Semi-Structured (2:4) Sparsity:](https://pytorch.org/tutorials/prototype/semi_structured_sparse.html) A GPU optimized sparse memory format -* [Nested Tensor:](https://pytorch.org/tutorials/prototype/nestedtensor.html) Batch together non-uniformly sized data into a single Tensor, such as images of different sizes. -* **Custom operators with Triton:** Write GPU operations using Triton Python DSL and easily integrate it into PyTorch's various components with custom operator registration. - -We encourage readers to copy-paste code from [our implementation of SAM on Github](https://github.com/pytorch-labs/segment-anything-fast) and [ask us questions](https://github.com/pytorch-labs/segment-anything-fast/issues) on Github. - - -![A quick glimpse of increasing throughput and decreasing memory overhead](/assets/images/accelerating-generative-ai/bar_chart_7.png){:style="width:100%;"} - - -_A quick glimpse of increasing throughput and decreasing memory overhead with our newly released, PyTorch native, features. Benchmarks run on p4d.24xlarge instance (8x A100s)._ - - -## SegmentAnything Model - -[SAM](https://github.com/facebookresearch/segment-anything) is a zero-shot vision model for generating promptable image masks. - - -![sam image masks](/assets/images/accelerating-generative-ai/intro_image.jpg){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - -The SAM architecture [described[ in its paper](https://arxiv.org/abs/2304.02643)] includes multiple prompt and image encoders based on the Transformer architecture. Of this, we measured performance across the smallest and largest vision transformer backbones: [ViT-B](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth) and [ViT-H](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth). And for simplicity, we only show traces for the ViT-B model. - - -## Optimizations - -Below we tell the story of optimizing SAM: profiling, identifying bottlenecks, and building new features into PyTorch that solve these problems. Throughout, we showcase our new PyTorch features: **torch.compile, SDPA, Triton kernels, Nested Tensor and semi-structured sparsity.** The following sections are progressively built upon each other, ending with our SAM-fast, now [available on Github](https://github.com/pytorch-labs/segment-anything-fast). We motivate each feature using real kernel and memory traces, using fully PyTorch native tooling, and visualize these traces with [Perfetto UI](https://perfetto.dev/). - - -### Baseline - -Our SAM baseline is Facebook Research’s [unmodified model](https://github.com/facebookresearch/segment-anything), using float32 dtype and a batch size of 1. After some initial warmup, we can look at a kernel trace using the [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html): - -![kernel trace](/assets/images/accelerating-generative-ai/baseline_trace.jpg){:style="width:100%;"} - - -We notice two areas ripe for optimization. - -The first is long calls to aten::index, the underlying call resulting from a Tensor index operation (e.g., []). While the actual GPU time spent on aten::index is relatively low. aten::index is launching two kernels, and a blocking cudaStreamSynchronize is happening in between. This means the CPU is waiting for the GPU to finish processing until it launches the second kernel. To optimize SAM, we should aim to remove blocking GPU syncs causing idle time. - -The second is significant time spent on GPU in matrix multiplication (dark green on stream 7 7 above). This is common in Transformers. We can significantly speed up SAM if we can reduce the amount of GPU time spent on matrix multiplication. - -We can measure the throughput (img/s) and memory overhead (GiB) from out of the box SAM to establish a baseline: - -![throughput (img/s) and memory overhead (GiB) from out of the box SAM](/assets/images/accelerating-generative-ai/bar_chart_0.png){:style="width:100%;"} - - - -### Bfloat16 Half precision (+GPU syncs and batching) - -To address the first issue of less time spent in matrix multiplication, we can turn to [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format). Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation. With reducing precision of parameters, it’s critical to validate end to end model accuracy. - - -![replacing padding dtypes with half precision, bfloat16](/assets/images/accelerating-generative-ai/bfloat16_snippet.jpg){:style="width:100%;"} - -_Shown here is an example of replacing padding dtypes with half precision, bfloat16. [Code is here](https://github.com/pytorch-labs/segment-anything-fast/blame/main/segment_anything_fast/modeling/prompt_encoder.py#L86)._ - -Next to simply setting `model.to(torch.bfloat16)` we have to change a few small places that assume the default dtype. - -Now, in order to remove GPU syncs we need to audit operations that cause them. We can find these pieces of code by searching the GPU traces for calls to `cudaStreamSynchronize`. In fact, we found two locations that we were able to rewrite to be sync-free. - - -![code sample 1](/assets/images/accelerating-generative-ai/code1.jpg){:style="width:100%;"} - - -![replacing padding dtypes with half precision, bfloat16](/assets/images/accelerating-generative-ai/bfloat16_snippet2.jpg){:style="width:100%;"} - - -Specifically, we see that within SAM’s image encoder, there are variables acting as coordinate scalers, q_coords and k_coords. These are both allocated and processed on the CPU. However, once these variables are used to index in rel_pos_resized, the index operation automatically moves these variables to the GPU. This copy over causes the GPU sync we've observed above. We notice a second call to index in SAM’s prompt encoder: We can use torch.where to rewrite this as shown above. - -**Kernel trace** - -After applying these changes, we begin to see significant time between individual kernel calls. This is typically observed with small batch sizes (1 here) due to the GPU overhead of launching kernels. To get a closer look at practical areas for optimization, we can start to profile SAM inference with batch size 8: - - -![profile SAM inference with batch size 8](/assets/images/accelerating-generative-ai/bfloat16_trace.jpg){:style="width:100%;"} - -Looking at the time spent per-kernel, we obverse most of SAM’s GPU time spent on elementwise kernels and softmax operation. With this we now see that matrix multiplications have become a much smaller relative overhead. - -![matrix multiplications have become a much smaller relative overhead](/assets/images/accelerating-generative-ai/bfloat16_kernels.jpg){:style="width:100%;"} - - -Taken the GPU sync and bfloat16 optimizations together, we have now pushed SAM performance by up to 3x - -![SAM performance by up to 3x](/assets/images/accelerating-generative-ai/bar_chart_1.png){:style="width:100%;"} - - - -### Torch.compile (+graph breaks and CUDA graphs) - -When observing a large number of small operations, such as the elementwise kernels profiled above, turning to a compiler to fuse operations can have strong benefits. PyTorch’s recently released **torch.compile** does a great job optimizing by: - - - -1. Fusing together sequences of operations such as nn.LayerNorm or nn.GELU into a single GPU kernel that is called and -2. Epilogues: fusing operations that immediately follow matrix multiplication kernels to reduce the number of GPU kernel calls. - -Through these optimizations, we reduce the number of GPU global memory roundtrips, thus speeding up inference. We can now try torch.compile on SAM’s [image encoder](https://github.com/pytorch-labs/segment-anything-fast/blob/3bd74614fe7285de4de3d763d8ec2e951c4c589c/experiments/eval_combo.py#L196-L201). To maximize performance we use a few advanced compile techniques such as: - - - -* using torch.compile’s max-autotune mode enables [CUDA graphs](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/) and shape-specific kernels with custom epilogues -* By setting TORCH_LOGS="graph_breaks,recompiles" we can manually verify that we are not running into [graph breaks](https://pytorch.org/docs/main/torch.compiler_faq.html#graph-breaks) or recompiles. -* Padding the batch of images input to the encoder with zeros ensures compile accepts static shapes thus being able to always use shape-specific optimized kernels with custom epilogues without recompilations. - -``` -predictor.model.image_encoder = \ - torch.compile(predictor.model.image_encoder, mode=use_compile) -``` - - - -**Kernel trace** - - - -![Kernel trace](/assets/images/accelerating-generative-ai/compile_trace.jpg){:style="width:100%;"} - - -torch.compile is working beautifully. We launch a single CUDA graph, which makes up a significant portion of GPU time within the timed region. Let's run our profile again and look at the percentage of GPU time spent in specific kernels: - - - -![the percentage of GPU time spent in specific kernels](/assets/images/accelerating-generative-ai/compile_kernels.jpg){:style="width:100%;"} - - -We now see softmax makes up a significant portion of the time followed by various GEMM variants. In summary we observe the following measurements for batch size 8 and above changes. - - - -![measurements for batch size 8 and above](/assets/images/accelerating-generative-ai/bar_chart_2.png){:style="width:100%;"} - - - -### SDPA: scaled_dot_product_attention - -Next up, we can tackle one of the most common areas for transformer performance overhead: the attention mechanism. Naive attention implementations scale quadratically in time and memory with sequence length. PyTorch’s [scaled_dot_product_attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html?highlight=scaled_dot_product_attention#torch.nn.functional.scaled_dot_product_attention) operation built upon the principles of [Flash Attention](https://arxiv.org/pdf/2205.14135.pdf), [FlashAttentionV2](https://github.com/Dao-AILab/flash-attention) and [xFormer's memory efficient attention](https://github.com/facebookresearch/xformers) can significantly speed up GPU attention. Combined with torch.compile, this operation allows us to express and fuse a common pattern within variants of MultiheadAttention. After [a small set of changes](https://github.com/facebookresearch/segment-anything/compare/50cb459d080bcd783a4b481d3bde4150d35ac497...7dc75fdf283693f73606f2fe7fdcb693afcb16b9) we can adapt the model to use scaled_dot_product_attention. - - -![PyTorch native attention implementation](/assets/images/accelerating-generative-ai/sdpa_snippet.jpg){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - -_PyTorch native attention implementation, [see code here](https://github.com/pytorch-labs/segment-anything-fast/blob/main/segment_anything_fast/modeling/image_encoder.py#L236)._ - -**Kernel trace** - -We can now see that in particular the memory efficient attention kernel is taking up a large amount of computational time on the GPU: - - -![memory efficient attention kernel is taking up a large amount of computational time on the GPU](/assets/images/accelerating-generative-ai/sdpa_kernels.jpg){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - -Using PyTorch’s native scaled_dot_product_attention, we can significantly increase the batch size. We now observe the following measurements for batch size 32 and above changes. - - -![batch size 32 and above](/assets/images/accelerating-generative-ai/bar_chart_3.png){:style="width:100%;"} - - -### Triton: Custom SDPA for fused relative positional encoding - -Transitioning away from inference throughput for a moment, we started profiling overall SAM memory. Within the image encoder, we saw significant spikes in memory allocation: - - - -![spikes in memory allocation](/assets/images/accelerating-generative-ai/triton_trace.png){:style="width:100%;"} - - -Zooming in, we see this allocation happens within add_decomposed_rel_pos, [on the following line:](https://github.com/pytorch-labs/segment-anything-fast/blob/main/segment_anything_fast/modeling/image_encoder.py#L373) - - - -![we see this allocation happens within add_decomposed_rel_pos](/assets/images/accelerating-generative-ai/triton_snippet.jpg){:style="width:100%;display: block;max-width:500px; margin-left:auto; margin-right:auto;"} - - - -The attn variable here is the addition of two smaller tensors: rel_h of shape (B, q_h, q_w, k_h, 1) and rel_w of shape (B, q_h, q_w, 1, k_w). - -It's not surprising that the memory efficient attention kernel (used via SDPA) is taking a long time with an attention bias size over 3.0GiB. If instead of allocating this large attn tensor, we thread into SDPA the two smaller rel_h and rel_w tensors, and only construct attn as needed, we'd anticipate significant performance gain. - -Unfortunately this is not a trivial modification; SDPA kernels are highly optimized and written in CUDA. We can turn to Triton, with their easy to understand and use [tutorial on a FlashAttention implementation](https://triton-lang.org/main/getting-started/tutorials/06-fused-attention.html). After some significant digging and in close collaboration with xFormer's Daniel Haziza we found one case of input shapes where it is relatively straightforward to implement a fused version of the kernel. The [details have been added to the repository](https://github.com/pytorch-labs/segment-anything-fast/blob/main/segment_anything_fast/flash_4.py). Surprisingly this can be done in under 350 lines of code for the inference case. - -This is a great example of extending PyTorch with a new kernel, straightforwardly built with Triton code. - -**Kernel trace** - - -![kernel trace](/assets/images/accelerating-generative-ai/triton_kernels.jpg){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - -With our custom positional Triton kernel we observe the following measurements for batch size 32. - - - -![we observe the following measurements for batch size 32](/assets/images/accelerating-generative-ai/bar_chart_4.png){:style="width:100%;"} - - - - -### NT: NestedTensor and batching predict_torch - -We have spent a lot of time on the image encoder. This makes sense, since it takes up the most amount of computational time. At this point however it is fairly well optimized and the operator that takes the most time would require significant additional investment to be improved. - -We discovered an interesting observation with the [mask prediction pipeline](https://github.com/pytorch-labs/segment-anything-fast/blob/7cd6ba3cea451602acb7d36d176da06c70ac68f1/experiments/eval_combo.py#L137-L157): for each image we have there is an associated size, coords, and fg_labels Tensor. Each of these tensors are of different batch sizes. Each image itself is also of a different size. This representation of data looks like [Jagged Data](https://en.wikipedia.org/wiki/Jagged_array). With PyTorch’s recently released [NestedTensor](https://pytorch.org/tutorials/prototype/nestedtensor.html), we can modify our data pipeline batch coords and fg_labels Tensors into a single NestedTensor. This can have significant performance benefits for the prompt encoder and mask decoder that follow the image encoder. Invoking: - - -``` -torch.nested.nested_tensor(data, dtype=dtype, layout=torch.jagged) -``` - - -**Kernel trace** - - - -![Kernel trace](/assets/images/accelerating-generative-ai/trace1.jpg){:style="width:100%;"} - - - -![we can launch kernels much faster from the CPU than the GPU can process](/assets/images/accelerating-generative-ai/nt_kernel.jpg){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - -We can see now that we can launch kernels much faster from the CPU than the GPU can process and that it spends a long time waiting at the end of our timed region for the GPU to finish (cudaDeviceSynchronize). We also don't see any more idle time (white space) between kernels on the GPU. - -With Nested Tensor, we observe the following measurements for batch size 32 and above changes. - - -![batch size 32 and above changes](/assets/images/accelerating-generative-ai/bar_chart_5.png){:style="width:100%;"} - -### int8: quantization and approximating matmul - -We notice in the above trace, that significant time is now spent in GEMM kernels. We’ve optimized enough that we now see matrix multiplication account for more time in inference than scaled dot product attention. - -Building on earlier learnings going from fp32 to bfloat16, let’s go a step further, emulating even lower precision with int8 quantization. Looking at quantization methods, we focus on [Dynamic quantization](https://pytorch.org/tutorials/recipes/quantization.html) wherein our model observes the range of possible inputs and weights of a layer, and subdivides the expressible int8 range to uniformly “spread out” observed values. Ultimately each float input will be mapped to a single integer in the range [-128, 127]. For more information see PyTorch’s [tutorial on quantization](https://pytorch.org/tutorials/recipes/quantization.html) - -Reducing precision can immediately lead to peak memory savings, but to realize inference speedups, we have to make full use of int8 through SAM’s operations. This requires building an efficient int8@int8 matrix multiplication kernel, as well as casting logic to translate from high to low precision (quantization) as well as reversing back from low to high (dequantization). Utilizing the power of torch.compile, we can compile and fuse together these quantization and dequantization routines into efficient single kernels and epilogues of our matrix multiplication. The resulting implementation is [fairly short and less than 250 lines of code](https://github.com/pytorch-labs/segment-anything-fast/blob/21b0208ae46eefc5659f7f200a2bf447add8765b/segment_anything_fast/dynamic_quant.py). For more information on the APIs and usage, see [pytorch-labs/ao](https://github.com/pytorch-labs/ao/tree/main#torchao). - -While it’s common to see some accuracy regression when quantizing models at inference time, SAM has been particularly robust to lower precision inference with minimal loss of accuracy. With quantization added, we now observe the following measurements for **batch size 32** and above changes. - - -![batch size 32 and above changes](/assets/images/accelerating-generative-ai/bar_chart_6.png){:style="width:100%;"} - -### sparse: Semi-structured (2:4) sparsity - -Matrix multiplications are still our bottleneck. We can turn to the model acceleration playbook with another classic method to approximate matrix multiplication: sparsification. By sparsifying our matrices (i.e., zeroing out values), we could theoretically use fewer bits to store weight and activation tensors. The process by which we decide which weights in the tensor to set to zero is called pruning. The idea behind pruning is that small weights in a weight tensor contribute little to the net output of a layer, typically the product of weights with activations. Pruning away small weights can potentially reduce model size without significant loss of accuracy. - -Methods for pruning are varied, from completely unstructured, wherein weights are greedily pruned to highly structured, wherein large sub-components of a tensor are pruned a time. Choice of method is not trivial. While unstructured pruning may have the theoretically least impact on accuracy, GPUs are also highly efficient with multiplying large, dense matrices and may suffer significant performance degradation in sparse regimes. One recent pruning method supported in PyTorch seeks to strike a balance, called semi-structured (or 2:4) sparsity. This sparse storage reduces the original tensor by a significant 50%, while simultaneously resulting in a dense tensor output that can leverage highly performant, 2:4 GPU kernels. See the following picture for an illustration. - - -![dense tensor output that can leverage highly performant, 2:4 GPU kernels](/assets/images/accelerating-generative-ai/sparse_image.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - -From [developer.nvidia.com/blog/exploiting-ampere-structured-sparsity-with-cusparselt](https://developer.nvidia.com/blog/exploiting-ampere-structured-sparsity-with-cusparselt) - -In order to use this sparse storage format and the associated fast kernels we need to prune our weights such that they adhere to the constraints for the format. We pick the two smallest weights to prune in a 1 by 4 region, measuring the performance vs accuracy tradeoff. It is easy to change a weight from its default PyTorch (“strided”) layout to this new, semi-structured sparse layout. To implement `apply_sparse(model)` we only require 32 lines of Python code: - - -``` -import torch -from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor - -# Sparsity helper functions -def apply_fake_sparsity(model): - """ - This function simulates 2:4 sparsity on all linear layers in a model. - It uses the torch.ao.pruning flow. - """ - # torch.ao.pruning flow - from torch.ao.pruning import WeightNormSparsifier - sparse_config = [] - for name, mod in model.named_modules(): - if isinstance(mod, torch.nn.Linear): - sparse_config.append({"tensor_fqn": f"{name}.weight"}) - - sparsifier = WeightNormSparsifier(sparsity_level=1.0, - sparse_block_shape=(1,4), - zeros_per_block=2) - sparsifier.prepare(model, sparse_config) - sparsifier.step() - - sparsifier.step() - sparsifier.squash_mask() - - -def apply_sparse(model): - apply_fake_sparsity(model) - for name, mod in model.named_modules(): - if isinstance(mod, torch.nn.Linear): - mod.weight = torch.nn.Parameter(to_sparse_semi_structured(mod.weight)) -``` - - -With 2:4 sparsity, we observe peak performance on SAM with vit_b and batch size 32: - - -![With 2:4 sparsity, we observe peak performance on SAM with vit_b and batch size 32](/assets/images/accelerating-generative-ai/bar_chart_7.png){:style="width:100%;"} - - - -### Conclusion - -Wrapping up, we are excited to have[ announced](https://www.youtube.com/watch?v=IWpM_9AsC-U) our fastest implementation of [Segment Anything](https://github.com/facebookresearch/segment-anything) to date. We rewrote Meta’s original SAM in pure PyTorch with no loss of accuracy using a breadth of newly released features: - - - -* **Torch.compile** PyTorch’s native JIT compiler, providing fast, automated fusion of PyTorch operations [[tutorial](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html)] -* **GPU quantization** accelerate models with reduced precision operations [[api](https://github.com/pytorch-labs/ao/tree/main#torchao)] -* **Scaled Dot Product Attention (SDPA)** a new, memory efficient implementation of Attention [[tutorial](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html)] -* **Semi-Structured (2:4) Sparsity** accelerate models with fewer bits to store weights and activations [[tutorial](https://pytorch.org/tutorials/prototype/semi_structured_sparse.html)] -* **Nested Tensor** Highly optimized, ragged array handling for non-uniform batch and image sizes [[tutorial](https://pytorch.org/tutorials/prototype/nestedtensor.html)] -* **Triton kernels.** Custom GPU operations, easily built and optimized via Triton - -For more details on how to reproduce the data presented in this blog post, check out [the experiments folder of segment-anything-fast](https://github.com/pytorch-labs/segment-anything-fast/tree/main/experiments). Please don't hesitate to contact us or [open an issue](https://github.com/pytorch-labs/segment-anything-fast/issues/new) if you run into any technical issues. - -In our next post, we are excited to share similar performance gains with our PyTorch natively authored LLM! - - -### Acknowledgements - -We would like to thank Meta’s [xFormers](https://github.com/facebookresearch/xformers) team including Daniel Haziza and Francisco Massa for authoring SDPA kernels and helping us design our custom one-off Triton kernel. \ No newline at end of file diff --git a/_posts/2023-11-16-pytorch-docathon-h2-2023-wrap.md b/_posts/2023-11-16-pytorch-docathon-h2-2023-wrap.md deleted file mode 100644 index 3f21769bb058..000000000000 --- a/_posts/2023-11-16-pytorch-docathon-h2-2023-wrap.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -layout: blog_detail -title: "🎉 PyTorch Docathon H2 2023 Wrap-up 🎉" ---- -We are thrilled to announce the successful completion of the Fall 2023 PyTorch Docathon! The event was a resounding success, and we want to extend our heartfelt gratitude to all the participants who made it possible. Dedication, expertise, and tireless efforts of our open-source contributors have once again helped us to improve PyTorch documentation. - -This Docathon ran from Nov 1 through Nov 15 with more than 170 registrants. The energy and enthusiasm were palpable, and entrants were judged on the difficulty of submissions that resulted in over TBA merged pull requests. We have fixed the PyTorch docstrings and made them compatible with the PEP 257 Python Docstring Conventions guidelines. We also have fixed multiple bugs in the pytorch/tutorials repo. - -We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide. - -Meet the top contributors: - -- First place: [ahoblitz](https://github.com/ahoblitz), [spzala](https://github.com/spzala), [alperenunlu](https://github.com/alperenunlu), [ChanBong](https://github.com/ChanBong) -- Second place: [nvs-abhilash](https://github.com/nvs-abhilash), [bjhargrave](https://github.com/bjhargrave), [zabboud](https://github.com/zabboud) -- Third place: [guptaaryan16](https://github.com/guptaaryan16), [min-jean-cho](https://github.com/min-jean-cho), [markstur](https://github.com/markstur) -- Honorable mentions: [RustyGrackle](https://github.com/RustyGrackle), [Viditagarwal7479](https://github.com/Viditagarwal7479), [Skylion007](https://github.com/Skylion007) - -You can see the full docathon leaderboard published [here](https://github.com/pytorch/tutorials/blob/main/docathon-leaderboard.md). - -As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch documentation and code, and pushing the boundaries of what's possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the PyTorch community. - -Thank you again for your participation and support. We look forward to seeing what you will achieve next! - -Team PyTorch \ No newline at end of file diff --git a/_posts/2023-11-29-new-features-for-ai.md b/_posts/2023-11-29-new-features-for-ai.md deleted file mode 100644 index c285d708378f..000000000000 --- a/_posts/2023-11-29-new-features-for-ai.md +++ /dev/null @@ -1,649 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.1 Contains New Performance Features for AI Developers" -author: Intel ---- - -We are excited to see the release of PyTorch 2.1. In this blog, we discuss the five features for which Intel made significant contributions to PyTorch 2.1: - -1. TorchInductor-CPU optimizations including Bfloat16 inference path for torch.compile -2. CPU dynamic shape inference path for torch.compile -3. C++ wrapper (prototype) -4. Flash-attention-based scaled dot product algorithm for CPU -5. PyTorch 2 export post-training auantization with an x86 back end through an inductor - -At Intel, we are delighted to be part of the PyTorch community and appreciate the collaboration with and feedback from our colleagues at Meta* as we co-developed these features. - -Let’s get started. - -## TorchInductor-CPU Optimizations - -This feature optimizes bfloat16 inference performance for TorchInductor. The 3rd and 4th generation Intel® Xeon® Scalable processors have built-in hardware accelerators for speeding up dot-product computation with the bfloat16 data type. Figure 1 shows a code snippet of how to specify the BF16 inference path. - -``` -user_model = ... - -user_model.eval() -with torch.no_grad(), torch.autocast("cpu"): - compiled_model = torch.compile(user_model) - y = compiled_model(x) -``` - -Figure 1. Code snippet showing the use of BF16 inference with TorchInductor \ - - - -We measured the performance on three TorchInductor benchmark suites—TorchBench, Hugging Face*, and TIMM—and the results are as follows in Table 1. Here we see that performance in graph mode (TorchInductor) outperforms eager mode by factors ranging from 1.25x to 2.35x.* - -Table 1. Bfloat16 performance geometric mean speedup in graph mode, compared with eager mode - - - - - - - - - - - - - - - - - -
        -Bfloat16 Geometric Mean Speedup (Single-Socket Multithreads) -
        -Compiler - -torchbench - -huggingface - -timm_models -
        -inductor - -1.81x - -1.25x - -2.35x -
        - - - - - - - - - - - - - - - - - - - -
        -Bfloat16 Geometric Mean Speedup (Single-Core Single Thread) -
        -Compiler - -torchbench - -huggingface - -timm_models -
        -inductor - -1.74x - -1.28x - -1.29x -
        - - -Developers can fully deploy their models on 4th generation Intel Xeon processors to take advantage of the Intel® Advanced Matrix Extensions (Intel® AMX) feature to get peak performance for `torch.compile`. Intel AMX has two primary components: tiles and tiled matrix multiplication (TMUL). The tiles store large amounts of data in eight two-dimensional registers, each one kilobyte in size. TMUL is an accelerator engine attached to the tiles that contain instructions to compute larger matrices in a single operation. - - -## CPU Dynamic Shapes Inference Path for torch.compile - - -Dynamic shapes is one of the key features in PyTorch 2.0. PyTorch 2.0 assumes everything is static by default. If we recompile because a size changed, we will instead attempt to recompile that size as being dynamic (sizes that have changed are likely to change in the future). Dynamic shapes support is required for popular models like large language models (LLM). Dynamic shapes that provide support for a broad scope of models can help users get more benefit from torch.compile. For dynamic shapes, we provide the post-op fusion for conv/gemm operators and vectorization code-gen for non-conv/gemm operators. - -Dynamic shapes is supported by both the inductor Triton back end for CUDA* and the C++ back end for CPU. The scope covers improvements for both functionality (as measured by model passing rate) and performance (as measured by inference latency/throughput). Figure 2 shows a code snippet for the use of dynamic shape inference with TorchInductor. - - -``` -user_model = ... - -# Training example -compiled_model = torch.compile(user_model) -y = compiled_model(x_size1) -# Here trigger the recompile because the input size changed -y = compiled_model(x_size2) - - -# Inference example -user_model.eval() -compiled_model = torch.compile(user_model) -with torch.no_grad(): - y = compiled_model(x_size1) - # Here trigger the recompile because the input size changed - y = compiled_model(x_size2) -``` - -Figure 2. Code snippet showing the use of dynamic shape inference with TorchInductor - -We again measured the performance on the three TorchInductor benchmark suites—TorchBench, Hugging Face, and TIMM—and the results are in Table 2. Here we see that performance in graph mode outperforms eager mode by factors ranging from 1.15x to 1.79x. - -Table 2. Dynamic shape geometric mean speedup compared with Eager mode - - - - - - - - - - - - - - - - - -
        -Dynamic Shape Geometric Mean Speedup (Single-Socket Multithreads) -
        -Compiler - -torchbench - -huggingface - -timm_models -
        -inductor - -1.35x - -1.15x - -1.79x -
        - - - - - - - - - - - - - - - - - - -
        -Dynamic Shape Geometric Mean Speedup (Single-Core Single-Thread) -
        -Compiler - -torchbench - -huggingface - -timm_models -
        -inductor - -1.48x - -1.15x - -1.48x -
        - - -## C++ Wrapper (Prototype) - -The feature generates C++ code instead of Python* code to invoke the generated kernels and external kernels in TorchInductor to reduce Python overhead. It is also an intermediate step to support deployment in environments without Python. - -To enable this feature, use the following configuration: - - -``` -import torch -import torch._inductor.config as config -config.cpp_wrapper = True -``` - -For light workloads where the overhead of the Python wrapper is more dominant, C++ wrapper demonstrates a higher performance boost ratio. We grouped the models in TorchBench, Hugging Face, and TIMM per the average inference time of one iteration and categorized them into small, medium, and large categories. Table 3 shows the geometric mean speedups achieved by the C++ wrapper in comparison to the default Python wrapper. - -Table 3. C++ wrapper geometric mean speedup compared with Eager mode - - - - - - - - - - - - - - - - - -
        -FP32 Static Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) -
        -Compiler - -Small (t <= 0.04s) - -Medium (0.04s < t <= 1.5s) - -Large (t > 1.5s) -
        -inductor - -1.06x - -1.01x - -1.00x -
        - - - - - - - - - - - - - - - - - - - - - - -
        -FP32 Static Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) -
        -Compiler - -Small (t <= 0.04s) - -Medium (0.04s < t <= 1.5s) - -Large (t > 1.5s) -
        -inductor - -1.13x - -1.02x - -1.01x -
        - - - - - - - - - - - - - - - - - - - - - - -
        -FP32 Dynamic Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) -
        -Compiler - -Small (t <= 0.04s) - -Medium (0.04s < t <= 1.5s) - -Large (t > 1.5s) -
        -inductor - -1.05x - -1.01x - -1.00x -
        - - - - - - - - - - - - - - - - - - - - - - -
        -FP32 Dynamic Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) -
        -Compiler - -Small (t <= 0.04s) - -Medium (0.04s < t <= 1.5s) - -Large (t > 1.5s) -
        -inductor - -1.14x - -1.02x - -1.01x -
        - - - - - - - - - - - - - - - - - - - - - - -
        -BF16 Static Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) -
        -Compiler - -Small (t <= 0.04s) - -Medium (0.04s < t <= 1.5s) - -Large (t > 1.5s) -
        -inductor - -1.09x - -1.03x - -1.04x -
        - - - - - - - - - - - - - - - - - - - - - - -
        -BF16 Static Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) -
        -Compiler - -Small (t <= 0.04s) - -Medium (0.04s < t <= 1.5s) - -Large (t > 1.5s) -
        -inductor - -1.17x - -1.04x - -1.03x -
        - - -## Flash-Attention-Based Scaled Dot Product Algorithm for CPU - -Scaled dot product attention (SDPA) is one of the flagship features of PyTorch 2.0 that helps speed up transformer models. It is accelerated with optimal CUDA kernels while still lacking optimized CPU kernels. This flash-attention implementation targets both training and inference, with both FP32 and Bfloat16 data types supported. There is no front-end use change for users to leverage this SDPA optimization. When calling SDPA, a specific implementation will be chosen automatically, including this new implementation. - - -We have measured the SDPA-related models in Hugging Face, and they are proven effective when compared to the unfused SDPA. Shown in Table 4 are the geometric mean speedups for SDPA optimization. \ - -Table 4. SDPA optimization performance geometric mean speedup - - - - - - - - - - - - - - - -
        -SDPA Geometric Mean Speedup (Single-Socket Multithreads) -
        -Compiler - -Geometric Speedup FP32 - -Geometric Speedup BF16 -
        -inductor - -1.15x, 20/20 - -1.07x, 20/20 -
        - - - - - - - - - - - - - - - - -
        -SDPA Geometric Mean Speedup (Single-Core Single-Thread) -
        -Compiler - -Geometric Speedup FP32 - -Geometric Speedup BF16 -
        -inductor - -1.02x, 20/20 - -1.04x, 20/20 -
        - - - -## PyTorch 2 Export Post-Training Quantization with x86 Back End through Inductor - - -PyTorch provides a new quantization flow in the PyTorch 2.0 export. This feature uses TorchInductor with an x86 CPU device as the back end for post-training static quantization with this new quantization flow. An example code snippet is shown in Figure 3. - - -``` -import torch -import torch._dynamo as torchdynamo -from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e -import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq - -model = ... - -model.eval() -with torch.no_grad(): - # Step 1: Trace the model into an FX graph of flattened ATen operators - exported_graph_module, guards = torchdynamo.export( - model, - *copy.deepcopy(example_inputs), - aten_graph=True, - ) - - # Step 2: Insert observers or fake quantize modules - quantizer = xiq.X86InductorQuantizer() - operator_config = xiq.get_default_x86_inductor_quantization_config() - quantizer.set_global(operator_config) - prepared_graph_module = prepare_pt2e(exported_graph_module, quantizer) - - # Doing calibration here. - - # Step 3: Quantize the model - convert_graph_module = convert_pt2e(prepared_graph_module) - - # Step 4: Lower Quantized Model into the backend - compile_model = torch.compile(convert_graph_module) -``` - -Figure 3. Code snippet showing the use of Inductor as back end for PyTorch 2 export post-training quantization - -All convolutional neural networks (CNN) models from the TorchBench test suite have been measured and proven effective when compared with the Inductor FP32 inference path. Performance metrics are shown in Table 5. - - - - - - - - - - - - - -
        -Compiler - -Geometric Speedup - -Geometric Related Accuracy Loss -
        -inductor - -3.25x, 12/12 - -0.44%, 12/12 -
        - - -## Next Steps - - -### Get the Software - - -Try out [PyTorch 2.1](https://github.com/pytorch/pytorch/releases/tag/v2.1.0) and realize the performance benefits for yourself from these features contributed by Intel. - -We encourage you to check out Intel’s other [AI Tools](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/tools.html) and [framework](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html) optimizations and learn about the open, standards-based [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html) multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio. - -For more details about the 4th generation Intel Xeon Scalable processor, visit the [AI platform](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/platform.html) where you can learn how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines. - - -### PyTorch Resources - -* [PyTorch Get Started](http://pytorch.org/get-started/pytorch-2.0/) -* [Dev Discussions](http://dev-discuss.pytorch.org/t/pytorch-release-2-0-execution-update/1077) -* [Documentation](http://pytorch.org/docs/2.0/) - -### Product and Performance Information - -1 Amazon EC2* m7i.16xlarge: 1-node, Intel Xeon Platinum 8488C processor with 256 GB memory (1 x 256 GB DDR5 4800 MT/s), microcode 0x2b000461, hyperthreading on, turbo on, Ubuntu* 22.04.3 LTS, kernel 6.2.0-1011-aws, GCC* 11.3.0, Amazon Elastic Block Store 200 GB, BIOS Amazon EC2 1.0 10/16/2017; Software: [PyTorch 2.1.0_rc4](https://github.com/pytorch/pytorch/tree/release/2.1), [Intel® oneAPI Deep Neural Network Library (oneDNN) version 3.1.1](https://github.com/oneapi-src/oneDNN/tree/v3.1.1), [TorchBench](https://github.com/pytorch/benchmark/commit/ffbbebb9), [TorchVision](https://github.com/pytorch/vision/commit/8636bf3), [TorchText](https://github.com/pytorch/text/commit/142d029), [TorchAudio](https://github.com/pytorch/audio/commit/475b6ae), [TorchData](https://github.com/pytorch/data/commit/eb9bf61), [TorchDynamo Benchmarks](https://github.com/pytorch/pytorch/tree/release/2.1/benchmarks/dynamo), tested by Intel on 9/12/2023. - - -2 Amazon EC2 c6i.16xlarge: 1-node, Intel Xeon Platinum 8375C processor with 128 GB memory (1 x 128 GB DDR4 3200 MT/s), microcode 0xd0003a5, hyperthreading on, turbo on, Ubuntu 22.04.2 LTS, kernel 6.2.0-1011-aws, gcc 11.3.0, Amazon Elastic Block Store 200 GB, BIOS Amazon EC2 1.010/16/2017; Software: [PyTorch 2.1.0_rc4](https://github.com/pytorch/pytorch/tree/release/2.1), [oneDNN version 3.1.1](https://github.com/oneapi-src/oneDNN/tree/v3.1.1), [TorchBench](https://github.com/pytorch/benchmark/commit/ffbbebb9), [TorchVision](https://github.com/pytorch/vision/commit/8636bf3), [TorchText](https://github.com/pytorch/text/commit/142d029), [TorchAudio](https://github.com/pytorch/audio/commit/475b6ae), [TorchData](https://github.com/pytorch/data/commit/eb9bf61), [TorchDynamo Benchmarks](https://github.com/pytorch/pytorch/tree/release/2.1/benchmarks/dynamo), [TorchBench cpu userbenchmark](https://github.com/pytorch/benchmark/tree/chuanqiw/inductor_quant/userbenchmark/cpu), tested by Intel on 9/12/2023. diff --git a/_posts/2023-11-30-accelerating-generative-ai-2.md b/_posts/2023-11-30-accelerating-generative-ai-2.md deleted file mode 100644 index d899c753c513..000000000000 --- a/_posts/2023-11-30-accelerating-generative-ai-2.md +++ /dev/null @@ -1,325 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Generative AI with PyTorch II: GPT, Fast" -featured-img: 'assets/images/accelerating-generative-ai-2/social-share.jpg' ---- - -This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate [Segment Anything over 8x](https://pytorch.org/blog/accelerating-generative-ai/) using only pure, native PyTorch. In this blog we’ll focus on LLM optimization. - -Over the past year, generative AI use cases have exploded in popularity. Text generation has been one particularly popular area, with lots of innovation among open-source projects such as [llama.cpp](https://github.com/ggerganov/llama.cpp), [vLLM](https://github.com/vllm-project/vllm), and [MLC-LLM](https://github.com/mlc-ai/mlc-llm). - -While these projects are performant, they often come with tradeoffs in ease of use, such as requiring model conversion to specific formats or building and shipping new dependencies. This begs the question: **how fast can we run transformer inference with only pure, native PyTorch?** - -As announced during our recent [PyTorch Developer Conference](https://www.youtube.com/watch?v=IWpM_9AsC-U), the PyTorch team wrote a from-scratch LLM **almost 10x faster than baseline,** with no loss of accuracy, all using native PyTorch optimizations. We leverage a breadth of optimizations including: - - -* **[Torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html)**: A compiler for PyTorch models -* **[GPU quantization](https://github.com/pytorch-labs/ao/tree/main#torchao)**: Accelerate models with reduced precision operations -* **[Speculative Decoding](https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py#L76)**: Accelerate LLMs using a small “draft” model to predict large “target” model’s output -* **[Tensor Parallelism](https://github.com/pytorch-labs/gpt-fast/blob/main/tp.py)**: Accelerate models by running them across multiple devices. - -And, even better, we can do it in **less than 1000 lines of native PyTorch code**. - -If this excites you enough to jump straight into the code, check it out at [https://github.com/pytorch-labs/gpt-fast](https://github.com/pytorch-labs/gpt-fast)! - - -![Screen recording](/assets/images/accelerating-generative-ai-2/screen-recording.gif){:style="width:100%;"} - -_Note: We will be focusing on latency (i.e. batch size=1) for all of these benchmarks. Unless otherwise specified, all benchmarks are run on an A100-80GB, power limited to 330W._ - - -## Starting Point (25.5 tok/s) - -Let’s start off with an extremely basic and simple implementation. - -![simple implementation](/assets/images/accelerating-generative-ai-2/image23.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - -Sadly, this does not perform very well. But why? Looking at a trace reveals the answer - it’s heavily **CPU overhead bound**! What this means is that our CPU is not able to tell the GPU what to do fast enough for the GPU to be fully utilized. - - -![trace](/assets/images/accelerating-generative-ai-2/image14.png){:style="width:100%;"} - - -Imagine the GPU as this super massive factory with a ridiculous amount of compute available. Then, imagine the CPU as some messenger shuttling instructions back and forth to the GPU. Remember, in large scale deep learning systems, the GPU is responsible for doing 100% of the work! In such systems, the only role of the CPU is to tell the GPU what work it should be doing. - - -![factory](/assets/images/accelerating-generative-ai-2/image16.png){:style="width:100%;display: block;max-width:500px; margin-left:auto; margin-right:auto;"} - - -So, the CPU runs over and tells the GPU to do an “add”, but by the time the CPU can give the GPU another chunk of work, the GPU has long finished the previous chunk of work. - -Despite the fact that the GPU needs to perform thousands of computations while the CPU only needs to do orchestration work, this is surprisingly common! There’s a variety of reasons for this, ranging from the fact that the CPU is likely running some single-threaded Python to the fact that GPUs are just incredibly fast nowadays. - -Regardless of the reason, we now find ourselves in the **overhead-bound regime**. So, what can we do? One, we could rewrite our implementation in C++, perhaps even eschew frameworks entirely and write raw CUDA. Or.... we could just send more work to the GPU at once. - - -![factory](/assets/images/accelerating-generative-ai-2/image3.png){:style="width:100%;display: block;max-width:500px; margin-left:auto; margin-right:auto;"} - - -By just sending a massive chunk of work at once, we can keep our GPU busy! Although during training, this may just be accomplished by increasing your batch size, how do we do this during inference? - -Enter torch.compile. - - -## Step 1: Reducing CPU overhead through torch.compile and a static kv-cache (107.0 tok/s) - -Torch.compile allows us to capture a larger region into a single compiled region, and particularly when run with mode=”reduce-overhead”, is very effective at reducing CPU overhead. Here, we also specify fullgraph=True, which validates that there are no “graph breaks” in your model (i.e. portions that torch.compile cannot compile). In other words, it ensures that torch.compile is running to its fullest potential. - -To apply it, we [simply wrap a function (or a module) with it](https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py#L296). - - - -``` -torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True) -``` - - -However, there are a couple of nuances here that make it somewhat nontrivial for folks to get significant performance boosts from applying torch.compile to text generation. - -The first obstacle is the kv-cache. The kv-cache is an inference-time optimization that caches the activations computed for the previous tokens (see [here](https://www.dipkumar.dev/becoming-the-unbeatable/posts/gpt-kvcache/) for a more in-depth explanation). However, as we generate more tokens, the “logical length” of the kv-cache grows. This is problematic for two reasons. One is that reallocating (and copying!) the kv-cache every time the cache grows is simply expensive. The other one is that this dynamism makes it harder to reduce the overhead, as we are no longer able to leverage approaches like cudagraphs. - -To resolve this, we use a[ “static” kv-cache](https://github.com/pytorch-labs/gpt-fast/blob/0afae1ace441ce4c5d02ef11a72da28cf7ca4795/generate.py#L154), which means that we statically allocate the maximum size of the kv-cache, and then mask out the unused values in the attention portion of the computation. - -![code](/assets/images/accelerating-generative-ai-2/image2.png){:style="width:100%;"} - -The second obstacle is the prefill phase. Transformer text generation is best thought of as a two phase process: 1. The prefill where the entire prompt is processed, and 2. Decoding where each token is generated autoregressively. - -Although decoding can be made entirely static once the kv-cache is made static, the prefill stage still requires significantly more dynamism, due to having a variable prompt length. Thus, we actually need to compile the two stages with separate compilation strategies. - -![compile](/assets/images/accelerating-generative-ai-2/image9.png){:style="width:100%;"} - - - -Although these details are a bit tricky, the actual implementation is not very difficult at all (see gpt-fast)! And the performance boost is dramatic. - - -![chart](/assets/images/accelerating-generative-ai-2/image28.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - -All of a sudden, our performance improves by more than 4x! Such performance gains are often common when one’s workload is overhead bound. - - -## Sidenote: How is torch.compile helping? - -It is worth disentangling how exactly torch.compile is improving performance. There’s 2 main factors leading to torch.compile’s performance. - -The first factor, like mentioned above, is overhead reduction. Torch.compile is able to reduce overhead through a variety of optimizations, but one of the most effective ones is called [CUDAGraphs](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/). Although torch.compile applies this automatically for you when “reduce-overhead” is set, saving the extra work and code you need to write when doing this yourself manually without torch.compile. - -The second factor, however, is that torch.compile simply generates faster kernels. In the decoding benchmark above, torch.compile actually generates every single kernel from scratch, including both the matrix multiplications and the attention! And even cooler, these kernels are actually faster than the built in alternatives (CuBLAS and FlashAttention2)! - -This may sound implausible to many of you, considering how hard it is to write efficient matrix multiplication/attention kernels, and how much manpower has been put into CuBLAS and FlashAttention. The key here, however, is that transformer decoding has very unusual computational properties. In particular, because of the KV-cache, for BS=1 _every single matrix multiplication in a transformer is actually a matrix vector multiplication_. - -This means that the computations are completely _memory-bandwidth bound_, and as such, are well within the range of compilers to automatically generate. And in fact, when we benchmark torch.compile’s matrix-vector multiplications against CuBLAS, we find that torch.compile’s kernels are actually quite a bit faster! - - -![code](/assets/images/accelerating-generative-ai-2/image24.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - - -![code](/assets/images/accelerating-generative-ai-2/image17.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - - -## Step 2: Alleviating memory bandwidth bottleneck through int8 weight-only quantization (157.4 tok/s) - -So, given that we’ve already seen massive speedups from applying torch.compile, is it possible to do even better? One way to think about this problem is to compute how close we are to the theoretical peak. In this case, the largest bottleneck is the cost of loading the weights from GPU global memory to registers. In other words, each forward pass requires us to “touch” every single parameter on the GPU. So, how fast can we theoretically “touch” every single parameter in a model? - - -![weights](/assets/images/accelerating-generative-ai-2/image11.png){:style="width:100%;display: block;max-width:500px; margin-left:auto; margin-right:auto;"} - - -To measure this, we can use **Model Bandwidth Utilization (MBU).** This measures what percentage of our memory bandwidth we’re able to use during inference. - -Computing it is pretty simple. We simply take the total size of our model (# params * bytes per param) and multiply it by the number of inferences we can do per second. Then, we divide this by the peak bandwidth of the GPU to get our MBU. - -![MBU](/assets/images/accelerating-generative-ai-2/image8.png){:style="width:100%;"} - - - -For example, for our above case, we have a 7B parameter model. Each parameter is stored in fp16 (2 bytes per parameter), and we achieved 107 tokens/s. Finally, our A100-80GB has a theoretical 2 TB/s of memory bandwidth. - - -![MBU](/assets/images/accelerating-generative-ai-2/image25.png){:style="width:100%;"} - - - -Putting this all together, we get **72% MBU! **This is quite good, considering that even just copying memory struggles to break 85%. - -But... it does mean that we’re pretty close to the theoretical limit here, and that we’re clearly bottlenecked on just loading our weights from memory. It doesn’t matter what we do - without changing the problem statement in some manner, we might only be able to eek out another 10% in performance. - -Let’s take another look at the above equation. We can’t really change the number of parameters in our model. We can’t really change the memory bandwidth of our GPU (well, without paying more money). But, we **can** change how many bytes each parameter is stored in! - -![MBU](/assets/images/accelerating-generative-ai-2/image18.png){:style="width:100%;"} - - - -Thus, we arrive at our next technique - int8 quantization. The idea here is simple. If loading our weights from memory is our main bottleneck, why don’t we just make the weights smaller? - -![MBU](/assets/images/accelerating-generative-ai-2/image7.png){:style="width:100%;"} - - - -Note that this is quantizing _only_ the weights - the computation itself is still done in bf16. This makes this form of quantization easy to apply with very little to no accuracy degradation. - -Moreover, torch.compile can also easily generate efficient code for int8 quantization. Let’s look again at the above benchmark, this time with int8 weight-only quantization included. - - -![code](/assets/images/accelerating-generative-ai-2/image1.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - -![code](/assets/images/accelerating-generative-ai-2/image27.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - -As you can see from the dark blue line (torch.compile + int8), there is a significant performance improvement when using torch.compile + int8 weight-only quantization! Moreover, the light-blue line (no torch.compile + int8) is actually much worse than even the fp16 performance! This is because in order to take advantage of the perf benefits of int8 quantization, we need the kernels to be fused. This shows one of the benefits of torch.compile - these kernels can be automatically generated for the user! - -[Applying int8 quantization to our model](https://github.com/pytorch-labs/gpt-fast/blob/main/quantize.py#L314), we see a nice 50% performance improvement, bringing us up to 157.4 tokens/s! - - -![chart](/assets/images/accelerating-generative-ai-2/image19.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - - -## Step 3: Reframing the problem using speculative decoding - -Even after using techniques like quantization, we’re still faced with another problem. In order to generate 100 tokens, we must load our weights 100 times. - -![diagram](/assets/images/accelerating-generative-ai-2/image5.png){:style="width:100%;"} - - - -Even if the weights are quantized, we still must load our weights over and over, once for each token we generate! Is there any way around this? - -At first glance, the answer might seem like no - there’s a strict serial dependency in our autoregressive generation. However, as it turns out, by utilizing [speculative decoding](https://arxiv.org/abs/2211.17192), we’re able to break this strict serial dependency and obtain speedups! - -![engineers](/assets/images/accelerating-generative-ai-2/image21.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - -Imagine you had a senior engineer (called Verity), who makes the right technical decisions but is rather slow at writing code. However, you also have a junior engineer (called Drake), who doesn’t always make the right technical decisions but can write code much faster (and cheaper!) than Verity. How can we take advantage of Drake (the junior engineer) to write code faster while ensuring that we are still making the right technical decisions? - - -![engineers](/assets/images/accelerating-generative-ai-2/image6.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - -First, Drake goes through the labor-intensive process of writing the code, making technical decisions along the way. Next, we give the code to Verity to review. - -![engineers](/assets/images/accelerating-generative-ai-2/image15.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - -Upon reviewing the code, Verity might decide that the first 3 technical decisions Drake made are correct, but the last 2 need to be redone. So, Drake goes back, throws away his last 2 decisions, and restarts coding from there. - -Notably, although Verity (the senior engineer) has only looked at the code once, we are able to generate 3 pieces of validated code identical to what she would have written! Thus, assuming Verity is able to review the code faster than it would have taken her to write those 3 pieces herself, this approach comes out ahead. - -In the context of transformer inference, Verity would be played by the role of the larger model whose outputs we want for our task, called the **verifier model**. Similarly, Drake would be played by a smaller model that’s able to generate text much faster than the larger model, called the **draft model**. So, we would generate 8 tokens using the draft model, and then process all eight tokens in parallel using the verifier model, throwing out the ones that don’t match. - -Like mentioned above, one crucial property of speculative decoding is that **it does not change the quality of the output**. As long as the time it takes for generating the tokens using the draft model + verifying the tokens is less than it would have taken to generate those tokens, we come out ahead. - -One of the great things about doing this all in native PyTorch is that this technique is actually really easy to implement! Here’s the [entirety of the implementation](https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py#L76), in about 50 lines of native PyTorch. - - -![code](/assets/images/accelerating-generative-ai-2/image10.png){:style="width:100%;"} - - - -Although speculative decoding guarantees that we have mathematically identical results compared to regular generation, it does have the property that the runtime performance varies depending on the generated text, as well as how aligned the draft and verifier model are. For example, when running CodeLlama-34B + CodeLlama-7B, we’re able to obtain a 2x boost in tokens/s for generating code. On the other hand, when using Llama-7B + TinyLlama-1B, we’re only able to obtain about a 1.3x boost in tokens/s. - - -## Sidenote: Running this on AMD - -Like mentioned above, every single kernel in decoding is generated from scratch by torch.compile, and is converted into OpenAI Triton. As AMD has a [torch.compile backend](https://pytorch.org/blog/experience-power-pytorch-2.0/) (and also a Triton backend), we can simply go through all of the optimizations above... but on an AMD GPU! With int8 quantization, we’re able to achieve 102.5 tokens/s with one GCD (i.e. one half) of a MI250x! - -![chart](/assets/images/accelerating-generative-ai-2/image4.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - - -## Step 4: Reducing the size of the weights even more with int4 quantization and GPTQ (202.1 tok/s) - -Of course, if reducing the weights down from 16 bits to 8 bits allows for speedups by reducing the number of bytes we need to load, reducing the weights down to 4 bits would result in even larger speedups! - -Unfortunately, when reducing weights down to 4-bits, the accuracy of the model starts to become a much larger concern. From our preliminary evals, we see that although using int8 weight-only quantization has no perceptible accuracy degradation, using int4 weight-only quantization does. - - -![table](/assets/images/accelerating-generative-ai-2/image13.png){:style="width:100%;"} - - - -There are 2 main tricks we can use to limit the accuracy degradation of int4 quantization. - -The first one is to have a more granular scaling factor. One way to think about the scaling factor is that when we have a quantized tensor representation, it is on a sliding scale between a floating point tensor (each value has a scaling factor) and an integer tensor (no values have a scaling factor). For example, with int8 quantization, we had one scaling factor per row. If we want higher accuracy, however, we can change that to “one scaling factor per 32 elements”. We choose a group size of 32 to minimize accuracy degradation, and this is also a common choice among the community. - -The other one is to use a more advanced quantization strategy than simply rounding the weights. For example, approaches like [GPTQ](https://arxiv.org/abs/2210.17323) leverage example data in order to calibrate the weights more accurately. In this case, we prototype an implementation of GPTQ in the repository based off of PyTorch’s recently released [torch.export](https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html). - -In addition, we need kernels that fuse int4 dequantize with the matrix vector multiplication. In this case, torch.compile is unfortunately not able to generate these kernels from scratch, so we leverage some handwritten CUDA kernels in PyTorch. - -These techniques require some additional work, but putting them all together results in even better performance! - - -![chart](/assets/images/accelerating-generative-ai-2/image12.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - -## Step 5: Combining everything together (244.7 tok/s) - -Finally, we can compose all of the techniques together to achieve even better performance! - - -![chart](/assets/images/accelerating-generative-ai-2/image22.png){:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - -## Step 6: Using Tensor Parallelism - -So far, we’ve been restricting ourselves to minimizing latency while on a single GPU. In many settings, however, we have access to multiple GPUs. This allows us to improve our latency further! - -To get an intuitive sense of why this would allow us to improve our latency, let’s take a look at the prior equation for MBU, particularly the denominator. Running on multiple GPUs gives us access to more memory bandwidth, and thus, higher potential performance. - -![MBU](/assets/images/accelerating-generative-ai-2/image8.png){:style="width:100%;"} - - -As for which parallelism strategy to pick, note that in order to reduce our latency for one example, we need to be able to leverage our memory bandwidth across more devices simultaneously. This means that we need to split the processing of one token across multiple devices. In other words, we need to use tensor parallelism. - -Luckily, PyTorch also provides low-level tools for tensor-parallelism that compose with torch.compile. We are also working on higher-level APIs for expressing tensor parallelism, stay tuned for those! - -However, even without a higher-level API, it’s actually still quite easy to add tensor parallelism. Our implementation comes in at [150 lines of code](https://github.com/pytorch-labs/gpt-fast/blob/main/tp.py), and doesn’t require any model changes. - - -![code](/assets/images/accelerating-generative-ai-2/image20.png){:style="width:100%;"} - - -We are still able to take advantage of all the optimizations mentioned previously, which all can continue to compose with tensor parallelism. Combining these together, we’re able to serve Llama-70B at 55 tokens/s with int8 quantization! - -![chart](/assets/images/accelerating-generative-ai-2/image26.png){:style="width:100%;"}{:style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;"} - - - -## Conclusion - -Let’s take a look at what we’re able to accomplish. - - - -1. Simplicity: Ignoring quantization, [model.py](https://github.com/pytorch-labs/gpt-fast/blob/main/model.py) (244 LOC) + [generate.py](https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py) (371 LOC) + [tp.py](https://github.com/pytorch-labs/gpt-fast/blob/main/tp.py) (151 LOC) comes out to 766 LOC to implement fast inference + speculative decoding + tensor-parallelism. -2. Performance: With Llama-7B, we’re able to use compile + int4 quant + speculative decoding to reach 241 tok/s. With llama-70B, we’re able to also throw in tensor-parallelism to reach 80 tok/s. These are both close to or surpassing SOTA performance numbers! - -PyTorch has always allowed for simplicity, ease of use, and flexibility. However, with torch.compile, we can throw in performance as well. - -The code can be found here: [https://github.com/pytorch-labs/gpt-fast](https://github.com/pytorch-labs/gpt-fast). We hope that the community finds it useful. Our goal with this repo is not to provide another library or framework for people to import. Instead, we encourage users to copy-paste, fork, and modify the code in the repo. - - -## Acknowledgements - -We would like to thank the vibrant open source community for their continual support of scaling LLMs, including: - - - -* Lightning AI for supporting pytorch and work in flash attention, int8 quantization, and LoRA fine-tuning. -* GGML for driving forward fast, on device inference of LLMs -* Andrej Karpathy for spearheading simple, interpretable and fast LLM implementations -* MLC-LLM for pushing 4-bit quantization performance on heterogenous hardware diff --git a/_posts/2023-12-05-snowflake-joins-pytorch.md b/_posts/2023-12-05-snowflake-joins-pytorch.md deleted file mode 100644 index 00f2246ce520..000000000000 --- a/_posts/2023-12-05-snowflake-joins-pytorch.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -layout: blog_detail -title: "Snowflake Joins the PyTorch Foundation as a General Member" ---- - -![Snowflake logo](/assets/images/snowflake-logo.svg){:style="max-width:350px;float:right;margin: 20px;"} - -The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Snowflake has joined as a general member. - -Snowflake enables thousands of organizations to unite siloed data, discover and securely share data, power data applications, and execute diverse AI/ML and analytic workloads across multiple clouds and geographies. - -"By joining the PyTorch community, we know that Snowflake will help accelerate data warehousing solutions and cutting-edge AI frameworks. This showcases the commitment to advancing innovation for data and artificial intelligence,” said Ibrahim Haddad, Executive Director, PyTorch Foundation. “We are thrilled to have Snowflake join the PyTorch Foundation, marking a significant stride in the convergence of data management and deep learning technologies." - -Snowflake enables collaboration with AI technologies to handle the storage and analysis of large datasets generated by machine learning and AI applications through scalability and SQL support. - -With the integrated repository of Python libraries from Anaconda in Snowpark, Snowflake users have always had a streamlined experience to deploy pre-trained PyTorch models in Snowflake to easily and securely make them a part of applications. Now with the addition of GPU instances in Snowpark Container Services (in private preview), training and other computationally intensive processing using PyTorch will also be streamlined, providing teams with an end-to-end solution for AI development and deployment. - -"Most if not all of our customers incorporate open source software as part of their data stacks, so it is critical for us to work with open source ecosystems like the PyTorch Foundation, alongside incorporating open source to meet the needs of our customers," said Adrien Treuille, Co-Founder of Streamlit, Director of Product Management at Snowflake. "As AI developers continue to integrate their models as part of applications, the power of Snowflake and PyTorch — coupled with Streamlit as the powerful front-end — creates near-limitless innovation for developers looking to build next-generation apps and unlock even more use cases." - -To learn more about the power of Snowflake and PyTorch, tune into Snowflake’s developer conference for AI and apps, BUILD. - -To learn more about how you can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/join). - -## About Snowflake - -Snowflake enables every organization to mobilize their data with Snowflake’s Data Cloud. Customers use the Data Cloud to unite siloed data, discover and securely share data, power data applications, and execute diverse AI/ML and analytic workloads. Wherever data or users live, Snowflake delivers a single data experience that spans multiple clouds and geographies. Thousands of customers across many industries, including 639 of the 2023 Forbes Global 2000 (G2K) as of July 31, 2023, use Snowflake Data Cloud to power their businesses. Learn more at [snowflake.com](https://www.snowflake.com/). - -## About PyTorch Foundation - -The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration. - - -## About The Linux Foundation - -The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its [trademark usage page](https://www.linuxfoundation.org/trademark-usage). Linux is a registered trademark of Linus Torvalds. \ No newline at end of file diff --git a/_posts/2023-12-12-dinosaurs-to-seismic-imaging.md b/_posts/2023-12-12-dinosaurs-to-seismic-imaging.md deleted file mode 100644 index a2bca4c7b0d8..000000000000 --- a/_posts/2023-12-12-dinosaurs-to-seismic-imaging.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -layout: blog_detail -title: "From PyTorch Conference 2023: From Dinosaurs to Seismic Imaging with Intel" -author: Ramya Ravi, Susan Kahler at Intel ---- - -![Dinosaur fossil](/assets/images/hunting-dinosaurs-with-intel-ai-fig1.jpeg){:style="width:100%;"} - - -## Lightning Talk 1: Seismic Data to Subsurface Models with OpenFWI - -Speaker: Benjamin Consolvo, AI Software Engineering Manager, Intel, [LinkedIn](https://linkedin.com/in/bconsolvo) - -### Session Overview - -In this session, Ben begins with an overview of seismic imaging and full waveform inversion (FWI). Seismic imaging and FWI helps us to explore land for important subsurface minerals necessary for human thriving. To find those crucial subsurface minerals, we need to image the subsurface with a high degree of accuracy at a low cost, which involves two main challenges. He explains the solutions for those challenges using AI, which are summarized below. - - - - - - - - - - - - - - - -
        Challenges - Solutions using AI -
        Traditional physics based FWI requires an accurate starting model. - Data-driven deep learning solutions do not require an accurate starting model. -
        GPUs are typically used for fine-tuning neural networks but are often unavailable and expensive. - CPUs are highly available, inexpensive, and viable for AI fine-tuning. The new 4th Gen Intel® Xeon® Scalable processor has the built-in AI accelerator engine called Intel® AMX (Intel® Advanced Matrix Extensions) that helps to accelerate AI training and inference performance. -
        - - -Next, he shows the wave propagation for the subsurface model and corresponding seismic shot gathers. In his example, the shot gathers are synthetically generated time-sampled records of sounds recordings from a shot (like a dynamite explosion or vibroseis truck) recorded by geophones spread across a large area. For this application, the training data consists of a pair of subsurface model image and seismic shot gather images, where the model from the shot gather is predicted. - - - - - - - - - - - - - - - - - - - - - - - -
        - Number of Seismic Shot Images - Number of subsurface model images -
        Train - 120,000 - 24,000 -
        Test - 25,000 - 5,000 -
        Validation - 5,000 - 1,000 -
        - - -In this application, the algorithm used during training was InversionNET (encoder-decoder convolutional neural network). Check out the implementation details for InversionNET architecture in [Deng et al. (2021)](https://arxiv.org/abs/2111.02926). - -He then shows the results: - - - -1. Prediction versus ground truth model after one epoch and at 50 epochs. After training InversionNET, the predicted model is much closer to the ground truth image. -2. Training loss and validation loss curves decreasing over time across 50 epochs. - -Finally, Ben concludes his talk by highlighting that he was able to successfully fine-tune a deep neural network without an accurate starting model to obtain subsurface model on a 4th generation Intel® Xeon® Scalable processor. - -Watch the [full video recording here](https://www.youtube.com/watch?v=TPp_Zyco6X4&list=PL_lsbAsL_o2BivkGLiDfHY9VqWlaNoZ2O&index=56) and download the [presentation](https://static.sched.com/hosted_files/pytorch2023/57/20231017_Consolvo_Seismic_PyTorchConf.pdf). More details can be found in this [blog](https://medium.com/better-programming/seismic-data-to-subsurface-models-with-openfwi-bcca0218b4e8). - -### About the Speaker - -![Ben Consolvo](/assets/images/ben-consolvo.jpg){:style="max-width:220px;float:right;margin-left: 20px;"} - -Ben Consolvo is an AI Solutions Engineering Manager at Intel. He has been building a team and a program around Intel’s AI technology paired with Intel’s hardware offerings. He brings a background and passion in data science, particularly in deep learning (DL) and computer vision. He has applied his skills in DL in the cybersecurity industry to automatically identify phishing websites, as well as to the oil and gas industry to identify subsurface features for geophysical imaging. - -## Lightning Talk 2: Dinosaur Bone Hunt - -Speaker: Bob Chesebrough, Sr Solution Architect, Intel, [LinkedIn](https://www.linkedin.com/in/robertchesebrough/) - -### Session Overview - -In this session, Bob starts the presentation by explaining his interest in collecting dinosaur bones and gives an overview of [Intel AI Software portfolio](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/overview.html). - -He then explains the steps to create a dinosaur site treasure map or dinosaur bone likelihood map: - - - -1. Collect data and create training data (New Mexico aerial photos of the Morrison Formation - a famous dinosaur bone bed in the Western United States and the GPS coordinates for small bone fragments discovered) -2. Train a simple ResNet 18 model using [Intel® Extension for PyTorch](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-pytorch.html#gs.1jggir) -3. Score the model on Utah photos and create a heat map - -Finally, Bob shows the results that dinosaur bones were discovered in Utah using dinosaur bone likelihood map. Go to the [GitHub repository](https://github.com/intelsoftware/jurassic) to access the code sample and try out the sample using Intel Extension for PyTorch. - -Watch the [full video recording here](https://www.youtube.com/watch?v=Q_soyAhduKk&list=PL_lsbAsL_o2BivkGLiDfHY9VqWlaNoZ2O&index=67) and download the [presentation](https://static.sched.com/hosted_files/pytorch2023/86/PyTorch_Conf_Chesebrough_2023_PPT.pdf). More details can be found in this [blog](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-ai-step-by-step-guide-for-hunting-dinosaurs.html). - -### About the Speaker - -![Bob Chesebrough](/assets/images/bob-chesebrough.jpg){:style="max-width:220px;float:right;margin-left: 20px;"} - -Bob Chesebrough's industry experience is software development/AI solution engineering for fortune 100 companies and national laboratories for over three decades. He is also a hobbyist who has logged over 800 miles and 1000 hours in the field finding dinosaur bones. He and his sons discovered an important fossil of the only known crocodilian from the Jurassic in New Mexico, they have also discovered and logged into the museum 2000+ bones localities and described a new mass bone bed in New Mexico. \ No newline at end of file diff --git a/_posts/2023-12-14-understanding-gpu-memory-1.md b/_posts/2023-12-14-understanding-gpu-memory-1.md deleted file mode 100644 index 166558d49ceb..000000000000 --- a/_posts/2023-12-14-understanding-gpu-memory-1.md +++ /dev/null @@ -1,391 +0,0 @@ ---- -layout: blog_detail -title: "Understanding GPU Memory 1: Visualizing All Allocations over Time" -author: Aaron Shi, Zachary DeVito ---- - -During your time with PyTorch on GPUs, you may be familiar with this common error message: - - -``` -torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 79.32 GiB of which 401.56 MiB is free. -``` - - -In this series, we show how to use memory tooling, including the Memory Snapshot, the Memory Profiler, and the Reference Cycle Detector to debug out of memory errors and improve memory usage. - -![Memory Timeline](/assets/images/understanding-gpu-memory-1/fig1.png){:style="width:100%;"} - -The **Memory Snapshot** tool provides a fine-grained GPU memory visualization for debugging GPU OOMs. Captured memory snapshots will show memory events including allocations, frees and OOMs, along with their stack traces. - -In a snapshot, each tensor’s memory allocation is color coded separately. The x axis is over time, and the y axis is the amount of GPU memory in MB. The snapshot is interactive, so we can observe the stack trace for any allocation by mousing over. Try it yourself at [https://github.com/pytorch/pytorch.github.io/blob/site/assets/images/understanding-gpu-memory-1/snapshot.html](https://github.com/pytorch/pytorch.github.io/blob/site/assets/images/understanding-gpu-memory-1/snapshot.html). - - -In this snapshot, there are 3 peaks showing the memory allocations over 3 training iterations (this is configerable). When looking at the peaks, it is **easy to see the rise of memory in the forward** **pass** and the **fall during the backward pass** as the gradients are computed. It is also possible to see that the program has the **same pattern of memory use iteration to iteration**. One thing that stands out is the many **tiny spikes in memory**, by mousing over them, we see that they are buffers used temporarily by convolution operators. - - -### Capturing Memory Snapshots - -The API to capture memory snapshots is fairly simple and available in torch.cuda.memory: - - - -* **Start:** `torch.cuda.memory._record_memory_history(max_entries=100000)` -* **Save:** `torch.cuda.memory._dump_snapshot(file_name)` -* **Stop:** `torch.cuda.memory._record_memory_history(enabled=None)` - -**Code Snippet** (for full code sample, see **Appendix A**): - - -``` - # Start recording memory snapshot history, initialized with a buffer - # capacity of 100,000 memory events, via the `max_entries` field. - torch.cuda.memory._record_memory_history( - max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT - ) - - # Run your PyTorch Model. - # At any point in time, save a snapshot to file for later. - for _ in range(5): - pred = model(inputs) - loss_fn(pred, labels).backward() - optimizer.step() - optimizer.zero_grad(set_to_none=True) - - # In this sample, we save the snapshot after running 5 iterations. - # - Save as many snapshots as you'd like. - # - Snapshots will save last `max_entries` number of memory events - # (100,000 in this example). - try: - torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle") - except Exception as e: - logger.error(f"Failed to capture memory snapshot {e}") - - # Stop recording memory snapshot history. - torch.cuda.memory._record_memory_history(enabled=None) -``` - - -To visualize the snapshot file, we have a tool hosted at [https://pytorch.org/memory_viz](https://pytorch.org/memory_viz). There, you can drag and drop your saved snapshot file and it will plot each allocation over time. **Privacy Note:** The tool will not save your snapshot. - - -![Memory Timeline](/assets/images/understanding-gpu-memory-1/fig2.png){:style="width:100%;"} - - -Alternatively, you can generate an HTML from a .pickle by using the script at pytorch/torch/cuda/_memory_viz.py, here is an example: - - -``` -python torch/cuda/_memory_viz.py trace_plot snapshot.pickle -o snapshot.html -``` - - -## Debugging CUDA OOMs - -Let’s look at how we can use the memory snapshot tool to answer: - - - -1. Why did a **CUDA OOM** happen? -2. Where is the **GPU Memory being used**? - - -### ResNet50 with a bug - -We’ve taken a look at a properly working model in the first snapshot. Now, let’s take a look at a training example with a bug, see snapshot: - - -![Memory Timeline](/assets/images/understanding-gpu-memory-1/fig3.png){:style="width:100%;"} - - -Notice how the **second iteration uses far more memory** than the first iteration. If this model were much larger, it could have **CUDA OOM'd in the second iteration** without much more insight into why. - - -![Memory Timeline](/assets/images/understanding-gpu-memory-1/fig4.png){:style="width:100%;"} - - -When examining this snapshot further, we can clearly see that several tensors are staying alive from the first iteration to the second and later iterations. If we mouse over one of these tensors, it would show a **stack trace suggesting that these were gradient tensors**. - -And indeed if we go to the code, we can see that **it doesn’t clear the gradient tensors**, when it could have **cleared them before the forward**. - - -Before: -``` - for _ in range(num_iters): - pred = model(inputs) - loss_fn(pred, labels).backward() - optimizer.step() -``` - -After: -``` - for _ in range(num_iters): - pred = model(inputs) - loss_fn(pred, labels).backward() - optimizer.step() - # Add this line to clear grad tensors - optimizer.zero_grad(set_to_none=True) -``` - - -We can simply add an `optimizer.zero_grad(set_to_none=True)` instruction to clear the gradient tensors from iteration to iteration (more details about why we need to zero the gradients here: [https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html](https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html)). - -This is a simplification of a bug we've found in more complicated programs using this tool. We encourage you to try out the Memory Snapshot on your GPU memory problems and let us know how it goes. - - -### ResNet50 after bug fix - -After applying the fix, the snapshot seems to be clearing the gradients now. - -![Memory Timeline](/assets/images/understanding-gpu-memory-1/fig5.png){:style="width:100%;"} - - -We now have the snapshot of a properly working ResNet50 model. Try out the code yourself (see code sample in **Appendix A**). - -But you may be wondering, **why is there still an increase in memory after the first iteration?** To answer this, let’s visit the **Memory Profiler** in the next section. - - -## Categorized Memory Usage - -The **Memory Profiler** is an added feature of the PyTorch Profiler that **categorizes** memory usage over time. We still rely on the Memory Snapshot for stack traces for deep dives into memory allocations. - -To generate a memory timeline, here is a code snippet (full code sample in **Appendix B**): - - -``` - # Initialize the profiler context with record_shapes, profile_memory, - # and with_stack set to True. - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1), - record_shapes=True, - profile_memory=True, - with_stack=True, - on_trace_ready=trace_handler, - ) as prof: - # Run the PyTorch Model inside the profile context. - for _ in range(5): - prof.step() - with record_function("## forward ##"): - pred = model(inputs) - - with record_function("## backward ##"): - loss_fn(pred, labels).backward() - - with record_function("## optimizer ##"): - optimizer.step() - optimizer.zero_grad(set_to_none=True) - - # Construct the memory timeline HTML plot. - prof.export_memory_timeline(f"{file_prefix}.html", device="cuda:0") -``` - - -For further reference, see [https://pytorch.org/docs/main/profiler.html](https://pytorch.org/docs/main/profiler.html). - -The Memory Profiler automatically generates categories based on the graph of tensor operations recorded during profiling. - - -![Memory Timeline](/assets/images/understanding-gpu-memory-1/fig6.png){:style="width:100%;"} - - -In this Memory Timeline collected using the Memory Profiler, we have the same training example as before. We can observe the **gradients in blue are now being cleared** from iteration to iteration. We can also notice that the **optimizer state in yellow is allocated after the first iteration**, and is kept constant for the rest of the job. - -This optimizer state is the reason behind the increase of GPU memory from the first iteration to the second. Try out the code yourself (see code sample in **Appendix B**). The Memory Profiler helps to improve training **memory understanding** so that model authors can figure out which categories are using the most GPU memory. - - -## Where can I find these tools? - -We hope that these tools will greatly improve your ability to debug CUDA OOMs and to understand your memory usage by category. - -The Memory Snapshot and the Memory Profiler are available in the v2.1 release of PyTorch as experimental features. - - - -* More information about the Memory Snapshot can be found in the [PyTorch Memory docs here](https://pytorch.org/docs/main/torch_cuda_memory.html). -* More details about the Memory Profiler can be found in the [PyTorch Profiler docs here](https://pytorch.org/docs/main/profiler.html). - - -## Feedback - -We look forward to hearing from you about any enhancements, bugs or memory stories that our tools helped to solve! As always, please feel free to open new issues on PyTorch’s Github page. - -We are also open to contributions from the OSS community, feel free to tag [Aaron Shi](https://github.com/aaronenyeshi) and [Zachary DeVito](https://github.com/zdevito) in any Github PRs for reviews. - - -## Acknowledgements - -Really appreciate the content reviewers, [Mark Saroufim](mailto:marksaroufim@meta.com) and [Gregory Chanan](mailto:gchanan@meta.com), for reviewing this post and improving its readability. - -Really appreciate the code reviews and feedback from [Adnan Aziz](mailto:adnanaziz@meta.com) and [Lei Tian](mailto:ltian@meta.com). - -## Appendix - - -### Appendix A - ResNet50 Memory Snapshot Code Example - - -``` -# (c) Meta Platforms, Inc. and affiliates. -import logging -import socket -from datetime import datetime, timedelta - -import torch - -from torchvision import models - -logging.basicConfig( - format="%(levelname)s:%(asctime)s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) -logger: logging.Logger = logging.getLogger(__name__) -logger.setLevel(level=logging.INFO) - -TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S" - -# Keep a max of 100,000 alloc/free events in the recorded history -# leading up to the snapshot. -MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000 - -def start_record_memory_history() -> None: - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not recording memory history") - return - - logger.info("Starting snapshot record_memory_history") - torch.cuda.memory._record_memory_history( - max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT - ) - -def stop_record_memory_history() -> None: - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not recording memory history") - return - - logger.info("Stopping snapshot record_memory_history") - torch.cuda.memory._record_memory_history(enabled=None) - -def export_memory_snapshot() -> None: - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not exporting memory snapshot") - return - - # Prefix for file names. - host_name = socket.gethostname() - timestamp = datetime.now().strftime(TIME_FORMAT_STR) - file_prefix = f"{host_name}_{timestamp}" - - try: - logger.info(f"Saving snapshot to local file: {file_prefix}.pickle") - torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle") - except Exception as e: - logger.error(f"Failed to capture memory snapshot {e}") - return - -# Simple Resnet50 example to demonstrate how to capture memory visuals. -def run_resnet50(num_iters=5, device="cuda:0"): - model = models.resnet50().to(device=device) - inputs = torch.randn(1, 3, 224, 224, device=device) - labels = torch.rand_like(model(inputs)) - optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) - loss_fn = torch.nn.CrossEntropyLoss() - - # Start recording memory snapshot history - start_record_memory_history() - - for _ in range(num_iters): - pred = model(inputs) - loss_fn(pred, labels).backward() - optimizer.step() - optimizer.zero_grad(set_to_none=True) - - # Create the memory snapshot file - export_memory_snapshot() - - # Stop recording memory snapshot history - stop_record_memory_history() - -if __name__ == "__main__": - # Run the resnet50 model - run_resnet50() -``` - - - -### Appendix B - ResNet50 Memory Profiler Code Example - - -``` -# (c) Meta Platforms, Inc. and affiliates. -import logging -import socket -from datetime import datetime, timedelta - -import torch - -from torch.autograd.profiler import record_function -from torchvision import models - -logging.basicConfig( - format="%(levelname)s:%(asctime)s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) -logger: logging.Logger = logging.getLogger(__name__) -logger.setLevel(level=logging.INFO) - -TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S" - -def trace_handler(prof: torch.profiler.profile): - # Prefix for file names. - host_name = socket.gethostname() - timestamp = datetime.now().strftime(TIME_FORMAT_STR) - file_prefix = f"{host_name}_{timestamp}" - - # Construct the trace file. - prof.export_chrome_trace(f"{file_prefix}.json.gz") - - # Construct the memory timeline file. - prof.export_memory_timeline(f"{file_prefix}.html", device="cuda:0") - -def run_resnet50(num_iters=5, device="cuda:0"): - model = models.resnet50().to(device=device) - inputs = torch.randn(1, 3, 224, 224, device=device) - labels = torch.rand_like(model(inputs)) - optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) - loss_fn = torch.nn.CrossEntropyLoss() - - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1), - record_shapes=True, - profile_memory=True, - with_stack=True, - on_trace_ready=trace_handler, - ) as prof: - for _ in range(num_iters): - prof.step() - with record_function("## forward ##"): - pred = model(inputs) - - with record_function("## backward ##"): - loss_fn(pred, labels).backward() - - with record_function("## optimizer ##"): - optimizer.step() - optimizer.zero_grad(set_to_none=True) - -if __name__ == "__main__": - # Warm up - run_resnet50() - # Run the resnet50 model - run_resnet50() -``` \ No newline at end of file diff --git a/_posts/2023-12-15-empowering-models-performance.md b/_posts/2023-12-15-empowering-models-performance.md deleted file mode 100644 index 5bc2c88059ac..000000000000 --- a/_posts/2023-12-15-empowering-models-performance.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -layout: blog_detail -title: "Empowering Models with Performance: The Art of Generalized Model Transformation Approach" -author: Jackie (Jiaqi) Xu, Yanbo Liang, Jason Ansel, Chunzhi Yang, Jade Nie, Yuzhen Huang, CK Luk, Xiaodong Wang, Lu Fang, Menglu Yu, Jinwon Lee, Daohang Shi, Flavio Sales Truzzi ---- - -## Introduction - -[PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/) (PT2) offers a compiled execution mode which rewrites Python bytecode to extract sequences of PyTorch operations, translating them into a Graph IR. The IR is then just-in-time compiled through a customizable back end, improving training performance without user interference. Often, production models may go through multiple stages of optimization/lowering to hit performance targets. Therefore, having a compiled mode is desirable as it can separate the work of improving model performance from direct modification of the PyTorch model implementation. Thus, the compiled mode becomes more important, enabling Pytorch users to enhance model performance without modifying the PyTorch code implementation. This feature is particularly valuable for optimizing complex models, including large-scale and production-ready ones. - -In our previous [blog post](https://pytorch.org/blog/optimizing-production-pytorch-performance-with-graph-transformations/) , we outlined how heuristic model transformation rules are employed to optimize intricate production models. While these rules enabled substantial performance gains for some pilot models, they lacked universal adaptability; they don't consistently perform well across different models or sometimes even within different sections of a single model. - - -![Fig.1 PT1 Graph mode vs PT2 Compile mode.](/assets/images/empowering-models-performance/fig1.jpg){:style="width:100%;"} - -

        Fig. 1: PT1 Graph mode vs PT2 Compile mode.

        - -In this blog post, we propose a more generalized model transformation solution, serving as a plugin to the PT2 compiler as shown in Fig.1 which is more general, performant and user-friendly, bringing performance improvements to both model training and inference without manual efforts. As illustrated in Fig.2, by incorporating the previously user-defined transformations into the compiler, we have streamlined the production stack. These changes bring advantages to a broader range of PyTorch models, extending beyond just Meta models, which has already been incorporated in PT2 and is ready for use to benefit all Pytorch models. - - -![Fig.2 Simplified stack with PT2 compile mode.](/assets/images/empowering-models-performance/fig2.jpg){:style="width:100%;"} - -

        Fig. 2: Simplified stack with PT2 compile mode.

        - - -## Guiding Principle: Atomic Rules - -Traditionally, people might use predefined heuristic rules to replace a model subgraph with another more performant subgraph toreduce launch overhead, minimize memory bw, and fully occupy SMs. However, this approach doesn’t scale well as it is hard to craft a set of rules that fits all models perfectly. - -Instead of grappling with bulky, complex rules, we can actually break them down into smaller, more digestible pieces – what we call '**atomic rules**'. These tiny powerhouses of efficiency target the transformation of individual operators, to conduct one step of the fusion/transformation. This makes them easy to handle and apply, offering a straightforward path to optimizing models. So, with these atomic rules in hand, optimizing any model for top-tier performance becomes a breeze! - -We will walk through some simple examples to demonstrate how we use a chain of atomic rules to replace complicated heuristic rules. - - -### Case 1: Horizontal fusion of computation chains started with accesses to embedding tables - -Horizontal fusion means fusing parallel operators into one so as to reduce the number of kernels to be launched and improve performance. In our previous blog ([Section 3.2](https://pytorch.org/blog/optimizing-production-pytorch-performance-with-graph-transformations/#32-horizontal-fusion-of-computation-chains-started-with-accesses-to-embedding-tables)), we described model transformations that fused layernorm and activation functions after embedding bags, as shown in the figure provided. However, this method, had limitations: - - - -1. It only worked with layernorm and activation functions after embedding. -2. It was restricted to models with specific architecture rules, causing various issues in our production stack, including parameter changes and inference disruptions. - -To improve, we can use three atomic rules as shown in Fig.3 to replace the complicated heuristic rule: - - - -* Fuse layernorms that follow the same split nodes horizontally. -* Then, fuse tanh functions following the same split nodes horizontally. -* Lastly, fuse vertical split-cat nodes. - -These atomic rules offer a clean and streamlined way for model simplification and optimization. - -![Fig.3 Before, we optimized the model in one go by replacing subgraphs. Now, with atomic rules, we optimize step-by-step, covering more cases.](/assets/images/empowering-models-performance/fig3.jpg){:style="width:100%;"} - -

        Fig. 3: Before, we optimized the model in one go by replacing subgraphs. Now, with atomic rules, we optimize step-by-step, covering more cases.

        - - - -### Case 2: Fuse horizontal MLP - -[MLP](https://en.wikipedia.org/wiki/Multilayer_perceptron)s (Multilayer Perceptrons) are fundamental components of deep neural networks, often consisting of linear, normalization, and activation functions. In complex models, there’s often a need to fuse many horizontal MLPs. Traditional methods find and replace parallel MLPs with a fused module as shown in Fig.4, but this isn’t always straightforward. Some models might not have normalization, or they might use different activation functions, making it hard to apply a one-size-fits-all rule. - -This is where our atomic rules come in handy. These simplified rules target individual operators one at a time, making the process easier and more manageable. We use the following atomic rules for horizontal MLP fusion: - - - -* Fusing horizontal linear operators -* Fusing horizontal layernorms. -* Fusing horizontal activation functions. - - -![Fig.4 Pseudocode for fusing MLP. Traditional optimizations need manual Python code changes.](/assets/images/empowering-models-performance/fig4.jpg){:style="width:100%;"} - -

        Fig. 4: Pseudocode for fusing MLP. Traditional optimizations need manual Python code changes.

        - -The beauty of these rules is that they’re not limited to one case. They can be applied broadly. Since PyTorch models are built with torch operators, focusing on a smaller set of operators simplifies the process. This approach is not only more manageable but also more general compared to writing a specific large pattern replacement rule, making it easier to optimize various models efficiently. - - -## Compile-time Graph Search - -Our principle is to use chained atomic rules to replace heuristic rules. While this approach covers a wider range of cases, it does entail a longer time for graph search and pattern matching. The next question is: how can we minimize compilation time while performing compile-time graph searches efficiently? - -We design a two-step greedy algorithm as illustrated in Fig. 5. The first step in this process is to identify the target nodes, which we follow certain rules, e.g., identifying all linear operations with the same input shapes. Once identified, we use a Breadth-First Search (BFS) strategy to separate these nodes into different sets, so that nodes within a set don’t have data dependency. The nodes within each of these sets are independent and can be fused horizontally. - - -![Fig.5 Process of model transformation with graph IR.](/assets/images/empowering-models-performance/fig5.jpg){:style="width:100%;"} - -

        Fig. 5: Process of model transformation with graph IR.

        - - -With our approach, the search time is roughly 60 seconds for one of our largest internal models, which is manageable for on-the-fly tasks. - - -## In the End - -In our tests with internal ranking models, we observed approximately 5% to 15% training performance improvement across five models on top of the performance gain brought by torch.compile. We have enabled the optimization in PT2 compiler stack and landed it as default when users choose Inductor as the backend ([config](https://github.com/pytorch/pytorch/blob/53acdb66f7ed31919cf69cf62e6ee0f13287be7e/torch/_inductor/config.py#L90)). We expect our generalized transformation approach could benefit models beyond Meta, and look forward to more discussion and improvement through this compiler level transformation framework. - - -## Acknowledgements - -Many thanks to Mark Saroufim, Gregory Chanan, Adnan Aziz, and Rocky Liu for their detailed and insightful reviews. diff --git a/_posts/2023-12-18-training-production-ai-models.md b/_posts/2023-12-18-training-production-ai-models.md deleted file mode 100644 index cb437bb55a2f..000000000000 --- a/_posts/2023-12-18-training-production-ai-models.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -layout: blog_detail -title: "Training Production AI Models with PyTorch 2.0" -author: CK Luk, Daohang Shi, Yuzhen Huang, Jackie (Jiaqi) Xu, Jade Nie, Zhou Wang, Lu Fang, Flavio Sales Truzzi, Devashish Shankar, Dima Ivashchenko, Chunzhi Yang, Nicolas Macchioni, David Berard, Yu Guo, Xiaodong Wang, Bert Maher, Yanbo Liang, Edward Yang, Brian Hirsh, Michael Voznesensky, Animesh Jain, Michael Anderson ---- - - - -## 1. Introduction - -[PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/) (abbreviated as PT2) can significantly improve the training and inference performance of an AI model using a compiler called _torch.compile_ while being 100% backward compatible with PyTorch 1.x. There have been reports on how PT2 improves the performance of common _benchmarks_ (e.g., [huggingface’s diffusers](https://huggingface.co/docs/diffusers/optimization/torch2.0)). In this blog, we discuss our experiences in applying PT2 to _production_ AI models at Meta. - - -## 2. Background - - -### 2.1 Why is automatic performance optimization important for production? - -Performance is particularly important for production—e.g, even a 5% reduction in the training time of a heavily used model can translate to substantial savings in GPU cost and data-center _power_. Another important metric is _development efficiency_, which measures how many engineer-months are required to bring a model to production. Typically, a significant part of this bring-up effort is spent on _manual_ performance tuning such as rewriting GPU kernels to improve the training speed. By providing _automatic_ performance optimization, PT2 can improve _both_ cost and development efficiency. - - -### 2.2 How PT2 improves performance - -As a compiler, PT2 can view _multiple_ operations in the training graph captured from a model (unlike in PT1.x, where only one operation is executed at a time). Consequently, PT2 can exploit a number of performance optimization opportunities, including: - - - -* **Fusing multiple operations into a single GPU kernel:** - * A typical type of performance overhead in running a GPU program is the CPU overhead of launching small GPU kernels. By fusing multiple operations into a single GPU kernel, PT2 can significantly reduce the kernel-launching overhead on the CPU. For instance, consider the PyTorch program in Figure 1(a). When it is executed on GPU with PT1, it has three GPU kernels (two for the two sin() ops and one for the addition op). With PT2, there is only one kernel generated, which fuses all three ops. - * After fusing some operations, certain operations in the graph may become dead and hence can be optimized away. This can save both compute and memory bandwidth on the GPU. For instance, in Figure 1(b), one of the duplicated sin() ops can be optimized away. - * In addition, fusion can also reduce GPU device memory reads/writes (by composing pointwise kernels) and help improve hardware utilization. - - - -![Fig.1 How PT2 improves performance with fusion and dead-code elimination.](/assets/images/training-production-ai-models/blog-fig1.jpg){:style="width:100%;"} - -

        Fig. 1: How PT2 improves performance with fusion and dead-code elimination.

        - - - -* **Reducing the type conversion overhead for using lower-precision data types:** - * PyTorch 1.x supports [Automatic Mixed Precision (AMP)](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html). While AMP can reduce the compute time of an op, it introduces type conversion overhead before and after the op. PT2 can increase AMP performance by optimizing away unnecessary type conversion code, significantly reducing its overhead. As an example, Figure 2(a) converts three 32-bit input tensors (a32, b32, c32) to bf16 before doing the matrix multiplications. Nevertheless, in this example, a32 and c32 are actually the same tensor (a_float32). So, there is no need to convert a_float32 twice, as shown in the code generated by torch.compile in Figure 2(b). Note that while both this example and the previous one optimize away redundant computations, they are different in the sense that the type conversion code in this example is _implicit_ via torch.autocast, unlike in the previous example where the torch.sin(x).cuda() is _explicit_ in user code. - - -![Fig.2 How PT2 reduces type conversion overhead when using AMP.](/assets/images/training-production-ai-models/blog-fig2.jpg){:style="width:100%;"} - -

        Fig. 2: How PT2 reduces type conversion overhead when using AMP.

        - - - - -* **Reusing buffers on the GPU:** - * With a global view, the scheduler in torch.compile can reuse buffers on the GPU, thereby reducing both memory allocation time and memory consumption. Figure 3 shows the driver program that calls the Triton kernels generated for the program in Figure 2(a). We can see that `buf1` is reused as` buf4`. - - -![Fig.3 Reuse of buffers.](/assets/images/training-production-ai-models/blog-fig3.jpg){:style="width:100%;"} - -

        Fig. 3: Reuse of buffers.

        - - - - - - -* **Autotuning:** - * PT2 has options to enable autotuning (via Triton) on matrix-multiply ops, pointwise ops, and reduction ops. Tunable parameters include block size, number of stages, and number of warps. With autotuning, the most performant implementation of an op can be found empirically. - - -## 3. Production environment considerations - -In this section, we describe a number of important considerations in applying PT2 to production. - - -### 3.1 Ensuring no model quality degradation with torch.compile - -Applying torch.compile to a model will cause numerical changes because of (1) reordering of floating-point ops during various optimizations such as fusion and (2) use of lower precision data types like bf16 if AMP is enabled. Therefore 100% bitwise compatibility with PT 1.x is not expected. Nevertheless, we still need to make sure that the model quality (measured in some form of numeric scores) is preserved after applying torch.compile. Typically, each production model will have its own range of acceptable scores (e.g., percentage change must be within 0.01%). - -In case of a model-quality drop caused by torch.compile, we need to do a deep-dive debug. - -One useful technique for debugging a torch.compile-related numeric issue is to apply torch.compile with different backends, in particular “eager” and “aot_eager”, in addition to “inductor”: - - - -* If the numeric issue happens with the “eager” backend, then the forward graph constructed by torch.compile is likely incorrect; -* If the numeric issue doesn’t happen with “eager” but happens with “aot_eager”, then the backward graph constructed by torch.compile is likely incorrect; -* If the numeric issue doesn’t happen with either “eager” or “aot_eager” but happens with “inductor”, then the code generation inside the inductor is likely incorrect. - - -### 3.2 Autotuning in production - -By default, the autotuning in torch.inductor is done _online_ while the model is executed. For some production models, we find that the autotuning time can take several hours, which is not acceptable for production. Therefore, we add _offline autotuning_ which works as depicted in Figure 4. The very first time that a model is run, the details (e.g., input tensor shape, data type etc) on all ops that require tuning will be logged to a database. Then, a tuning process for these ops is run overnight to search for the most performant implementation of each op; the search result is updated to a persistent cache (implemented as a source file of torch.inductor). Next time when the model is run again, the tuned implementation of each op will be found in the cache and chosen for execution. - - -![Fig.4 The offline autotuning used in production.](/assets/images/training-production-ai-models/blog-fig4.jpg){:style="width:100%;"} - -

        Fig. 4: The offline autotuning used in production.

        - -### 3.3 Profiling support for torch.compile - -As we previously discussed in this [blog](https://pytorch.org/blog/performance-deb), a profiler is essential for debugging the performance of production models. We have enhanced the profiler to display torch.compile related events on the timeline. The most useful ones are marking which parts of the model are running compiled code so that we can quickly validate if the parts of the model that are supposed to be compiled are actually compiled by torch.compile. For example, the trace in Figure 5 has two compiled regions (with the label “CompiledFunction”). Other useful events are time spent on the compilation and that spent on accessing the compiler’s code-cache. - - -![Fig.5 A trace with two compiled regions.](/assets/images/training-production-ai-models/blog-fig5.jpg){:style="width:100%;"} - -

        Fig. 5: A trace with two compiled regions.

        - - -### 3.4 Controlling just-in-time compilation time - -torch.compile uses just-in-time compilation. The compilation happens when the first batch of data is trained. In our production setting, there is an upper limit on how much time is allowed for a training job to reach its first batch, aka _Time-To-First-Batch (TTFB)_. We need to make sure that enabling torch.compile will not increase TTFB to over the limit. This could be challenging because production models are large and~~ ~~torch.compile can take substantial compilation time. We enable _parallel compilation_ to keep the compile time under control (this is controlled by the global variable `compile_threads` inside `torch/_inductor/config.py`, which is already set to the CPU count on OSS Linux). A model is decomposed into one or more computational graphs; each graph is decomposed into multiple Triton kernels. If parallel compilation is enabled, all the Triton kernels in the same graph can be compiled simultaneously (nevertheless, kernels from different graphs are still compiled in serial). Figure 6 illustrates how parallel compilation helps. - - -![Fig.6 Using parallel compilation in production.](/assets/images/training-production-ai-models/blog-fig6.jpg){:style="width:100%;"} - -

        Fig. 6: Using parallel compilation in production.

        - - -## 4. Results - -In this section, we use three production models to evaluate PT2. First we show the training time speedups with PT2, using different optimization configs. Second, we show the importance of parallel compilation on the compilation time. - - -### 4.1 Training-time speedup with torch.compile - -Figure 7 reports the training-time speedup with PT2. For each model, we show four cases: (i) no-compile with bf16, (ii) compile with fp32, (iii) compile with bf16, (iv) compile with bf16 and autotuning. The y-axis is the speedup over the baseline, which is no-compile with fp32. Note that no-compile with bf16 is actually slower than no-compile with fp32, due to the type conversion overhead. In contrast, compiling with bf16 achieves much larger speedups by reducing much of this overhead. Overall, given that these models are already heavily optimized by hand, we are excited to see that torch.compile can still provide 1.14-1.24x speedup. - - -![Fig.7 Training-time speedup with torch.compile (note: the baseline, no-compile/fp32, is omitted in this figure).](/assets/images/training-production-ai-models/blog-fig7.jpg){:style="width:100%;"} - -

        Fig. 7: Training-time speedup with torch.compile (note: the baseline, no-compile/fp32, is omitted in this figure).

        - - - -### 4.2 Compilation-time reduction with parallel compilation - -Figure 8 shows the compilation time with and without parallel compilation. While there is still room for improvement on the serial compilation time, parallel compilation has reduced the compilation overhead on TTFB to an acceptable level. Models B and C benefit more from parallel compilation than Model A does because they have more distinct Triton kernels per graph. - - -![Fig.8 PT2 compilation time.](/assets/images/training-production-ai-models/blog-fig8.jpg){:style="width:100%;"} - -

        Fig. 8: PT2 compilation time.

        - - - -## 5. Concluding Remarks - -In this blog, we demonstrate that PT2 can significantly accelerate the training of large and complex production AI models with reasonable compilation time. In our next blog, we will discuss how PT2 can do general graph transformations. - - -## 6. Acknowledgements - -Many thanks to [Mark Saroufim](mailto:marksaroufim@meta.com), [Adnan Aziz](mailto:adnanaziz@fb.com), and [Gregory Chanan](mailto:gchanan@meta.com) for their detailed and insightful reviews. \ No newline at end of file diff --git a/_posts/2023-12-19-understanding-gpu-memory-2.md b/_posts/2023-12-19-understanding-gpu-memory-2.md deleted file mode 100644 index 7a655778091d..000000000000 --- a/_posts/2023-12-19-understanding-gpu-memory-2.md +++ /dev/null @@ -1,383 +0,0 @@ ---- -layout: blog_detail -title: "Understanding GPU Memory 2: Finding and Removing Reference Cycles" -author: Aaron Shi, Zachary DeVito ---- - -This is part 2 of the Understanding GPU Memory blog series. Our first post [Understanding GPU Memory 1: Visualizing All Allocations over Time](/blog/understanding-gpu-memory-1/) shows how to use the memory snapshot tool. In this part, we will use the Memory Snapshot to visualize a GPU memory leak caused by reference cycles, and then locate and remove them in our code using the Reference Cycle Detector. - -Sometimes when we were using the Memory Snapshot, we saw plots of GPU memory that looked similar to this. - - - -![GPU memory](/assets/images/understanding-gpu-memory-1/memory_leak_oom.jpg){:style="width:100%;"} - - -In this snapshot, each peak shows GPU tensors building up over time and then several tensors getting released at once. In addition, a CUDA OOM happens on the right side causing all the tensors to be released. Seeing the tensors accumulate like this is a **clear indication of a problem, but it doesn't immediately suggest why**. - - -## Tensors in Reference Cycles - -During early debugging, we dug in further to find that this **pattern happens a lot when your Python code has objects with reference cycles. ** Python will clean up non-cyclic objects immediately using reference counting. However objects in reference cycles are only cleaned up later by a cycle collector. If these cycles refer to a GPU tensor, the GPU tensor will stay alive until that cycle collector runs and removes the reference cycle. Let’s take a look at a simplified example. - - - -![Simple reference cycle](/assets/images/understanding-gpu-memory-1/simple_reference_cycle.png){:style="width:100%; max-width:400px; margin-right: auto; margin-left: auto; display: block;"} - - -**Code Snippet behind the snapshot (full code in Appendix A):** - - -``` - def leak(tensor_size, num_iter=100000, device="cuda:0"): - class Node: - def __init__(self, T): - self.tensor = T - self.link = None - - for _ in range(num_iter): - A = torch.zeros(tensor_size, device=device) - B = torch.zeros(tensor_size, device=device) - a, b = Node(A), Node(B) - - # A reference cycle will force refcounts to be non-zero. - a.link, b.link = b, a - # Python will eventually garbage collect a & b, but will - # OOM on the GPU before that happens (since python - # runtime doesn't know about CUDA memory usage). -``` - - -In this code example, the tensors A and B are created, where A has a link to B and vice versa. This forces a non-zero reference count when A and B go out of scope. When we run this for 100,000 iterations, we expect the automatic garbage collection to free the reference cycles when going out of scope. However, this will actually CUDA OOM. - - -### Why doesn’t automatic garbage collection work? - -The automatic garbage collection works well when there is a lot of extra memory as is common on CPUs because it amortizes the expensive garbage collection by using [Generational Garbage Collection](https://en.wikipedia.org/wiki/Tracing_garbage_collection#Generational_GC_(ephemeral_GC)). But to amortize the collection work, it defers some memory cleanup making the maximum memory usage higher, which is less suited to memory constrained environments. The Python runtime also has no insights into CUDA memory usage, so it cannot be triggered on high memory pressure either. It’s even more challenging as GPU training is almost always memory constrained because we will often raise the batch size to use any additional free memory. - -The CPython’s garbage collection frees unreachable objects held in reference cycles via the **mark-and-sweep**. The garbage collection is automatically run when the number of objects exceeds certain thresholds. There are **3 generations of thresholds** to help amortize the expensive costs of running garbage collection on every object. The later generations are less frequently run. This would explain why automatic collections will only clear several tensors on each peak, however there are still tensors that leak resulting in the CUDA OOM. Those tensors were held by reference cycles in later generations. - - -## Explicitly calling gc.collect() - -One way to fix this is by explicitly calling the garbage collector frequently. Here we can see that the GPU memory for tensors out of scope gets cleaned up when we explicitly call the garbage collector every 100 iterations. This also controls the maximum GPU peak memory held by leaking tensors. - - -![memory leak](/assets/images/understanding-gpu-memory-1/memory_leak_gc_collect.jpg){:style="width:100%;"} - - -Although this works and fixes the CUDA OOM issue, calling gc.collect() too frequently can cause other issues including **QPS regressions**. Therefore we cannot simply increase the frequency of garbage collection on every training job. **It's best to just avoid creating reference cycles in the first place**. More on this in section, Reference Cycle Detector. - - -## Sneaky Memory Leak in Callback - -Real examples are more complicated, so let’s look at a more realistic example that has a similar behavior. In this snapshot, we can observe the same behavior of tensors being accumulated and freed during automatic garbage collection, until we hit a CUDA OOM. - - - -![memory leak](/assets/images/understanding-gpu-memory-1/memory_leak_awaitable.jpg){:style="width:100%;"} - - -**Code Snippet behind this snapshot (full code sample in Appendix A):** - - -``` - class AwaitableTensor: - def __init__(self, tensor_size): - self._tensor_size = tensor_size - self._tensor = None - - def wait(self): - self._tensor = torch.zeros(self._tensor_size, device="cuda:0") - return self._tensor - - class AwaitableTensorWithViewCallback: - def __init__(self, tensor_awaitable, view_dim): - self._tensor_awaitable = tensor_awaitable - self._view_dim = view_dim - # Add a view filter callback to the tensor. - self._callback = lambda ret: ret.view(-1, self._view_dim) - - def wait(self): - return self._callback(self._tensor_awaitable.wait()) - - async def awaitable_leak( - tensor_size=2**27, num_iter=100000, - ): - for _ in range(num_iter): - A = AwaitableTensor(tensor_size) - AwaitableTensorWithViewCallBack(A, 4).wait() -``` - - -In this code, we define two classes. The class AwaitableTensor will create a tensor when waited upon. Another class AwaitableTensorWithViewCallback will apply a view filter on the AwaitableTensor via callback lambda. - -When running awaitable_leak, which creates tensor A (512 MB) and applies a view filter for 100,000 iterations, we expect that A should be reclaimed each time it goes out of scope because the reference count should reach 0. However, this will actually OOM! - -While we know there is a reference cycle here, it isn't clear from the code where the cycle is created. **To help with these situations, we have created a tool to locate and report these cycles.** - - -## Reference Cycle Detector - -Introducing the **Reference Cycle Detector**, which helps us find reference cycles keeping GPU tensors alive. The API is fairly simple: - - - -* During model initialization: - * **Import:** `from torch.utils.viz._cycles import warn_tensor_cycles` - * **Start:** `warn_tensor_cycles()` - -The Reference Cycle Detector will issue warnings every time that the cycle collector runs and finds a CUDA tensor that gets freed. The warning provides an **object graph** showing how the reference cycle refers to the GPU tensor. - - - -![object graph](/assets/images/understanding-gpu-memory-1/awaitable_leak_cycle.png){:style="width:100%;"} - - -For instance in this object graph, we can easily observe that there is a **circular dependency on the outer circle of the graph**, and **highlighted in red is the GPU tensor kept alive**. - -Most cycles are pretty easy to fix once they are discovered. For instance here we can remove the reference to self created by self._view_dim in the callback. - - - -![code snippet](/assets/images/understanding-gpu-memory-1/awaitable_code_snippet.png){:style="width:100%;"} - - -We've spent some time fixing cycles in existing models using these tools. For example in TorchRec, we’ve found and removed a reference cycle in [PR#1226](https://github.com/pytorch/torchrec/pull/1226). - - - -![code snippet](/assets/images/understanding-gpu-memory-1/torchrec_code_snippet.png){:style="width:100%;"} - - -Once we’ve removed the reference cycles, the code will **no longer issue a CUDA OOM nor show any memory leaks** in their snapshots. - - -### What are the other benefits of using the Reference Cycle Detector? - -Removing these cycles will also directly **lower the maximum GPU memory usage** as well as make it **less likely for memory to fragment** because the allocator returns to the same state after each iteration. - - -## Where can I find these tools? - -We hope that the Reference Cycle Detector will greatly improve your ability to find and remove memory leaks caused by reference cycles. The Reference Cycle Detector is available in the v2.1 release of PyTorch as experimental features and More information about the Reference Cycle Detector can be found in the [PyTorch Memory docs here](https://pytorch.org/docs/main/torch_cuda_memory.html). - - -## Feedback - -We look forward to hearing from you about any enhancements, bugs or memory stories that our tools helped to solve! As always, please feel free to open new issues on PyTorch’s Github page. - -We are also open to contributions from the OSS community, feel free to tag [Aaron Shi](https://github.com/aaronenyeshi) and [Zachary DeVito](https://github.com/zdevito) in any Github PRs for reviews. - - -## Acknowledgements - -Really appreciate the content reviewers, [Mark Saroufim](mailto:marksaroufim@meta.com), [Gregory Chanan](mailto:gchanan@meta.com), and [Adnan Aziz](mailto:adnanaziz@meta.com) for reviewing this post and improving its readability. - - -## Appendix - - -### Appendix A - Code Sample - -This code snippet was used to generate the plots and examples shown. Here are the arguments to reproduce the sections: - - - -* Introduction: `python sample.py` -* Explicitly calling gc.collect(): `python sample.py --gc_collect_interval=100` -* Sneaky Memory Leak in Callback: `python sample.py --workload=awaitable` -* Ref Cycle Detector: `python sample.py --workload=awaitable --warn_tensor_cycles` - -**sample.py:** - - -``` -# (c) Meta Platforms, Inc. and affiliates. -import argparse -import asyncio -import gc -import logging -import socket -from datetime import datetime, timedelta - -import torch - -logging.basicConfig( - format="%(levelname)s:%(asctime)s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) -logger: logging.Logger = logging.getLogger(__name__) -logger.setLevel(level=logging.INFO) - -TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S" - -# Keep a max of 100,000 alloc/free events in the recorded history -# leading up to the snapshot. -MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000 - -def start_record_memory_history() -> None: - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not recording memory history") - return - - logger.info("Starting snapshot record_memory_history") - torch.cuda.memory._record_memory_history( - max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT - ) - -def stop_record_memory_history() -> None: - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not recording memory history") - return - - logger.info("Stopping snapshot record_memory_history") - torch.cuda.memory._record_memory_history(enabled=None) - -def export_memory_snapshot() -> None: - if not torch.cuda.is_available(): - logger.info("CUDA unavailable. Not exporting memory snapshot") - return - - # Prefix for file names. - host_name = socket.gethostname() - timestamp = datetime.now().strftime(TIME_FORMAT_STR) - file_prefix = f"{host_name}_{timestamp}" - - try: - logger.info(f"Saving snapshot to local file: {file_prefix}.pickle") - torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle") - except Exception as e: - logger.error(f"Failed to capture memory snapshot {e}") - return - -# This function will leak tensors due to the reference cycles. -def simple_leak(tensor_size, gc_interval=None, num_iter=30000, device="cuda:0"): - class Node: - def __init__(self, T): - self.tensor = T - self.link = None - - for i in range(num_iter): - A = torch.zeros(tensor_size, device=device) - B = torch.zeros(tensor_size, device=device) - a, b = Node(A), Node(B) - # A reference cycle will force refcounts to be non-zero, when - # a and b go out of scope. - a.link, b.link = b, a - # Python will eventually gc a and b, but may OOM on the CUDA - # device before that happens (since python runtime doesn't - # know about CUDA memory usage). - - # Since implicit gc is not called frequently enough due to - # generational gc, adding an explicit gc is necessary as Python - # runtime does not know about CUDA memory pressure. - # https://en.wikipedia.org/wiki/Tracing_garbage_collection#Generational_GC_(ephemeral_GC) - if gc_interval and i % int(gc_interval) == 0: - gc.collect() - -async def awaitable_leak( - tensor_size, gc_interval=None, num_iter=100000, device="cuda:0" -): - class AwaitableTensor: - def __init__(self, tensor_size, device) -> None: - self._tensor_size = tensor_size - self._device = device - self._tensor = None - - def wait(self) -> torch.Tensor: - self._tensor = torch.zeros(self._tensor_size, device=self._device) - return self._tensor - - class AwaitableTensorWithViewCallBack: - def __init__( - self, - tensor_awaitable: AwaitableTensor, - view_dim: int, - ) -> None: - self._tensor_awaitable = tensor_awaitable - self._view_dim = view_dim - # Add a view filter callback to the tensor. - self._callback = lambda ret: ret.view(-1, self._view_dim) - - def wait(self) -> torch.Tensor: - return self._callback(self._tensor_awaitable.wait()) - - for i in range(num_iter): - # Create an awaitable tensor - a_tensor = AwaitableTensor(tensor_size, device) - - # Apply a view filter callback on the awaitable tensor. - AwaitableTensorWithViewCallBack(a_tensor, 4).wait() - - # a_tensor will go out of scope. - - if gc_interval and i % int(gc_interval) == 0: - gc.collect() - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="A memory_leak binary instance") - parser.add_argument( - "--gc_collect_interval", - default=None, - help="Explicitly call GC every given interval. Default is off.", - ) - parser.add_argument( - "--workload", - default="simple", - help="Toggle which memory leak workload to run. Options are simple, awaitable.", - ) - parser.add_argument( - "--warn_tensor_cycles", - action="store_true", - default=False, - help="Toggle whether to enable reference cycle detector.", - ) - args = parser.parse_args() - - if args.warn_tensor_cycles: - from tempfile import NamedTemporaryFile - - from torch.utils.viz._cycles import observe_tensor_cycles - - logger.info("Enabling warning for Python reference cycles for CUDA Tensors.") - - def write_and_log(html): - with NamedTemporaryFile("w", suffix=".html", delete=False) as f: - f.write(html) - logger.warning( - "Reference cycle includes a CUDA Tensor see visualization of cycle %s", - f.name, - ) - - observe_tensor_cycles(write_and_log) - else: - # Start recording memory snapshot history - start_record_memory_history() - - # Run the workload with a larger tensor size. - # For smaller sizes, we will not CUDA OOM as gc will kick in often enough - # to reclaim reference cycles before an OOM occurs. - size = 2**26 # 256 MB - try: - if args.workload == "awaitable": - size *= 2 - logger.info(f"Running tensor_size: {size*4/1024/1024} MB") - asyncio.run( - awaitable_leak(tensor_size=size, gc_interval=args.gc_collect_interval) - ) - elif args.workload == "simple": - logger.info(f"Running tensor_size: {size*4/1024/1024} MB") - simple_leak(tensor_size=size, gc_interval=args.gc_collect_interval) - else: - raise Exception("Unknown workload.") - except Exception: - logger.exception(f"Failed to allocate {size*4/1024/1024} MB") - - # Create the memory snapshot file - export_memory_snapshot() - - # Stop recording memory snapshot history - stop_record_memory_history() -``` \ No newline at end of file diff --git a/_posts/2024-01-03-accelerating-generative-ai-3.md b/_posts/2024-01-03-accelerating-generative-ai-3.md deleted file mode 100644 index e51062ad6fa1..000000000000 --- a/_posts/2024-01-03-accelerating-generative-ai-3.md +++ /dev/null @@ -1,344 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Generative AI Part III: Diffusion, Fast" -author: Sayak Paul and Patrick von Platen (Hugging Face 🤗) ---- - -This post is the third part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate [Segment Anything over 8x](https://pytorch.org/blog/accelerating-generative-ai/) using only pure, native PyTorch. In part two, we showed how to accelerate [Llama-7B by almost 10x](https://pytorch.org/blog/accelerating-generative-ai-2/) using only native PyTorch optimizations. In this blog, we’ll focus on speeding up text-to-image diffusion models by upto 3x. - -We will leverage an array of optimizations including: - - - -* Running with the bfloat16 precision -* scaled_dot_product_attention (SPDA) -* torch.compile -* Combining q,k,v projections for attention computation -* Dynamic int8 quantization - -We will primarily focus on Stable Diffusion XL (SDXL), demonstrating a latency improvement of 3x. These techniques are PyTorch-native, which means you don’t have to rely on any third-party libraries or any C++ code to take advantage of them. - -Enabling these optimizations with the 🤗Diffusers library takes just a few lines of code. If you’re already feeling excited and cannot wait to jump to the code, check out the accompanying repository here: [https://github.com/huggingface/diffusion-fast](https://github.com/huggingface/diffusion-fast). - - - -![SDXL Chart](/assets/images/accelerating-generative-ai-3/fg1.png){:style="width:100%;"} - - -_(The discussed techniques are not SDXL-specific and can be used to speed up other text-to-image diffusion systems, as shown later.)_ - -Below, you can find some blog posts on similar topics: - - - -* [Accelerated Diffusers with PyTorch 2.0](https://pytorch.org/blog/accelerated-diffusers-pt-20/) -* [Exploring simple optimizations for SDXL](https://huggingface.co/blog/simple_sdxl_optimizations) -* [Accelerated Generative Diffusion Models with PyTorch 2](https://pytorch.org/blog/accelerated-generative-diffusion-models/) - - -## Setup - -We will demonstrate the optimizations and their respective speed-up gains using the 🤗[Diffusers library](https://github.com/huggingface/diffusers). Apart from that, we will make use of the following PyTorch-native libraries and environments: - - - -* Torch nightly (to benefit from the fastest kernels for efficient attention; 2.3.0.dev20231218+cu121) -* 🤗 PEFT (version: 0.7.1) -* torchao (commit SHA: 54bcd5a10d0abbe7b0c045052029257099f83fd9) -* CUDA 12.1 - -For an easier reproduction environment, you can also refer to this [Dockerfile](https://github.com/huggingface/sdxl-fast/blob/main/Dockerfile). The benchmarking numbers presented in this post come from a 400W 80GB A100 GPU (with its clock rate set to its maximum capacity). - -Since we use an A100 GPU (Ampere architecture) here, we can specify `torch.set_float32_matmul_precision("high")` to benefit from the [TF32 precision format](https://blogs.nvidia.com/blog/tensorfloat-32-precision-format/). - - -## Run inference using a reduced precision - -Running SDXL in Diffusers just takes a few lines of code: - - -``` -from diffusers import StableDiffusionXLPipeline - -## Load the pipeline in full-precision and place its model components on CUDA. -pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0").to("cuda") - -## Run the attention ops without efficiency. -pipe.unet.set_default_attn_processor() -pipe.vae.set_default_attn_processor() - -prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" -image = pipe(prompt, num_inference_steps=30).images[0] -``` - - -But this isn’t very practical as it takes **7.36 seconds** to generate a single image with 30 steps. This is our baseline which we will try to optimize one step at a time. - - -![SDXL Chart](/assets/images/accelerating-generative-ai-3/fg2.png){:style="width:100%;"} - - -Here, we’re running the pipeline with the full precision. We can immediately cut down the inference time by using a reduced precision such as [bfloat16](https://cloud.google.com/tpu/docs/bfloat16). Besides, modern GPUs come with dedicated cores for running accelerated computation benefiting from reduced precision. To run the computations of the pipeline in the bfloat16 precision, we just need to specify the data type while initializing the pipeline: - - -``` -from diffusers import StableDiffusionXLPipeline - -pipe = StableDiffusionXLPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16 -).to("cuda") - -## Run the attention ops without efficiency. -pipe.unet.set_default_attn_processor() -pipe.vae.set_default_attn_processor() -prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" -image = pipe(prompt, num_inference_steps=30).images[0] -``` - - -![SDXL Chart](/assets/images/accelerating-generative-ai-3/fg3.png){:style="width:100%;"} - - -By using a reduced precision, we’re able to cut down the inference latency from **7.36 seconds to 4.63 seconds**. - -**Some notes on the use of bfloat16** - - - -* Using a reduced numerical precision (such as float16, bfloat16) to run inference doesn’t affect the generation quality but significantly improves latency. -* The benefits of using the [bfloat16](https://cloud.google.com/tpu/docs/bfloat16) numerical precision as compared to float16 are hardware-dependent. Modern generations of GPUs tend to favor bfloat16. -* Furthermore, in our experiments, we bfloat16 to be much more resilient when used with quantization in comparison to float16. - -_(We later ran the experiments in float16 and found out that the recent versions of torchao do not incur numerical problems from float16.)_ - - -## Use SDPA for performing attention computations - -By default, Diffusers uses `scaled_dot_product_attention` (SDPA) for performing attention-related computations when using PyTorch 2. SDPA provides faster and more efficient kernels to run intensive attention-related operations. To run the pipeline SDPA, we simply don’t set any attention processor like so: - - -``` -from diffusers import StableDiffusionXLPipeline - -pipe = StableDiffusionXLPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16 -).to("cuda") - -prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" -image = pipe(prompt, num_inference_steps=30).images[0] -``` - - -SDPA gives a nice boost from **4.63 seconds to 3.31 seconds**. - - -![SDXL Chart](/assets/images/accelerating-generative-ai-3/fg4.png){:style="width:100%;"} - - - -## Compiling the UNet and VAE - -We can ask PyTorch to perform some low-level optimizations (such as operator fusion and launching faster kernels with CUDA graphs) by using `torch.compile`. For the `StableDiffusionXLPipeline`, we compile the denoiser (UNet) and the VAE: - - -``` -from diffusers import StableDiffusionXLPipeline -import torch - -pipe = StableDiffusionXLPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16 -).to("cuda") - -## Compile the UNet and VAE. -pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True) -pipe.vae.decode = torch.compile(pipe.vae.decode, mode="max-autotune", fullgraph=True) - -prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" - -## First call to `pipe` will be slow, subsequent ones will be faster. -image = pipe(prompt, num_inference_steps=30).images[0] -``` - - -Using SDPA attention and compiling both the UNet and VAE reduces the latency from **3.31 seconds to 2.54 seconds**. - -![SDXL Chart](/assets/images/accelerating-generative-ai-3/fg5.png){:style="width:100%;"} - - -**Notes on torch.compile** - -`torch.compile` offers different backends and modes. As we’re aiming for maximum inference speed, we opt for the inductor backend using the “max-autotune”. “max-autotune” uses CUDA graphs and optimizes the compilation graph specifically for latency. Using CUDA graphs greatly reduces the overhead of launching GPU operations. It saves time by using a mechanism to launch multiple GPU operations through a single CPU operation. - -Specifying `fullgraph` to be `True` ensures that there are no graph breaks in the underlying model, ensuring the fullest potential of `torch.compile`. In our case, the following compiler flags were also important to be explicitly set: - - -``` -torch._inductor.config.conv_1x1_as_mm = True -torch._inductor.config.coordinate_descent_tuning = True -torch._inductor.config.epilogue_fusion = False -torch._inductor.config.coordinate_descent_check_all_directions = True -``` - - -For the full list of compiler flags, refer to [this file.](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py) - -We also change the memory layout of the UNet and the VAE to “channels_last” when compiling them to ensure maximum speed: - - -``` -pipe.unet.to(memory_format=torch.channels_last) -pipe.vae.to(memory_format=torch.channels_last) -``` - - -In the next section, we’ll show how to improve the latency even further. - - -## Additional optimizations - - -### No graph breaks during `torch.compile` - -Ensuring that the underlying model/method can be fully compiled is crucial for performance (`torch.compile` with `fullgraph=True`). This means having no graph breaks. We did this for the UNet and VAE by changing how we access the returning variables. Consider the following example: - - -![code example](/assets/images/accelerating-generative-ai-3/fg5b.jpg){:style="width:100%;"} - - -### Getting rid of GPU syncs after compilation - -During the iterative reverse diffusion process, we [call](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228) `step()` on the scheduler each time after the denoiser predicts the less noisy latent embeddings. Inside `step()`, the `sigmas` variable is [indexed](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476). If the `sigmas` array is placed on the GPU, indexing causes a communication sync between the CPU and GPU. This causes a latency, and it becomes more evident when the denoiser has already been compiled. - -But if the `sigmas `array always stays on the CPU (refer to [this line](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240)), this sync doesn’t take place, hence improved latency. In general, any CPU <-> GPU communication sync should be none or be kept to a bare minimum as it can impact inference latency. - - -### Using combined projections for attention ops - -Both the UNet and the VAE used in SDXL make use of Transformer-like blocks. A Transformer block consists of attention blocks and feed-forward blocks. - -In an attention block, the input is projected into three sub-spaces using three different projection matrices – Q, K, and V. In the naive implementation, these projections are performed separately on the input. But we can horizontally combine the projection matrices into a single matrix and perform the projection in one shot. This increases the size of the matmuls of the input projections and improves the impact of quantization (to be discussed next). - -Enabling this kind of computation in Diffusers just takes a single line of code: - - -``` -pipe.fuse_qkv_projections() -``` - - -This will make the attention operations for both the UNet and the VAE take advantage of the combined projections. For the cross-attention layers, we only combine the key and value matrices. To learn more, you can refer to the official documentation [here](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline.fuse_qkv_projections). It’s worth noting that we [leverage](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/models/attention_processor.py#L1356) PyTorch’s `scaled_dot_product_attention` here internally. - -These additional techniques improved the inference latency from **2.54 seconds to 2.52 seconds**. - -![SDXL Chart](/assets/images/accelerating-generative-ai-3/fg6.png){:style="width:100%;"} - - - -## Dynamic int8 quantization - -We selectively apply [dynamic int8 quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html) to both the UNet and the VAE. This is because quantization adds additional conversion overhead to the model that is hopefully made up for by faster matmuls (dynamic quantization). If the matmuls are too small, these techniques may degrade performance. - -Through experimentation, we found that certain linear layers in the UNet and the VAE don’t benefit from dynamic int8 quantization. You can check out the full code for filtering those layers [here](https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16) (referred to as `dynamic_quant_filter_fn` below). - -We leverage the ultra-lightweight pure PyTorch library [torchao](https://github.com/pytorch-labs/ao) to use its user-friendly APIs for quantization: - - -``` -from torchao.quantization import apply_dynamic_quant - -apply_dynamic_quant(pipe.unet, dynamic_quant_filter_fn) -apply_dynamic_quant(pipe.vae, dynamic_quant_filter_fn) -``` - - -Since this quantization support is limited to linear layers only, we also turn suitable pointwise convolution layers into linear layers to maximize the benefit. We also specify the following compiler flags when using this option: - - -``` -torch._inductor.config.force_fuse_int_mm_with_mul = True -torch._inductor.config.use_mixed_mm = True -``` - - -To prevent any numerical issues stemming from quantization, we run everything in the bfloat16 format. - -Applying quantization this way improved the latency from **2.52 seconds to 2.43 seconds**. - - -![SDXL Chart](/assets/images/accelerating-generative-ai-3/fg7.png){:style="width:100%;"} - - - -## Resources - -We welcome you to check out the following codebases to reproduce these numbers and extend the techniques to other text-to-image diffusion systems as well: - - - -* [diffusion-fast](https://github.com/huggingface/diffusion-fast) (repository providing all the code to reproduce the numbers and plots above) -* [torchao library](https://github.com/pytorch-labs/ao) -* [Diffusers library](https://github.com/huggingface/diffusers) -* [PEFT library](https://github.com/huggingface/peft) - -**Other links** - - - -* [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://huggingface.co/papers/2307.01952) -* [Fast diffusion documentation](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion) - - -## Improvements in other pipelines - -We applied these techniques to other pipelines to test the generality of our approach. Below are our findings: - - -### [SSD-1B](https://huggingface.co/segmind/SSD-1B) - - -![SSD-1B Chart](/assets/images/accelerating-generative-ai-3/fg8.png){:style="width:100%;"} - - - -### [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) - - -![Stable Diffusion v1-5 chart](/assets/images/accelerating-generative-ai-3/fg9.png){:style="width:100%;"} - - - -### [PixArt-alpha/PixArt-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS) - -It’s worth noting that PixArt-Alpha uses a Transformer-based architecture as its denoiser for the reverse diffusion process instead of a UNet. - - -![PixArt-alpha/PixArt-XL-2-1024-MS chart](/assets/images/accelerating-generative-ai-3/fg10.png){:style="width:100%;"} - - -Note that for Stable Diffusion v1-5 and PixArt-Alpha, we didn’t explore the best shape combination criteria for applying dynamic int8 quantization. It might be possible to get better numbers with a better combination. - -Collectively, the methods we presented offer substantial speedup over the baseline without degradation in the generation quality. Furthermore, we believe that these methods should complement other optimization methods popular in the community (such as [DeepCache](https://github.com/horseee/DeepCache), [Stable Fast](https://github.com/chengzeyi/stable-fast), etc.). - - -## Conclusion and next steps - -In this post, we presented a basket of simple yet effective techniques that can help improve the inference latency of text-to-image Diffusion models in pure PyTorch. In summary: - - - -* Using a reduced precision to perform our computations -* Scaled-dot product attention for running the attention blocks efficiently -* torch.compile with “max-autotune” to improve for latency -* Combining the different projections together for computing attention -* Dynamic int8 quantization - -We believe there’s a lot to be explored in terms of how we apply quantization to a text-to-image diffusion system. We didn’t exhaustively explore which layers in the UNet and the VAE tend to benefit from dynamic quantization. There might be opportunities to further speed things up with a better combination of the layers being targeted for quantization. - -We kept the text encoders of SDXL untouched other than just running them in bfloat16. Optimizing them might also lead to improvements in latency. - - -## Acknowledgements - -Thanks to [Ollin Boer Bohan](https://madebyoll.in/) whose [VAE](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) was used throughout the benchmarking process as it is numerically more stable under reduced numerical precisions. - -Thanks to Hugo Larcher from Hugging Face for helping with infrastructure. diff --git a/_posts/2024-01-09-amazon-sagemaker-w-torchserve.md b/_posts/2024-01-09-amazon-sagemaker-w-torchserve.md deleted file mode 100644 index 9efd2bc5357f..000000000000 --- a/_posts/2024-01-09-amazon-sagemaker-w-torchserve.md +++ /dev/null @@ -1,584 +0,0 @@ ---- -layout: blog_detail -title: "Accelerate AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe, saving up to 75% on inference costs" -author: James Wu, Ankith Gunapal, Li Ning, Subhash Talluri, and Saurabh Trikande ---- - -Multi-model endpoints (MMEs) are a powerful feature of [Amazon SageMaker](https://aws.amazon.com/sagemaker/) designed to simplify the deployment and operation of machine learning (ML) models. With MMEs, you can host multiple models on a single serving container and host all the models behind a single endpoint. The SageMaker platform automatically manages the loading and unloading of models and scales resources based on traffic patterns, reducing the operational burden of managing a large quantity of models. This feature is particularly beneficial for deep learning and generative AI models that require accelerated compute. The cost savings achieved through resource sharing and simplified model management makes SageMaker MMEs an excellent choice for you to host models at scale on AWS. - -Recently, generative AI applications have captured widespread attention and imagination. Customers want to deploy generative AI models on GPUs but at the same time are conscious of costs. SageMaker MMEs support GPU instances and is a great option for these types of applications. Today, we are excited to announce TorchServe support for SageMaker MMEs. This new model server support gives you the advantage of all the benefits of MMEs while still using the serving stack that TorchServe customers are most familiar with. In this post, we demonstrate how to host generative AI models, such as Stable Diffusion and Segment Anything Model, on SageMaker MMEs using TorchServe and build a language-guided editing solution that can help artists and content creators develop and iterate their artwork faster. - - -## Solution overview - -Language-guided editing is a common cross-industry generative AI use case. It can help artists and content creators work more efficiently to meet content demand by automating repetitive tasks, optimizing campaigns, and providing a hyper-personalized experience for the end customer. Businesses can benefit from increased content output, cost savings, improved personalization, and enhanced customer experience. In this post, we demonstrate how you can build language-assisted editing features using MME TorchServe that allow you to erase any unwanted object from an image and modify or replace any object in an image by supplying a text instruction. - -The user experience flow for each use case is as follows: - -* To remove an unwanted object, the select the object from the image to highlight it. This action sends the pixel coordinates and the original image to a generative AI model, which generates a segmentation mask for the object. After confirming the correct object selection, you can send the original and mask images to a second model for removal. The detailed illustration of this user flow is demonstrated below. - - - - - - - - - - - - -
        - -Dog on a bench with mouse pointer clicking the dog - - - -Dog on a bench highlighted - - - -A bench without the dog - -
        Step 1: Select an object (“dog”) from the image - Step 2: Confirm the correct object is highlighted - Step 3: Erase the object from the image -
        - - - - -* To modify or replace an object, the select and highlight the desired object, following the same process as described above. Once you confirm the correct object selection, you can modify the object by supplying the original image, the mask, and a text prompt. The model will then change the highlighted object based on the provided instructions. A detailed illustration of this second user flow is as follows. - - - - - - - - - - - - -
        - -A vase with a cactus and mouse pointer - - - -A vase highlighted - - - -A rounded vase with a cactus - -
        Step 1: Select an object (“vase”) from the image - Step 2: Confirm the correct object is highlighted - Step 3: Provide a text prompt (“futuristic vase”) to modify the object -
        - - -To power this solution, we use three generative AI models: Segment Anything Model (SAM), Large Mask Inpainting Model (LaMa), and Stable Diffusion Inpaint (SD). Here are how these models been utilized in the user experience workflow: - - - - - - - - - - - -
        To remove an unwanted object - To modify or replace an object -
        - -flow diagram - - - -flow diagram - -
        - - - - -1. Segment Anything Model (SAM) is used to generate a segment mask of the object of interest. Developed by Meta Research, SAM is an open-source model that can segment any object in an image. This model has been trained on a massive dataset known as SA-1B, which comprises over 11 million images and 1.1 billion segmentation masks. For more information on SAM, refer to their [website](https://advimman.github.io/lama-project/) and [research paper](https://arxiv.org/abs/2109.07161). -2. LaMa is used to remove any undesired objects from an image. LaMa is a Generative Adversarial Network (GAN) model specializes in fill missing parts of images using irregular masks. The model architecture incorporates image-wide global context and a single-step architecture that uses Fourier convolutions, enabling it to achieve state-of-the-art results at a faster speed. For more details on LaMa, visit their [website](https://advimman.github.io/lama-project/) and [research paper](https://arxiv.org/abs/2109.07161). -3. SD 2 inpaint model from Stability AI is used to modify or replace objects in an image. This model allows us to edit the object in the mask area by providing a text prompt. The inpaint model is based on the text-to-image SD model, which can create high-quality images with a simple text prompt. It provides additional arguments such as original and mask images, allowing for quick modification and restoration of existing content. To learn more about Stable Diffusion models on AWS, refer to [Create high-quality images with Stable Diffusion models and deploy them cost-efficiently with Amazon SageMaker.](https://aws.amazon.com/blogs/machine-learning/create-high-quality-images-with-stable-diffusion-models-and-deploy-them-cost-efficiently-with-amazon-sagemaker/) - -All three models are hosted on SageMaker MMEs, which reduces the operational burden from managing multiple endpoints. In addition to that, using MME eliminates concerns about certain models being underutilized because resources are shared. You can observe the benefit from improved instance saturation, which ultimately leads to cost savings. The following architecture diagram illustrates how all three models are served using SageMaker MMEs with TorchServe. - - -flow diagram - -We have published the code to implement this solution architecture in our [GitHub repository](https://github.com/lxning/amazon-sagemaker-examples/tree/feat/torchserve-mme-gpu/inference/torchserve/mme-gpu). To follow along with the rest of the post, use the notebook file. It is recommended to run this example on a SageMaker notebook instance using the `conda_python3` (Python 3.10.10) kernel. - - -## Extend the TorchServe container - -The first step is to prepare the model hosting container. SageMaker provides a managed PyTorch Deep Learning Container (DLC) that you can retrieve using the following code snippet: - - -``` -# Use SageMaker PyTorch DLC as base image -baseimage = sagemaker.image_uris.retrieve( - framework="pytorch", - region=region, - py_version="py310", - image_scope="inference", - version="2.0.0", - instance_type="ml.g5.2xlarge", -) -print(baseimage) -``` - - -Because the models require resources and additional packages that are not on the base PyTorch DLC, you need to build a Docker image. This image is then uploaded to [Amazon Elastic Container Registry](http://aws.amazon.com/ecr/) (Amazon ECR) so we can access directly from SageMaker. The custom installed libraries are listed in the Docker file: - - -``` -ARG BASE_IMAGE - -FROM $BASE_IMAGE - -#Install any additional libraries -RUN pip install segment-anything-py==1.0 -RUN pip install opencv-python-headless==4.7.0.68 -RUN pip install matplotlib==3.6.3 -RUN pip install diffusers -RUN pip install tqdm -RUN pip install easydict -RUN pip install scikit-image -RUN pip install xformers -RUN pip install tensorflow -RUN pip install joblib -RUN pip install matplotlib -RUN pip install albumentations==0.5.2 -RUN pip install hydra-core==1.1.0 -RUN pip install pytorch-lightning -RUN pip install tabulate -RUN pip install kornia==0.5.0 -RUN pip install webdataset -RUN pip install omegaconf==2.1.2 -RUN pip install transformers==4.28.1 -RUN pip install accelerate -RUN pip install ftfy -``` - - -Run the shell command file to build the custom image locally and push it to Amazon ECR: - - -``` -%%capture build_output - -reponame = "torchserve-mme-demo" -versiontag = "genai-0.1" - -# Build our own docker image -!cd workspace/docker && ./build_and_push.sh {reponame} {versiontag} {baseimage} {region} {account} -``` - - - -## Prepare the model artifacts - -The main difference for the new MMEs with TorchServe support is how you prepare your model artifacts. The code repo provides a skeleton folder for each model (models folder) to house the required files for TorchServe. We follow the same four-step process to prepare each model `.tar` file. The following code is an example of the skeleton folder for the SD model: - - -``` -workspace -|--sd - |-- custom_handler.py - |-- model-config.yaml -``` - - -The first step is to download the pre-trained model checkpoints in the models folder: - - -``` -import diffusers -import torch -import transformers - -pipeline = diffusers.StableDiffusionInpaintPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-inpainting", torch_dtype=torch.float16 -) - -sd_dir = "workspace/sd/model" -pipeline.save_pretrained(sd_dir) -``` - - -The next step is to define a `custom_handler.py` file. This is required to define the behavior of the model when it receives a request, such as loading the model, preprocessing the input, and postprocessing the output. The `handle` method is the main entry point for requests, and it accepts a request object and returns a response object. It loads the pre-trained model checkpoints and applies the `preprocess` and `postprocess` methods to the input and output data. The following code snippet illustrates a simple structure of the `custom_handler.py` file. For more detail, refer to the [TorchServe handler API.](https://github.com/pytorch/serve/blob/4e2126277cff57e61e455097987c3be7d625f384/docs/custom_service.md?plain=1#L10) - - -``` -def initialize(self, ctx: Context): - -def preprocess(self, data): - -def inference(self, data): - -def handle(self, data, context): - requests = self.preprocess(data) - responses = self.inference(requests) - - return responses -``` - - -The last required file for TorchServe is `model-config.yaml`. The file defines the configuration of the model server, such as number of workers and batch size. The configuration is at a per-model level, and an example config file is shown in the following code. For a complete list of parameters, refer to the [GitHub repo](https://github.com/pytorch/serve/blob/master/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java#L14). - - -``` -minWorkers: 1 -maxWorkers: 1 -batchSize: 1 -maxBatchDelay: 200 -responseTimeout: 300 -``` - - -The final step is to package all the model artifacts into a single .tar.gz file using the `torch-model-archiver` module: - - -``` -!torch-model-archiver --model-name sd --version 1.0 --handler workspace/sd/custom_handler.py --extra-files workspace/sd/model --config-file workspace/sam/model-config.yaml --archive-format no-archive!cd sd && tar cvzf sd.tar.gz . -``` - - - -## Create the multi-model endpoint - -The steps to create a SageMaker MME are the same as before. In this particular example, you spin up an endpoint using the SageMaker SDK. Start by defining an [Amazon Simple Storage Service](http://aws.amazon.com/s3) (Amazon S3) location and the hosting container. This S3 location is where SageMaker will dynamically load the models base on invocation patterns. The hosting container is the custom container you built and pushed to Amazon ECR in the earlier step. See the following code: - - -``` -# This is where our MME will read models from on S3. -multi_model_s3uri = output_path -``` - - -Then you want to define a `MulitDataModel` that captures all the attributes like model location, hosting container, and permission access: - - -``` -print(multi_model_s3uri) -model = Model( - model_data=f"{multi_model_s3uri}/sam.tar.gz", - image_uri=container, - role=role, - sagemaker_session=smsess, - env={"TF_ENABLE_ONEDNN_OPTS": "0"}, -) - -mme = MultiDataModel( - name="torchserve-mme-genai-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), - model_data_prefix=multi_model_s3uri, - model=model, - sagemaker_session=smsess, -) -print(mme) -``` - - -The `deploy()` function creates an endpoint configuration and hosts the endpoint: - - -``` -mme.deploy( - initial_instance_count=1, - instance_type="ml.g5.2xlarge", - serializer=sagemaker.serializers.JSONSerializer(), - deserializer=sagemaker.deserializers.JSONDeserializer(), -) -``` - - -In the example we provided, we also show how you can list models and dynamically add new models using the SDK. The `add_model()` function copies your local model `.tar` files into the MME S3 location: - - -``` -# Only sam.tar.gz visible! -list(mme.list_models()) - -models = ["sd/sd.tar.gz", "lama/lama.tar.gz"] -for model in models: - mme.add_model(model_data_source=model) -``` - - - -## Invoke the models - -Now that we have all three models hosted on an MME, we can invoke each model in sequence to build our language-assisted editing features. To invoke each model, provide a `target_model` parameter in the `predictor.predict()` function. The model name is just the name of the model `.tar` file we uploaded. The following is an example code snippet for the SAM model that takes in a pixel coordinate, a point label, and dilate kernel size, and generates a segmentation mask of the object in the pixel location: - - -``` -img_file = "workspace/test_data/sample1.png" -img_bytes = None - -with Image.open(img_file) as f: - img_bytes = encode_image(f) - -gen_args = json.dumps(dict(point_coords=[750, 500], point_labels=1, dilate_kernel_size=15)) - -payload = json.dumps({"image": img_bytes, "gen_args": gen_args}).encode("utf-8") - -response = predictor.predict(data=payload, target_model="/sam.tar.gz") -encoded_masks_string = json.loads(response.decode("utf-8"))["generated_image"] -base64_bytes_masks = base64.b64decode(encoded_masks_string) - -with Image.open(io.BytesIO(base64_bytes_masks)) as f: - generated_image_rgb = f.convert("RGB") - generated_image_rgb.show() -``` - - -To remove an unwanted object from an image, take the segmentation mask generated from SAM and feed that into the LaMa model with the original image. The following images show an example. - - - - - - - - - - - - - -
        -Dog on a bench - - - -White mask of dog on black background - - - -Just a bench - -
        Sample image - Segmentation mask from SAM - Erase the dog using LaMa -
        - - -To modify or replace any object in an image with a text prompt, take the segmentation mask from SAM and feed it into SD model with the original image and text prompt, as shown in the following example. - - - - - - - - - - - - - -
        -Dog on a bench - - -White mask of dog on black background - - -Hamster on a bench - -
        Sample image - Segmentation mask from SAM - Replace using SD model with text prompt -
        - “a hamster on a bench” -
        - - - -## Cost savings - -The benefits of SageMaker MMEs increase based on the scale of model consolidation. The following table shows the GPU memory usage of the three models in this post. They are deployed on one `g5.2xlarge` instance by using one SageMaker MME. - - - - - - - - - - - - - - - - - - - -
        Model - GPU Memory (MiB) -
        Segment Anything Model - 3,362 -
        Stable Diffusion In Paint - 3,910 -
        Lama - 852 -
        - - -You can see cost savings when hosting the three models with one endpoint, and for use cases with hundreds or thousands of models, the savings are much greater. - -For example, consider 100 Stable Diffusion models. Each of the models on its own could be served by an `ml.g5.2xlarge` endpoint (4 GiB memory), costing $1.52 per instance hour in the US East (N. Virginia) Region. To provide all 100 models using their own endpoint would cost $218,880 per month. With a SageMaker MME, a single endpoint using `ml.g5.2xlarge` instances can host four models simultaneously. This reduces production inference costs by 75% to only $54,720 per month. The following table summarizes the differences between single-model and multi-model endpoints for this example. Given an endpoint configuration with sufficient memory for your target models, steady state invocation latency after all models have been loaded will be similar to that of a single-model endpoint. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - Single-model endpoint - Multi-model endpoint -
        Total endpoint price per month - $218,880 - $54,720 -
        Endpoint instance type - ml.g5.2xlarge - ml.g5.2xlarge -
        CPU Memory capacity (GiB) - 32 - 32 -
        GPU Memory capacity (GiB) - 24 - 24 -
        Endpoint price per hour - $1.52 - $1.52 -
        Number of instances per endpoint - 2 - 2 -
        Endpoints needed for 100 models - 100 - 25 -
        - - - -## Clean up - -After you are done, please follow the instructions in the cleanup section of the notebook to delete the resources provisioned in this post to avoid unnecessary charges. Refer to [Amazon SageMaker Pricing](https://aws.amazon.com/sagemaker/pricing/) for details on the cost of the inference instances. - - -## Conclusion - -This post demonstrates the language-assisted editing capabilities made possible through the use of generative AI models hosted on SageMaker MMEs with TorchServe. The example we shared illustrates how we can use resource sharing and simplified model management with SageMaker MMEs while still utilizing TorchServe as our model serving stack. We utilized three deep learning foundation models: SAM, SD 2 Inpainting, and LaMa. These models enable us to build powerful capabilities, such as erasing any unwanted object from an image and modifying or replacing any object in an image by supplying a text instruction. These features can help artists and content creators work more efficiently and meet their content demands by automating repetitive tasks, optimizing campaigns, and providing a hyper-personalized experience. We invite you to explore the example provided in this post and build your own UI experience using TorchServe on a SageMaker MME. - -To get started, see [Supported algorithms, frameworks, and instances for multi-model endpoints using GPU backed instances](https://docs.aws.amazon.com/sagemaker/latest/dg/multi-model-endpoints.html#multi-model-support). - - ---- - - -### About the authors - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        -James Wu - -James Wu is a Senior AI/ML Specialist Solution Architect at AWS. helping customers design and build AI/ML solutions. James’s work covers a wide range of ML use cases, with a primary interest in computer vision, deep learning, and scaling ML across the enterprise. Prior to joining AWS, James was an architect, developer, and technology leader for over 10 years, including 6 years in engineering and 4 years in marketing & advertising industries. -
        -Li Ning - - -Li Ning is a senior software engineer at AWS with a specialization in building large-scale AI solutions. As a tech lead for TorchServe, a project jointly developed by AWS and Meta, her passion lies in leveraging PyTorch and AWS SageMaker to help customers embrace AI for the greater good. Outside of her professional endeavors, Li enjoys swimming, traveling, following the latest advancements in technology, and spending quality time with her family. -
        -Ankith Gunapal - -Ankith Gunapal is an AI Partner Engineer at Meta (PyTorch). He is passionate about model optimization and model serving, with experience ranging from RTL verification, embedded software, computer vision, to PyTorch. He holds a Master’s in Data Science and a Master’s in Telecommunications. Outside of work, Ankith is also an electronic dance music producer. - -
        -Saurabh Trikande - -Saurabh Trikande is a Senior Product Manager for Amazon SageMaker Inference. He is passionate about working with customers and is motivated by the goal of democratizing machine learning. He focuses on core challenges related to deploying complex ML applications, multi-tenant ML models, cost optimizations, and making deployment of deep learning models more accessible. In his spare time, Saurabh enjoys hiking, learning about innovative technologies, following TechCrunch and spending time with his family. - -
        -Subhash Talluri - -Subhash Talluri is a Lead AI/ML solutions architect of the Telecom Industry business unit at Amazon Web Services. He’s been leading development of innovative AI/ML solutions for Telecom customers and partners worldwide. He brings interdisciplinary expertise in engineering and computer science to help build scalable, secure, and compliant AI/ML solutions via cloud-optimized architectures on AWS. - -
        \ No newline at end of file diff --git a/_posts/2024-01-10-finetune-llms.md b/_posts/2024-01-10-finetune-llms.md deleted file mode 100644 index bd198ab9486e..000000000000 --- a/_posts/2024-01-10-finetune-llms.md +++ /dev/null @@ -1,199 +0,0 @@ ---- -layout: blog_detail -title: "Finetune LLMs on your own consumer hardware using tools from PyTorch and Hugging Face ecosystem" -author: Younes Belkada, Marc Sun, Titus von Köller, Sourab Mangrulkar, Benjamin Bossan, Lysandre Debut, Steven Liu ---- - -We demonstrate how to finetune a 7B parameter model on a typical consumer GPU (NVIDIA T4 16GB) with LoRA and tools from the PyTorch and Hugging Face ecosystem with complete reproducible Google Colab notebook. - - -## Introduction - -Large Language Models (LLMs) have shown impressive capabilities in industrial applications. Often, developers seek to tailor these LLMs for specific use-cases and applications to fine-tune them for better performance. However, LLMs are large by design and require a large number of GPUs to be fine-tuned. - -Let’s focus on a specific example by trying to fine-tune a Llama model on a free-tier Google Colab instance (1x NVIDIA T4 16GB). Llama-2 7B has 7 billion parameters, with a total of 28GB in case the model is loaded in full-precision. Given our GPU memory constraint (16GB), the model cannot even be loaded, much less trained on our GPU. This memory requirement can be divided by two with negligible performance degradation. You can read more about running models in half-precision and mixed precision for training [here](https://huggingface.co/docs/transformers/v4.15.0/performance#forward-vs-backward-execution-speed). - - -## What makes our Llama fine-tuning expensive? - -In the case of full fine-tuning with Adam optimizer using a half-precision model and mixed-precision mode, we need to allocate per parameter: - -* 2 bytes for the weight -* 2 bytes for the gradient -* 4 + 8 bytes for the Adam optimizer states - -→ With a total of 16 bytes per trainable parameter, this makes a total of **112GB** (excluding the intermediate hidden states). Given that the largest GPU available today can have up to 80GB GPU VRAM, it makes fine-tuning challenging and less accessible to everyone. To bridge this gap, Parameter Efficient Fine-Tuning (PEFT) methods are largely adopted today by the community. - - -## Parameter Efficient Fine-Tuning (PEFT) methods - -PEFT methods aim at drastically reducing the number of trainable parameters of a model while keeping the same performance as full fine-tuning. - -They can be differentiated by their conceptual framework: does the method fine-tune a subset of existing parameters, introduce new parameters, introduce trainable prompts, etc.? We recommend readers to have a look at the paper shared below that extensively compares existing PEFT methods. - -![Venn diagram](/assets/images/finetune-llms/fg1.png){:style="width:100%;"} - - -_Image taken from the paper: [Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning](https://arxiv.org/pdf/2303.15647.pdf)_ - -For this blog post, we will focus on Low-Rank Adaption for Large Language Models (LoRA), as it is one of the most adopted PEFT methods by the community. - - -## Low-Rank Adaptation for Large Language Models (LoRA) using 🤗 PEFT - -[The LoRA method](https://arxiv.org/pdf/2106.09685.pdf) by Hu et al. from the Microsoft team came out in 2021, and works by attaching extra trainable parameters into a model(that we will denote by _base model_). - -To make fine-tuning more efficient, LoRA decomposes a large weight matrix into two smaller, low-rank matrices (called update matrices). These new matrices can be trained to adapt to the new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn’t receive any further adjustments. To produce the final results, both the original and the adapted weights are combined. - -This approach has several advantages: - - - -* LoRA makes fine-tuning more efficient by drastically reducing the number of trainable parameters. -* The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them. -* LoRA is orthogonal to many other parameter-efficient methods and can be combined with many of them. -* The performance of models fine-tuned using LoRA is comparable to the performance of fully fine-tuned models. -* LoRA does not add any inference latency when adapter weights are merged with the base model - -In principle, LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. However, for simplicity and further parameter efficiency, in Transformer models LoRA is typically applied to attention blocks only. The resulting number of trainable parameters in a LoRA model depends on the size of the low-rank update matrices, which is determined mainly by the rank r and the shape of the original weight matrix. - - -![Animated diagram that show how LoRA works in practice](/assets/images/finetune-llms/fg2.gif){:style="width:100%;max-width:600px; margin-left: auto; margin-right: auto; display:block;"} - - -_Animated diagram that show how LoRA works in practice - original content adapter from the figure 1 of LoRA [original paper](https://arxiv.org/abs/2106.09685)_ - - -Below is a code snippet showing how to train LoRA model using Hugging Face PEFT library: - -![code snippet showing how to train LoRA model using Hugging Face PEFT library](/assets/images/finetune-llms/fg3.png){:style="width:100%;"} - - -## The base model can be in any `dtype`: leveraging SOTA LLM quantization and loading the base model in 4-bit precision - -According to the LoRA formulation, the base model can be compressed in any data type (_'dtype'_) as long as the hidden states from the base model are in the same dtype as the output hidden states from the LoRA matrices. - -Compressing and quantizing large language models has recently become an exciting topic as SOTA models become larger and more difficult to serve and use for end users. Many people in the community proposed various approaches for effectively compressing LLMs with minimal performance degradation. - -This is where the [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library comes in. Its purpose is to make cutting-edge research by Tim Dettmers, a leading academic expert on quantization and the use of deep learning hardware accelerators, accessible to the general public. - - -## QLoRA: One of the core contributions of `bitsandbytes` towards the democratization of AI - -Quantization of LLMs has largely focused on quantization for inference, but the [QLoRA](https://arxiv.org/abs/2305.14314) (Quantized model weights + Low-Rank Adapters) paper showed the breakthrough utility of using backpropagation through frozen, quantized weights at large model scales. - -With QLoRA we are matching 16-bit fine-tuning performance across all scales and models, while reducing fine-tuning memory footprint by more than 90%— thereby allowing fine-tuning of SOTA models on consumer-grade hardware. - -In this approach, LoRA is pivotal both for purposes of fine-tuning and the correction of minimal, residual quantization errors. Due to the significantly reduced size of the quantized model it becomes possible to generously place low-rank adaptors at every network layer, which together still make up just 0.2% of the original model's weight memory footprint. Through such usage of LoRA, we achieve performance that has been shown to be equivalent to 16-bit full model finetuning. - -![System diagram](/assets/images/finetune-llms/fg4.png){:style="width:100%;"} - -In addition to generous use of LoRA, to achieve high-fidelity fine-tuning of 4-bit models, QLoRA uses 3 further algorithmic tricks: - - - -1. 4-bit NormalFloat (NF4) quantization, a custom data type exploiting the property of the normal distribution of model weights and distributing an equal number of weights (per block) to each quantization bin—thereby enhancing information density. -2. Double Quantization, quantization of the quantization constants (further savings). -3. Paged Optimizers, preventing memory spikes during gradient checkpointing from causing out-of-memory errors. - -An interesting aspect is the dequantization of 4-bit weights in the GPU cache, with matrix multiplication performed as a 16-bit floating point operation. In other words, we use a _low-precision storage data type_ (in our case 4-bit, but in principle interchangeable) and one normal precision _computation data type_. This is important because the latter defaults to 32-bit for hardware compatibility and numerical stability reasons, but should be set to the optimal BFloat16 for newer hardware supporting it to achieve the best performance. - -To conclude, through combining these refinements to the quantization process and generous use of LoRA, we compress the model by over 90% and retain full model performance without the usual quantization degradation, while also retaining full fine-tuning capabilities with 16-bit LoRA adapters at every layer. - - -## Using QLoRA in practice - -These SOTA quantization methods come packaged in the `bitsandbytes` library and are conveniently integrated with HuggingFace 🤗 Transformers. For instance, to use LLM.int8 and QLoRA algorithms, respectively, simply pass `load_in_8bit` and `load_in_4bit` to the `from_pretrained` method. - - -``` -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - -model_id = "facebook/opt-125m" -# For LLM.int8() -# model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) - -# For QLoRA -model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True) -``` - - -You can read more about quantization features in this specific section of the documentation: [https://huggingface.co/docs/transformers/main_classes/quantization](https://huggingface.co/docs/transformers/main_classes/quantization) - -When using QLoRA with Adam optimizer using a 4-bit base model and mixed-precision mode, we need to allocate per parameter: - - - -* ~0.5 bytes for the weight -* 2 bytes for the gradient -* 4 + 8 bytes for the Adam optimizer states - -Giving a total of 14 bytes per trainable parameter times 0.0029 as we end up having only 0.29% trainable parameters with QLoRA, this makes the QLoRA training setup cost around 4.5GB to fit, but requires in practice ~7-10GB to include intermediate hidden states which are always in half-precision (7 GB for a sequence length of 512 and 10GB for a sequence length of 1024) in the Google Colab demo shared in the next section. - -Below is the code snippet showing how to train QLoRA model using Hugging Face PEFT: - - -![code snippet showing how to train QLoRA model using Hugging Face PEFT](/assets/images/finetune-llms/fg5.png){:style="width:100%;"} - -## Using TRL for LLM training - -Models such as ChatGPT, GPT-4, and Claude are powerful language models that have been fine-tuned using a method called Reinforcement Learning from Human Feedback (RLHF) to be better aligned with how we expect them to behave and would like to use them. The finetuning goes through 3 steps: - - - -* Supervised Fine-tuning (SFT) -* Reward / preference modeling (RM) -* Reinforcement Learning from Human Feedback (RLHF) - - -![Process diagram](/assets/images/finetune-llms/fg6.png){:style="width:100%;"} - - -_From InstructGPT paper: Ouyang, Long, et al. "Training language models to follow instructions with human feedback." arXiv preprint arXiv:2203.02155 (2022)._ - -Here, we will only focus on the supervised fine-tuning step. We train the model on the new dataset following a process similar to that of pretraining. The objective is to predict the next token (causal language modeling). Multiple techniques can be applied to make the training more efficient: - - - -* **Packing**: Instead of having one text per sample in the batch and then padding to either the longest text or the maximal context of the model, we concatenate a lot of texts with an End-Of-Sentence (EOS) token in between and cut chunks of the context size to fill the batch without any padding. This approach significantly improves training efficiency as each token processed by the model contributes to training. - -![Sample diagram](/assets/images/finetune-llms/fg7.png){:style="width:100%;"} - - - - -* **Train on completion only**: We want the model to be able to understand the prompt and generate an answer/. Instead of training the model on the whole input (prompt + answer), the training will be more efficient if we only train the model on completion. - -You can perform supervised fine-tuning with these techniques using SFTTrainer: - - -``` -from trl import SFTTrainer - -trainer = SFTTrainer( - model=model, - args=training_arguments, - train_dataset=train_dataset, - dataset_text_field="text", - max_seq_length=1024, - packing=True, -) -``` - - -Since SFTTrainer back-end is powered by 🤗[accelerate](https://github.com/huggingface/accelerate), you can easily adapt the training to your hardware setup in one line of code! - -For example, with you have 2 GPUs, you can perform Distributed Data Parallel training with using the following command: - - -``` -accelerate launch --num_processes=2 training_llama_script.py -``` - - -## Putting all the pieces together - -We made a complete reproducible Google Colab notebook that you can check through[ this link](https://colab.research.google.com/drive/1vIjBtePIZwUaHWfjfNHzBjwuXOyU_ugD?usp=sharing). We use all the components shared in the sections above and fine-tune a llama-7b model on UltraChat dataset using QLoRA. As it can be observed through the screenshot below, when using a sequence length of 1024 and a batch size od 4, the memory usage remains very low (around 10GB). - -![Memory usage diagram](/assets/images/finetune-llms/fg8.png){:style="width:100%;max-width:600px; margin-left: auto; margin-right: auto; display:block;"} diff --git a/_posts/2024-01-16-accelerating-triton.md b/_posts/2024-01-16-accelerating-triton.md deleted file mode 100644 index e86cfaa9c6ff..000000000000 --- a/_posts/2024-01-16-accelerating-triton.md +++ /dev/null @@ -1,237 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Triton Dequantization Kernels for GPTQ" -author: Less Wright, Adnan Hoque (IBM) ---- - -## TL;DR - -Leveraging a first principles approach, we showcase a step by step process undertaken to accelerate the current Triton GPTQ kernels by 3x (core GPTQ) and 6x (AutoGPTQ). Example: 275us to 47us on a typical Llama style inference input. The goal is to provide a helpful template for accelerating any given Triton kernel. We provide a background on Triton and GPTQ quantization and dequantization process, showcase the impact of coalesced memory access to improve shared and global memory throughput, highlight changes made to reduce warp stalling to improve total throughput, and an overview on integrating Triton kernels into PyTorch code. Longer term, we hope to surpass the existing CUDA native GPTQ kernel with our Triton kernel. - - - -![Fig 1: Performance benchmarking the optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on H100](/assets/images/accelerating-triton/fg1.png){:style="width:100%;max-width:600px; margin-left: auto; margin-right: auto; display:block;"} - - -_Fig 1: Performance benchmarking the optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on H100_ - - - -![Fig 2: Performance benchmarking the newly optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on A100](/assets/images/accelerating-triton/fg2.png){:style="width:100%;max-width:600px; margin-left: auto; margin-right: auto; display:block;"} - - -_Fig 2: Performance benchmarking the newly optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on A100_ - -![Fig 3: Even with these improvements, there remains a gap between our optimized Triton kernel and the CUDA native AutoGTPQ kernel on A100.](/assets/images/accelerating-triton/fg3.png){:style="width:100%;max-width:600px; margin-left: auto; margin-right: auto; display:block; margin-top: 60px"} - -_Fig 3: Even with these improvements, there remains a gap between our optimized Triton kernel and the CUDA native AutoGTPQ kernel on A100. More to come…_ - - -## 1.0 Introduction to Triton - -The [Triton framework](https://openai.com/research/triton) provides a hardware agnostic way of programming and targeting GPUs, currently supporting both NVIDIA and AMD, with support for additional hardware vendors in progress. Triton is now a mainstay for PyTorch 2.0 as torch.compile decomposes eager PyTorch and re-assembles it into a high percentage of Triton kernels with PyTorch connecting code. - -As Triton becomes more widely adopted, it will be essential that programmers understand how to systematically step through the Triton stack (from the high level Python down to the low-level SASS) to address performance bottlenecks in order to optimize GPU efficiency for algorithms that go beyond torch.compile generated kernels. - -In this post, we will introduce some core concepts of the Triton programming language, how to identify common performance limiters in GPU kernels, and in parallel, tune a quantization kernel used in AutoGPTQ that can be used for high throughput inference applications. - - -### Intro to GPTQ Quantization and Dequantization - - -[GPTQ](https://arxiv.org/abs/2210.17323) is a quantization algorithm that is able to compress ultra-large (175B+) LLMs efficiently to int4 bit representation, via approximate second order information (Hessian inverse). [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) is a framework built on GPTQ, allowing for rapid dequantization and inference/serving of LLMs that have been quantized with GPTQ. - -As part of the AutoGPTQ stack, they provide a Triton GPTQ kernel to handle the dequantization of a model for inference. - -The basic process for INT quantization is shown below and involves determining the scale and zero point, and then computing the quantized 4bit Weight using the Scale and Zero point: - - -![The basic process for INT quantization](/assets/images/accelerating-triton/fg4.jpg){:style="width:100%;max-width:400px; margin-left: auto; margin-right: auto; display:block;"} - - - -We thus store the 4 Bit weights along with the meta information of Scale and ZeroPoint for each group of weights. - -To ‘dequant’ these weights, we do the following: - - -![To ‘dequant’ these weights](/assets/images/accelerating-triton/fg5.png){:style="width:100%;max-width:400px; margin-left: auto; margin-right: auto; display:block;"} - - -And then proceed to **Matrix Multiply** the dequantized weights with the dense input feature matrix for this linear layer. - - -## 2.0 Identify the Bottlenecks - Optimizing Matrix Multiplication - -As it turns out, making a fast matrix multiplication kernel is not trivial. A naively implemented matrix multiply will rarely reach peak throughput performance on highly parallel machines like GPUs. So – we need to tackle our compute and memory subsystems in our GPU in an hierarchical fashion to make sure we are maximally utilizing each resource. - - -We start our optimization process, by running the unoptimized Triton Kernel, through the Nvidia Nsight Compute tool and taking a note of some important metrics and warnings: - - -![some important metrics and warnings](/assets/images/accelerating-triton/fg6.png){:style="width:100%;"} - - -Fig xy (todo) - -![some important metrics and warnings](/assets/images/accelerating-triton/fg7.png){:style="width:100%;max-width:300px; margin-left: auto; margin-right: auto; display:block;"} - - -We notice first that both compute and memory throughput are low, 7.40% and 21.19% respectively (fig xy) . Knowing that for typical inference matrix problem sizes, we are in the memory bound regime, we will attempt to optimize the kernel by applying code changes that target the memory subsystem of our A100 GPU. - -The three topics this post will cover are: - -1. L2 Optimization -2. Vectorized Load -3. Warp Stalling - - -Let’s walk through each topic, make the appropriate changes, and see its corresponding impact on our Triton Kernel. This Triton kernel is a fused dequantization kernel that dequantizes a packed int32 weight (we will refer to this as the B Matrix) Tensor into int4 weights, performs matrix multiplication with the activation tensor (refer to as the A matrix) in FP16 mode, and then stores the results back to a matrix C. - -The above is referred to as W4A16 quantization. Keep in mind that the process we describe can and should be used for the development of any GPU kernel, as these are common bottlenecks in any unoptimized kernel. - - -## 3.0 L2 Optimization - -This optimization already exists in the AutoGPTQ kernel, but we’d like to dedicate a section to this to help readers better understand how mapping and execution order of thread blocks is handled in Triton. Thus, we will step through a naive mapping and then a more optimal mapping to see its corresponding impact. - -Let’s build up our kernel naively, starting with a “linear” load from global memory and then compare it to a more optimized “swizzled” load. Linear vs Swizzled determines the execution order of our grid of work on the GPU. Let’s take a look at the hints that the [Nvidia Nsight Compute Tool](https://developer.nvidia.com/nsight-compute) provides regarding our kernels shared memory access pattern in the naive case: - -![the hints from the Nvidia Nsight Compute Tool](/assets/images/accelerating-triton/fg8.jpg){:style="width:100%;"} - - -To tackle this issue we can use an approach referred to as “tile-swizzling.” The idea of this method is to launch our thread blocks in a more L2 cache friendly order. - -Let’s take a step back and familiarize ourselves with some Triton semantics and make a simple CUDA analogy to understand the concept better. Triton kernels launch “programs”. These so-called programs map to the concept of a Thread Block in CUDA and it is the basic unit of parallelism in a Triton Kernel. Every program has with it associated a “pid” and all the threads in a program are guaranteed to be executing the same instruction. - -The Triton programs will be distributed onto your SMs in a naive-way if you do a simple linear mapping of “pid” to a 2D grid location of your output matrix C. - -This 2D grid location is determined by pid_m and pid_n in Triton. We would like to exploit data and cache locality in the L2 cache of our GPU, when we distribute our grid of work. To do this in Triton we can make the following changes: - -![To do this in Triton](/assets/images/accelerating-triton/fg9.png){:style="width:100%;"} - - -The code highlighted in red would be the naive “linear” tile ordering, and the code highlighted in green is the “swizzled” tile ordering. This type of launch promotes a sense of locality. Here is a visual to help understand this better. - -![a sense of locality](/assets/images/accelerating-triton/fg10.jpg){:style="width:100%;max-width:600px; margin-left: auto; margin-right: auto; display:block;"} - - -After incorporating this change, the profiler no longer complains about uncoalesced memory accesses. Let’s take a look at how our memory throughput has changed: - - -![how our memory throughput has changed](/assets/images/accelerating-triton/fg11.jpg){:style="width:100%;"} - - -This change was tested on a simple load store kernel. Looking at the GPU speed of light statistics section in the profiler we also see a 112.07% increase in the memory throughput of the simple load kernel, which is what we were after with this optimization. Again, this optimization already exists in the AutoGPTQ kernel, but is the boilerplate logic that every Triton Kernel programmer will have to write in the beginning of their kernel, before any of the exciting dequantization or matrix multiply logic. It is thus important to understand that: - -1. This mapping is not unique - -2. Triton does not automatically handle this kind of optimization for the programmer, and careful thought must be taken to ensure your kernel is optimally handling shared memory accesses - -These are not obvious for those new to Triton, as much of the shared memory access optimization is handled by the Triton compiler. However, in the cases where these are not handled by the compiler, it is important to be able to understand what tools and methods are available to us to be able to influence memory behavior. - - -## 4.0 Vectorized Load - -Now, back to the original complaints of our unoptimized kernel. We want to optimize the global memory access pattern of our kernel. From the details page of the Nvidia Nsight compute tool, we see the following note, where the profiler is complaining about uncoalesced global memory accesses. - -Let’s dig deeper and take a look at the SASS (Assembly) Code load for an unoptimized memory read: - - -![an unoptimized memory read](/assets/images/accelerating-triton/fg12.png){:style="width:100%;"} - - -This load operation resulted in 32 global load operations that are 16 bit wide. This is not optimal. - -We would like to do our global memory loads in a vectorized way so that it results in the least amount of load instructions. To combat this we can give the Triton Compiler some help. - -![code block](/assets/images/accelerating-triton/fg13.png){:style="width:100%;"} - - -The green highlighted lines above act as a compiler hint. It tells the compiler that these elements are contiguous in memory and that this load operation can be coalesced. - -Let’s see the effect in assembly after adding these lines. - -![the effect in assembly after adding these lines](/assets/images/accelerating-triton/fg14.png){:style="width:100%;"} - - -The load is now performed in 4 global load operations that are each 128 bit wide, instead of 32 16 bit global load operations. This means 28 fewer memory fetch instructions, and importantly a coalesced memory access. This can be seen from the fact that a single thread is not accessing consecutive memory addresses, which without the compiler hint, was the behavior. - -The resulting effect is 73x speedup in an isolated load operation, and after incorporating it in the full dequantization kernel we were able to see another 6% speedup. Another step in the right direction! - - -## 5.0 Warp Stalling - - -![performance limiter, warp stalling](/assets/images/accelerating-triton/fg15.png){:style="width:100%;"} - - -Now putting all the changes back into our full dequantization kernel, we see the following performance limiter, warp stalling. - -These warp stalls are mostly caused by ‘Long Scoreboard’ stalls, accounting for 92.63% of the total. - -At a high level, [long scoreboard stalls](https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) happen when a warp requires data that may not be ready yet in order to be in the “issued” state. In other words GPUs are throughput machines, and we need to hide the latency of load instructions with compute instructions. By loading more data and rearranging where the load instructions are in the script we can take care of this problem. - -In an ideal scenario, each warp scheduler would be able to issue 1 instruction every clock cycle. Note - Every SM on an A100 GPU has 4 warp schedulers. - -However – our kernel has bottlenecks and is spending 4.4 cycles in the stall state with the block size that AutoGPTQ Triton kernel deems as optimal given the presets it has. - -**How do we improve this?** - -We want to be able to increase our memory throughput so that we can increase the chance that when a warp issues an instruction, we won’t be waiting for loads to be stored in SRAM so that they can be used for computation. We played around with multiple parameters (such as number of pipeline stages, and number of warps) and the one that had the biggest impact was increasing the block size by a factor of 2 in the k dimension. - -These changes yield an immediate impact on both compute and memory throughput. - -![an immediate impact on both compute and memory throughput](/assets/images/accelerating-triton/fg16.jpg){:style="width:100%;"} - -We also see the long scoreboard wait time at the step where we shift and scale the quantized weights drop significantly, which is what we identified as the original bottleneck in the source code. While there are still stalls at this line, only 68% of them are caused by long scoreboard stalls, compared to the original 92%. Ideally, we do not observe ANY stalls, so there is still work to be done here, but a reduction in the amount of stalls caused by long scoreboard tells us that our data is at this point ready to be used (in L1TEX) memory by an instruction that a warp wants to execute, at a higher frequency then the original kernel. - -![1.4x speedup in the execution time of our kernel](/assets/images/accelerating-triton/fg17.png){:style="width:100%;"} - - -The corresponding impact is a 1.4x speedup in the execution time of our kernel. - - -## 6.0 Results - -By tackling all these problem areas methodically our resulting kernel is 6x faster on the Nvidia A100 GPU than if you were to use the Triton kernel AutoGPTQ provides out-of-the-box. - -Taking a relevant Llama inference sample data point, the [Triton kernel we’ve developed ](https://github.com/foundation-model-stack/foundation-model-stack/tree/triton/triton/kernels)takes 47us to perform dequantization and matrix multiplication compared to the 275us taken by the AutoGPTQ kernel for the same matrix size. - -By replicating this step-by-step approach it should be possible to get similar speedups in other kernels, and help build understanding on common GPU bottlenecks and how to tackle them. - -It is important to note that while strides have been made in improving the performance of the AutoGPTQ Triton Kernel, we have still not closed the gap on the current exllamaV2 CUDA kernels found in AutoGPTQ. - -More research is required to understand how we can further optimize this kernel to match equivalent custom CUDA kernel performance. - - -## Summary and Future work - -Triton extends PyTorch by allowing low level GPU optimizations to be done at a higher level of abstraction than CUDA programming, with the net result that adding optimized Triton kernels can help PyTorch models run faster. - -Our goal in this post was to show an example of accelerating the GPTQ dequant kernel and provide a template workflow for how the accelerations were achieved. - -For future work, SplitK work decomposition for the matrix multiplication is a potential speed up we’ll investigate. - - -## Integrating custom Triton Kernels into PyTorch - -Given the acceleration shown above, a common question is how to actually use a custom kernel in a given PyTorch codebase. - -A triton kernel will contain at least two parts - the actual Triton kernel code which will be compiled by the Triton compiler: - -![the actual Triton kernel code which will be compiled by the Triton compiler](/assets/images/accelerating-triton/fg18.png){:style="width:100%;"} - - -Along with the actual kernel code is a python wrapper, that may or may not subclass the PyTorch autograd class - depending if it’s going to support a backwards pass (i.e. for training purposes or only for inference purposes). - -You simply import the python class into your PyTorch code where you want to use it much like any other Python / PyTorch function. - -![import the python class into your PyTorch code](/assets/images/accelerating-triton/fg19.png){:style="width:100%;"} - -In this case, simply importing and then using ‘fast_qlinear’ would invoke the underlying Triton kernel with the speed-ups we’ve shown above applied to your PyTorch model. - - -## Acknowledgements - -Thanks to Jamie Yang and Hao Yu from IBM Research for their technical guidance in the collection of these results. diff --git a/_posts/2024-01-18-accelerate-pytorch-models.md b/_posts/2024-01-18-accelerate-pytorch-models.md deleted file mode 100644 index 1e5a90b8b297..000000000000 --- a/_posts/2024-01-18-accelerate-pytorch-models.md +++ /dev/null @@ -1,237 +0,0 @@ ---- -layout: blog_detail -title: "Accelerate PyTorch Models Using Quantization Techniques with Intel Extension for PyTorch" -author: Intel ---- - -## Overview - -PyTorch is a Python-based framework for developing deep learning models. It is one of the most popular industry-standard AI frameworks and is used for a wide variety of computer vision and natural language processing applications. PyTorch was developed by Meta and is now part of The Linux Foundation. Intel works with the open source PyTorch project to optimize the PyTorch framework for Intel® hardware. The newest optimizations and features are first released in Intel® Extension for PyTorch before upstreaming them into PyTorch. The Intel extension provides quantization features to deliver good accuracy results for large deep learning models. - -This article introduces quantization, types of quantization, and demonstrates a code sample on how to accelerate PyTorch-based models by applying Intel Extension for PyTorch quantization. - - -## What Is Quantization? - -Quantization is a systematic reduction of the precision of all or several layers within the model. This means a higher-precision type (like single precision floating-point (FP32) that is mostly used in deep learning) is converted into a lower-precision type, such as FP16 (16 bits) or int8 (8 bits). - -This helps to achieve: - -* Lower memory bandwidth -* Lower storage -* Higher performance with minimum to zero accuracy loss - -Quantization is especially important with large models such as those based on the Transformer architecture (like BERT or GPT). - -There are two types of quantization: - -* Static: This quantizes the weights and activations of the model, and is used when memory bandwidth and compute savings are important. -* Dynamic: The weights are quantized ahead of time, but the activations are dynamically quantized during inference. - - -## How to Perform Static Quantization and Dynamic Quantization - -The Intel extension extends PyTorch with up-to-date features and optimizations for an extra performance boost on Intel hardware. - -[Installation Instructions for Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch#installation) - -The extension can be loaded as a Python module or linked as a C++ library. Python users can enable it dynamically by importing **intel_extension_for_pytorch**. The extension provides built-in quantization to deliver good statistical accuracy for most popular deep learning workloads including convolutional neural networks (CNN), natural language processing (NLP), and recommendation models. The quantization functionality in the Intel extension currently supports post-training quantization. - -**To quantize the existing FP32 model to an int8 model using static quantization:** - -1. Prepare the quantization configuration. For default static quantization configuration, use **ipex.quantization.default_static_qconfig**. -2. Prepare the model for calibration using the **ipex.quantization.prepare** method. -3. Perform calibration against the dataset. This calibration is specific for static quantization as it needs the representative dataset to determine the optimal quantization parameters, so the user should provide data to the model in batches to calibrate it. -4. Convert the model from FP32 to int8 using the **ipex.quantization.convert** method. This function converts the FP32 model to int8 based on the applied calibration and configuration. - -**To quantize the existing FP32 model to an int8 model using dynamic quantization, which is similar to static quantization:** - -1. Prepare the quantization configuration. For default dynamic quantization configuration, use **ipex.quantization.default_dynamic_qconfig**. -2. Prepare the FP32 model by using the **ipex.quantization.prepare** method. Provide the parameters, such as FP32 model to quantize, the prepared configuration, example inputs, and information. -3. Convert the model from FP32 to int8 using the **ipex.quantization.convert** method. The input model is the model prepared in Step 2. - - -## Code Sample - - -### Dataset - -For static quantization, the model is calibrated with the [CIFAR-10 dataset](https://www.cs.toronto.edu/~kriz/cifar.html). The CIFAR-10 is a subset of the 80 million [tiny images dataset](https://groups.csail.mit.edu/vision/TinyImages/) collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton. - -This dataset contains 60,000 images in 10 classes (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and track). Every class has exactly 6,000 images. All images are 32 x 32 pixels and are colored. Also, the classes are completely mutually exclusive, which means there is no overlapping between classes. - - -### Implementation - -The [code sample](https://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/Features-and-Functionality/IntelPytorch_Quantization) demonstrates how to quantize (using static and dynamic quantization) a ResNet*-50 model using Intel Extension for PyTorch. The following steps are implemented in the code sample: - - -#### Download and Prepare the Dataset - -Here, we use the CIFAR-10 dataset available in torchvision. - -1. To make data fit the model: - -* Transform the data. -* Change the size of the images from 32 x 32 pixels to 224 x 224 pixels. -* Convert them to tensors. -* Normalize them. - -{:start="2"} -2. Prepare transformations of the dataset as shown: - -``` -transform = torchvision.transforms.Compose([ -torchvision.transforms.Resize((224, 224)), -torchvision.transforms.ToTensor(), -torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - -``` - -{:start="3"} -3. Initialize the dataset. - -``` -test_dataset = torchvision.datasets.CIFAR10(root=DATA, train=False, transform=transform, download=Ture) -``` - - -#### Prepare the Data Loader - -To load a dataset for static quantization calibration in specific size batches, create the loader as shown: - - -``` -calibration_data_loader = torch.utils.data.DataLoader( -dataset=test_dataset, -batch_size=128 -) -``` - - - -#### Create the Model - -Use the pretrained ResNet-50 model available in the Torchvision library with default weights. The prepared model is FP32. - - -``` -model_fp32 = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT) -``` - - - -#### Apply Static Quantization - -Create a **staticQuantize** function that implements the steps described previously. - - - -1. To perform static quantization, we need: - -* FP32 model loaded earlier -* Example data -* Calibration dataset - -{:start="2"} -2. Prepare the quantization configuration: - -``` -config_static = ipex.quantization.default_static_qconfig -``` - -In this code sample, we are using the default quantization configuration, but you can also define your own. \ - -{:start="3"} -3. Prepare the model using the declared configuration: - - -``` -prepared_model_static = prepare(model_fp32, -qconfig_static, -example_inputs=data, -inplace=False) -``` - -{:start="4"} -4. Calibrate the model with the calibration dataset. Feed the model with successive batches of data from the dataset. - - -``` -for batch_idx, (data, target) in enumerate(calibration_data_loader): -prepared_model_static(data) -if batch_idx % 10 == 0: -print("Batch %d/%d complete, continue ..." %(batch_idx+1, len(calibration_data_loader))) -``` - -{:start="5"} -5. Convert the model. - -``` -converted_model_static = convert(prepared_model_static) -``` - - -#### Apply Dynamic Quantization - -Create the **dynamicQuantize** function similar to the **staticQuantize** function. - -1. To perform dynamic quantization, we only need: - -* The FP32 model loaded earlier -* Example data - -{:start="2"} -2. Prepare the quantization configuration: - -``` -qconfig_dynamic = ipex.quantization.default_dynamic_qconfig -``` - -{:start="3"} -3. Prepare the model. - -``` -prepared_model_dynamic = prepare(model_fp32, -qconfig_dynamic, -example_inputs=data, -inplace=False) -``` - -{:start="4"} -4. Convert the model from FP32 to int8. - -``` -converted_model_dynamic = convert(prepared_model_dynamic) -``` - -In this way, two functions are created to take advantage of the optimizations that quantization offers: - -* **DynamicQuantize** for dynamic quantization of models -* **StaticQuantize** for static model quantization - - -## Next Steps - -Get started with Intel Extension for PyTorch quantization today and use it to achieve better accuracy results for deep learning workloads. Additionally, [Intel® Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html?cid=sem&source=sa360&campid=2023_q2_iags_us_iagsoapie_iagsoapiee_awa_text-link_exact_cd_dpd-oneapi-intel_neural_compressor_3500107853_google_div_oos_non-pbm_intel&ad_group=ai_model_compression_exact&intel_term=neural+compressor&sa360id=43700076378213630&gclid=CjwKCAjw-IWkBhBTEiwA2exyO1pBoV7k3j16OANdyEOMVYDUvy4MZK3WQX6zzhymBxz7Pikqq0ndwBoCHvUQAvD_BwE&gclsrc=aw.ds#gs.2t5hw6) provides [quantization](https://intel.github.io/neural-compressor/latest/docs/source/quantization.html) to improve the speed of inference. - -Check out and incorporate Intel’s other [AI and machine learning framework optimizations](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html) and [end-to-end portfolio of tools](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/tools.html) into your AI workflow. - -Learn about the unified, open, standards-based [oneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html) programming model that forms the foundation of Intel’s [AI Software Portfolio](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/overview.html) to help you prepare, build, deploy, and scale your AI solutions. - -For more details about the 4th gen Intel® Xeon® Scalable processors, visit the [Intel® AI platform overview](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/platform.html) where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs. - - -## Additional Resources - -* [Accelerate AI Workloads with Intel® Advanced Matrix Extensions (Intel® AMX)](https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/ai-solution-brief.html) -* [AI and Machine Learning Development Tools and Resources](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/overview.html) -* [AI Frameworks](https://www.intel.com/content/www/us/en/developer/tools/frameworks/overview.html#gs.2t503z) -* [Computer Vision](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/training/computer-vision.html) -* [Intel Hardware for AI](https://www.intel.com/content/www/us/en/artificial-intelligence/hardware.html) -* [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html?cid=sem&source=sa360&campid=2023_q2_iags_us_iagsoapie_iagsoapiee_awa_text-link_exact_cd_dpd-oneapi-intel_neural_compressor_3500107853_google_div_oos_non-pbm_intel&ad_group=ai_model_compression_exact&intel_term=neural+compressor&sa360id=43700076378213630&gclid=CjwKCAjw-IWkBhBTEiwA2exyO1pBoV7k3j16OANdyEOMVYDUvy4MZK3WQX6zzhymBxz7Pikqq0ndwBoCHvUQAvD_BwE&gclsrc=aw.ds#gs.2t5hw6) -* [oneAPI Unified Programming Model](https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html#gs.h7kofh) -* [PyTorch Foundation](https://pytorch.org/foundation) -* [PyTorch Optimizations from Intel](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-pytorch.html) -* [PyTorch Quantization Code Sample](https://github.com/oneapi-src/oneAPI-samples/tree/master/AI-and-Analytics/Features-and-Functionality/IntelPytorch_Quantization) -* [Quantization Using Intel Neural Compressor](https://intel.github.io/neural-compressor/latest/docs/source/quantization.html) diff --git a/_posts/2024-01-23-accelerating-generative-ai-4.md b/_posts/2024-01-23-accelerating-generative-ai-4.md deleted file mode 100644 index 1a5da5cad87c..000000000000 --- a/_posts/2024-01-23-accelerating-generative-ai-4.md +++ /dev/null @@ -1,214 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Generative AI with PyTorch IV: Seamless M4T, fast" -author: Yejin Lee, Carole-Jean Wu, Christian Puhrsch, Joel Schlosser, Driss Guessous, Jeffrey Wan, Joe Isaacson, Can Balioglu, Juan Pino ---- - -This post is the fourth part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. To skip to the code, check out our github ([seamless_communication](https://github.com/facebookresearch/seamless_communication/pull/328), [fairseq2](https://github.com/facebookresearch/fairseq2/pull/272)). We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate [Segment Anything over 8x](https://pytorch.org/blog/accelerating-generative-ai/) using only pure, native PyTorch. In part two, we showed how to accelerate [Llama-7B by almost 10x](https://pytorch.org/blog/accelerating-generative-ai-2/) using only native PyTorch optimizations. In part three, we showed how to accelerate [text-to-image diffusion models up to 3x](https://pytorch.org/blog/accelerating-generative-ai-3/) using only native Pytorch optimizations. - -In this blog, we’ll focus on speeding up FAIR’s Seamless M4T-v2 model resulting in **2x speedup for text decoder module _and_ 30x for vocoder module, resulting in 2.7x speedup for end-to-end inference**, with no loss of accuracy by using CUDA Graph and native PyTorch optimization: - -* [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) - - -![End to End Inference Speedup](/assets/images/accelerating-generative-ai-4/fg1.png){:style="width:100%;"} - - - -## Introduction - -Seamless M4T is an open-source foundational speech/text translation and transcription technology developed by FAIR. Seamless M4T is a massively multilingual and multimodal machine translation model, with the [latest version](https://github.com/facebookresearch/seamless_communication) (Seamless M4T-v2) released on November 30th, 2023. The high-level model architecture of Seamless M4T-v2 is illustrated in Figure 1. - - -![Model Architecture of Seamless M4T-v2](/assets/images/accelerating-generative-ai-4/fg2.png){:style="width:100%;max-width:600px; display:block; margin-left: auto; margin-right: auto;"} - - -**Figure 1.** Model Architecture of Seamless M4T-v2. - -Accelerating inference latency is crucial for translation models to improve user experience through faster communication across languages. In particular, batch_size=1 is often used for fast translation where latency matters a lot in applications such as chatbots, speech translation, and live subtitling. Therefore, we conducted the performance analysis on inference with batch_size=1, as shown in Figure 2 to understand the Amdahl’s Law bottleneck. Our results indicate that the text decoder and vocoder are the most time-consuming modules, accounting for 61% and 23% of the inference time, respectively. - - -![Text decoder and vocoder are the most time consuming module. Breakdown of inference time by modules for English-Spanish S2ST (Speech-to-Speech-Text) task for batch_size=1 on A100 GPU.](/assets/images/accelerating-generative-ai-4/fg3.png){:style="width:100%;"} - - -**Figure 2.** Text decoder and vocoder are the most time consuming module. Breakdown of inference time by modules for English-Spanish S2ST (Speech-to-Speech-Text) task for batch_size=1 on A100 GPU. - -To take a closer look at the performance bottleneck of the text decoder and vocoder, we analyzed GPU traces for the text decoder and vocoder for the 8th sample for the English-Spanish translation example of [FLEURS](https://huggingface.co/datasets/google/fleurs) dataset as shown in Figure 3. It revealed that the **text decoder and vocoder are heavily CPU-bound modules.** We observed a significant gap incurred by CPU overhead that delayed the launch of GPU kernels, resulting in a substantial increase in the execution time for both the modules. - - -![CPU and GPU trace for Text Decoder](/assets/images/accelerating-generative-ai-4/fg4.jpg){:style="width:100%;"} - - - -**(a)** CPU and GPU trace for Text Decoder - - -![CPU and GPU trace for Vocoder](/assets/images/accelerating-generative-ai-4/fg5.jpg){:style="width:100%;"} - - -**(b)** CPU and GPU trace for Vocoder - -**Figure 3.** Text Decoder and Vocoder are heavily CPU-bound modules. CPU and GPU trace for (a) Text Decoder (b) Vocoder for the 8th sample for English-Spanish translation example of [FLEURS](https://huggingface.co/datasets/google/fleurs) dataset. The trace is obtained by running inference with batch_size=1 on A100 gpu. - -Based on the real-system performance analysis results that text_decoder and vocoder are heavily CPU bound modules in Seamless M4T-v2, we enabled torch.compile + CUDA Graph to those modules. In this post, we share modifications required to enable torch.compile + CUDA Graph on each module for batch_size=1 inference scenario, discussion on CUDA Graph and next step plans. - - -## Torch.compile with CUDA Graph - -`torch.compile` is a PyTorch API that allows users to compile PyTorch models into a standalone executable or script which is generally used for optimizing model performance by removing unnecessary overhead. - -CUDA Graph is a feature provided by NVIDIA that allows for the optimization of kernel launches in CUDA applications. It creates an execution graph of CUDA kernels, which can be pre-processed and optimized by the driver before being executed on the GPU. The main advantage of using CUDA Graph is that it reduces the overhead associated with launching individual kernels, as the graph can be launched as a single unit, reducing the number of API calls and data transfers between the host and device. This can lead to significant performance improvements, especially for applications that have a large number of small kernels or repeat the same set of kernels multiple times. If this is something you are interested in learning more about, check out this paper that highlights the important role of data for accelerated computing: **[Where is the data? Why you cannot debate CPU vs. GPU performance without the answer](https://ieeexplore.ieee.org/abstract/document/5762730)** by our own Kim Hazelwood! This is when NVIDIA was heavily investing in general-purpose GPU (GPGPUs) and before deep learning revolutionized the computing industry! - -However, because CUDA Graph operates on 1) fixed memory pointer, 2) fixed shape of tensors, that are recorded at the compile time, we introduced the following improvements for CUDA Graph to be reused across multiple sizes of inputs to _prevent CUDA Graph generation for each iteration_ and let the data inside CUDA Graph be reused across different runs _to share KV Cache for multiple decoding steps_. - - -## Text Decoder - -The Text Decoder in Seamless is a decoder from NLLB [[1](https://ai.meta.com/research/no-language-left-behind/)] that performs T2TT (Text to Text Translation). Also, this module is a CPU-bound model where gpu execution time is not long enough to hide CPU overhead because of **the nature of auto-regressive generation that requires sequential processing of tokens**, which limits the amount of parallelism that can be achieved on the GPU. Based on this observation, we enabled torch.compile + CUDA Graph for the text decoders to reduce the dominating CPU overhead as shown in Figure 4. - - -![CPU and GPU trace for Text Decoder after torch.compile + CUDA Graph are enabled](/assets/images/accelerating-generative-ai-4/fg6.png){:style="width:100%;"} - - -**Figure 4.** CPU and GPU trace for Text Decoder after torch.compile + CUDA Graph are enabled. - - -### 1. Updating and retrieving KV cache - -During inference, the text decoder has two computation phases: a prefill phase that consumes the prompt and an incremental generation phase that generates output tokens one by one. Given a high enough batch size or input length, prefill operates on a sufficiently high number of tokens in parallel — GPU performance is the bottleneck and the CPU overheads do not impact performance significantly. On the other hand, incremental token generation is always executed with sequence length 1 and it is often executed with a small batch size (even 1), e.g. for interactive use cases. Thus, incremental generation can be limited by the CPU speed and thus is a good candidate for torch.compile + CUDA Graph. - -However, during the incremental token generation phase, the sequence_length dimension of key and value involved in the attention computation increases by one with each step while the sequence length of query always remains 1. Specifically, key/value are generated by appending the newly computed key/value of sequence length 1 to the key/value stored in the KV cache so far. But as mentioned above, CUDA Graph records all the shapes of tensors during compilation and replay with the recorded shapes. Thus, few modifications have been made to address this issue following the great work [here](https://blog.fireworks.ai/speed-python-pick-two-how-cuda-graphs-enable-fast-python-code-for-deep-learning-353bf6241248). - -a) We modify the KV-cache handling to take the indices in which to write new values in a CUDA Tensor (i.e., `valid_seq_pos`) rather than a Python integer. - -![Modification to KV cache append and get](/assets/images/accelerating-generative-ai-4/fg7.png){:style="width:100%;"} - -**Figure 5.** Modification to KV cache `append` and `get` - -b) We also modify attention to work with the fixed shape of key and value over the `max_seq_length`. We only compute softmax over the sequence positions up to the current decoding step (i.e., `valid_seq_pos`) . To mask out sequence positions > current decoding step (i.e., `valid_seq_pos)`, we create a boolean mask tensor (i.e., `mask`) where sequence positions > `valid_seq_pos` are set to False. - - -![Helper function to generate valid_seq_pos and mask](/assets/images/accelerating-generative-ai-4/fg8.png){:style="width:100%;"} - - **Figure 6.** Helper function to generate `valid_seq_pos` and `mask` - -It's important to post that these modifications result in an increase in the amount of computation required, as we compute attention over more sequence positions than necessary (up to `max_seq_length`). However, despite this drawback, our results demonstrate that torch.compile + CUDA Graph still provide significant performance benefits compared to standard PyTorch code. - -c) As different inference samples have different sequence length, it also generates different shapes of inputs that are to be projected to key and value for the cross attention layers. Thus, we pad the input to have a static shape and generate a padding mask to mask out padded output. - - -### 2. Memory Pointer Management - -As CUDA Graph records memory pointers along with the shape of tensors, it is important to make different inference samples to correctly reference the recorded memory pointer (e.g., KV cache) to avoid compiling CUDA Graph for each inference sample. However, some parts of the Seamless codebase made different inference samples to refer to different memory addresses, so we made modifications to improve the memory implications. - -e) Seamless adopts beam search as a text decoding strategy. In the beam search process, we need to perform KV cache reordering for all the attention layers for each incremental decoding step to make sure each selected beam performs with corresponding KV cache as shown in the code snippet below. - - -![KV cache reordering operation for beam search decoding strategy](/assets/images/accelerating-generative-ai-4/fg8b.png){:style="width:100%;"} - -**Figure 8.** KV cache reordering operation for beam search decoding strategy. - -The above code allocates new memory space and overwrites the original memory pointer for `cache_k` and `cache_v`. Thus we modified KV cache reordering to keep the memory pointer of each cache as was recorded during compilation by using [copy_](https://pytorch.org/docs/stable/generated/torch.Tensor.copy_.html) operator. - - -![In-place update for KV cache using copy_ operator](/assets/images/accelerating-generative-ai-4/fg9.png){:style="width:100%;"} - -**Figure 9.** In-place update for KV cache using `copy_` operator - -f) After enabling torch.compile + CUDA Graph to text decoder by modifying the code as mentioned above, the overhead of text decoder shifts to KV cache reordering as shown in Figure 10. KV cache reordering repeatedly calls index_select 96 times (assuming 24 decoder layers where each layer consists of two types of attention layers with cache for key and value). - -![CPU and GPU trace for Text Decoder after enabling torch.compile + CUDA Graph](/assets/images/accelerating-generative-ai-4/fg10.png){:style="width:100%;"} - -**Figure 10.** CPU and GPU trace for Text Decoder after enabling torch.compile + CUDA Graph. - -As part of accelerating text decoder, we additionally applied torch.compile to KV cache reordering to benefit from fusing kernels as shown in Figure 11. Note that we cannot use CUDA Graph here (`mode='max-autotune'`) here, because `copy_` operation modifies the inputs which violates the static input requirement of CUDA graph version in torch.compile. - -![Applying torch.compile to KV Cache reordering](/assets/images/accelerating-generative-ai-4/fg11.png){:style="width:100%;"} - - -**Figure 11.** Applying torch.compile to KV Cache reordering. - -As a result of enabling torch.compile to KV cache reordering, the gpu kernels that were launched separately (Figure 12(a)) are now fused so there are much fewer gpu kernels to launch (Figure 12(b)). - -![CPU and GPU trace for KV cache reordering before enabling torch.compile](/assets/images/accelerating-generative-ai-4/fg12.png){:style="width:100%;"} - -**(a)** CPU and GPU trace for KV cache reordering **before** enabling torch.compile - -![CPU and GPU trace for KV cache reordering after enabling torch.compile](/assets/images/accelerating-generative-ai-4/fg13.png){:style="width:100%;"} - -**(b)** CPU and GPU trace for KV cache reordering **after** enabling torch.compile - -**Figure 12.** CPU and GPU trace for KV cache reordering (a) before and (b) after enabling torch.compile - - -## Vocoder - -Vocoder in Seamless is a HiFi-GAN unit-vocoder that converts generated units to waveform output where an unit is a representation of speech that combines different aspects such as phonemes and syllables, which can be used to generate sounds that are audible to humans. Vocoder is a relatively simple module that consists of Conv1d and ConvTranspose1d layers and is a CPU bound module as shown in FIgure 3. Based on this observation, we decided to enable torch.compile + CUDA Graph for vocoder to reduce the disproportionally large CPU overhead as shown in Figure 10. But there were several fixes to be made. - -![CPU and GPU trace for Vocoder after torch.compile + CUDA Graph are enabled](/assets/images/accelerating-generative-ai-4/fg14.png){:style="width:100%;"} - -**Figure 13.** CPU and GPU trace for Vocoder after torch.compile + CUDA Graph are enabled. - -a) The input tensor shape of the vocoder is different across different inference samples. But as CUDA Graph records the shape of tensors and replays them, we had to pad the input to the fixed size with zeros. Since vocoder only consists of Conv1d layers, we do not need an additional padding mask, and padding with zeros is sufficient. - -b) Vocoder consists of conv1d layers wrapped with `torch.nn.utils.weight_norm` (see [here](https://github.com/facebookresearch/seamless_communication/blob/main/src/seamless_communication/models/vocoder/hifigan.py#L37-L112)). However, applying torch.compile directly to Vocoder incurs graph break as below, which leads to suboptimal performance improvement. This graph break happens inside the hook handling part in the PyTorch code of `weight_norm`. - - -``` -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] Graph break: setattr(UserDefinedObjectVariable) from user code at: -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/vocoder.py", line 49, in forward -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] return self.code_generator(x, dur_prediction) # type: ignore[no-any-return]1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] return forward_call(*args, **kwargs) -[2023-12-13 04:26:16,822] [1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/codehifigan.py", line 101, in forward -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] return super().forward(x) -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/hifigan.py", line 185, in forward -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] x = self.ups[i](x) -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1550, in _call_impl -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] args_result = hook(self, args) -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py", line 65, in __call__ -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] setattr(module, self.name, self.compute_weight(module)) -[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] -``` - - -Since the weights of layers do not change during the inference, we do not need weight normalization. So we simply removed weight normalization for Vocoder as shown in Figure 14, by utilizing `remove_weight_norm` function which is already provided at the Seamless codebase ([here](https://github.com/facebookresearch/seamless_communication/blob/main/src/seamless_communication/models/vocoder/hifigan.py#L198-L205)). - -![Removing weight_norm for Vocoder](/assets/images/accelerating-generative-ai-4/fg15.png){:style="width:100%;"} - -**Figure 14.** Removing `weight_norm` for Vocoder - - -## Performance Evaluation + Impact of CUDA Graph - -Figure 15 shows the speedup result when enabling torch.compile(mode=”max-autotune”) + CUDA Graph on the text decoder and vocoder. We achieve **2x speedup for the text decoder and 30x speedup for vocoder, leading to 2.7x faster end-to-end inference time.** - - - - - - -
        - -Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph - - -Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph -
        - - -**Figure 15.** Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph - -We also report the speedups for text decoder and vocoder using torch.compile without CUDA Graph, which is supported by torch.compile’s API (i.e., `torch.compile(mode="max-autotune-no-cudagraphs")`), to identify the impact of CUDA Graph on the performance. Without CUDA Graph, the speedup for text decoder and vocoder reduces to 1.17x and 18.4x. While still quite significant, it indicates the important role of CUDA Graph. We conclude that Seamless M4T-v2 is exposed to a lot of time launching CUDA kernels, especially when we use small batch size (e.g., 1) where the GPU kernel execution time is not long enough to amortize the GPU kernel launch time. - - -![End-to-end inference speedup of applying torch.compile and CUDA graph incrementally](/assets/images/accelerating-generative-ai-4/fg1.png){:style="width:100%;"} - - -**Figure 16.** End-to-end inference speedup of applying torch.compile and CUDA graph incrementally. **a)** “Inc. Decoding”: Apply torch.compile only to the text decoder **b)** “Inc. Decoding w/ CUDA Graph”: Apply torch.compile + CUDA Graph to the text decoder **c)** “+KV Cache Reordering”: Additionally apply torch.compile to KV cache reordering operation upon b) **d)** “+Vocoder”: Additionally apply torch.compile to the vocoder upon c) **e)** “+Vocoder w/ CUDA Graph”: Additionally apply torch.compile + CUDA Graph to the vocoder upon d). - -Figure 16 represents the cumulative effect of applying torch.compile with and without CUDA Graph to the modules. The results indicate a significant improvement in the end-to-end inference speedup, demonstrating the effectiveness of these techniques in optimizing the overall latency. As a result, we gain **2.7x** end-to-end inference speedup for Seamless M4T-v2 with batch_size=1. - - -## Acknowledgements - -We thank the PyTorch team and Seamless team for their tremendous support with this work. \ No newline at end of file diff --git a/_posts/2024-01-30-pytorch2-2-lib-updates.md b/_posts/2024-01-30-pytorch2-2-lib-updates.md deleted file mode 100644 index 2b6f2400eb45..000000000000 --- a/_posts/2024-01-30-pytorch2-2-lib-updates.md +++ /dev/null @@ -1,133 +0,0 @@ ---- -layout: blog_detail -title: "New Library Updates in PyTorch 2.2" ---- - -## Summary - -We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 2.2 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. - - - - - - - - - - - - - - - - - - - - - -
        Latest Stable Library Versions (Full List)* -
        TorchArrow 0.1.0 - TorchRec 0.6.0 - TorchVision 0.17 -
        TorchAudio 2.2.0 - TorchServe 0.9.0 - TorchX 0.7.0 -
        TorchData 0.7.1 - TorchText 0.17.0 - PyTorch on XLA Devices 2.1 -
        - - -*To see [prior versions](https://pytorch.org/docs/stable/index.html) or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’. - - -## TorchRL - -### Feature: TorchRL’s Offline RL Data Hub - -TorchRL now provides one of the largest dataset hubs for offline RL and imitation learning, and it all comes under a single data format (TED, for TorchRL Episode Data format). This makes it possible to easily swap from different sources in a single training loop. It is also now possible to easily combine datasets of different sources through the ReplayBufferEnsemble class. The data processing is fully customizable. Sources include simulated tasks (Minari, D4RL, VD4RL), robotic datasets (Roboset, OpenX Embodied dataset) and gaming (GenDGRL/ProcGen, Atari/DQN). Check these out in the [documentation](https://pytorch.org/rl/reference/data.html#datasets). - -Aside from these changes, our replay buffers can now be dumped on disk using the `.dumps()` method which will serialize the buffers on disk using the TensorDict API which is faster, safer and more efficient than using torch.save. - -Finally, replay buffers can now be read and written from separate processes on the same machine without any extra code needed from the user! - - -### TorchRL2Gym environment API - -To facilitate TorchRL’s integration in existing code-bases and enjoy all the features of TorchRL’s environment API (execution on device, batched operations, transforms…) we provide a TorchRL-to-gym API that allows users to register any environment they want in gym or gymnasium. This can be used in turn to make TorchRL a universal lib-to-gym converter that works across stateless (eg, dm_control) and stateless (Brax, Jumanji) environments. The feature is thoroughly detailed in the [doc](https://pytorch.org/rl/reference/generated/torchrl.envs.EnvBase.html#torchrl.envs.EnvBase.register_gym). The info_dict reading API has also been improved. - - -### Environment speedups - -We added the option of executing environments on a different environment than the one used to deliver data in ParallelEnv. We also speeded up the GymLikeEnv class to a level that now makes it competitive with gym itself. - - -### Scaling objectives - -The most popular objectives for RLHF and training at scale (PPO and A2C) are now compatible with FSDP and DDP models! - - -## TensorDict - - -### Feature: MemoryMappedTensor to replace MemmapTensor - -We provide a much more efficient mmap backend for TensorDict; MemoryMappedTensor, which directly subclasses torch.Tensor. It comes with a bunch of utils to be constructed, such as `from_tensor`, `empty` and many more. MemoryMappedTensor is now much safer and faster than its counterpart. The library remains fully compatible with the previous class to facilitate transition. - -We also introduce a new set of multithreaded serialization methods that make tensordict serialization highly competitive with torch.save, with serialization and deserialization speeds for LLMs more than [3x faster than with torch.save](https://github.com/pytorch/tensordict/pull/592#issuecomment-1850761831). - - -### Feature: Non-tensor data within TensorDict - -It is not possible to carry non-tensor data through the `NonTensorData` tensorclass. This makes it possible to build tensordicts with metadata. The `memmap`-API is fully compatible with these values, allowing users to seamlessly serialize and deserialize such objects. To store non-tensor data in a tensordict, simply assign it using the `__setitem__` method. - - -### Efficiency improvements - -Several methods runtime have been improved, such as unbind, split, map or even TensorDict instantiation. Check our [benchmarks](https://pytorch.org/tensordict/dev/bench/)! - - -## TorchRec/fbgemm_gpu - - -### VBE - -TorchRec now natively supports VBE (variable batched embeddings) within the `EmbeddingBagCollection` module. This allows variable batch size per feature, unlocking sparse input data deduplication, which can greatly speed up embedding lookup and all-to-all time. To enable, simply initialize `KeyedJaggedTensor `with `stride_per_key_per_rank` and `inverse_indices` fields, which specify batch size per feature and inverse indices to reindex the embedding output respectively. - -In addition to the TorchRec library changes, [fbgemm_gpu](https://pytorch.org/FBGEMM/) has added the support for variable batch size per feature in TBE. [VBE](https://github.com/pytorch/FBGEMM/pull/1752) is enabled on split TBE training for both weighted and unweighted cases. To use VBE, please make sure to use the latest fbgemm_gpu version. - - -### Embedding offloading - -This technique refers to using CUDA UVM to cache ‘hot’ embeddings (i.e. store embedding tables on host memory with cache on HBM memory), and prefetching the cache. Embedding offloading allows running a larger model with fewer GPUs, while maintaining competitive performance. Use the prefetching pipeline ([PrefetchTrainPipelineSparseDist](https://github.com/pytorch/torchrec/blob/main/torchrec/distributed/train_pipeline.py?#L1056)) and pass in [per-table cache load factor](https://github.com/pytorch/torchrec/blob/main/torchrec/distributed/types.py#L457) and the [prefetch_pipeline](https://github.com/pytorch/torchrec/blob/main/torchrec/distributed/types.py#L460) flag through constraints in the planner to use this feature. - -Fbgemm_gpu has introduced [UVM cache pipeline prefetching](https://github.com/pytorch/FBGEMM/pull/1893) in [v0.5.0](https://github.com/pytorch/FBGEMM/releases/tag/v0.5.0) for TBE performance speedup. This allows cache-insert to be executed in parallel with TBE forward/backward. To enable this feature, please be sure to use the latest fbgemm_gpu version. - - -### Trec.shard/shard_modules - -These APIs replace embedding submodules with its sharded variant. The shard API applies to an individual embedding module while the shard_modules API replaces all embedding modules and won’t touch other non-embedding submodules. - -Embedding sharding follows similar behavior to the prior TorchRec DistributedModuleParallel behavior, except the ShardedModules have been made composable, meaning the modules are backed by [TableBatchedEmbeddingSlices](https://github.com/pytorch/torchrec/blob/main/torchrec/distributed/composable/table_batched_embedding_slice.py#L15) which are views into the underlying TBE (including .grad). This means that fused parameters are now returned with named_parameters(), including in DistributedModuleParallel. - - -## TorchVision - - -### The V2 transforms are now stable! - - -The `torchvision.transforms.v2` namespace was still in BETA stage until now. It is now stable! Whether you’re new to Torchvision transforms, or you’re already experienced with them, we encourage you to start with [Getting started with transforms v2](https://pytorch.org/vision/stable/auto_examples/transforms/plot_transforms_getting_started.html#sphx-glr-auto-examples-transforms-plot-transforms-getting-started-py) in order to learn more about what can be done with the new v2 transforms. - -Browse our [main docs](https://pytorch.org/vision/stable/transforms.html#) for general information and performance tips. The available transforms and functionals are listed in the [API reference](https://pytorch.org/vision/stable/transforms.html#v2-api-ref). Additional information and tutorials can also be found in our [example gallery](https://pytorch.org/vision/stable/auto_examples/index.html#gallery), e.g. [Transforms v2: End-to-end object detection/segmentation example](https://pytorch.org/vision/stable/auto_examples/transforms/plot_transforms_e2e.html#sphx-glr-auto-examples-transforms-plot-transforms-e2e-py) or [How to write your own v2 transforms](https://pytorch.org/vision/stable/auto_examples/transforms/plot_custom_transforms.html#sphx-glr-auto-examples-transforms-plot-custom-transforms-py). - - -### Towards `torch.compile()` support - -We are progressively adding support for `torch.compile()` to torchvision interfaces, reducing graph breaks and allowing dynamic shape. - -The torchvision ops (`nms`, `[ps_]roi_align`, `[ps_]roi_pool` and `deform_conv_2d`) are now compatible with `torch.compile` and dynamic shapes. - -On the transforms side, the majority of [low-level kernels](https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/functional/__init__.py) (like `resize_image()` or `crop_image()`) should compile properly without graph breaks and with dynamic shapes. We are still addressing the remaining edge-cases, moving up towards full functional support and classes, and you should expect more progress on that front with the next release. diff --git a/_posts/2024-01-30-pytorch2-2.md b/_posts/2024-01-30-pytorch2-2.md deleted file mode 100644 index 23dcdc2dd30a..000000000000 --- a/_posts/2024-01-30-pytorch2-2.md +++ /dev/null @@ -1,130 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.2: FlashAttention-v2 integration, AOTInductor" ---- - -We are excited to announce the release of PyTorch® 2.2 ([release note](https://github.com/pytorch/pytorch/releases/tag/v2.2.0))! PyTorch 2.2 offers ~2x performance improvements to _[scaled_dot_product_attention](https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html)_ via [FlashAttention-v2](https://arxiv.org/abs/2307.08691) integration, as well as _AOTInductor_, a new ahead-of-time compilation and deployment tool built for non-python server-side deployments. - -This release also includes improved _torch.compile_ support for Optimizers, a number of new inductor optimizations, and a new logging mechanism called TORCH_LOGS. - -Please note that we are [deprecating macOS x86 support](https://github.com/pytorch/pytorch/issues/114602), and PyTorch 2.2.x will be the last version that supports macOS x64. - -Along with 2.2, we are also releasing a series of updates to the PyTorch domain libraries. More details can be found in the library updates blog. - -This release is composed of 3,628 commits and 521 contributors since PyTorch 2.1. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.2. More information about how to get started with the PyTorch 2-series can be found at our [Getting Started](https://pytorch.org/get-started/pytorch-2.0/) page. - -Summary: - -* _[scaled_dot_product_attention](https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html)_ (SDPA) now supports _[FlashAttention-2](https://arxiv.org/abs/2307.08691)_, yielding around 2x speedups compared to previous versions. -* PyTorch 2.2 introduces a new ahead-of-time extension of [TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747) called _[AOTInductor](https://pytorch.org/docs/main/torch.compiler_aot_inductor.html)_, designed to compile and deploy PyTorch programs for non-python server-side. -* _torch.distributed_ supports a new abstraction for initializing and representing ProcessGroups called _[device_mesh](https://pytorch.org/tutorials/recipes/distributed_device_mesh.html)_. -* PyTorch 2.2 ships a standardized, configurable logging mechanism called [TORCH_LOGS](https://pytorch.org/tutorials/recipes/torch_logs.html). -* A number of _torch.compile_ improvements are included in PyTorch 2.2, including improved support for compiling Optimizers and improved TorchInductor fusion and layout optimizations. -* Please note that we are [deprecating macOS x86 support](https://github.com/pytorch/pytorch/issues/114602), and PyTorch 2.2.x will be the last version that supports macOS x64. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        -Stable - Beta - Performance Improvements -
        - FlashAttention-2 Integration - Inductor optimizations -
        - AOTInductor - aarch64 optimizations -
        - TORCH_LOGS - -
        - device_mesh - -
        - Optimizer compilation - -
        - - -*To see a full list of public feature submissions click [here](https://docs.google.com/spreadsheets/d/1TzGkWuUMF1yTe88adz1dt2mzbIsZLd3PBasy588VWgk/edit?usp=sharing). - - -## Beta Features - -### [Beta] FlashAttention-2 support in _torch.nn.functional.scaled_dot_product_attention_ - -_[torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html)_ (SDPA) now supports FlashAttention-2, yielding around 2x speedups (compared to the previous version) and reaching ~50-73% of theoretical maximum FLOPs/s on A100 GPUs. - -More information is available on FlashAttention-2 in [this paper](https://arxiv.org/abs/2307.08691). - -For a tutorial on how to use SDPA please see [this tutorial](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html). - -### [Beta] AOTInductor: ahead-of-time compilation and deployment for torch.export-ed programs - -AOTInductor is an extension of [TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747), designed to process exported PyTorch models, optimize them, and produce shared libraries as well as other relevant artifacts. These compiled artifacts can be deployed in non-Python environments, which are frequently employed for inference on the server-side. Note that AOTInductor supports the same backends as Inductor, including CUDA, ROCm, and CPU. - -For more information please see the [AOTInductor tutorial](https://pytorch.org/docs/main/torch.compiler_aot_inductor.html). - -### [Beta] Fine-grained configurable logging via TORCH_LOGS - -PyTorch now ships a standardized, configurable logging mechanism that can be used to analyze the status of various subsystems such as compilation and distributed operations. - -Logs can be enabled via the TORCH_LOGS environment variable. For example, to set the log level of TorchDynamo to logging.ERROR and the log level of TorchInductor to logging.DEBUG pass _TORCH_LOGS="-dynamo,+inductor"_ to PyTorch. - -For more information, please see the logging [documentation](https://pytorch.org/docs/2.2/logging.html) and [tutorial](https://pytorch.org/tutorials/recipes/torch_logs.html). - -### [Beta] torch.distributed.device_mesh - -PyTorch 2.2 introduces a new abstraction for representing the ProcessGroups involved in distributed parallelisms called _torch.distributed.device_mesh_. This abstraction allows users to represent inter-node and intra-node process groups via an N-dimensional array where, for example, one dimension can data parallelism in FSDP while another could represent tensor parallelism within FSDP. - -For more information, see the [device_mesh tutorial](https://pytorch.org/tutorials/recipes/distributed_device_mesh.html). - -### [Beta] Improvements to _torch.compile_-ing Optimizers - -A number of improvements have been made to torch.compile-ing Optimizers including less overhead and support for cuda graphs. - -More technical details of the improvements are available on [dev-discuss](https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669), and a recipe for _torch.compile_-ing optimizers is available [here](https://pytorch.org/tutorials/recipes/compiling_optimizer.html). - - -## Performance Improvements - -### Inductor Performance Optimizations - -A number of performance optimizations have been added to TorchInductor including [horizontal fusion support for torch.concat](https://github.com/pytorch/pytorch/pull/111437), [improved convolution layout optimizations](https://github.com/pytorch/pytorch/pull/114600), and improved _scaled_dot_product_attention_ [pattern](https://github.com/pytorch/pytorch/pull/109156) [matching](https://github.com/pytorch/pytorch/pull/110001). - -For a complete list of inductor optimizations, please see the [Release Notes](https://github.com/pytorch/pytorch/tree/v2.2.0). - -### aarch64 Performance Optimizations - -PyTorch 2.2 includes a number of performance enhancements for aarch64 including support for [mkldnn weight pre-packing](https://github.com/pytorch/pytorch/pull/115037/files), improved [ideep](https://github.com/intel/ideep) [primitive caching](https://github.com/intel/ideep/pull/261), and improved inference speed via [fixed format kernel improvements](https://github.com/oneapi-src/oneDNN/pull/1590) to [OneDNN](https://github.com/oneapi-src/oneDNN/). - -For a complete list of aarch64 optimizations, please see the [Release Notes](https://github.com/pytorch/pytorch/tree/v2.2.0). \ No newline at end of file diff --git a/_posts/2024-02-01-new-in-docs.md b/_posts/2024-02-01-new-in-docs.md deleted file mode 100644 index fd272b0ef635..000000000000 --- a/_posts/2024-02-01-new-in-docs.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -layout: blog_detail -title: "What's New in PyTorch Documentation" ---- - -Greetings to the PyTorch community! Here is a quick update on PyTorch docs. - -In November 2023, we successfully conducted a [PyTorch Docathon](https://pytorch.org/blog/pytorch-docathon-h2-2023-wrap/), a community event where PyTorch community members gathered together to improve PyTorch documentation and tutorials. This event saw a global participation of contributors who dedicated their time and effort to enhance our docs. We extend our sincere gratitude to everyone involved. - -A key accomplishment of the Docathon was the comprehensive work carried out on docstrings. Our community contributors meticulously reviewed and improved the docstrings based on the provided tasks. - -In addition to that, we've added three new tutorials that showcase real-world applications of PyTorch. We are particularly proud that two of these tutorials were contributed by PyTorch ecosystem partners. - -Here is the new tutorials for you to explore: - -* [Whole Slide Image Classification Using PyTorch and TIAToolbox](https://pytorch.org/tutorials/intermediate/tiatoolbox_tutorial.html) —This tutorial demonstrates how to classify Whole Slide Images (WSIs) using PyTorch deep learning models with TIAToolbox, which are images of human tissue samples used by pathologists and researchers to study diseases like cancer at the microscopic level. -* [Semi-Supervised Learning using USB built upon PyTorch](https://pytorch.org/tutorials/advanced/usb_semisup_learn.html) – This tutorial introduces USB, a flexible and modular semi-supervised learning framework based on PyTorch, demonstrating its ease of use in training a FreeMatch/SoftMatch model on CIFAR-10 using pre-trained ViT and its adaptability to various algorithms and imbalanced datasets. -* [Deploying a PyTorch Stable Diffusion model as a Vertex AI Endpoint](https://pytorch.org/tutorials/recipes/torchserve_vertexai_tutorial.html) – This tutorial provides a step-by-step guide on how to streamline the deployment of a PyTorch Stable Diffusion model (v1.5) using Vertex AI, a fully-managed machine learning platform, by creating a custom TorchServe handler, uploading model artifacts to Google Cloud Storage, creating a Vertex AI model with the model artifacts and a prebuilt PyTorch container image, and finally deploying the model onto an endpoint. - -We're planning more community events this year, so stay tuned! - -And finally, we just published new 2.2 PyTorch [documentation](https://pytorch.org/docs/) and [tutorials](https://pytorch.org/tutorials/). Check it out! - -Best regards, -The PyTorch Team \ No newline at end of file diff --git a/_posts/2024-02-06-pytorch-2-paper-tutorial.md b/_posts/2024-02-06-pytorch-2-paper-tutorial.md deleted file mode 100644 index 541a802c9af7..000000000000 --- a/_posts/2024-02-06-pytorch-2-paper-tutorial.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2 paper and tutorial @ ASPLOS 2024" ---- - -The PyTorch team is excited to share that our paper on PyTorch 2 has been accepted for presentation at the ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), scheduled to take place from April 27 to May 1, 2024, in San Diego, CA, USA. - -The paper delves into the implementation of torch.compile and highlights the key technologies driving it, including TorchDynamo (graph capture), TorchInductor (backend compiler), and Dynamic Shape support. - -During the ASPLOS conference, we'll be conducting a tutorial on Saturday, April 27, focusing on the inner workings of PyTorch 2 and how systems researchers can leverage and build upon it. Stay tuned for more details as the event approaches – we look forward to your participation! - -A preview of the paper is attached below: - -Title: **PyTorch 2: Faster Machine Learning Through Dynamic Python Bytecode Transformation and Graph Compilation.** [**Full Paper PDF**](/assets/pytorch2-2.pdf) - -### Abstract -This paper introduces two extensions to the popular PyTorch machine learning framework, TorchDynamo and TorchInductor, which implement the torch.compile feature released in PyTorch 2. TorchDynamo is a Python-level just-in-time (JIT) compiler that enables graph compilation in PyTorch programs without sacrificing the flexibility of Python. It achieves this by dynamically modifying Python bytecode before execution and extracting sequences of PyTorch operations into an FX graph, which is then JIT compiled using one of many extensible backends. TorchInductor is the default compiler backend for TorchDynamo, which translates PyTorch programs into OpenAI's Triton for GPUs and C++ for CPUs. Results show that TorchDynamo is able to capture graphs more robustly than prior approaches while adding minimal overhead, and TorchInductor is able to provide a 2.27x inference and 1.41x training geometric mean speedup on an NVIDIA A100 GPU across 180+ real-world models, which outperforms six other compilers. These extensions provide a new way to apply optimizations through compilers in eager mode frameworks like PyTorch. - - -### Authors - -Jason Ansel (Meta); Edward Yang (Meta); Horace He (Meta); Natalia Gimelshein (OpenAI); Animesh Jain (Meta); Michael Voznesensky (Meta); Bin Bao (Meta); Peter Bell (Quansight); David Berard (Meta); Evgeni Burovski Quansight; Geeta Chauhan (Meta); Anjali Chourdia (Meta); Will Constable (Meta); Alban Desmaison (Meta); Zachary DeVito (Meta); Elias Ellison (Meta); Will Feng (Meta); Jiong Gong (Intel); Michael Gschwind (Meta); Brian Hirsh (Meta); Sherlock Huang (Meta); Kshiteej Kalambarkar (Quansight); Laurent Kirsch (Meta); Michael Lazos (Meta); Mario Lezcano (Quansight); Yanbo Liang (Meta); Jason Liang (Meta); Yinghai Lu (Meta); CK Luk (Meta); Bert Maher (Meta); Yunjie Pan (University of Michigan); Christian Puhrsch (Meta); Matthias Reso (Meta); Mark Saroufim (Meta); Marcos Yukio Siraichi (Quansight); Helen Suk (Meta); Michael Suo (Meta); Phil Tillet (OpenAI); Eikan Wang (Intel); Xiaodong Wang (Meta); William Wen (Meta); Shunting Zhang (Meta); Xu Zhao (Meta); Keren Zhou (OpenAI & George Mason University); Richard Zou (Meta); Ajit Mathews (Meta); Gregory Chanan (Meta); Peng Wu (Meta); Soumith Chintala (Meta) - -### ASPLOS'24 - Full Day Tutorial Schedule - -Full schedule for the ASPLOS'24 PyTorch 2 Tutoral on Saturday, April 27th is available [here](https://github.com/pytorch/workshops/tree/master/ASPLOS_2024) diff --git a/_posts/2024-03-13-maximizing-training.md b/_posts/2024-03-13-maximizing-training.md deleted file mode 100644 index f12d7216f958..000000000000 --- a/_posts/2024-03-13-maximizing-training.md +++ /dev/null @@ -1,470 +0,0 @@ ---- -layout: blog_detail -title: "Maximizing training throughput using PyTorch FSDP" -author: Team PyTorch at IBM and Team PyTorch at Meta ---- - -In this blog, we demonstrate the scalability of FSDP with a pre-training exemplar, a 7B model trained for 2T tokens, and share various techniques we used to achieve a rapid training speed of 3,700 tokens/sec/GPU, or 40B tokens/day on 128 A100 GPUs. This translates to a model FLOPS utilization (MFU) and hardware FLOPS utilization (HFU) of 57%. Additionally, we have observed near linear scaling of FSDP to 512 GPUs, implying that training a 7B model on 512 GPUs to 2T tokens using this method would take just under two weeks. - -IBM researchers trained a Meta Llama 2 7B architecture to 2T tokens, which we will refer to as LlamaT(est). This model demonstrates comparable model quality as Llama 2 on various academic benchmarks. All of the [training code](https://github.com/foundation-model-stack/fms-fsdp), along with our methodology to achieve this throughput, can be found in this blog. We also share the configuration knobs that work well for the Llama 2 models – 7B, 13B, 34B, and 70B for A100s and H100s. - -In this process, we also propose a _new _selective activation checkpointing mechanism that applies to FSDP which gives us a 10% boost beyond out-of-the box FSDP. We have open sourced the [training code base](https://github.com/foundation-model-stack/fms-fsdp) and an associated scalable data loader as the methodology to achieve this throughput. - -One key benefit of a PyTorch native pathway for training is the ability to seamlessly train on multiple hardware backends. For example, the recent end-to-end stack for training that was released by AllenAI through OLMo also leverages PyTorch FSDP for training on AMD and NVIDIA GPUs. There are three main components that we leverage from FSDP to achieve our throughput: - -1. [SDPA Flash attention](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html), that enables fused attention kernels and efficient attention computation -2. [Overlap](https://engineering.fb.com/2021/07/15/open-source/fsdp/) in computation and communication allows for better utilization of the GPU -3. [Selective activation checkpointing](https://arxiv.org/pdf/2205.05198.pdf) enables us to tradeoff between GPU memory and compute - -IBM has been working closely with Team PyTorch at Meta on [PyTorch FSDP](https://arxiv.org/abs/2304.11277) for nearly two years: introducing the [rate limiter](https://pytorch.org/blog/scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud/) for achieving better throughput on Ethernet interconnects, [distributed checkpointing](https://pytorch.org/blog/performant-distributed-checkpointing/) to improve the checkpoint times by an order of magnitude, and implementing the early version of checkpointing for the hybrid sharding mode of FSDP. Late last year, we used FSDP to train a model end-to-end. - - -## Training Details - -The 7B model is trained on 128 A100 GPUs with 400Gbps network connectivity and GPU direct RDMA. We use SDPA FlashAttention v2 for attention computation, and for this model we turned off activation checkpointing that limits the batch size, but provides the highest throughput – batch size is 1 million tokens per batch for 128 GPUs and improves throughput by about 10% when compared to activation checkpointing. With these parameters, we have an almost full overlap in computation and communication. We use the AdamW optimizer in 32-bit with beta1 of 0.9 and beta2 of 0.95, weight decay of 0.1, and a learning rate ending at 3e-5 with a warmup to max learning rate of 3e-4 and a cosine schedule to reduce to 3e-5 over 2T tokens. The training was performed using mixed precision bf16 on an internal dataset. The training stack is using IBM’s [Foundation Model Stack](https://github.com/foundation-model-stack/foundation-model-stack/blob/main/fms/models/llama.py) for model architecture and PyTorch nightlies post-2.2 release for FSDP and SDPA. We tried a few different nightlies during the time period of Nov 2023 through Feb 2024 and we observed an improvement in the throughput. - - -### Selective activation checkpointing - -We jointly implemented a simple and effective mechanism of selective activation checkpointing (AC). In FSDP, the common practice is to checkpoint each transformer block. A simple extension is to checkpoint every _n _blocks and reduce the amount of recomputation, while increasing the memory needed. This is quite effective for the 13B model size, increasing the throughput by 10%. For the 7B model size, we did not need activation checkpointing at all. Future versions of FSDP will provide selective activation checkpointing at an operator level, enabling an optimal compute-memory tradeoff. The code for the above is implemented [here](https://github.com/foundation-model-stack/fms-fsdp/blob/main/fms_fsdp/policies/ac_handler.py). - - -### Throughput and MFU, HFU computation - -While we only trained the 7B model to 2T tokens, we performed numerous experiments on the other model sizes to provide the best configuration options. This is summarized in the table below for two types of infrastructure — an A100 cluster with 128 GPUs and 400Gbps inter-node interconnect, and an H100 cluster with 96 GPUs and 800Gbps inter-node interconnect. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        -Model size - - - -Batch size - - - -Activation checkpoint - - - -Throughput tokens/sec/GPU (A100 80GB and 400Gbps interconnect) - - - -MFU % (A100 80GB) - - - -HFU % (A100 80GB) - - - -Throughput tokens/sec/GPU (H100 80GB and 800Gbps interconnect) - - - -MFU % (H100 80GB) - - - -HFU % (H100 80GB) - - -
        -7B - - - -2 - - - -No - - - -3700 - - - -0.57 - - - -0.57 - - - -7500 - - - -0.37 - - - -0.37 - - -
        -13B - - - -2 - - - -Selective - - - -1800 - - - -0.51 - - - -0.59 - - - -3800 - - - -0.35 - - - -0.40 - - -
        -34B - - - -2 - - - -Yes - - - -700 - - - -0.47 - - - -0.64 - - - -1550 - - - -0.32 - - - -0.44 - - -
        -70B - - - -2 - - - -Yes - - - -370 - - - -0.50 - - - -0.67 - - - -800 - - - -0.34 - - - -0.45 - - -
        - -_Table 1: Model and Hardware FLOPS utilization of various model sizes on A100 and H100 GPUs_ - -HFU numbers are computed using the [PyTorch FLOP counter](https://github.com/pytorch/pytorch/blob/2240018c03744ee34ea14ad53481db934c37e384/torch/utils/flop_counter.py#L336) and the theoretical bf16 performance of A100 and H100 GPUs, whereas MFU numbers are computed using the methodology outlined in [NanoGPT](https://github.com/karpathy/nanoGPT) and the [PaLM paper](https://github.com/pytorch/pytorch/blob/2240018c03744ee34ea14ad53481db934c37e384/torch/utils/flop_counter.py#L336). We also note that the batch sizes we use for the larger models are intentionally kept at 2 per GPU to mimic choices made in training models of 4k sequence length and achieve this up to 512 GPUs without exceeding the 4M tokens popular batch size. Beyond that, we would need tensor parallelism or sequence parallelism. - -We note in the table above that for A100s, that activation recomputation causes the MFU to reduce, while HFU increases! With the introduction of better activation checkpointing schemes, we expect MFU to increase and catch up with HFU. However, we observe that for H100s, both MFU and HFU are relatively low. We analyze the PyTorch profile traces on H100 and observe that there is a 10% gap due to network “peeking” out. In addition, we hypothesize that the HBM bandwidth of H100s is the cause for the reduced HFU/MFU on H100s and not being able to obtain the 3x improvement (H100s are theoretically 3x faster than A100s - [312 vs 989TFLOPS](https://github.com/stas00/ml-engineering/tree/master/compute/accelerator#tflops-comparison-table), but only have <2x the HBM bandwidth than A100s - [2.0 vs 3.35TBps](https://github.com/stas00/ml-engineering/tree/master/compute/accelerator#accelerator-memory-size-and-speed)). We plan to try out other configuration options like Tensor Parallel to improve the knobs for the 70B model on H100s. - - -### Model details - -The loss curve for training is shown in the below figure. - - -![loss curve for training](/assets/images/maximizing-training/loss_curve.png){:style="width:100%;display: block; max-width: 600px; margin-right: auto; margin-left: auto"} - - -_Figure 1: LlamaT training loss curve_ - -The 2T checkpoint is converted to Hugging Face format by a script that is provided in the repository and we then use [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) to compute key academic benchmarks and compare that by running it on Llama2-7B. These results are captured in the below table. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Evaluation metric - Llama2-7B (baseline) - LlamaT-7B -
        MMLU (zero shot) - 0.41 - 0.43 -
        MMLU (5-shot weighted avg) - 0.47 - 0.50 -
        Arc challenge - 0.46 - 0.44 -
        Arc easy - 0.74 - 0.71 -
        Boolq - 0.78 - 0.76 -
        Copa - 0.87 - 0.83 -
        Hellaswag - 0.76 - 0.74 -
        Openbookqa - 0.44 - 0.42 -
        Piqa - 0.79 - 0.79 -
        Sciq - 0.91 - 0.91 -
        Winogrande - 0.69 - 0.67 -
        Truthfulqa - 0.39 - 0.39 -
        GSM8k (8-shot) - 0.13 - 0.11 -
        - - -_Table 1: LM eval harness scores_ - -We observe that the model performs competitively with Llama2 (bolder is better). - - -### Training chronicles - -Training was stable with no crashes, though we did observe a few hiccups: - -**0-200B tokens**: We observed a slowdown in the iteration time (time taken to execute one training step). We stopped the job to ensure that the data loader was not causing any slowdowns and the checkpointing was performant and accurate. We did not find any issues. By this time, HSDP checkpointing code was available in PyTorch, and we took this opportunity to make the switch to PyTorch checkpointing code. - -**200B tokens-1.9T**: We did not do any manual intervention in the job in late December. When we came back early January, disk space had exceeded and checkpoints were failing to be written, although the training job continued. The last known checkpoint was 1.5T. - -**1.5T-1.7T**: We evaluated the 1.5T checkpoint with lm-evaluation-harness and discovered that model has been trained with an extra special token between two documents due to the Hugging Face tokenizer introducing a separator token and our dataloader also appending its own document separator. We modified the dataloader to eliminate the extra special token, and continued training with the modified dataloader from 1.7T token onwards. - -**1.7T-2T**: The loss initially spiked due to the change in the special tokens which was quickly recovered in a few billion tokens. The training finished without any other manual intervention! - - -### Key takeaways and even more speed - -We demonstrated how one can use FSDP to train a model to 2T tokens with an excellent performance of 3700 tokens/sec/GPU and that generates a good quality model. As part of this exercise, we open sourced all our code for training and the knobs to achieve this throughput. These knobs can be leveraged by not only large-scale runs, but also smaller scale tuning runs. You can find the code [here](https://github.com/foundation-model-stack/fms-fsdp). - -FSDP APIs implement the [ZeRO](https://pytorch.org/docs/stable/fsdp.html) algorithms in a PyTorch native manner and allow for tuning and training of large models. In the past, we have seen FSDP proof points ([Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca), [Hugging Face](https://huggingface.co/blog/ram-efficient-pytorch-fsdp), [Llama 2 recipes](https://github.com/facebookresearch/llama-recipes)) on tuning a variety of LLMs (such as Meta Llama 2 7B to 70B Llama) using simple training loops and achieving good throughputs and training times. - -Finally, we note that there are several levers for speeding up training: - - - -1. Node optimizations that can speedup specific operations (e.g., attention computation using Flash Attention V2) -2. Graph optimizations (e.g., fusing kernels, torch.compile) -3. Overlap in compute-communications -4. Activation recomputation - -We have leveraged 1, 3, and a variation of 4 in this blog and are working closely with Team PyTorch at Meta to get torch.compile (2) as well as a more advanced version of 4 with per-operator selective activation recomputation. We plan to share a simple formatting code and example data to ingest into our data loader to enable others to use the code base for training of models. - - -## Acknowledgements - -There are several teams that have been involved in reaching this proof point and we would like to thank the teams across Meta and IBM. Specifically, we extend our gratitude to the PyTorch distributed team, Facebook Research and Applied AI teams that built the [FSDP APIs](https://arxiv.org/abs/2304.11277) and made enhancements based on our feedback. We also wish to thank the data team at IBM Research that curated the data corpus used in this exercise and the infrastructure team at IBM Research (especially, Claudia Misale, Shweta Salaria, and Seetharami Seelam) that optimized NCCL and network configurations. By building and leveraging all of these components, we have successfully demonstrated the LlamaT proof point. - -The selective activation checkpointing was conceptualized at IBM by Linsong Chu, Davis Wertheimer, Mudhakar Srivatsa, and Raghu Ganti and implemented by Less Wright at Meta. - -Special thanks to [Stas Bekman](https://www.linkedin.com/in/stasbekman/?originalSubdomain=ca) and [Minjia Zhang](https://minjiazhang.github.io/), who provided extensive feedback and helped improve the blog. Their insights have been invaluable in highlighting key aspects of optimizing the training and exploring further enhancements. - - -## Appendix - - -### Communication computation overlap - -Another key aspect of training in a multi-node setting is the ability to overlap communication and computation. In FSDP, there are multiple opportunities for overlapping – during the FSDP unit gathering phase at forward pass as well as the backward pass computation. Overlapping the gather during forward pass while the computation of the previous unit and overlapping backward computation with the next unit gathering and gradient scattering help improve GPU utilization by nearly 2x. We illustrate this on the 400Gbps network interconnect with A100 80GB GPUs. In the case of HSDP, there is no inter-node traffic during the pre-fetch stage for forward pass and the overlap is only for the backward gradient computation phase. Of course, HSDP is feasible only when the model can be sharded within a single node, limiting the size of models to around 30B parameters. - -The below figure shows three steps in FSDP with the communication between nodes at the bottom and the compute stream at the top of the second half of the image. For the 7B model with no activation recomputation, we observe the overlap to be complete. In practice, the overlap percentage possible is 90% since the first block during forward pass and the last block during backward pass are not able to overlap. - -![three steps in FSDP with the communication between nodes at the bottom and the compute stream at the top of the second half](/assets/images/maximizing-training/overlap_zoomed_out.png){:style="width:100%;"} - - -A zoomed in view of the above three-step process is shown below for a single step. We can clearly see the granularity of the computation and communication and how they overlap in an interleaved manner. - -![zoomed in view of the above three-step process](/assets/images/maximizing-training/overlap_zoomed_in.png){:style="width:100%;"} diff --git a/_posts/2024-04-04-accelerating-moe-model.md b/_posts/2024-04-04-accelerating-moe-model.md deleted file mode 100644 index 182ebd9e4938..000000000000 --- a/_posts/2024-04-04-accelerating-moe-model.md +++ /dev/null @@ -1,131 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating MoE model inference with Locality-Aware Kernel Design" -author: Adnan Hoque, Less Wright, Antoni Virós Martin, Chih-Chieh Yang ---- - -## 1.0 Summary - -We show that by implementing column-major scheduling to improve data locality, we can accelerate the core Triton GEMM (General Matrix-Matrix Multiply) kernel for MoEs (Mixture of Experts) up to 4x on A100, and up to 4.4x on H100 Nvidia GPUs. This post demonstrates several different work decomposition and scheduling algorithms for MoE GEMMs and shows, at the hardware level, why column-major scheduling produces the highest speedup. - -Repo and code available at: [https://github.com/pytorch-labs/applied-ai/tree/main/kernels/triton/inference/col_major_moe_gemm](https://github.com/pytorch-labs/applied-ai/tree/main/kernels/triton/inference/col_major_moe_gemm). - - -![Figure 1A. Optimized Fused MoE GEMM Kernel TFLOPs on A100 for varying Batch Sizes M](/assets/images/accelerating-moe-model/fig-7.png){:style="width:100%;display: block; max-width: 600px; margin-right: auto; margin-left: auto"} - -_Figure 1A. Optimized Fused MoE GEMM Kernel TFLOPs on **A100** for varying Batch Sizes M_ - - -![Figure 1B. Optimized Fused MoE GEMM Kernel TFLOPs on H100 for varying Batch Sizes M](/assets/images/accelerating-moe-model/fig-8.png){:style="width:100%;display: block; max-width: 600px; margin-right: auto; margin-left: auto; margin-top: 40px;"} - -_Figure 1B. Optimized Fused MoE GEMM Kernel TFLOPs on **H100** for varying Batch Sizes M_ - -## 2.0 Background - -[OpenAI’s Triton](https://github.com/openai/triton) is a hardware-agnostic language and compiler that as our prior [blog post](https://pytorch.org/blog/accelerating-triton/) has shown can be used to accelerate quantization workflows. We also showed that in terms of kernel development, much of the same learnings and performance analysis tools from CUDA can be leveraged to provide similar insights into how Triton kernels work under-the-hood and subsequent measures to speedup these kernels in latency sensitive environments. As Triton becomes increasingly adopted in production settings, it is important that developers understand the common tips and tricks to developing performant kernels as well as the generality of these methods to various different architectures and workflows. Thus, this post will explore how we optimized the Triton kernel developed by [vLLM ](https://github.com/vllm-project/vllm)for the popular Mixture of Experts (MoE) Mixtral model using classical techniques and how these techniques can be implemented in Triton to achieve performance gain. - -[Mixtral 8x7B](https://arxiv.org/abs/2401.04088) is a sparse Mixture of Experts Language Model. Unlike the classical dense transformer architecture, each transformer block houses 8 MLP layers where each MLP is an ‘expert’. As a token flows through, a router network selects which 2 of the 8 experts should process that token and the results are then combined. The selected experts for the same token vary at each layer. As a result, while Mixtral 8x7B has a total of 47B params, during inference only 13B params are active. - - -The MoE GEMM (General Matrix-Matrix Multiply) kernel receives a stacked weight matrix containing all the experts, and must subsequently route each token to the TopK (2 for Mixtral) experts by utilizing a mapping array produced by the resultant scores of the router network. In this post, we provide methods to efficiently parallelize this computation during inference time, specifically during autoregression (or decoding stages). - - -## 3.0 Work Decomposition - SplitK - -We have previously shown that for the matrix problem sizes found in LLM inference, specifically in the context of W4A16 quantized inference, GEMM kernels can be accelerated by applying a [SplitK work decomposition](https://arxiv.org/abs/2402.00025). Thus, we started our MoE acceleration research by implementing SplitK in the [vLLM MoE Kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py), which produced speedups of approximately 18-20% over the Data Parallel approach. - -This result shows that the SplitK optimization can be used as a part of a more formulaic approach to improving/developing Triton kernels in inference settings. To build intuition about these different work decompositions, let’s consider a simple example for the multiplication of two 4x4 matrices and SplitK=2. - -In the data parallel GEMM kernel shown below, the computation for a single block of the output matrix will be handled by 1 threadblock, TB0. - - - -![Figure 2. Data Parallel GEMM](/assets/images/accelerating-moe-model/fig-1.gif){:style="width:100%;display: block; max-width: 600px; margin-right: auto; margin-left: auto"} - -_Figure 2. Data Parallel GEMM_ - -In contrast, in the SplitK kernel, the work required to compute 1 block in the output matrix, is “split” or shared amongst 2 thread blocks TB0 and TB1. This provides better load balancing and increased parallelism. - - - -![Figure 3. SplitK GEMM](/assets/images/accelerating-moe-model/fig.gif){:style="width:100%;display: block; max-width: 600px; margin-right: auto; margin-left: auto"} - -_Figure 3. SplitK GEMM_ - -The key idea is that we’ve increased our parallelism from M*N to M*N*SplitK. This approach does incur some costs such as adding inter-threadblock communication via atomic operations. However, these costs are minimal compared to the savings of other constrained GPU resources like shared memory and registers. Most importantly, the SplitK strategy provides superior load balancing characteristics for skinny matrices, (as is the case in MoE inference) and is the common matrix profile during decoding and inference. - -## 4.0 GEMM Hardware Scheduling - Column Major - -To improve upon the ~20% speedup with SplitK we focused our investigation on the logic that controls the hardware scheduling of the GEMM in Triton Kernels. Our profiling of the vLLM MoE kernel showed a low L2 cache hit rate, thus we investigated three scheduling options - column-major, row-major and grouped launch. Due to some intrinsic properties of MoE models, such as large expert matrices, and having to dynamically load TopK (2 for Mixtral) matrices during the duration of the kernel, cache reuse/hit rate becomes a bottleneck that this optimization will target. - - -For background, in our previous [blog](https://pytorch.org/blog/accelerating-triton/), we touched on the concept of “tile swizzling”, a method to achieve greater L2 cache hit rate. This concept relates to how the software _schedules_ the GEMM onto the SMs of a GPU. In Triton, this schedule is determined by the pid_m and pid_n calculations. Our key insight is that for skinny matrix multiplications, a column-major ordering ensures optimal reuse of the columns of the weight matrix, B. To illustrate this, let’s take a look at a snippet of what a column major computation of pid_m, and pid_n would look like: - - - -![Figure 4. Column Major ordering in PyTorch](/assets/images/accelerating-moe-model/fig-6.png){:style="width:100%;display: block; max-width: 500px; margin-right: auto; margin-left: auto"} - -_Figure 4. Column Major ordering in PyTorch_ - -From above, we note that with this mapping, we schedule the GEMM such that we calculate the output blocks of C in the following order: C(0, 0), C(1, 0), C(2, 0),… etc. To understand the implications we provide the following illustration: - - - -![Activation matrix / Weight matrix](/assets/images/accelerating-moe-model/fig-5.png){:style="width:100%;display: block; max-width: 500px; margin-right: auto; margin-left: auto"} - - - -![L1/L2 Cache](/assets/images/accelerating-moe-model/fig-4.png){:style="width:100%;display: block; max-width: 300px; margin-right: auto; margin-left: auto"} - - - -![C - Output Matrix](/assets/images/accelerating-moe-model/fig-3.png){:style="width:100%;display: block; max-width: 300px; margin-right: auto; margin-left: auto"} - -_Figure 5. Cache Reuse Pattern for a Column-Major GEMM Schedule_ - -In the above simplified view of a column-major schedule, let’s assume for a GEMM with skinny activation matrix A, that the entire matrix can fit in the GPU cache which is a reasonable assumption to make for the type of problem sizes we encounter in MoE inference. This allows for maximal _reuse_ of the columns of the weight matrix B, due to the fact that the B column can be re-used for the corresponding output tile calculations, C(0,0), C(1, 0) and C(2, 0). Consider instead, a row-major schedule, C(0,0), C(0,1), C(0, 2) etc. We would have to evict the column of B, and issue multiple load instructions to DRAM to calculate the same amount of output blocks. - -An important design consideration when optimizing kernels is a memory access pattern that results in the least amount of global load instructions. This optimal memory access pattern is achieved with the column-major schedule. The results below showcase the performance of the three schedules we investigated: - - - -![Figure 6. Comparison of GEMM Schedules on A100 for varying Batch Sizes M](/assets/images/accelerating-moe-model/fig-2.png){:style="width:100%;display: block; max-width: 600px; margin-right: auto; margin-left: auto"} - - -_Figure 6. Comparison of GEMM Schedules on A100 for varying Batch Sizes M_ - -The column-major schedule provides up to a 4x speedup over the other patterns, and as we’ll show in the next section, provides an optimal memory access pattern due to greatly improved data locality. - -## 5.0 Nsight Compute Analysis - Throughput and Memory Access Pattern - -For performance analysis, we focus on the **M = 2** case for the H100. A similar study can be done for the A100 as many of the same observations carry over. We note the following salient results, that showcase the impact of our optimizations. - - -![Figure 7. H100 Memory Throughput Chart for M = 2. Note the very large increase in the cache hit rates L1 cache hit rate (+2696%) and L2 cache hit rate (+254%).](/assets/images/accelerating-moe-model/fig-1.png){:style="width:100%;"} - -_Figure 7. H100 Memory Throughput Chart for M = 2. Note the very large increase in the cache hit rates L1 cache hit rate (+2696%) and L2 cache hit rate (+254%)._ - - - -![Figure 8. H100 Memory Instruction Statistics M = 2. Note the 49% reduction in global memory loads.](/assets/images/accelerating-moe-model/fig.png){:style="width:100%;margin-top: 40px;"} - -_Figure 8. H100 Memory Instruction Statistics M = 2. Note the 49% reduction in global memory loads._ - -These statistics show that our optimizations had the intended effect, which can be seen in the reduced cache misses, reduced memory accesses and the resultant 2.7x speedup. More concretely, the trace shows us a 2.54x increase in L2 hit rate (Figure 7), and a ~50% reduction in DRAM accesses (Figure 8). - -These improvements ultimately yield the reduced latency, with the optimized kernel being 2.7x faster for bs=2 and 4.4x for bs=512. - -## 6.0 Future Work - -Our kernel was tested in FP16, which showcases the numerics and performance of the column major scheduling for MoE, but most production models are using BFloat16. We encountered a limitation in Triton such that tl.atomic_add does not support Bfloat16 and hit launch latency concerns which would require cuda graph support for column major production use. In initial testing this translated to a 70% end-to-end speedup but, we encountered some expert mapping inconsistencies in an end to end environment that are not reflected in the test environment, so further work is needed to fully realize these speedups. \ - - -For future work, we intend to move this into a CUDA kernel which will ensure full BFloat16 support and reduced launch latency relative to Triton, and potentially resolve the expert routing inconsistency. We’ve also previously [published work](https://arxiv.org/abs/2402.00025) on enabling GPTQ W4A16 with Triton GEMM kernels, so natural follow-on work would include fusing dequantization into this kernel to allow for a GPTQ quantized inference path. - -## 7.0 Reproducibility - -We have [open sourced](https://github.com/pytorch-labs/applied-ai/tree/main/kernels/triton/inference/col_major_moe_gemm) the Triton kernel code along with an easy to run performance benchmark for readers interested in comparing or verifying the performance on their own GPU. - -## Acknowledgements - -We want to thank Daniel Han, Raghu Ganti, Mudhakar Srivatsa, Bert Maher, Gregory Chanan, Eli Uriegas, and Geeta Chauhan for their review of the presented material and Woosuk from the vLLM team as we built on his implementation of the Fused MoE kernel. \ No newline at end of file diff --git a/_posts/2024-04-16-torchtune-fine-tune-llms.md b/_posts/2024-04-16-torchtune-fine-tune-llms.md deleted file mode 100644 index 256dc025b63a..000000000000 --- a/_posts/2024-04-16-torchtune-fine-tune-llms.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -layout: blog_detail -title: "torchtune: Easily fine-tune LLMs using PyTorch" ---- - -We’re pleased to announce the alpha release of torchtune, a PyTorch-native library for easily fine-tuning large language models. - -Staying true to PyTorch's design principles, torchtune provides composable and modular building blocks along with easy-to-extend training recipes to fine-tune popular LLMs on a variety of consumer-grade and professional GPUs. - -torchtune supports the full fine-tuning workflow from start to finish, including - -* Downloading and preparing datasets and model checkpoints. -* Customizing the training with composable building blocks that support different model architectures, parameter-efficient fine-tuning (PEFT) techniques, and more. -* Logging progress and metrics to gain insight into the training process. -* Quantizing the model post-tuning. -* Evaluating the fine-tuned model on popular benchmarks. -* Running local inference for testing fine-tuned models. -* Checkpoint compatibility with popular production inference systems. - -To get started, jump right into the [code](https://www.github.com/pytorch/torchtune) or walk through our many [tutorials](https://pytorch.org/torchtune/main/)! - - -## Why torchtune? - -Over the past year there has been an explosion of interest in open LLMs. Fine-tuning these state of the art models has emerged as a critical technique for adapting them to specific use cases. This adaptation can require extensive customization from dataset and model selection all the way through to quantization, evaluation and inference. Moreover, the size of these models poses a significant challenge when trying to fine-tune them on consumer-level GPUs with limited memory. - -Existing solutions make it hard to add these customizations or optimizations by hiding the necessary pieces behind layers of abstractions. It’s unclear how different components interact with each other and which of these need to be updated to add new functionality. torchtune empowers developers to adapt LLMs to their specific needs and constraints with full control and visibility. - - -## torchtune’s Design - -torchtune was built with the following principles in mind - -* **Easy extensibility** - New techniques emerge all the time and everyone’s fine-tuning use case is different. torchtune’s recipes are designed around easily composable components and hackable training loops, with minimal abstraction getting in the way of fine-tuning your fine-tuning. Each [recipe](https://github.com/pytorch/torchtune/tree/main/recipes) is self-contained - no trainers or frameworks, and is designed to be easy to read - less than 600 lines of code! -* **Democratize fine-tuning** - Users, regardless of their level of expertise, should be able to use torchtune. Clone and modify configs, or get your hands dirty with some code! You also don’t need beefy data center GPUs. Our memory efficient recipes have been tested on machines with a single 24GB gaming GPU. -* **Interoperability with the OSS LLM ecosystem** - The open source LLM ecosystem is absolutely thriving, and torchtune takes advantage of this to provide interoperability with a wide range of offerings. This flexibility puts you firmly in control of how you train and use your fine-tuned models. - -Over the next year, open LLMs will become even more powerful, with support for more languages (multilingual), more modalities (multimodal) and more tasks. As the complexity of these models increases, we need to pay the same attention to “how” we design our libraries as we do to the features provided or performance of a training run. Flexibility will be key to ensuring the community can maintain the current pace of innovation, and many libraries/tools will need to play well with each other to power the full spectrum of use cases. torchtune is built from the ground up with this future in mind. - -In the true PyTorch spirit, torchtune makes it easy to get started by providing integrations with some of the most popular tools for working with LLMs. - - - -* **[Hugging Face Hub](https://huggingface.co/docs/hub/en/index)** - Hugging Face provides an expansive repository of open source models and datasets for fine-tuning. torchtune seamlessly integrates through the `tune download` CLI command so you can get started right away with fine-tuning your first model. -* **[PyTorch FSDP](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html)** - Scale your training using PyTorch FSDP. It is very common for people to invest in machines with multiple consumer level cards like the 3090/4090 by NVidia. torchtune allows you to take advantage of these setups by providing distributed recipes powered by FSDP. -* **[Weights & Biases](https://wandb.ai/site)** - torchtune uses the Weights & Biases AI platform to log metrics and model checkpoints during training. Track your configs, metrics and models from your fine-tuning runs all in one place! -* **[EleutherAI’s LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)** - Evaluating fine-tuned models is critical to understanding whether fine-tuning is giving you the results you need. torchtune includes a simple evaluation recipe powered by EleutherAI’s LM Evaluation Harness to provide easy access to a comprehensive suite of standard LLM benchmarks. Given the importance of evaluation, we will be working with EleutherAI very closely in the next few months to build an even deeper and more “native” integration. -* **[ExecuTorch](https://pytorch.org/executorch-overview)** - Models fine-tuned with torchtune can be [easily exported](https://github.com/pytorch/executorch/tree/main/examples/models/llama2#optional-finetuning) to ExecuTorch, enabling efficient inference to be run on a wide variety of mobile and edge devices. -* **[torchao](https://github.com/pytorch-labs/ao)** - Easily and efficiently quantize your fine-tuned models into 4-bit or 8-bit using a simple [post-training recipe](https://github.com/pytorch/torchtune/blob/main/recipes/quantize.py) powered by the quantization APIs from torchao. - - -## What’s Next? - -This is just the beginning and we’re really excited to put this alpha version in front of a vibrant and energetic community. In the coming weeks, we’ll continue to augment the library with more models, features and fine-tuning techniques. We’d love to hear any feedback, comments or feature requests in the form of GitHub issues on our repository, or on our [Discord channel](https://discord.com/invite/4Xsdn8Rr9Q). As always, we’d love any contributions from this awesome community. Happy Tuning! \ No newline at end of file diff --git a/_posts/2024-04-24-pytorch2-3.md b/_posts/2024-04-24-pytorch2-3.md deleted file mode 100644 index ca7ddee0a4d2..000000000000 --- a/_posts/2024-04-24-pytorch2-3.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.3 Release Blog" ---- - -We are excited to announce the release of PyTorch® 2.3 ([release note](https://github.com/pytorch/pytorch/releases/tag/v2.3.0))! PyTorch 2.3 offers support for user-defined Triton kernels in torch.compile, allowing for users to migrate their own Triton kernels from eager without experiencing performance regressions or graph breaks. Tensor Parallelism improves the experience for training Large Language Models using native PyTorch functions, which has been validated on training runs for 100B parameter models. As well, semi-structured sparsity implements semi-structured sparsity as a Tensor subclass, with observed speedups of up to 1.6 over dense matrix multiplication. - -This release is composed of 3393 commits and 426 contributors since PyTorch 2.2. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.3. More information about how to get started with the PyTorch 2-series can be found at our [Getting Started](https://pytorch.org/get-started/pytorch-2.0/) page. - - - - - - - - - - - - - - - - - - - - - - - -
        Beta - Prototype - Performance Improvements -
        User-defined Triton kernels in torch.compile - torch.export adds new API to specify dynamic_shapes - Weight-Only-Quantization introduced into Inductor CPU backend -
        Tensor parallelism within PyTorch Distributed - Asynchronous checkpoint generation - -
        Support for semi-structured sparsity - - -
        - - -*To see a full list of public feature submissions click [here](https://docs.google.com/spreadsheets/d/1TzGkWuUMF1yTe88adz1dt2mzbIsZLd3PBasy588VWgk/edit?usp=sharing). - - -## Beta Features - - -### [Beta] Support for User-defined Triton kernels in _torch.compile_ - -Allows for PyTorch code that contains triton kernels to be executed natively using torch.compile. This enables users to migrate code containing triton kernels from eager PyTorch to _torch.compile_ without running into performance regressions or graph breaks. Native support also creates an opportunity for Torch Inductor to precompile the user-defined Triton kernel as well as better organize code around the Triton kernel allowing for further optimizations. - -You can find more information about how to utilize user defined Triton kernels in torch.compile within [this tutorial](https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html). - - -### [Beta] Tensor Parallelism introduces more efficient ways to train LLMs - -The Tensor Parallel API facilitates various tensor manipulations across GPUs/hosts and integrates with FSDP for 2D Parallelism (Tensor parallelism across devices + Data Parallelism across hosts). It also offers a low-level API for constructing higher-level Tensor parallel APIs. This API has been validated to support the training of transformer models with over 100 billion parameters. - -You can find more information on how to utilize this within your workflows within [this tutorial](https://pytorch.org/tutorials/intermediate/TP_tutorial.html). - - -### [Beta] Semi-structured sparsity provides users with a way to take advantage of accelerated sparse inference and memory savings - -_torch.sparse.SparseSemiStructuredTensor_ implements semi-structured sparsity as a Tensor subclass, which have observed speedups of up to 1.6 over dense matrix multiplication. - -In particular it adds: - - - -* Additional support for quantization composability (mixed dtype, dequant fusion) -* Updated cuSPARSELt and CUTLASS kernels -* torch.compile support - -You can find more information on how to take advantage of semi-structured sparsity [here](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html). - - -## Prototype Features - - -### [PROTOTYPE] _torch.export_ adds new API to specify _dynamic_shapes_ - -You can now use _torch.export.Dim_ to better represent dynamic shapes by enabling developers to specify ranges (min and max values) that can be reused across different input dimensions that are constrained to be equal. - -To learn more about _torch.export.Dim_ as well as how it can be used to express more interesting relationships (such as linear arithmetic expressions) check out the tutorial [here](https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html#constraints-dynamic-shapes). - - -### [PROTOTYPE] Asynchronous checkpoint generation - -Asynchronous checkpoint generation allows users to continue their training loops while checkpoints are being generated, essentially offloading much of the checkpointing cost. - -You can find out how to utilize this within your own workflows with this [example](https://github.com/pytorch/pytorch/blob/release/2.3/torch/distributed/checkpoint/examples/async_checkpointing_example.py). - - -## Performance Improvements - - -### [PROTOTYPE] Weight-Only-Quantization introduced into Inductor CPU backend - -PyTorch 2.3 enhances LLM inference performance on torch inductor CPU backend. The project [gpt-fast](https://github.com/pytorch-labs/gpt-fast) offers a simple and efficient PyTorch native acceleration for transformer text generation with _torch.compile_. Prior to 2.3 only CUDA devices were supported and this feature enables the CPU counterpart by providing highly optimized kernels for the int4 and int8 weight only quantization Linear. - -For more information / how to utilize this feature please refer to the [gpt-fast README](https://github.com/pytorch-labs/gpt-fast#quantization). \ No newline at end of file diff --git a/_posts/2024-04-30-executorch-alpha.md b/_posts/2024-04-30-executorch-alpha.md deleted file mode 100644 index 4c9a8649e7eb..000000000000 --- a/_posts/2024-04-30-executorch-alpha.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -layout: blog_detail -title: "ExecuTorch Alpha: Taking LLMs and AI to the Edge with Our Community and Partners" ---- - -We are excited to announce the release of [ExecuTorch alpha](https://github.com/pytorch/executorch), focused on deploying large language models (LLMs) and large ML models to the edge, stabilizing the API surface, and improving our installation processes. It has been an exciting few months [from our 0.1 (preview) release](https://pytorch.org/blog/pytorch-edge/) in collaboration with our partners at Arm, Apple, and Qualcomm Technologies, Inc. - -In this post we’ll discuss our full support for Meta’s Llama 2, early support for Meta’s Llama 3, broad model support in ExecuTorch, and highlight the important work our partners have done to move us forward. - -## Large Language Models on Mobile - -Mobile devices are highly constrained for compute, memory, and power. To bring LLMs to these devices, we heavily leverage quantization and other techniques to pack these models appropriately. - -ExecuTorch alpha supports 4-bit post-training quantization using GPTQ. We've provided broad device support on CPU by landing dynamic shape support and new dtypes in XNNPack. We've also made significant improvements in export and lowering, reduced memory overhead and improved runtime performance. This enables running Llama 2 7B efficiently on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22, S23, and S24 phones and other edge devices. [Early support](https://github.com/pytorch/executorch/releases/tag/v0.2.0) for [Llama 3 8B](https://ai.meta.com/blog/meta-llama-3/) is also included. We are always improving the token/sec on various edge devices and you can visit GitHub for the [latest performance numbers](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md). - -We're working closely with our partners at Apple, Arm, and Qualcomm Technologies to delegate to GPU and NPU for performance through Core ML, MPS, TOSA, and Qualcomm AI Stack backends respectively. - -## Supported Models - -We remain committed to supporting an ever-expanding list of models with ExecuTorch. Since preview, we have significantly expanded our tested models across NLP, vision and speech, with full details [in our release notes](https://github.com/pytorch/executorch/releases/tag/v0.2.0). Although support for on-device LLMs is early, we anticipate most traditional models to function seamlessly out of the box, with delegation to XNNPACK, Core ML, MPS, TOSA, and HTP for performance. If you encounter any problems please open [a GitHub issue](https://github.com/pytorch/executorch/issues) with us. - -## Productivity - -Deploying performant models tuned for specific platforms often require deep visualization into the on-device runtime data to determine the right changes to make in the original PyTorch model. With ExecuTorch alpha, we provide a powerful SDK with observability throughout the process from model authoring to deployment, including delegate and hardware-level information. - -The ExecuTorch SDK was enhanced to include better debugging and profiling tools. Because ExecuTorch is built on PyTorch, the debugging capabilities include the ability to map from operator nodes back to original Python source code for more efficient anomaly resolution and performance tuning for both delegated and non-delegated model instances. You can learn more about the ExecuTorch SDK [here](https://github.com/pytorch/executorch/blob/main/examples/sdk/README.md). - -## Partnerships - -ExecuTorch has only been possible because of strong collaborations across Arm, Apple, and Qualcomm Technologies. The collaboration for the initial launch of ExecuTorch continues as we support LLMs and large AI models on the edge for PyTorch. As we’ve seen with this early work for ExecuTorch alpha, there are unique challenges with these larger models and we’re excited to develop in the open. - -We also want to highlight the great partnership with Google on [XNNPACK](https://github.com/google/XNNPACK) for CPU performance. The teams continue to work together upstreaming our changes and across the TensorFlow and PyTorch teams to make sure we can all support generative AI models on the edge with SOTA performance. - -Lastly, our hardware partner MediaTek has been doing work enabling the Llama collection of models with ExecuTorch on their SoCs. We'll have more to share in the future. - -## Alpha and Production Usage - -With our alpha release, we have production-tested ExecuTorch. Meta is using ExecuTorch for hand tracking on Meta Quest 3 and a variety of models on Ray-Ban Meta Smart Glasses. In addition, we have begun the rollout of ExecuTorch with Instagram and are integrating with other Meta products. We are excited to see how ExecuTorch can be used for other edge experiences. - -## Community - -We are excited to see various efforts in the community to adopt or contribute to ExecuTorch. For instance, Unity recently [shared their work](https://schedule.gdconf.com/session/unity-developer-summit-drive-better-gameplay-experiences-on-user-devices-with-ai-presented-by-unity/903634) at the Game Developers Conference ([GDC](https://gdconf.com/)) on leveraging ExecuTorch and Edge IR to run PyTorch models with their neural network inference library Sentis. Leveraging ExecuTorch's hackability and extensibility, Unity introduced their own custom backend that serializes ExecuTorch’s Edge Dialect IR into Sentis’ native serialized format enabling developers to begin using PyTorch models easily in their games and apps. - -We’ve been building and innovating with ExecuTorch in the open. Our north star is to empower the community to deploy any ML model on edge devices painlessly and efficiently. Whether you are a hobbyist or this is your day job, we’d love for you to [jump in to bring your ML models to the edge](https://pytorch.org/executorch/stable/getting-started-setup.html). We are looking for your help to: - -1. Use ExecuTorch to [run your LLM models locally](https://github.com/pytorch/executorch/blob/main/docs/source/llm/getting-started.md) on various deployment targets and share your feedback -2. Expand our supported models, including bug reports -3. Expand our quantization schemes -4. Help us build out delegates to GPU and NPU - -To all individual contributors and early adopters of ExecuTorch, a big thank you as well. We can’t wait to have more of you [join us](https://github.com/pytorch/executorch)! \ No newline at end of file diff --git a/_posts/2024-05-01-accelerating-llama3.md b/_posts/2024-05-01-accelerating-llama3.md deleted file mode 100644 index acf2f0df829f..000000000000 --- a/_posts/2024-05-01-accelerating-llama3.md +++ /dev/null @@ -1,151 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Llama3 FP8 Inference with Triton Kernels" -author: Adnan Hoque, Less Wright, Chih Chieh Yang ---- - -## 1.0 Summary - -We present an optimized Triton FP8 GEMM (General Matrix-Matrix Multiply) kernel TK-GEMM, which leverages SplitK parallelization. For small batch size inference, TK-GEMM delivers up to **1.94x** over the base Triton matmul implementation, **1.87x** speedup over cuBLAS FP8 and **1.71x** over cuBLAS FP16 for Llama3-70B inference problem sizes on NVIDIA H100 GPUs. - - -![TK-GEMM Speedup over PyTorch (calling cuBLAS) for Llama3-70B Attention Layer Matrix Shapes (N=K=8192)](/assets/images/accelerating-llama3/fig1.png){:style="width:100%;"} - -**Figure 1.** TK-GEMM Speedup over PyTorch (calling cuBLAS) for Llama3-70B Attention Layer Matrix Shapes (N=K=8192) - -In this blog, we will cover how we designed an optimized kernel using [Triton](https://github.com/openai/triton) for FP8 inference and tuned it for Lama3-70B inference. We will cover FP8 (8-bit floating point), a new datatype supported by Hopper generation GPUs (SM90), the key SM90 features that Triton supports, and how we modified the parallelization to be able to maximize memory throughput for memory-bound (inference) problem sizes. - -We also dedicate a section on CUDA graphs, an important technology that will help materialize kernel level speedups and enable developers who want to use Triton kernels in production settings to get additional performance gain. - -Repo and code available at: [https://github.com/pytorch-labs/applied-ai](https://github.com/pytorch-labs/applied-ai) - -## 2.0 FP8 Datatype - -The FP8 datatype was [introduced](https://arxiv.org/pdf/2209.05433.pdf) jointly by Nvidia, Arm and Intel and serves as a successor to 16-bit floating point types. With half the bit count, it has the potential to provide significant throughput improvements over its predecessors for Transformer networks. The FP8 datatype consists of 2 formats: - -**E4M3** (4-bit exponent and 3-bit mantissa). Able to store +/ 448 and nan. -**E5M2** (5-bit exponent and 2-bit mantissa). Able to store +/- 57,334, nan and inf. - - - -![BF16, FP16, FP8 E4M3 and FP8 E5M2](/assets/images/accelerating-llama3/fig2.png){:style="width:100%;"} - -**Above:** _BF16, FP16, FP8 E4M3 and FP8 E5M2._ -_To show precision differences, the closest representation to 0.3952 is shown in each format._ -_Image Credit: [Nvidia](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html)_ - -We use E4M3 in inference and forward pass training due its higher precision and E5M2 in training backward pass due to its higher dynamic range. Nvidia has designed their H100 FP8 Tensor Core to provide a peak of 3958 TFLOPS, **2x** the FLOPS of the FP16 Tensor Core. - -We designed our Triton kernel with these hardware innovations in mind and in the rest of the blog we will discuss methods to leverage and verify that these features are indeed being utilized by the Triton compiler. - -## 3.0 Triton Hopper Support and FP8 Tensor Core Instruction - -The Hopper GPU architecture has added the following [new features](https://arxiv.org/abs/2402.13499) that we can expect will accelerate FP8 GEMM. - - - -* TMA (Tensor Memory Accelerator) Hardware Unit -* WGMMA (Warp Group Matrix Multiply-Accumulate Instruction) -* Threadblock Clusters - -Triton currently takes advantage of one of these features, the _wgmma_ instruction, whereas PyTorch (calling cuBLAS) leverages all 3 which makes these speedups even more impressive. To fully take advantage of the Hopper FP8 Tensor Core, the wgmma is necessary even though the older mma.sync instruction is still supported. - -The key difference between the mma and wgmma instructions is that instead of 1 CUDA warp being responsible for an output shard, an entire warp group, 4 CUDA warps, _asynchronously_ contributes to an output shard. - -To see what this instruction looks like in practice, and to verify that our Triton Kernel is indeed utilizing this feature we analyzed the PTX and SASS assembly using [nsight compute](https://developer.nvidia.com/nsight-compute). - - -![PTX Assembly](/assets/images/accelerating-llama3/fig3.png){:style="width:100%;display:block;max-width:600px;margin-left:auto;margin-right:auto;"} - -**Figure 2.** PTX Assembly - -This instruction is further lowered into a QGMMA instruction in SASS. - -![SASS Assembly](/assets/images/accelerating-llama3/fig4.png){:style="width:100%;display:block;max-width:600px;margin-left:auto;margin-right:auto;"} - -**Figure 3.** SASS Assembly - -Both instructions tell us that we are multiplying two FP8 E4M3 input tensors and accumulating in F32, which confirms that the TK-GEMM Kernel is utilizing the FP8 Tensor Core and the lowering is being done correctly. - -## 4.0 SplitK Work Decomposition - - -![TK-GEMM vs Base Triton GEMM TFLOPS for M = 1-64](/assets/images/accelerating-llama3/fig5.png){:style="width:100%;display:block;max-width:600px;margin-left:auto;margin-right:auto;"} - -**Figure 4.** TK-GEMM vs Base Triton GEMM TFLOPS for M = 1-64 - - -The base Triton FP8 GEMM implementation does [not perform](https://github.com/openai/triton/issues/3104) well for the small M regime, where for a matrix multiplication of A (_MxN_) x B (_NxK_), _M_ < _N_, _K_. To optimize for this type matrix profile we applied a SplitK work decomposition instead of the Data Parallel decomposition found in the base Triton kernel. This greatly improved latencies for the small M regime. - -For background, SplitK launches additional thread blocks along the k dimension to calculate partial output sums. The partial results from each thread block are then summed using an atomic reduction. This allows for finer grained work decomposition with resultant performance improvements. More details on SplitK are available in our [arxiv paper](https://arxiv.org/abs/2402.00025). - - -After carefully tuning the other relevant hyperparameters for our kernel such as tile sizes, number of warps and the number of pipeline stages to Llama3-70B problem sizes we were able to produce up to **1.94x** speedup over the Triton [base implementation](https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html). For a more comprehensive introduction to hyperparameter tuning, see our [blog](https://pytorch.org/blog/accelerating-moe-model/#30-work-decomposition---splitk). - - -![NCU profiler times for TK-GEMM under varying batch sizes, and compared with PyTorch (calling cuBLAS) FP8 and FP16.](/assets/images/accelerating-llama3/fig6.png){:style="width:100%;"} - - -**Above**: _NCU profiler times for TK-GEMM under varying batch sizes, and compared with PyTorch (calling cuBLAS) FP8 and FP16._ - -Note that starting at M=32, the cuBLAS FP8 kernel starts to outperform TK-GEMM. For M >= 32, we suspect that hyperparameters we found are not optimal, and thus another set of experiments is required to determine the optimal parameters for the mid-sized M regime. - -## 5.0 CUDA Graphs to Enable End-to-End Speedup - -To be able to realize these speedups in an end-to-end setting, we must take into account both the kernel execution time (GPU duration) as well as the wall time (CPU+GPU) duration. Triton kernels, which are handwritten (as opposed to torch compile generated) are known to suffer from high-kernel launch latencies. If we use [torch profiler](https://pytorch.org/docs/stable/profiler.html) to trace the TK-GEMM kernel we can see the call stack on the CPU side to pinpoint exactly what is causing the slowdown. - - -![CPU Launch Overhead: 2.413ms](/assets/images/accelerating-llama3/fig7.png){:style="width:100%;"} - -**Figure 5.** CPU Launch Overhead: 2.413ms - -From above, we see that the majority of the wall time of our optimized kernel is dominated by JIT (Just-in-Time) compilation overhead. To combat this we can use CUDA graphs. - - -![CUDA Graphs Visualization](/assets/images/accelerating-llama3/fig8.png){:style="width:100%;"} - -**Figure 6.** CUDA Graphs Visualization -_Image Credit: [PyTorch](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/)_ - -The key idea is instead of multiple kernel launches, we instead can create and instantiate a graph (1 time cost) and then submit that instance of the graph for execution. To illustrate this point we simulate a Llama3-70B Attention layer, As shown in the below figure generated using [nsight systems](https://developer.nvidia.com/nsight-systems), the time between each GEMM is **_165us_** compared to the **_12us_** spent on the actual matmul due the CPU kernel launch overhead. This means that **_92%_** of the time of the time in an Attention layer the GPU is idle and not doing any work. - - -![Simulated Llama3-70B Attention Layer with TK-GEMM](/assets/images/accelerating-llama3/fig9.png){:style="width:100%;"} - -**Figure 7.** Simulated Llama3-70B Attention Layer with TK-GEMM - -To show the impact of CUDA graphs, we then created a graph of the TK-GEMM kernel in the toy Attention layer and replayed the graph. Below, we can see that the gaps between kernel executions are reduced to 6.65us. - - -![Simulated Llama3-70B Attention Layer with TK-GEMM and CUDA Graphs](/assets/images/accelerating-llama3/fig10.png){:style="width:100%;"} - -**Figure 8.** Simulated Llama3-70B Attention Layer with TK-GEMM and CUDA Graphs - -In practice, this optimization would result in a **6.4x** speedup of a single attention layer in Llama3-70B, over naively using TK-GEMM in a model without CUDA graphs. - -## 6.0 Potential Future Optimization Paths - -![TMA Hardware Unit](/assets/images/accelerating-llama3/fig11.png){:style="width:100%;"} - -**Figure 9.** TMA Hardware Unit -_Image Credit: [Nvidia](https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/)_ - -The Nvidia H100 features a TMA hardware unit. The dedicated TMA unit frees up registers and threads to do other work, as address generation is completely handled by the TMA. For memory bound problem sizes, this can provide even further gain when Triton enables support for this feature. - - -![Tensor Core Utilization (Arrows Indicate Degrees of Freedom)](/assets/images/accelerating-llama3/fig12.png){:style="width:100%;"} - -**Figure 10.** Tensor Core Utilization (Arrows Indicate Degrees of Freedom) - -To identify how well we are utilizing the Tensor Core, we can analyze the roofline chart. Notice that we are in the memory-bound region as expected for small M. To improve kernel latency we can either increase the arithmetic intensity, which with a fixed problem size can only be achieved through exploiting data locality and other loop [optimizations](https://www.codee.com/is-your-algorithm-running-at-peak-performance-the-roofline-model/) or increasing the memory throughput. This requires either a more optimal parallel algorithm specialized for the FP8 datatype as well as the type of problem size characteristics we expect to see in FP8 inference. - - -![DRAM Throughput Circled, 1.65TB/s vs Peak 3.35TB/s on H100 (M=16, N=8192, K=8192)](/assets/images/accelerating-llama3/fig13.png){:style="width:100%;"} - -**Figure 11.** DRAM Throughput Circled, 1.65TB/s vs Peak 3.35TB/s on H100 (M=16, N=8192, K=8192) - -Lastly, we can see that we are only achieving around **50%** of peak DRAM throughput on the NVIDIA H100. High performance GEMM kernels typically achieve around **70-80%** of peak throughput. This means that there is still a lot of room to improve and the techniques mentioned above (loop unrolling, optimized parallelization) are needed for additional gain. - -## 7.0 Future Work - -For future research, we would like to explore [CUTLASS](https://github.com/NVIDIA/cutlass/tree/main) 3.x and [CuTe](https://github.com/NVIDIA/cutlass/tree/main/media/docs/cute) to leverage more direct control over Hopper features especially in terms of obtaining direct TMA control and exploring pingpong architectures, which have shown promising results for FP8 GEMM. \ No newline at end of file diff --git a/_posts/2024-05-02-docathon-june-2024.md b/_posts/2024-05-02-docathon-june-2024.md deleted file mode 100644 index 43b00f9a290d..000000000000 --- a/_posts/2024-05-02-docathon-june-2024.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -layout: blog_detail -title: "Announcing PyTorch Docathon June, 2024" ---- - -We are thrilled to announce the upcoming PyTorch Docathon in June! The Docathon, akin to a hackathon, is an event dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Documentation is a vital component of any technology. By refining it, we can simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine learning. See our previous events [here](https://pytorch.org/blog/announcing-docathon/) and [here](https://pytorch.org/blog/announcing-docathon-h2-2023/). - -## Why Participate - -The Docathon is an inclusive event designed to be accessible to newcomers, requiring only a basic understanding of Python, PyTorch, and Machine Learning, with some tasks not even requiring these skills. It offers a rewarding experience as participants can see the direct impact of their contributions on the project's usability and accessibility. The Docathon promotes a collaborative environment, allowing participants to work with other contributors and PyTorch maintainers, fostering the exchange of ideas and networking. It also provides a rich learning experience, offering the opportunity to explore PyTorch modules, update docstrings, and test tutorials. - - - -## Event Details - -**June 4**: Kick-off -**June 4 - 16**: Submissions and Feedback -**June 17 - 18**: Final Reviews -**June 20**: Winner Announcements - -Further details for the Docathon will be announced at the Kick-off call on June 4. - -[Please register to join this year’s event](https://community.linuxfoundation.org/events/details/lfhq-pytorch-foundation-presents-pytorch-docathon-june-4-20th-2024/). - - - - \ No newline at end of file diff --git a/_posts/2024-05-02-hitchhikers-guide-speculative-decoding.md b/_posts/2024-05-02-hitchhikers-guide-speculative-decoding.md deleted file mode 100644 index 79860bd1eab7..000000000000 --- a/_posts/2024-05-02-hitchhikers-guide-speculative-decoding.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -layout: blog_detail -title: "A Hitchhiker’s Guide to Speculative Decoding" -author: Team PyTorch at IBM ---- - -Speculative decoding is an optimization technique for inference that makes educated guesses about future tokens while generating the current token, all within a single forward pass. It incorporates a verification mechanism to ensure the correctness of these speculated tokens, thereby guaranteeing that the overall output of speculative decoding is identical to that of vanilla decoding. Optimizing the cost of inference of large language models (LLMs) is arguably one of the most critical factors in reducing the cost of generative AI and increasing its adoption. Towards this goal, various inference optimization techniques are available, including custom kernels, dynamic batching of input requests, and quantization of large models. - -In this blog post, we provide a guide to speculative decoding and demonstrate how it can coexist with other optimizations. We are proud to open source the following, which includes the first speculator for Llama3 models: - - - -1. Speculator models for [Meta Llama3 8B](https://huggingface.co/ibm-fms/llama3-8b-accelerator), [IBM Granite 7B lab](https://huggingface.co/ibm/granite-7b-lab-accelerator), [Meta Llama2 13B](https://huggingface.co/ibm-fms/codellama-13b-accelerator), and [Meta Code Llama2 13B](https://huggingface.co/ibm-fms/codellama-13b-accelerator). -2. [The code for inference via IBM’s fork of HF TGI.](https://github.com/IBM/text-generation-inference/pull/79) -3. [The code for training your own speculators and corresponding recipes.](https://github.com/foundation-model-stack/fms-fsdp/pull/35) - -We have deployed these speculators in an internal production-grade environment with thousands of daily users and observed 2x speedup on language models - Llama3 8B, Llama2 13B, and IBM Granite 7B and 3x speedup on IBM’s Granite 20B code models. We provide a detailed explanation of our approach in this [technical report](https://arxiv.org/abs/2404.19124) and are planning in-depth analysis in an upcoming ArXiv paper. - - -## Speculative decoding: Inference - -We run IBM TGIS in our internal production environment that has optimizations such as continuous batching, fused kernels, and quantization kernels. To enable speculative decoding in TGIS, we modified the paged attention kernel from [vLLM](https://github.com/vllm-project/vllm). In what follows, we will describe the key changes to the inference engine to enable speculative decoding. - -Speculative decoding is based on the premise that the model is powerful enough to predict multiple tokens in a single forward pass. However, the current inference servers are optimized to predict only a single token at a time. In our approach, we attach multiple speculative heads (in addition to the usual one) to the LLM to predict _N+1-, N+2-, N+3-th …_ token. For example, 3 heads will predict 3 additional tokens. Details of the speculator architecture are explained in a later part of this blog. There are two challenges to achieve _efficiency_ and _correctness_ during inference - one is to predict without replicating KV-cache and the other is to verify that the predictions match the original model’s outcomes. - -In a typical generation loop, after the prompt is processed in a single forward step, a sequence length of 1 (next token predicted) is fed into the forward pass of the model along with the kv-cache. In a naive speculative decoding implementation, each speculative head would have its own kv-cache, but instead we modify the paged attention kernel developed in the vLLM project to enable efficient kv-cache maintenance. This ensures that throughput does not reduce at larger batch sizes. Further, we modify the attention masks to enable verification of the _N+1’th_ token and thus enable speculative decoding without deviating from the original model’s output. The details of this implementation are captured [here](https://github.com/foundation-model-stack/fms-extras). - - -## Results - -We illustrate the speedup obtained with the Meta’s chat versions of Llama2 13B using a simple prompt. - - -![Visual illustration of the non-speculative generation (left) compared to speculative generation (right)](/assets/images/hitchhikers-guide-speculative-decoding/fig1.gif){:style="width:100%;"} - -_Figure 2: Visual illustration of the non-speculative generation (left) compared to speculative generation (right)_ - -We deployed the above solution in an internal production environment. The figure below reports two metrics – time to first token (TTFT) and inter-token latency (ITL) with different numbers of concurrent users (which is captured in the numbers on the graph lines). We observe that the speculative decoding version is nearly twice as fast for the Llama2 13B chat model and nearly thrice as fast for the Granite 20B code model compared to the non-speculative version for all batch sizes. We observe similar behavior for the smaller models - IBM’s Granite 7B and Meta Llama3 8B models. - -![Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Llama 13B with number of concurrent users indicated on the graph](/assets/images/hitchhikers-guide-speculative-decoding/fig2.png){:style="width:100%;"} - -_Figure 3: Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Llama 13B with number of concurrent users indicated on the graph_ - - -![Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Granite 20B Code with number of concurrent users indicated on the graph](/assets/images/hitchhikers-guide-speculative-decoding/fig3.png){:style="width:100%;"} - - -_Figure 4: Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Granite 20B Code with number of concurrent users indicated on the graph_ - - -### Note on efficiency - -We performed numerous experiments to determine the right configuration for speculator training. These are: - -1. **Speculator architecture**: The current approach allows for the number of heads to be modified, which maps to the number of tokens that we can look ahead. Increasing the number of heads also increases the amount of extra compute needed and complexity of training. In practice, for language models, we find 3-4 heads works well in practice, whereas we found that code models can reap benefits from 6-8 heads. -2. **Compute**: Increasing the number of heads results in increased compute in two dimensions, one is that of increased latency for a single forward pass as well as the compute needed for multiple tokens. If the speculator is not accurate with more heads, it will result in wasted compute increasing the latency and reducing the throughput. -3. **Memory**: The increased compute is offset by the roundtrips to HBM that need to be done for each forward pass. Note that if we get 3 tokens lookahead correct, we have saved three round trip times on HBM. - -We settled on 3-4 heads for the language models and 6-8 heads for the code models and across different model sizes ranging from 7B to 20B, we observed significant latency improvements without throughput loss compared to non-speculative decoding. We begin to observe throughput reduction beyond a batch size of 64, which happens rarely in practice. - - -## Speculative decoding: Training - -There are two broad approaches for speculative decoding, one is to leverage a smaller model (e.g., Llama 7B as a speculator for Llama 70B) and the other is to attach speculator heads (and train them). In our experiments, we find the approach of attaching speculator heads to be more effective both in model quality and latency gains. - - -### Speculator architecture - -[Medusa](https://arxiv.org/abs/2401.10774) made speculative decoding popular; their approach is to add a head to the existing model which is then trained to do speculation. We modify the Medusa architecture by making the “heads” hierarchical, where each head stage predicts a single token and then feeds it to the next head stage. These multi-stage heads are depicted in the below figure. We are exploring ways of minimizing the embeddings table by sharing these across the multiple stages and base model. - - -![A simple architecture diagram for a 3-headed multi-stage speculator. Z is the state from the base model.](/assets/images/hitchhikers-guide-speculative-decoding/fig4.png){:style="width:100%;display:block;max-width:300px;margin-left:auto;margin-right:auto;"} - - -_Figure 4: A simple architecture diagram for a 3-headed multi-stage speculator. Z is the state from the base model._ - - -#### Speculator training - -We have a two-phase approach to training a speculator for efficiency reasons. In the first phase, we train on small batches with long sequence lengths (4k tokens) and use the standard causal LM approach for training. In phase 2, we use large batches with short sequence lengths (256 tokens) generated from the base model. In this training phase, we tune the heads to match the output of the base model. Through numerous experiments, we find that a 5:2 ratio of steps for phase 1 vs phase 2 works well. We depict the progress of these phases in the below figure. We use PyTorch FSDP and [IBM FMS](https://github.com/foundation-model-stack/fms-fsdp) for the training of speculators. - - -![Per-head training loss curves for Llama2-13B speculator training, phase 1 and 2](/assets/images/hitchhikers-guide-speculative-decoding/fig5.jpg){:style="width:100%;"} - -_Figure 5: Per-head training loss curves for Llama2-13B speculator training, phase 1 and 2_ - - -## Conclusion and Future Work - -Through this blog, we are releasing a new approach for speculative decoding and the following assets: - -1. Models for improving the inter-token latencies for a range of models - Llama3 8B, Llama2 13B, Granite 7B, and CodeLlama 13B -2. Production quality code for inference -3. Recipes for training speculators - -We are working on training speculators for Llama3 70B and Mistral models and invite the community to contribute as well as help improve on our framework. We would also love to work with major open source serving frameworks such as [vLLM](https://github.com/vllm-project/vllm) and [TGI](https://github.com/huggingface/text-generation-inference) to contribute back our speculative decoding approach to benefit the community. - - -#### Acknowledgements - -There are several teams that helped us get to these latency improvements for inference. We would like to thank the vLLM team for creating the paged attention kernel in a clean and reusable manner. We extend our gratitude to the Team PyTorch at Meta that helped provide feedback on this blog as well as continued efforts on optimal usage of PyTorch. Special thanks to our internal production teams at IBM Research who took this prototype to production and hardened it. A shout out to Stas Bekman for providing insightful comments on the blog resulting in an improved explanation of the tradeoffs between compute, memory, and speculator effectiveness. - -The paged attention kernel was integrated into IBM FMS by Josh Rosenkranz and Antoni Viros i Martin. The speculator architecture and training was done by Davis Wertheimer, Pavithra Ranganathan, and Sahil Suneja. The integration of the modeling code with the inference server was done by Thomas Parnell, Nick Hill, and Prashant Gupta. - \ No newline at end of file diff --git a/_posts/2024-05-11-enhancing-deep-learning.md b/_posts/2024-05-11-enhancing-deep-learning.md deleted file mode 100644 index 456ba8b9e658..000000000000 --- a/_posts/2024-05-11-enhancing-deep-learning.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -layout: blog_detail -title: "Enhancing Deep Learning Workflows: PyTorch Ecosystem Tools" -hidden: true ---- - -Welcome to the thriving PyTorch ecosystem, where a wealth of tools and libraries await, purpose-built to elevate your experience in deep learning as a developer or researcher. The Ecosystem Tools pages host many projects from experts spanning academia, industry, application development, and machine learning. - -Initially, PyTorch aimed to establish a thriving community, enabling developers to access each other's tools, engage in meaningful discussions, and explore the wealth of resources available within the community. - -Today, the PyTorch ecosystem has grown to feature over 100 projects tailored to your needs, providing robust support, enhanced speed, and effortless integration with PyTorch. If your project aligns with our mission, we invite you to [submit](https://github.com/pytorch-fdn/ecosystem) it and join this dynamic ecosystem. - -New this month, we’ve moved all of our Ecosystem blogs over to our PyTorch.org website to host a space where our community can show off the latest innovations with our users. Read on to hear about the latest projects in the ecosystem! - -## Explore the Latest Tools and Frameworks in the Ecosystem - -As we continue into 2024, we're thrilled to showcase an impressive array of ecosystem tools that significantly enrich the PyTorch community. These tools cover a wide range of domains, including pose estimation, profiling, and even quantum computing. Let's explore each one to witness firsthand how they are reshaping the PyTorch landscape, opening up exciting possibilities for developers. - - -### [Anomalib](https://github.com/openvinotoolkit/anomalib) - - -Anomalib is a deep learning library that aims to collect state-of-the-art anomaly detection algorithms for benchmarking on both public and private datasets. Anomalib provides several ready-to-use implementations of anomaly detection algorithms described in the recent literature, as well as a set of tools that facilitate the development and implementation of custom models. The library has a strong focus on image-based anomaly detection, where the goal of the algorithm is to identify anomalous images, or anomalous pixel regions within images in a dataset. Anomalib is constantly updated with the latest algorithms and training/inference extensions. - -### [Diffusers](https://huggingface.co/docs/diffusers) - -Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or training your own diffusion models, Diffusers is a modular toolbox that supports both. - -### [Pomegranate](https://pomegranate.readthedocs.io/en/latest/) - -Pomegranate is a versatile machine learning library that integrates seamlessly with PyTorch. It provides a wide range of probabilistic models and tools for probabilistic modeling tasks. Pomegranate empowers users to build complex models such as hidden Markov models (HMMs), Bayesian networks, and Gaussian mixture models (GMMs). By combining the strengths of PyTorch and Pomegranate, developers can leverage the power of deep learning and probabilistic modeling to tackle various machine learning challenges. - - -### [PyPose](https://pypose.org/) - -PyPose is a PyTorch-based library designed for pose estimation tasks. With PyPose, developers can efficiently train and deploy models for human pose estimation, a fundamental computer vision problem. By leveraging PyTorch's flexibility and performance, PyPose simplifies the process of building accurate pose estimation models. Its intuitive APIs and pre-trained models make it an excellent choice for researchers and developers exploring human pose estimation applications. - - -### [PyPOTS](https://github.com/WenjieDu/PyPOTS) - -A python toolbox/library for data mining on partially-observed time series with PyTorch, including SOTA models supporting tasks of imputation, classification, clustering, and forecasting on incomplete (irregularly-sampled) multivariate time series with missing values. - -### [OctoML Profiler](https://github.com/octoml/octoml-profile) - -OctoML Profiler is a performance profiling tool that aids in optimizing PyTorch models. This tool helps developers identify performance bottlenecks and inefficiencies within their deep learning models. By providing insights into memory usage, compute time, and data movement, the OctoML Profiler enables developers to fine-tune their models for improved efficiency. With this valuable feedback, developers can optimize their models for deployment on various hardware platforms. - -### [Open Compass](https://github.com/open-compass/opencompass) - -OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features include: Comprehensive support for models and datasets, efficient distributed evaluation, diversified evaluation paradigms, modular design with high extensibility and experiment management and reporting mechanism. - -### [Renate](https://renate.readthedocs.io/en/latest/) - -Renate is a PyTorch-based library for neural architecture search (NAS). It simplifies the process of automatically searching for optimal neural network architectures tailored to specific tasks. Renate leverages techniques like reinforcement learning and evolutionary algorithms to efficiently explore the architecture space. By using Renate, developers can save significant time and resources while discovering highly performant models. - - -### [RoMa](https://github.com/naver/roma) - - -RoMa is a standalone library to handle rotation representations with PyTorch (rotation matrices, quaternions, rotation vectors, etc). It aims for robustness, ease-of-use, and efficiency. - - -### [Substra](https://github.com/Substra) - -Substra is an open source federated learning (FL) software. It enables the training and validation of machine learning models on distributed datasets. It provides a flexible Python interface and a web application to run federated learning training at scale. Substra's main usage is in production environments. It has already been deployed and used by hospitals and biotech companies. Substra can also be used on a single machine to perform FL simulations and debug code. - -### [TorchQuantum](https://hanruiwanghw.wixsite.com/torchquantum) - -TorchQuantum is a powerful library that combines the PyTorch framework with quantum computing concepts. It enables developers to explore quantum machine learning algorithms and build hybrid classical-quantum models. By integrating the principles of quantum computing into PyTorch, TorchQuantum opens up new possibilities for solving complex problems that traditional deep learning approaches may struggle with. - -### [TIAToolbox](https://github.com/TissueImageAnalytics/tiatoolbox) - -The TIAToolbox (Text-Image-Augmentation Toolbox) is a PyTorch library designed to augment text and image data for deep learning tasks. It offers a comprehensive set of tools for data augmentation, including transformations, noise injection, and image/text synthesis. By applying TIAToolbox, developers can enrich their training datasets, improve model generalization, and enhance the robustness of their deep learning models. - -### [torchdistill](https://github.com/yoshitomo-matsubara/torchdistill) - -torchdistill is a coding-free framework built on PyTorch for reproducible deep learning and knowledge distillation studies. The framework is designed to enable users to design experiments by declarative PyYAML configuration files and supports high-level module abstractions. - -### [TorchOpt](https://torchopt.readthedocs.io/en/latest/#) - -TorchOpt is a PyTorch library focused on optimization algorithms for deep learning. It provides a collection of state-of-the-art optimization techniques, such as stochastic gradient descent (SGD) variants, adaptive learning rate methods, and optimization schedules. TorchOpt empowers developers to fine-tune their models efficiently, converge faster, and achieve better performance in various deep learning tasks. - -### [USB](https://usb.readthedocs.io/) - -USB, or Unified Speech-to-Text Benchmark, is a PyTorch-based toolkit for training and evaluating speech recognition models. It provides standardized datasets and evaluation metrics to facilitate fair and accurate comparisons between different speech recognition architectures. By using USB, researchers and developers can benchmark their models against state-of-the-art systems and drive advancements in the field of automatic speech recognition. - -### [Zeus](https://github.com/ml-energy/zeus) - -Zeus is the current state-of-the-art in deep learning energy measurement and optimization. It has monitor components that allow users to measure GPU energy consumption and optimizer components that automatically optimize DNN or GPU knobs based on measurements from the monitor component. - - -## Be Part of Our Ecosystem - -Our diverse ecosystem tools are instrumental in PyTorch's success.. They provide essential support for tasks such as pose estimation, probabilistic modeling, performance profiling, model interpretability, speech recognition, quantum computing, data augmentation, optimization, and neural architecture search. - -Leveraging these tools empowers developers and researchers to accelerate their deep learning workflows and unlock new possibilities in the field of AI. - -Have a tool that would be a good fit for the [PyTorch Ecosystem](https://pytorch.org/ecosystem/)? If you can answer the below questions, we’d love for you to [submit your tool for review](https://github.com/pytorch-fdn/ecosystem). - - - -1. Does your project complement PyTorch, enhancing user experience, introducing new capabilities, or accelerating training and inference processes? - * Examples could include visualization tools, a kernel library or a framework that sits on top to enable research in a particular area such as NLP. -2. Is the project ready for broad developer usage? - * For example, is the project stable, will it be maintained, and is there adequate supporting infrastructure, documentation, and technical support to allow a developer to successfully use it? - -Thank you to all of our contributors and collaborators in our ecosystem! Here’s to a great 2024. diff --git a/_posts/2024-05-11-introducing-depyf.md b/_posts/2024-05-11-introducing-depyf.md deleted file mode 100644 index 13ade59debf5..000000000000 --- a/_posts/2024-05-11-introducing-depyf.md +++ /dev/null @@ -1,217 +0,0 @@ ---- -layout: blog_detail -title: "Introducing depyf: mastering torch.compile with ease" -hidden: true -author: Kaichao You ---- - -![depyf logo](/assets/images/depyf.png){:style="width:100%;display: block; max-width: 400px; margin-right: auto; margin-left: auto"} - - -We are thrilled to introduce `depyf`, a new project to the PyTorch ecosystem designed to help users understand, learn, and adapt to `torch.compile`! - - -## Motivation - -`torch.compile` is a cornerstone of PyTorch 2.x, offering a straightforward path to accelerate machine learning workflows with just a single line of code for both training and inference. The mere inclusion of `@torch.compile` can[ dramatically enhance the performance of your code](https://pytorch.org/get-started/pytorch-2.0/). However, identifying the optimal insertion point for `torch.compile` is not easy, not to mention the complexity of adjusting various knobs for maximum efficiency. - -The intricacies of the `torch.compile` stack, encompassing Dynamo, AOTAutograd, Inductor, and more, present a **steep learning curve**. These components, essential for deep learning performance optimization, can be daunting without a solid foundation in the subject. - - -_Note: For an introductory example of how torch.compile works, please refer to this[ walk-through explanation](https://depyf.readthedocs.io/en/latest/walk_through.html)._ - - -## A common tool: `TORCH_COMPILE_DEBUG` - -To demystify `torch.compile`, the common approach involves leveraging the `TORCH_COMPILE_DEBUG` environment variable. While it provides more information, deciphering the output remains a formidable task. - -For example, when we have the following code: - - -``` -# test.py -import torch -from torch import _dynamo as torchdynamo -from typing import List - -@torch.compile -def toy_example(a, b): - x = a / (torch.abs(a) + 1) - if b.sum() < 0: - b = b * -1 - return x * b - -def main(): - for _ in range(100): - toy_example(torch.randn(10), torch.randn(10)) - -if __name__ == "__main__": - main() -``` - - -And run it with `TORCH_COMPILE_DEBUG=1 python test.py` , we will get a directory named `torch_compile_debug/run_2024_02_05_23_02_45_552124-pid_9520` , under which there are these files: - - -``` -. -├── torchdynamo -│ └── debug.log -└── torchinductor - ├── aot_model___0_debug.log - ├── aot_model___10_debug.log - ├── aot_model___11_debug.log - ├── model__4_inference_10.1 - │ ├── fx_graph_readable.py - │ ├── fx_graph_runnable.py - │ ├── fx_graph_transformed.py - │ ├── ir_post_fusion.txt - │ ├── ir_pre_fusion.txt - │ └── output_code.py - ├── model__5_inference_11.2 - │ ├── fx_graph_readable.py - │ ├── fx_graph_runnable.py - │ ├── fx_graph_transformed.py - │ ├── ir_post_fusion.txt - │ ├── ir_pre_fusion.txt - │ └── output_code.py - └── model___9.0 - ├── fx_graph_readable.py - ├── fx_graph_runnable.py - ├── fx_graph_transformed.py - ├── ir_post_fusion.txt - ├── ir_pre_fusion.txt - └── output_code.py -``` - - -The generated files and logs often raise more questions than they answer, leaving developers puzzled over the meaning and relationships within the data. Common puzzles for `TORCH_COMPILE_DEBUG` include: - - - -* What does `model__4_inference_10.1` mean? -* I have one function but three `model__xxx.py` in the directory, what is their correspondence? -* What are those `LOAD_GLOBAL` stuff in `debug.log` ? - - -## A better tool: `depyf` comes to rescue - -Let’s see how `depyf` can help developers to resolve the above challenges. To use `depyf` , simply execute `pip install depyf` or follow the project page[ https://github.com/thuml/depyf](https://github.com/thuml/depyf) to install the latest version, and then surround the main code within `with depyf.prepare_debug` . - - -``` -# test.py -import torch -from torch import _dynamo as torchdynamo -from typing import List - -@torch.compile -def toy_example(a, b): - x = a / (torch.abs(a) + 1) - if b.sum() < 0: - b = b * -1 - return x * b - -def main(): - for _ in range(100): - toy_example(torch.randn(10), torch.randn(10)) - -if __name__ == "__main__": - import depyf - with depyf.prepare_debug("depyf_debug_dir"): - main() -``` - - -After executing `python test.py` , `depyf` will produce a directory named `depyf_debug_dir` (the argument of the `prepare_debug` function). Under the directory, there would be these files: - - -``` -. -├── __compiled_fn_0 AFTER POST GRAD 0.py -├── __compiled_fn_0 Captured Graph 0.py -├── __compiled_fn_0 Forward graph 0.py -├── __compiled_fn_0 kernel 0.py -├── __compiled_fn_3 AFTER POST GRAD 0.py -├── __compiled_fn_3 Captured Graph 0.py -├── __compiled_fn_3 Forward graph 0.py -├── __compiled_fn_3 kernel 0.py -├── __compiled_fn_4 AFTER POST GRAD 0.py -├── __compiled_fn_4 Captured Graph 0.py -├── __compiled_fn_4 Forward graph 0.py -├── __compiled_fn_4 kernel 0.py -├── __transformed_code_0_for_torch_dynamo_resume_in_toy_example_at_8.py -├── __transformed_code_0_for_toy_example.py -├── __transformed_code_1_for_torch_dynamo_resume_in_toy_example_at_8.py -└── full_code_for_toy_example_0.py -``` - - -And there are two obvious benefits: - - - -1. The long and difficult-to-understand `torchdynamo/debug.log` is gone. Its content is cleaned up and shown as human-readable source code, in `full_code_for_xxx.py` and `__transformed_code_{n}_for_xxx.py` . It is worth to note, that the most tedious and difficult job of `depyf` is to decompile the bytecode inside `torchdynamo/debug.log` into Python source code, freeing developers from intimidating internals of Python. -2. The correspondence between function names and computation graphs are respected. For example, in `__transformed_code_0_for_toy_example.py` , we can see a function named `__compiled_fn_0` , and we will immediately know its corresponding computation graphs are in `__compiled_fn_0_xxx.py` , because they share the same `__compiled_fn_0` prefix name. - -Starting with full_code_for_xxx.py , and following the functions involved, users will have a clear view of what torch.compile does to their code. - - -## One more thing: step-through debuggability - -Stepping through code line by line using debuggers is a great way to understand how code works. However, under `TORCH_COMPILE_DEBUG` , those files are only for users’ information, and cannot be executed with the data users concern. - - -_Note: By “debug”, we mean the process of inspecting and improving a program, rather than correcting buggy code._ - -A standout feature of depyf is its capability to facilitate step-through debugging for torch.compile: all of the files it generates are linked with runtime code objects inside Python interpreter, and we can set breakpoints in these files. The usage is simple, just add one context manager with depyf.debug() , and it should do the trick: - - -``` -# test.py -import torch -from torch import _dynamo as torchdynamo -from typing import List - -@torch.compile -def toy_example(a, b): - x = a / (torch.abs(a) + 1) - if b.sum() < 0: - b = b * -1 - return x * b - -def main(): - for _ in range(100): - toy_example(torch.randn(10), torch.randn(10)) - -if __name__ == "__main__": - import depyf - with depyf.prepare_debug("depyf_debug_dir"): - main() - with depyf.debug(): - main() -``` - - -Just one caveat: the workflow of debugging `torch.compile` deviates from standard debugging workflow. With `torch.compile`, many codes are **dynamically** generated. Therefore, we need to: - - - -1. launch the program -2. when the program exits `with depyf.prepare_debug("depyf_debug_dir")` , code will be available in `depyf_debug_dir`. -3. when the program enters `with depyf.debug()` , it will automatically set a breakpoint internally, so that the program is paused. -4. navigate to `depyf_debug_dir` to set breakpoints. -5. continue to run the code, and debuggers will hit these breakpoints! - - -![depyf screenshot](/assets/images/depyf-screenshot.png){:style="width:100%;"} - - -Here is a screenshot of what it looks like. All code and tensor variables are live, and we can inspect any variable, and step through the code, as in our daily debugging workflow now! The only difference is that we are debugging `torch.compile` generated code rather than human-written code. - - -## Conclusion - -`torch.compile` serves as an invaluable tool for accelerating PyTorch code effortlessly. For those looking to delve deeper into `torch.compile`, whether to leverage its full potential or to integrate custom operations, the learning curve can be very steep though. `depyf` is designed to lower this barrier, offering a user-friendly experience to understand, learn, and adapt to `torch.compile`. - -Do explore `depyf` and experience its benefits firsthand! The project is open-source and readily available at[ https://github.com/thuml/depyf](https://github.com/thuml/depyf). Installation is straightforward via `pip install depyf`. We hope `depyf` can enhance everyone’s development workflow with `torch.compile`. \ No newline at end of file diff --git a/_posts/2024-05-11-zeus.md b/_posts/2024-05-11-zeus.md deleted file mode 100644 index e861653a8d3b..000000000000 --- a/_posts/2024-05-11-zeus.md +++ /dev/null @@ -1,182 +0,0 @@ ---- -layout: blog_detail -title: "Deep Learning Energy Measurement and Optimization" -hidden: true -author: Jae-Won Chung ---- - -![Zeus logo](/assets/images/zeus/fig1.png){:style="width:100%;display: block; max-width: 400px; margin-right: auto; margin-left: auto"} - -_This post is authored by [Jae-Won Chung](https://jaewonchung.me/about), a PhD student at the University of Michigan and the lead of the [ML.ENERGY Initiative](https://ml.energy)._ - -Deep learning consumes quite a bit of energy. For instance, training a single 200B LLM on AWS p4d instances consumed around 11.9 GWh (source: [CIDR 2024 keynote](https://mvdirona.com/jrh/talksandpapers/JamesHamiltonCIDR2024.pdf)), which is an amount that can single-handedly power more than a thousand [average US households](https://www.eia.gov/tools/faqs/faq.php?id=97&t=3) for a year. - -[Zeus](https://github.com/ml-energy/zeus) is an open-source toolbox for measuring and optimizing the energy consumption of deep learning workloads. Our goal is to make energy optimization based on accurate measurements as easy as possible for diverse deep learning workloads and setups by offering composable tools with minimal assumptions. - -Zeus largely provides two types of tools: - - - -1. Programmatic and command line GPU energy **measurement** tools -2. Several energy **optimization** tools that find the best ML and/or GPU configurations - -Zeus can benefit those who would like to - - - -* measure and optimize their electricity cost -* reduce heat dissipation from their GPUs (by lowering power draw) -* report energy usage from research and development -* reduce carbon footprint from electricity usage - - -## Part 1: Measuring Energy - -Just like performance optimization, accurate measurement is the basis of effective energy optimization. Popular proxies for estimating power consumption like the maximum power draw of the hardware [can sometimes be vastly off](https://ml.energy/blog/energy/measurement/measuring-gpu-energy-best-practices/) compared to actual measurement. - -To make energy measurement as easy and transparent as possible, the core utility Zeus offers is the `ZeusMonitor` class. Let’s take a look at the actual snippet: - -```python -from zeus.monitor import ZeusMonitor - -# All four GPUs are measured simultaneously. -monitor = ZeusMonitor(gpu_indices=[0,1,2,3]) - -# Measure total time and energy within the window. -monitor.begin_window("training") -for e in range(100): - - # Measurement windows can arbitrarily be overlapped. - monitor.begin_window("epoch") - for x, y in train_dataloader: - y_hat = model(x) - loss = criterion(y, y_hat) - loss.backward() - optim.step() - measurement = monitor.end_window("epoch") - print(f"Epoch {e}: {measurement.time} s, {measurement.total_energy} J") - -measurement = monitor.end_window("training") -print(f"Entire training: {measurement.time} s, {measurement.total_energy} J") -``` - -What you see above is a typical PyTorch training loop which uses four GPUs for data parallel training. Inside, we created an instance of `ZeusMonitor` and passed in a list of GPU indices to monitor. Then, using the monitor, we can measure the time and energy consumption of arbitrary execution _windows_ within the training script by pairing calls to `begin_window` and `end_window`. Multiple windows can overlap and nest in arbitrary ways without affecting the measurement of each, as long as their names are different. - -`ZeusMonitor` adds very little overhead – typically single digit milliseconds – around the window. This allows `ZeusMonitor` to be used in various applications. For instance: - - - -* [The ML.ENERGY Leaderboard](https://ml.energy/leaderboard): The first open-source benchmark on how much energy LLM text generation consumes. -* [The ML.ENERGY Colosseum](https://ml.energy/leaderboard): An online service that lets users compare LLM responses side-by-side based on response quality _and_ energy consumption. - -See our [blog post](https://ml.energy/blog/energy/measurement/measuring-gpu-energy-best-practices/) for a deeper technical dive into accurate GPU energy measurement. - - -## Part 2: Optimizing Energy - -Let me introduce you to two of the energy optimizers provided by Zeus. - - -### GlobalPowerLimitOptimizer - - - -GPUs allow users to configure its maximum power draw, called _power limit_. Typically, as you lower the GPU’s power limit from the default maximum, computation may get slightly slower, but you’ll save disproportionately more energy. The `GlobalPowerLimitOptimizer` in Zeus automatically finds the optimal GPU power limit globally across all GPUs. - -```python -from zeus.monitor import ZeusMonitor -from zeus.optimizer.power_limit import GlobalPowerLimitOptimizer - -# The optimizer measures time and energy through the ZeusMonitor. -monitor = ZeusMonitor(gpu_indices=[0,1,2,3]) -plo = GlobalPowerLimitOptimizer(monitor) - -for e in range(100): - plo.on_epoch_begin() - for x, y in train_dataloader: - plo.on_step_begin() - - y_hat = model(x) - loss = criterion(y, y_hat) - loss.backward() - optim.step() - - plo.on_step_end() - plo.on_epoch_end() -``` - -In our familiar PyTorch training loop, we have instantiated `GlobalPowerLimitOptimizer` and passed it an instance of the `ZeusMonitor`, through which the optimizer sees the GPUs. Then, we just need to let the optimizer know about training progress (step and epoch boundaries), and the optimizer will transparently do all the necessary profiling and converge to the optimal power limit. - -If you’re using the HuggingFace [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) or [SFTTrainer](https://huggingface.co/docs/trl/main/en/sft_trainer), integration is even easier: - -```python -from zeus.monitor import ZeusMonitor -from zeus.optimizer.power_limit import HFGlobalPowerLimitOptimizer - -# ZeusMonitor actually auto-detects CUDA_VISIBLE_DEVICES. -monitor = ZeusMonitor() -pl_optimizer = HFGlobalPowerLimitOptimizer(monitor) - -# Pass in the optimizer as a Trainer callback. Also works for SFTTrainer. -trainer = Trainer( - model=model, - train_dataset=train_dataset, - ..., - callbacks=[pl_optimizer], -) -``` - -The `HFGlobalPowerLimitOptimizer` wraps `GlobalPowerLimitOptimizer` so that it automatically detects step and epoch boundaries. We have example integrations [here](https://github.com/ml-energy/zeus/tree/master/examples/huggingface), including running Gemma 7B supervised fine-tuning with QLoRA. - -Now, we know how to integrate the optimizer, but what is the _optimal_ power limit? We know different users can have different preferences regarding trading off time and energy, so we allow users to specify an `OptimumSelector` (basically the [Strategy Pattern](https://en.wikipedia.org/wiki/Strategy_pattern)) to express their needs. - -```python -# Built-in strategies for selecting the optimal power limit. -from zeus.optimizer.power_limit import ( - GlobalPowerLimitOptimizer, - Time, - Energy, - MaxSlowdownConstraint, -) - -# Minimize energy while tolerating at most 10% slowdown. -plo = GlobalPowerLimitOptimizer( - monitor, - MaxSlowdownConstraint(factor=1.1), -) - -``` - -Some of the built-in strategies include “Minimize time” ([Time](https://ml.energy/zeus/reference/optimizer/power_limit/#zeus.optimizer.power_limit.Time), this might still reduce the power limit from the default since some workloads exhibit almost no slowdown even on lower power limits), “Minimize energy” ([Energy](https://ml.energy/zeus/reference/optimizer/power_limit/#zeus.optimizer.power_limit.Energy)), “Somewhere in between” ([ZeusCost](https://ml.energy/zeus/reference/optimizer/power_limit/#zeus.optimizer.power_limit.ZeusCost)), and “Minimize energy given maximum slowdown” ([MaxSlowdownConstraint](https://ml.energy/zeus/reference/optimizer/power_limit/#zeus.optimizer.power_limit.MaxSlowdownConstraint)). Users can also create their own optimum selectors as needed. - - -### PipelineFrequencyOptimizer - -The pipeline frequency optimizer, based on our research paper [Perseus](https://ml.energy/zeus/research_overview/perseus), is our latest work on energy optimization for large model training, like GPT-3. Perseus can reduce the energy consumption of large model training with no or negligible training throughput degradation. We’ll briefly talk about how. - -![one iteration of training with four stage pipeline parallelism](/assets/images/zeus/fig2.png){:style="width:100%;"} - - -The above is a visualization of one iteration of training with four stage _pipeline parallelism_ running with the 1F1B schedule. Each box is either a forward or a backward computation, and is colored with its power consumption. - -The key observation here is that when models are partitioned into pipeline stages, it’s very difficult to slice them in perfectly equal sizes. This leads to forward/backward boxes of varying widths and therefore computation _idle time_ between boxes. You would notice that those smaller boxes can run slightly slower than wider boxes and the overall critical path (blue line) will not change at all. - -![one iteration of training with four stage pipeline parallelism](/assets/images/zeus/fig3.png){:style="width:100%;"} - -That’s what Perseus automatically does. Based on profiling, it identifies computation boxes that are not on the critical path and figures out the precise amount of slowdown for each box that minimizes energy consumption. When done correctly, computations we slowed down will consume less power & energy, but the overall iteration time of the pipeline does not change. - -See [our guide](https://ml.energy/zeus/optimize/pipeline_frequency_optimizer/) to get started with Perseus! - - -## Final Words - -For users who run their own on-premise compute, energy consumption and the resulting electricity bill is not something that can be easily overlooked. On a larger scale, energy consumption is not just about electricity bills, but also about data center power delivery. With thousands of GPUs running in clusters, finding stable, affordable, and sustainable electricity sources to power data centers is becoming [increasingly challenging](https://www.cbre.com/insights/reports/north-america-data-center-trends-h1-2023). Finding ways to reduce energy disproportionately more than slowdown leads to lower average power consumption, which can help with the power delivery challenge. - -With Zeus, we hope to take the first step towards deep learning energy measurement and optimization. - -Wondering where to go from here? Here are a couple helpful links: - -* [Zeus homepage/documentation](https://ml.energy/zeus) -* [Zeus GitHub repository](https://github.com/ml-energy/zeus) -* [Zeus usage and integration examples](https://github.com/ml-energy/zeus/tree/master/examples) -* [ML.ENERGY Initiative](https://ml.energy) (i.e., the people building Zeus) \ No newline at end of file diff --git a/_posts/2024-05-14-speeding-up-vits.md b/_posts/2024-05-14-speeding-up-vits.md deleted file mode 100644 index 54cc16100610..000000000000 --- a/_posts/2024-05-14-speeding-up-vits.md +++ /dev/null @@ -1,133 +0,0 @@ ---- -layout: blog_detail -title: "Speeding up ViTs using Block Sparsity" -author: "FAIR at Meta: Mostafa Elhoushi, Sensors and Systems at Meta Reality Labs Research: Syed Shakib Sarwar, Aaryan Kothapalli, Mia Kasperek, Barbara De Salvo, PyTorch at Meta: Christian Puhrsch, Jesse Cai, Joe Isaacson, Quantsight: Andrew James, Pearu Peterson, Nikita Vedeneev" ---- - -**TLDR:** We show promising results of up to a **1.46x speedup with <2% drop in accuracy** on float32 Vision Transformers on A100 GPUs by applying block sparsity on MLP module’s weights. This approach can potentially be applied to other types of transformers including large language models. Our implementation and benchmarks to reproduce our results are available at [https://github.com/pytorch-labs/superblock](https://github.com/pytorch-labs/superblock). - - -## Introduction - -PyTorch has landed a lot of improvements to CUDA kernels that implement block sparse matrix multiplications. Recent updates to Pytorch can lead up to [4.8x speedup](https://gist.github.com/cpuhrsch/7fec60079cbe2daeff59c0577f933320) on large matrix multiplication shapes with high sparsity levels over dense baselines. - -In this blog, we show the promising results of applying block sparsity on weights of linear layers of MLP (multi-layer perceptron) layers in vision transformers (ViTs) and show end-to-end model speedups on A100 Nvidia GPUs. - -As a recap, block sparsity sparsifies weights in tiles of blocks of predetermined size, rather than sparsifying individual elements. This particular sparsity pattern is interesting because it is amenable to GPU acceleration via fast sparse kernels. For more information about the differences between different sparsity patterns, or about sparsity as a whole, please check out [torchao](https://github.com/pytorch/ao/tree/main/torchao/sparsity). - - -![Illustrations of different types of sparsity.](/assets/images/speeding-up-vits/fig1.png){:style="width:100%;"} - - -_Illustrations of different types of sparsity._ - - -## Approach - -Our approach can be broken down into two distinct steps: - - - -1. Training the model from scratch using block sparse masks subnets. -2. Folding these masks into our weights to accelerate them for inference. - -We explain our training and inference steps below - - -### Training - -Starting with an uninitialized Vision Transformer, we apply random trainable masks with a specified block size and sparsity level on the weights of output projection linear layer of attention blocks, the weights of the two linear layers inside the MLP, a.k.a., FFN (feed forward networks), as well as the final linear classification layer. The forward pass during training follows the [supermask approach](https://arxiv.org/abs/2207.00670), as each mask is converted to binary map using a tuned threshold based on sparsity requirements, e.g., if we want 80% sparsity, we will have the threshold automatically tuned to keep top 20% weights. The masks are of a square <block size>x<block size> elements, where <block size> is a hyperparameter. The priority of the weights is dependent on the mask value or score which is trained. We [multiply the binary masks of each layer with the weights](https://github.com/pytorch-labs/superblock/blob/7a469210c7bcb846dd8b6bfa848d104312312126/supermask.py#L130) to sparsify the model. - -![Illustration of the Supermask sparsification approach](/assets/images/speeding-up-vits/fig2.png){:style="width:100%;"} - -_Illustration of the [Supermask](https://arxiv.org/abs/2207.00670) sparsification approach._ - - -### Inference - -After training, the [dense weights can be turned to sparse weights by multiplying with the mask](https://github.com/pytorch-labs/superblock/blob/7a469210c7bcb846dd8b6bfa848d104312312126/supermask.py#L122-L125) and stored for inference. At this stage, although the weights have a high percentage of zero values, they are still stored in dense format. We use PyTorch's [to_sparse_bsr()](https://pytorch.org/docs/stable/generated/torch.Tensor.to_sparse_bsr.html) API to to convert the weights to [Block Sparse Representation](https://pytorch.org/docs/stable/sparse.html#sparse-bsr-docs) (BSR) format that stores only the non-zero values and the indices of their blocks. This step only needs to be done once and the results can be cached for runtime. - -During runtime, no changes in code are required. We just pass any input tensor to the model, and when the forward() function of the sparsified linear layers are invoked, PyTorch takes care of invoking the optimized matrix multiplication for block sparse weights. This should work for A100 as well as H100 NVIDIA GPUs. - - -## Results: Microbenchmarks - -To validate the viability of block sparsity from a performance standpoint, we first ran a series of microbenchmarks using this [simple script](https://github.com/pytorch/ao/blob/73f8efce1e950235f58dc917ee204517ec74bba0/benchmarks/benchmark_gpu_sparsity.py). Using the linear shapes from ViT-b, we compared the speedup of our block sparse kernels across a single linear layer as we varied the sparsity level and block size of the weight matrix. - -We run using PyTorch 2.3.0.dev20240305+cu121 nightly on NVIDIA A100s and report the speedup of each sparsity configuration compared to dense baseline. We observed positive speedups when block size >=32 or sparsity level >= 0.8 for float32, while for bfloat16 we observe smaller speedups and usually for block size 64 and higher sparsities. Hence, for end-to-end speedups on the model, we will focus in this blog on float32 and leave bfloat16 for future work. - - - -![Micro benchmarking results on linear layers of ViT-b-16.](/assets/images/speeding-up-vits/fig3.png){:style="width:100%;"} - - - - -![Micro benchmarking results on linear layers of ViT-b-16.](/assets/images/speeding-up-vits/fig4.png){:style="width:100%;"} - - -_Micro benchmarking results on linear layers of ViT-b-16._ - - -## Results: Vision Transformers - -Once we confirmed that we were able to show speedups over the linear layers, we focused on showing end-to-end speedups on [ViT_B_16](https://pytorch.org/vision/main/models/generated/torchvision.models.vit_b_16.html). - -We trained this model from scratch on ImageNet dataset using the standard [ViT_B_16 recipe](https://github.com/pytorch/vision/tree/main/references/classification#vit_b_16). We show speedups for sparsifying MLP modules and leave sparsifying weights of input and output projections of attention for future work. - -We looked at wall-clock inference speedup, focusing on batch size 256. We found that: - - - -* For 90% sparsity we can get 1.24x, 1.37x, 1.65x speedups for block sizes 16, 32, and 64 respectively. -* To obtain speedup, the minimum sparsity for block sizes 16, 32, and 64 are 0.86, 0.82, and 0.7 respectively. Hence, as expected, the larger the block size, the smaller sparsity we need to obtain speedup. - -We note a limitation of the `sparse_bsr()` API: that layers need to be multiples of the block size. Since the dimensions of the last FC classification layer in ViT was not a multiple of the block size, they were not converted to BSR representation in our experiments. - - -![Speedup on ViT-b-16 with batch size 256 on MLP modules across different batch sparsities and block sizes.](/assets/images/speeding-up-vits/fig5.png){:style="width:100%;"} - -_Speedup on ViT-b-16 with batch size 256 on MLP modules across different batch sparsities and block sizes._ - -We also explored the speedup for different batch sizes for 90% sparsity. We observed a speedup over the baseline for batch sizes starting from 16 and upwards. While bigger block sizes have bigger speedups at the largest batch sizes, the smallest possible batch size to obtain >1 speedup is smaller for smaller block sizes. - -We believe on-device hardware can obtain speedups for batch size 1 as they - unlike server GPUs - can be fully utilized at such small batch sizes. - - -![Speedup on ViT-b-16 with 90% sparsity on MLP modules across different batch sizes and block sizes.](/assets/images/speeding-up-vits/fig6.png){:style="width:100%;"} - - -_Speedup on ViT-b-16 with 90% sparsity on MLP modules across different batch sizes and block sizes._ - -Looking at the Top-1 accuracy on ImageNet=blurred test set of the sparsified models for different block sizes and sparsities, we see a few expected results: - - - -* low levels of sparsity (<=70%) have no meaningful regression in accuracy -* mid levels of sparsity (>=80% to <90%) have limited regression in accuracy -* high levels of sparsity (>=90%) removes so many weights that accuracy is significantly impacted - -More research could be done to improve accuracies of higher sparsities and larger block sizes. We hope that the block sparsity support in PyTorch and the illustrated speedups in this blog will encourage researchers to explore more accurate sparsification approaches. - - -![Accuracies on training ViT-b-16 on ImageNet-blurred using the SuperMask approach.](/assets/images/speeding-up-vits/fig7.png){:style="width:100%;"} - -_Accuracies on training ViT-b-16 on ImageNet-blurred using the SuperMask approach._ - - -## Next Steps - -We have shown promising speedups for block sparsifying MLP modules ViT in float32 precision. There is still more work to be done in order to observe speedups on bfloat16 and we hope to obtain progress on that soon. Possible next steps to further optimize block sparsity on vision transformers and transformers in general: - - - -* Perform block sparsity on attention input and output projections. -* Perform block sparsity during finetuning rather than training from scratch. -* Perform further optimizations on the matmul kernels for ViT's linear operator specific shapes (especially for 80% and lower sparsity). -* Combine with other optimizations such as int8 and torch.compile() -* Explore other weight sparsification algorithms, e.g., [Spartan](https://arxiv.org/abs/2205.14107), to improve accuracy -* Explore selecting weights to sparsify (e.g., specific transformer layers) - -Please reach out to [melhoushi@meta.com](mailto:melhoushi@meta.com) if you have questions or are interested in contributing to block sparsification! - -Additionally if you’re broadly interested in sparsity please feel free to reach out to [@jcaip](https://github.com/jcaip) / [jessecai@meta.com](mailto:jessecai@meta.com) and please come check out [torchao](https://github.com/pytorch/ao), a community we’re building for architecture optimization techniques like quantization and sparsity. diff --git a/_posts/2024-05-15-achieving-sustainability-goals.md b/_posts/2024-05-15-achieving-sustainability-goals.md deleted file mode 100644 index abb112ac0f53..000000000000 --- a/_posts/2024-05-15-achieving-sustainability-goals.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -layout: blog_detail -title: "Achieving Sustainability Goals with PyTorch and Intel AI" ---- -This post was contributed by Intel AI in partnership with the PyTorch Foundation. - -In 2017, the UN Global Compact emphasized digital technology, particularly open source, as crucial for achieving Sustainable Development Goals (SDGs), projecting a potential $2.1 trillion boost to the tech sector by 2030. The SDGs, part of the "2030 Agenda for Sustainable Development," address global prosperity across various sectors. - -The [Linux Foundation's Sustainability Initiative](https://www.linuxfoundation.org/projects/sustainability) aligns projects with sustainable development goals. By assessing project impact, resources can be better allocated for enhancement. Intel is also a contributor to this initiative, and recently presented three use cases with PyTorch and Intel AI to address UN SDG-aligned issues. - -![Sustainability Goals](/assets/images/achieving-sustainability-goals.png){:style="width:100%;"} - -## SDG 15: Life on Land - -* Using a bone likelihood map to pinpoint dinosaur bones, which paves the way for transfer learning to tackle contemporary challenges like wildfire prediction. -* Employing transfer learning for wildfire prediction and generating data with Stable Diffusion. - -## SDG 9: Industry, Innovation, Infrastructure - -* Identifying crucial minerals, oil, and gas through subsurface models. - -Here are the key highlights from the workshops. Read below for a summary, and be sure to watch the full workshop videos and visit the GitHub repositories. - -## Session 1: Introduction to Dinosaur Bone Bed Maps - -Bob Chesebrough recently led a PyTorch workshop demonstrating how to create a dinosaur bone bed map for Dinosaur National Monument. He shared footage of his discoveries and explained his AI-driven approach, utilizing geological data to pinpoint possible bone-rich areas. - -Attendees learned to set up JupyterLab, access the training section, and launch a BASH shell. Bob's classification model, applied to aerial images, facilitated heatmap generation to identify potential bone locations, refined through field data. The GitHub repo "Jurassic" guided participants through directory setup and model optimization steps. - -Rahul Unnikrishnan Nair demonstrated the use of PyTorch, focusing on performance enhancements. The workshop covered modeling best practices, such as data transformations, class distribution, dropout layers, and efficient training methods. Training and scoring procedures were examined, with a focus on model accuracy and transportability to other regions. Heatmap creation involved cutting images into tiles, considering context for accurate environmental identification. - -Watch the [full workshop video here ](https://www.youtube.com/watch?v=w4JmPkqnD0E)and visit the [GitHub repository ](https://github.com/intelsoftware/jurassic)to access the code sample and experiment with the code using [Intel ® Extension for PyTorch](https://pytorch.org/tutorials/recipes/recipes/intel_extension_for_pytorch.html). Try it out with PyTorch and explore what works best for you. Happy dinosaur bone hunting! - -## Session 2: Seismic Data to Subsurface Models with OpenFWI: Training an AI Model with PyTorch - -Seismic exploration is crucial for subsurface imaging in mineral and oil/gas exploration. Full waveform inversion (FWI) recreates subsurface sound wave velocities, akin to ultrasound for the Earth. - -Ben Consolvo, an AI Software Engineering Manager at Intel, presented training AI models directly from seismic data using PyTorch on Intel high-performance processors. FWI, though accurate, is computationally intensive and relies on precise initial models. AI models offer an alternative approach, learning directly from data without the need for precise initializations. Ben explained the challenges of AI models, highlighting the need for diverse datasets and the potential use of CPUs for fine-tuning. He also discussed FWI's surprising medical applications. - -Watch the[ full video here](https://www.youtube.com/watch?v=zvk3Rr-OjU0) and go to the[ paper](https://betterprogramming.pub/seismic-data-to-subsurface-models-with-openfwi-bcca0218b4e8) for more details. The GitHub repo is[ OpenFWI](https://github.com/lanl/OpenFWI). - -## Session 3: Using PyTorch to Aid Wildfire Prediction - -Forest fires pose significant threats to ecosystems, wildlife, and communities. Machine learning presents a promising approach to enhance prediction accuracy. In this Earth Day webinar, Bob Chesebrough and Rahul Unnikrishnan Nair demonstrated image analysis techniques using the MODIS dataset which was used to predict early forest fire probabilities. Through fine-tuning a ResNet18 model with the Intel® Extension for PyTorch, pre-trained models were adjusted with aerial photos, utilizing geo-spatial and color data for fire risk assessment. - -Emphasizing the temporal and geographical filtering requirements for dataset analysis, showcasing images from fire-affected areas like Paradise, CA, the model's adaptability to different hardware configurations was highlighted, along with the utilization of Stable Diffusion for data synthesis when real datasets were unavailable. The presenters encouraged audience engagement in PyTorch experimentation for early fire detection by extending a challenge to leverage these tools for critical predictive tasks. Join them in this endeavor to enhance wildfire prevention and protection efforts. - -Watch the[ full video here](https://www.youtube.com/watch?v=gSC_IHyx0IM) and go to the[ paper](https://www.intel.com/content/www/us/en/developer/articles/technical/predicting-forest-fires-using-pytorch.html) for more details. The GitHub repo is[ ForestFirePrediction](https://github.com/IntelSoftware/ForestFirePrediction). - -## About the Intel Speakers - -[Bob Chesebrough](https://www.linkedin.com/in/robertchesebrough/), Sr Solutions Architect - -Bob Chesebrough’s industry experience is software development/AI solution engineering for Fortune 100 companies and national laboratories for over three decades. He is also a hobbyist who has logged over 800 miles and 1000 hours in the field finding dinosaur bones. He and his sons discovered an important fossil of the only known crocodilian from the Jurassic in New Mexico, they have also discovered and logged into the museum 2000+ bones localities and described a new mass bone bed in New Mexico. - -[Rahul Unnikrishnan Nair](https://www.linkedin.com/in/rahulunair/), Architect in Applied AI and the Engineering Lead at Intel® Liftoff - -In his current role at Intel® Liftoff for Startups program, Rahul Nair brings his extensive experience in applied AI and engineering to mentor early-stage AI startups. His dedication lies in helping these startups transform their innovative ideas into fully-fledged, market-ready products with a strong emphasis on use-case-driven, practical engineering and optimization. - -[Ben Consolvo](https://www.linkedin.com/in/bconsolvo/), AI Software Engineering Manager - -Ben Consolvo is an AI Solutions Engineering Manager at Intel. He has been building a team and a program around Intel’s AI technology paired with Intel’s hardware offerings. He brings a background and passion in data science, particularly in deep learning (DL) and computer vision. He has applied his skills in DL in the cybersecurity industry to automatically identify phishing websites, as well as to the oil and gas industry to identify subsurface features for geophysical imaging. - -[Kelli Belcher](https://www.linkedin.com/in/kelli-belcher/), AI Solutions Engineer - -Kelli Belcher is an AI Solutions Engineer at Intel with over 5 years of experience across the financial services, healthcare, and tech industries. In her current role, Kelli helps build Machine Learning solutions using Intel’s portfolio of open AI software tools. Kelli has experience with Python, R, SQL, and Tableau, and holds a Master of Science in Data Analytics from the University of Texas. diff --git a/_posts/2024-05-21-maximizing-training-throughput.md b/_posts/2024-05-21-maximizing-training-throughput.md deleted file mode 100644 index 63ffc8e19ab4..000000000000 --- a/_posts/2024-05-21-maximizing-training-throughput.md +++ /dev/null @@ -1,216 +0,0 @@ ---- -layout: blog_detail -title: "Maximizing Training Throughput Using PyTorch FSDP and Torch.compile" -author: Team PyTorch at IBM and Team PyTorch at Meta ---- - -[Recently](https://pytorch.org/blog/maximizing-training/), we demonstrated how FSDP and selective activation checkpointing can be used to achieve **57% MFU (Model Flops Utilization)** for training a 7B model on A100 GPUs. We also demonstrated how it can train a high quality model, which we open sourced as [Granite 7B base model](https://huggingface.co/ibm/granite-7b-base) on Hugging Face Hub under the Apache v2.0 license. - -We continued our quest to improve the utilization of GPUs by leveraging torch.compile. Using torch.compile and the selective activation checkpointing from our previous work, we achieve a **MFU of 68%** for the 7B model on A100 GPUs! torch.compile improves training MFU between 10% and 23% for various model sizes. - -This blog is organized into three parts: (1) Challenges addressed in order to train using torch.compile, (2) Numerical parity of compile with no-compile, and (3) MFU report. - -We open sourced all the code and updated it in the [fms-fsdp repository.](https://github.com/foundation-model-stack/fms-fsdp) We are also working with Team PyTorch at Meta to contribute these to the newly released [torch titan](https://github.com/pytorch/torchtitan) repository for pre-training. - - -## Challenges of using torch.compile - -torch.compile is a graph compilation technique that improves GPU utilization. For details on how torch compile works, we refer the readers to the recent [PyTorch paper](https://pytorch.org/blog/pytorch-2-paper-tutorial/) and associated tutorials. A key challenge in getting torch.compile to perform well is to minimize (or eliminate) graph breaks. We initially started with the Llama implementation provided by Meta, but compiling it caused too many graph breaks resulting in reduced training throughput. - -Several portions of the model architecture had to be fixed, with the most important one being the positional embedding layer (RoPE). The typical RoPE implementation uses complex numbers, which was not supported in torch.compile at the time of testing. We implemented RoPE using einops while maintaining parity with the original model architecture implementation. We had to properly cache the frequencies so that we did not run into graph breaks within the RoPE implementation. - -Compiling an FSDP model does result in graph breaks, which the PyTorch team at Meta is working to remove. However, these graph breaks as of PyTorch 2.3 are at FSDP unit boundaries and do not affect throughput significantly. - -When using custom kernels, we need to wrap each kernel by exposing its API to torch.compile. This involves indicating what parameters are modified in-place, how they are modified, and what shapes and strides will their return values have based on the inputs. In our case, SDPA Flash attention is already integrated appropriately and we were able to get that kernel to work with torch.compile with no graph breaks. - -We also noticed that when increasing the amount of data from 2T to 6T tokens, the data loader became a bottleneck. A key reason for this is the fact that previously, we implemented document shuffling in our dataloader naively, by having each worker maintain a list of shuffled document pointers. - -With the larger dataset, these pointer lists were growing to hundreds of thousands of entries per worker. Maintaining pointer lists at this scale became expensive enough that cpu contention throttled our training throughput. We re-implemented document shuffling without any pointer lists using a [Linear Congruential Generator](https://en.wikipedia.org/wiki/Linear_congruential_generator). LCG is a pseudorandom number generator algorithm that implements a random walk over a population, providing sampling without replacement. - -We leveraged the same idea to produce implicit bijective mappings from ordered to shuffled document indices. This enables us to shrink those annoying lists of hundreds of thousands of pointers down to a single integer state for the LCG. This eliminated 80% of the bottleneck and provided a significant boost to our performance. We will devote a separate blog to go into all the details of our performant pre-training data loader. - - -## Numerical Parity of torch.compile and torch.no-compile - -We had previously observed parity issues when training with compile and no-compile options, with one of these being related to the use of SDPA. After a few days of intense debugging sessions between the PyTorch teams at Meta and IBM, we were able to achieve parity between PyTorch compile and no-compile modes. To document and verify this parity, we take a mini-Llama model architecture of 1.4B size and train it to 100B tokens in four variations – no-compile, compile with no activation checkpointing, compile with selective activation checkpointing, and compile with full activation checkpointing. - -We plot the loss curves and gradient norm for these options below: - -![Figure 1: Loss curve and gradient norm for various compile options](/assets/images/max-training-chart.jpg){:style="width:100%;"} - -_Figure 1: Loss curve and gradient norm for various compile options_ - -Further, we run the lm-evaluation-harness and compare the various model scores on different benchmarks and observe no major differences between compile and no-compile, which is shown below. - - -![Figure 2: lm-evaluation-harness comparison of various benchmarks between compile and no-compile](/assets/images/max-training-table.png){:style="width:100%;"} - - -_Figure 2: lm-evaluation-harness comparison of various benchmarks between compile and no-compile_ - -We observe from all these results that compile with all its variants is equal to no-compile option, thus demonstrating parity between compile and no-compile. - - -## MFU report - -Finally, like our previous blog, we compute the MFU for four different model sizes on two clusters. One cluster is 128 A100 GPUs with 400 Gbps inter-node connectivity, and the other is 464 H100 GPUs with 3.2 Tbps inter-node connectivity. We use the selective activation checkpointing that we covered [in the prior blog](https://pytorch.org/blog/maximizing-training/) in addition to compile. We capture the results in the table below. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model size - Batch size - MFU no-compile - MFU compile - Percentage gain (%) -
        7B - 2 - 0.57 - 0.68 - 20 -
        13B - 2 - 0.51 - 0.60 - 17 -
        34B - 2 - 0.47 - 0.54 - 15 -
        70B - 2 - 0.50 - 0.55 - 10 -
        - - -_Table 1: MFU results with compile and no compile for Llama2 model architectures on 128 A100 80GB GPUs with 400Gbps internode interconnect_ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model size - Batch size - MFU no-compile - MFU compile - Percentage gain -
        7B - 2 - 0.37 - 0.45 - 21 -
        13B - 2 - 0.35 - 0.43 - 23 -
        34B - 2 - 0.32 - 0.38 - 19 -
        70B - 2 - 0.32 - 0.38 - 19 -
        - - -_Table 2: MFU results with compile and no compile for Llama2 model architectures on 464 H100 80GB GPUs with 3.2Tbps internode interconnect_ - -We also had an internal production run on 448 GPUs using a Llama2 7B architecture. Using compile and selective activation checkpointing, with a global batch size of 3.7M, we trained for 4T tokens in 13 days 10 hours! - -During training, the data center cooling had to kick in with extra air conditioning and our training team was alerted to this, since we were using the GPUs quite effectively ☺ - -One key observation from the tables 1 and 2 is that the MFU numbers do not linearly scale with model size. There are two possible explanations that we are actively investigating, one is the scalability of FSDP as model size increases and when tensor parallel needs to be enabled to more effectively use the GPU and the other is batch size, which can be increased further to get better MFU. We plan to explore FSDP v2 and selective operator checkpointing along with the tensor parallel feature to study the scaling laws of FSDP with model size. - - -## Future Work - -We plan to start testing FSDP v2 which will be released as part of PyTorch 2.4. FSDP2 provides per parameter sharding and selective operator checkpointing feature that can potentially provide even better memory-compute tradeoffs. - -We have also been engaged with the PyTorch team at Meta to evaluate the new asynchronous checkpointing feature that can further improve the GPU utilization by reducing the time to write checkpoints. - -We are exploring extending various Triton kernels currently used in inference to perform backward operations to gain speedups beyond inference only. - -Finally, as recent work on use of fp8 is emerging, we plan to explore how we can even further accelerate model training using the new data type that promises a 2x acceleration. - - -## Acknowledgements - -There are several teams that have been involved in reaching this proof point and we would like to thank the teams across Meta and IBM. Specifically, we extend our gratitude to the Meta PyTorch distributed and compiler teams and IBM Research. - -Multiple people were extensively involved in the effort of achieving torch.compile numerical parity with our models, and we wish to acknowledge the key folks involved in this effort; Animesh Jain and Less Wright at Meta, and Linsong Chu, Davis Wertheimer, Brian Vaughan, Antoni i Viros Martin, Mudhakar Srivatsa, and Raghu Ganti at IBM Research. - -Special thanks to [Stas Bekman](https://www.linkedin.com/in/stasbekman/?originalSubdomain=ca), who provided extensive feedback and helped improve this blog. Their insights have been invaluable in highlighting key aspects of optimizing the training and exploring further enhancements. diff --git a/_posts/2024-06-04-docathon-kickoff-h1-2024.md b/_posts/2024-06-04-docathon-kickoff-h1-2024.md deleted file mode 100644 index a1cb1035f02a..000000000000 --- a/_posts/2024-06-04-docathon-kickoff-h1-2024.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -layout: blog_detail -title: "Ready, Set, Contribute: PyTorch Docathon Kickoff H1 2024" ---- - -The PyTorch Docathon is now live! This event is dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Our hope with this Docathon is to simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine learning. - -

        JOIN THE KICK-OFF EVENT
        -on June 4th at 10 AM PT

        - - -## Event Details - -* June 4: Kick-off - join a 30-minutes livestream kick off event on Discord on June 4th at 10 AM PT [here](https://discord.com/events/878249534336167987/1245440397510180907). If you can't join the kick-off event, watch our [welcome video](https://youtu.be/2D0aej50umA) on YouTube -* June 4-June 16: Submissions and Feedback -* June 17-18: Final Reviews -* June 20: Winner Announcements - -## How to Contribute - -Review the Docathon H1 2024 issue in the [pytorch/pytorch](https://github.com/pytorch/pytorch/issues/127345) or[ pytorch/tutorials ](https://github.com/pytorch/tutorials/issues/2894)repo that contain all the necessary information on participating in the Docathon and highlights the specific issues to work on. Remember to sign the CLA in your first PR and adhere to the Code of Conduct guidelines. - -## Read the Code of Conduct - -Take a moment to review the PyTorch code of conduct found [here](https://docs.github.com/en/site-policy/github-terms/github-community-code-of-conduct). This document outlines the expectations for behavior and communication within our team, and it is important that everyone is aware of and adheres to these guidelines. - -## Join our Discord - -This channel serves as the main communication hub during the Docathon. You can join it using by using this link: - -

        JOIN DISCORD SERVER

        - -When you first join the server, you will have limited access. To gain full access to our Discord PyTorch Docathon Channel: - -1. Enter the server and navigate to the #self-roles channel. -2. In the #self-roles channel, click on the 'Join Docathon' button in the relevant post to assign yourself the docathon role. -3. After assigning the role, you will see the 'PyTorch Docathon H1 2024 Section' in the left-hand menu for discussions. -4. To help prevent spam we are asking that you change your server username to your GitHub username or the email username you registered with. - -## Explore the GitHub Issues - -All the Docathon issues are posted on GitHub. You can find them by the docathon-h1-2024 label in the following participating repositories: - -* [pytorch/pytorch](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc+label%3Adocathon-h1-2024) -* [pytorch/tutorials](https://github.com/pytorch/tutorials/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc+label%3Adocathon-h1-2024) -* [pytorch/xla](https://github.com/pytorch/xla/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc+label%3A%22docathon-h1-2024%22) -* [pytorch-labs/torchfix](https://github.com/pytorch-labs/torchfix/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc+label%3Adocathon-h1-2024) - -The issues are categorized into three levels of difficulty: easy, medium, and advanced. If this is your first time contributing to PyTorch, we recommend starting with an issue at the easy level. - -## Prizes for Winners - -We will have a leaderboard throughout the duration of the Docathon. The more you contribute, the higher you’ll get on the board! Our top three winners will get free admission to [PyTorch Conference 2024](https://events.linuxfoundation.org/pytorch-conference/). - -## Thank you to our Partners - -This year, we’re thrilled to work with the PyTorch Teams at Meta, Google and Snowflake to help us put on a successful event. We’ll also be at [Snowflake Dev Day](https://u20934166.ct.sendgrid.net/ls/click?upn=u001.I0np5LuiPyKW16wNY408Oo-2Bbsl8LTTFpumobGm2Rp-2F8zbPScG1hdhIF5oBHMWg96q8d7q31-2B1wRrzDpeTgLDCD6FBreEsZKA-2Fi2T-2BA2upbvMD1hmAnpEutE9KOQuxVZnSlQQ1xyyTXB16qRa8VpgLD4ScqnpqqdSffoL4rKHjpyoVMVMoxCsEuKcHJfDWGiMECgZDoegcf9j7bnJPdx0qXN4phU37F77vSXetvmrfy1t6YtpSvgX2eACaznmpmFNfyRtlDRsfdY8YgoZN3rkCMoL0bwCFXdt-2FvIZawhnUV2qppyS0ZgmSCGl-2BufBL0mn8HJIew4LxuXuUB3gfsWD1K6Hb2oiECdgEL-2BK-2BZ6OVUBbdfsupYb6tlGTA2Ic6jTTOODdc5-2B1RKMxIBw1-2FcC8Dzd4uxjdIqgrF4vQ0zdGsVXr2rsWISdAzRwwZ6HxvPYiL8vq_eKG3f-2BVkGDs-2F-2BQAgHZwkvtAslantAPMulCPu-2FMezvzmhqnecR6Zi4bBfzHqa-2BAIYrvYiOg1COidzFz394ty6L-2FhCaPa85b-2BvNqP8-2BnHbrX2cxmOcMxJolrQzGKT9AZVkURRXc3AUORnPJRfwJ32v8Dp-2Fpx9b2Kf973NO10Vsxu9GvjAtGFxlGl-2Bgjhs0tu8Jlhj-2BCG2lXGsFGXnxI1t1hw-3D-3D) on June 6 where you can hear from Meta’s Matthias Reso, and check out our PyTorch booth. - -Happy contributing! \ No newline at end of file diff --git a/_posts/2024-06-06-int4-decoding.md b/_posts/2024-06-06-int4-decoding.md deleted file mode 100644 index 9a1b28ca262f..000000000000 --- a/_posts/2024-06-06-int4-decoding.md +++ /dev/null @@ -1,3485 +0,0 @@ ---- -layout: blog_detail -title: "INT4 Decoding GQA CUDA Optimizations for LLM Inference" -author: Sarunya Pumma, Jongsoo Park, Jianyu Huang, Amy Yang, Jaewon Lee, Daniel Haziza, Grigory Sizov, Jeremy Reizenstein, Jeff Johnson, Ying Zhang ---- - -#### An efficient decoding Grouped-Query Attention with low-precision KV cache - -## Introduction - -Generative AI has taken the world by storm with its ability to generate content like humans. Many of these generative AI tools are powered by large language models (LLMs), like Meta [Llama](https://llama.meta.com/llama3/) models and OpenAI’s [ChatGPT](https://openai.com/gpt-4). One of the main challenges of LLMs is supporting large “context lengths” (also known as “sequence lengths”). The context length refers to the number of tokens that the model uses to understand the input context and generate responses. Longer context lengths generally translate into higher precision and quality in the responses. However, long context lengths are compute and memory intensive. This is mainly due to the following reasons: - - - -* The computational complexity of attention layers increases proportionally with the context length (the growth rate depends on the attention algorithm). As a result, when using long context lengths, the attention layers can become a bottleneck, particularly during the prefill phase where attentions are compute bound. -* The KV cache size grows linearly with the context length, thus, putting higher pressure on the memory requirement and consequently slowing down the already memory-bound attention decoding. Moreover, since the memory capacity is limited, the batch size reduces when the KV cache gets bigger, which generally results in a drop in throughput. - -The computational complexity growth is difficult to solve compared to the other problem mentioned above. One way to address the KV cache size growth problem is to use low precision KV cache. From our experiments, group-wise INT4 quantization provides comparable results in terms of accuracy compared to BF16 KV cache during the decode phase in Meta Llama 2 inference. However, we did not observe any latency improvement, despite reading 4x lesser data in attention decoding layers. This means that the INT4 attention is 4x less efficient at utilizing precious HBM bandwidth than BF16 attention. - -In this note, we discuss the CUDA optimizations that we applied to INT4 GQA (grouped-query attention – the attention layer that we use in the LLM inference phase) to improve its performance by up to **1.8x on the NVIDIA A100 GPU** and **1.9x on the NVIDIA H100 GPU**. - - - -* The **optimized CUDA INT4 GQA** outperformed [INT4 Flash-Decoding GQA](https://pytorch.org/blog/flash-decoding/) (the best performing INT4 GQA that we used in the experiment mentioned above) by **1.4x-1.7x on A100** and **1.09x-1.3x on H100.** -* The **optimized CUDA INT4 GQA** performs better than **BF16 Flash-Decoding GQA** by **1.5x-1.7x on A100 and 1.4x-1.7x on H100.** - - -## Background - - -### GQA for LLM Inference - -[Grouped-Query Attention (GQA)](https://arxiv.org/abs/2305.13245) is a variant of multi-head attention (MHA) where each KV cache head is shared across a group of query heads. Our LLM inference adopts GQA as an attention layer in both the prefill and decode phases in order to reduce the capacity requirement for the KV cache. We use multiple GPUs in inference where the KV cache and query heads are distributed across GPUs. Each GPU runs an attention layer with a single KV head and a group of Q heads. Therefore, when viewed from a single GPU perspective, the GQA component can also be described as [MQA (Multi-Query Attention)](https://arxiv.org/abs/1911.02150). - -The simplified workflow of decoding GQA is illustrated in Figure 1. GQA takes three main inputs: input query (denoted `Q`), K cache (denoted `K`), and V cache (denoted `V`). Our current GQA inference uses BF16 for `Q`, `K`, and `V`. - - - -* `Q` is a 4D BF16 tensor of shape (`B`, `1`, HQ, `D`) -* `K` is a 4D BF16 tensor of shape (`B`, Tmax, HKV, `D`) -* `V` is a 4D BF16 tensor of shape (`B`, Tmax, HKV, `D`) - -_where_ - - - -* `B` is the batch size (the number of input prompts) -* HQ is the number of query heads -* HKV is the number of KV heads (HQ must be divisible by HKV) -* Tmax is the maximum context length -* `D` is the head dimension (fixed to 128) - -GQA is simply bmm(softmax(bmm(Q, KT) / sqrt(D)), V). This yields a single output tensor (denoted as `O`) which is a 4D BF16 tensor that has the same shape as `Q`. Note that matrix multiplications are performed using BF16, however, accumulation and `softmax` are carried out in FP32. We call this “BF16 GQA” as the KV cache is BF16. - - -![Figure 1: The simplified workflow of BF16 GQA for LLM inference](/assets/images/int4-decoding/fg1.png){:style="width:100%;display:block;max-width:500px;margin-left:auto;margin-right:auto;"} - -**Figure 1** The simplified workflow of BF16 GQA for LLM inference - - -### INT4 GQA - -To further reduce the size of the KV cache, we explore the possibility of using INT4 for KV cache instead of BF16. We estimate the potential performance improvement by calculating the computational intensity (CI) of INT4 GQA and comparing it to that of BF16 GQA, as CI represents FLOPS per byte. We compute the CI for QKT and `PV` (as shown in Equation 1) as they take KV cache as an operand. Note that we disregard the `Q` load as it is negligible compared to the KV cache. We also ignore any intermediate data loads/stores that are not on global memory. Thus, the CI only takes into account the computation FLOPS and KV cache loads. - - -![Equation 1](/assets/images/int4-decoding/eq.jpg){:style="width:100%;display:block;max-width:400px;margin-left:auto;margin-right:auto;"} - -**Equation (1)** - - -Assuming that HQ = 8 and HKV = 1, CI for BF16 KV cache is 8 while CI for INT4 KV cache is 32. The CIs indicate that both BF16 and INT4 GQAs are memory bound (the peak CIs for BF16 tensor cores for A100 and H100 are [312 TF / 2 TB/s = 141](https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/a100-80gb-datasheet-update-nvidia-us-1521051-r2-web.pdf) and [990 TF / 3.35 TB/s = 269](https://www.nvidia.com/en-us/data-center/h100/); note that these TF numbers are without sparsity). Moreover, with INT4 KV cache, we should expect up to 4x performance improvement compared to BF16 GQA. - -To enable INT4 KV cache support in GQA, we can dequantize the KV cache from INT4 to BF16 before passing it to the BF16 GQA operator. However, since KV cache is typically large, copying it from/to global memory can be costly. Moreover, decoding GQA is a memory bound operation (the memory unit is utilized much more heavily than the compute unit). Figure 2 shows the NCU profile of the [FMHA CUTLASS BF16 GQA kernel in xFormers](https://github.com/facebookresearch/xformers/blob/9f6abadabdec17cd4b5c301632a44bf8216a7f35/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_bf16_aligned.cu#L33), which is one of the state of the art implementations of GQA. From the figure, it is obvious that memory is a bottleneck. - - -![Figure 2: The NCU profile of the FMHA CUTLASS BF16 kernel in xFormers](/assets/images/int4-decoding/fg2.png){:style="width:100%"} - -**Figure 2** The NCU profile of the [FMHA CUTLASS BF16 kernel in xFormers](https://github.com/facebookresearch/xformers/blob/9f6abadabdec17cd4b5c301632a44bf8216a7f35/xformers/csrc/attention/cuda/fmha/autogen/impl/cutlassF_bf16_aligned.cu#L33) - -A more efficient alternative is to fuse INT4 dequantization with the GQA operation (shown in Figure 3). In other words, having GQA read INT4 KV cache directly and perform the INT4 to BF16 conversion within the kernel. This change can potentially reduce the amount of global memory reads required for the KV cache, which could lead to a decrease in latency. We call this “INT4 GQA.” - - -![Figure 3: The workflow of fused INT4 GQA](/assets/images/int4-decoding/fg3.png){:style="width:100%;display:block;max-width:500px;margin-left:auto;margin-right:auto;"} - -**Figure 3** The workflow of fused INT4 GQA - -We list the state of the art implementations of GQA in the table below along with their features in Table 1. - -**Table 1** State of the art GQA implementations - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Implementation - Denote - BF16 GQA - Fused INT4 GQA -
        Flash-Decoding (Triton implementation) - FD - Yes - Yes -
        Flash Attention (v2.3.3) - FA - Yes - No -
        CUDA baseline - CU - Yes - Yes -
        - - -All implementations, except for CU, support both split-K and non split-K. CU only has the split-K implementation. Only FA has a heuristic in the backend to determine whether to run the split-K or non split-K kernel. For other implementations, users must explicitly choose which version to run. In this note, we focus on long context lengths (in our experiments, we use a context length of 8192) and therefore opt for the split-K version wherever possible. - -As the baseline, we measured the performance of the state of the art GQA implementations on NVIDIA A100 and H100 GPUs. The latency (time in microseconds) and achieved bandwidth (GB/s) are reported in Table 2. Note that we ran a range of split-Ks (from 2 to 128 splits) and reported the best performance for each implementation. For all experiments, we use a context length of 8192. For INT4 GQA, we used row-wise quantization (i.e., num quantized groups = 1). - -**Table 2** Baseline GQA performance - -On A100 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Time (us) - BF16 GQA - INT4 GQA -
        Batch size - FD - FA - CU - FD - FA - CU -
        32 - 139 - 133 - 183 - 137 - - - 143 -
        64 - 245 - 229 - 335 - 234 - - - 257 -
        128 - 433 - 555 - 596 - 432 - - - 455 -
        256 - 826 - 977 - 1127 - 815 - - - 866 -
        512 - 1607 - 1670 - 2194 - 1581 - - - 1659 -
        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Effective Bandwidth (GB/s) - BF16 GQA - INT4 GQA -
        Batch size - FD - FA - CU - FD - FA - CU -
        32 - 965 - 1012 - 736 - 262 - - - 250 -
        64 - 1097 - 1175 - 802 - 305 - - - 278 -
        128 - 1240 - 968 - 901 - 331 - - - 314 -
        256 - 1301 - 1100 - 954 - 351 - - - 331 -
        512 - 1338 - 1287 - 980 - 362 - - - 345 -
        - - -On H100 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Time (us) - BF16 GQA - INT4 GQA -
        Batch size - FD - FA - CU - FD - FA - CU -
        32 - 91 - 90 - 114 - 70 - - - 96 -
        64 - 148 - 146 - 200 - 113 - - - 162 -
        128 - 271 - 298 - 361 - 205 - - - 294 -
        256 - 515 - 499 - 658 - 389 - - - 558 -
        512 - 1000 - 1011 - 1260 - 756 - - - 1066 -
        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Effective Bandwidth (GB/s) - BF16 GQA - INT4 GQA -
        Batch size - FD - FA - CU - FD - FA - CU -
        32 - 1481 - 1496 - 1178 - 511 - - - 371 -
        64 - 1815 - 1840 - 1345 - 631 - - - 443 -
        128 - 1982 - 1802 - 1487 - 699 - - - 487 -
        256 - 2087 - 2156 - 1634 - 736 - - - 513 -
        512 - 2150 - 2127 - 1706 - 757 - - - 537 -
        - - -First, let’s discuss the BF16 GQA performance: CU ranks last in terms of performance among all implementations. FD and FA have comparable performance. When the batch size is less than or equal to 64, FA utilizes the split-K kernel and performs slightly better than FD. However, when the batch size is greater than 64, FD performs better. - -The same trend holds true for INT4 GQAs. However, we did not measure the performance of FA as it does not support INT4 KV cache. FD outperforms CU for all cases. - -When comparing the latencies of FD between BF16 and INT4 GQAs, we find that they are almost identical. This suggests that _INT4 GQA is highly inefficient_, which can be further confirmed by the significantly lower achievable bandwidth for INT4 GQA compared to BF16 GQA. The same trend is also true when looking at the performance of CU. - - -### CUDA with Tensor Cores INT4 GQA Implementation - -In this section, we briefly describe our baseline implementation which is CUDA with tensor cores INT4 GQA (CU). Each thread block processes only one KV head and a group of query heads from one input prompt. Therefore, each thread block performs mm(softmax(mm(Q, KT) / sqrt(D)), V); notice that `mm` is being performed not `bmm`. Moreover, since this is a split-K implementation, tokens in the KV cache are split among different thread blocks. Note that each thread block contains 4 warps (each warp contains 32 threads for NVIDIA A100 and H100 GPUs). Work in each thread block is split among warps. Within each warp, we use the [WMMA](https://bruce-lee-ly.medium.com/nvidia-tensor-core-introduction-to-wmma-api-programming-21bcfee4ec45) API to compute matrix multiplication on tensor cores. Figure 4 demonstrates the work partitioning in CU. - - -![Figure 4: CU work partitioning](/assets/images/int4-decoding/fg4.jpg){:style="width:100%"} - - -**Figure 4** CU work partitioning - - -## Optimizing CUDA with Tensor Cores Kernel of INT4 GQA - -In this note, we discuss the optimizations that we have applied to the CUDA with tensor cores implementation of INT4 GQA (CU). The ideal goal is to improve the INT4 GQA performance by 4 times based on the CI analysis in the previous section. Note that the query size is negligible compared to the KV cache size when the context length is long. - -In our analysis, we used the [NVIDIA Nsight Compute (NCU)](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html) as the main profiler. Our general bottleneck elimination approach is to minimize the stall cycles. We applied 10 optimizations to INT4 GQA, three of which are specific for NVIDIA A100/H100 GPUs. These optimizations are well known CUDA optimization techniques which can be generalized to many applications. - -It is worth noting that the reason that we choose to optimize the CUDA implementation rather than the Flash-Decoding implementation (FD) (which is Triton based) is because with CUDA, we have a better control of how the low-level instructions are being generated. Many optimization techniques that we apply such as, operating on tensor core fragments directly (Optimizations 7-9), cannot be done through Triton since it does not expose low-level details to developers. However, these optimizations can be integrated into the compiler-based solution to make the optimizations available to broader operators, which is indeed a part of our future plan. - - -### Optimization 1: Unroll `K` Loads - -**Problem Analysis:** - -The NCU profile shows that during `K` loading, there are only 2 global loads followed by _memory stalls_ at `dequantize_permuted_int4`. The memory stalls are the long scoreboard stalls which indicates the waits for global memory access. This suggests that the kernel does not issue sufficient memory loads - -to hide the global load latency. The kernel issues data loading, and then waits to consume the data immediately causing the global load latency to be exposed. The stalls are shown in Figure 5. - - -![Figure 5: K loading before unrolling](/assets/images/int4-decoding/fg5.png){:style="width:100%"} - -**Figure 5** K loading before unrolling (the numbers that the arrows point to are stall cycles caused by global memory wait) - -**Solution:** - -In the baseline implementation, we use `uint32_t` to load 8 INT4 `K` values in a single load and we perform 2 `uint32_t` loads in each iteration, which is 16 INT4 K values. To allow for a better global load latency hiding, we issue 8 `uint32_t` loads instead of two before consuming the `K` values in `dequantize_permuted_int4`. This allows the compiler to unroll the loads as well as reorder the instructions to hide the global load latency better. Figure 6 shows the NCU profile of `K` loading after unrolling. Comparing Figure 5 and Figure 6, we effectively reduce the stall cycles by unrolling the `K` loads. - - -![Figure 6: K loading after unrolling](/assets/images/int4-decoding/fg6.png){:style="width:100%"} - -**Figure 6** K loading after unrolling (the numbers that the arrows point to are stall cycles caused by global memory wait) - -**Results:** - -**Table 3** Performance of Optimization 1 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 1 - Baseline - Opt 1 -
        32 - 137 - 143 - 134 - 262 - 250 - 267 - 1.02 - 1.07 -
        64 - 234 - 257 - 237 - 305 - 278 - 302 - 0.99 - 1.09 -
        128 - 432 - 455 - 422 - 331 - 314 - 339 - 1.02 - 1.08 -
        256 - 815 - 866 - 806 - 351 - 331 - 355 - 1.01 - 1.07 -
        512 - 1581 - 1659 - 1550 - 362 - 345 - 369 - 1.02 - 1.07 -
        - - - -### Optimization 2: Improve `P` Type Casting (FP32->BF16) - -**Problem Analysis:** - -Since the product of softmax(bmm(Q, KT) / sqrt(D)) is FP32 (denoted as `P` in Figure 3), the kernel has to convert `P` from FP32 to BF16 before feeding it to the next `bmm` computation. The kernel performs the FP32 to BF16 conversion of `P` by copying the FP32 data from one location in shared memory to another location in shared memory. This causes stalls during the shared memory access (shown in Figure 7) which might be caused by (1) the shared memory indirection; and (2) the shared memory bank conflict since each thread accesses an 16-bit element (because of this, two threads can access the same memory bank simultaneously). - - -![Figure 7: P type casting before Optimization 2](/assets/images/int4-decoding/fg7.png){:style="width:100%"} - - -**Figure 7** `P` type casting before Optimization 2 (the number that the arrow points to is stall cycles caused by shared memory wait) - -**Solution:** - -We use all threads in the thread block to do in-place type conversion. Each thread operates on two consecutive elements in order to avoid the shared memory bank conflict when storing BF16. All threads work on the same head (`h`) at the same time to guarantee correctness of the conversion. The in-place conversion steps are as follows: - - - -1. Each thread loads 2 FP32 token elements from the same head from the shared memory into registers -2. Call `__syncthreads()` to make sure that every thread finishes reading the data -3. Each thread converts its data to 2 BF16 token elements and then stores the results to the same shared memory - -Some optimizations that we apply to the implementation: - - - -* Use vector types (especially `nv_bfloat2`) -* Unroll data loading/storing, i.e., performing multiple loads before calling `__syncthreads()` and performing multiple stores after `__syncthreads()` - -After this optimization, long stalls are not observed during `P` type casting as shown in Figure 8. - -![Figure 8: P type casting after Optimization 2](/assets/images/int4-decoding/fg8.png){:style="width:100%"} - -**Figure 8** `P` type casting after Optimization 2 (the numbers that the arrow points to are stall cycles caused by shared memory wait) - -**Culprits:** - -Since we unroll data loading/storing by using registers as an intermediate storage, the number of registers per thread increases resulting in reduced occupancy. - -**Results:** - -**Table 4** Performance of Optimization 2 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 2 - Baseline - Opt 2 -
        32 - 137 - 143 - 126 - 262 - 250 - 285 - 1.09 - 1.14 -
        64 - 234 - 257 - 221 - 305 - 278 - 324 - 1.06 - 1.16 -
        128 - 432 - 455 - 395 - 331 - 314 - 362 - 1.09 - 1.15 -
        256 - 815 - 866 - 749 - 351 - 331 - 382 - 1.09 - 1.16 -
        512 - 1581 - 1659 - 1435 - 362 - 345 - 399 - 1.10 - 1.16 -
        - - - -### Optimization 3: Remove Local Memory Usage for max QKT computation - -**Problem Analysis:** - -During the softmax computation, the kernel has to compute max QKT for each head. It uses a temporary "thread-local" storage for storing per-thread max QKT results (one float value for each head). Depending on the compiler, the thread-local storage can be allocated on registers (on chip) or the local memory (off chip == global memory). Unfortunately, in the baseline, the thread-local storage resides in the local memory which is much slower than the registers (shown in Figure 9). We suspect that this is because the compiler cannot determine the indices of thread-local storage at compile time (since the number of heads (`H`) in the kernel is a runtime variable). Accessing local memory as if accessing registers can hurt the performance of the kernel. - - -![Figure 9: Local memory access during max QKT computation](/assets/images/int4-decoding/fg9.png){:style="width:100%"} - -**Figure 9** Local memory access during max QKT computation - -**Solution:** - -We realize that we do not need `H` (number of heads) floats as temporary storage per thread since each thread can compute max QKT for only one head instead of all the heads. Thus, we only need one float per thread, which can be easily stored in a register. To accumulate the max results among warps, we use shared memory. This optimization eliminates the local memory usage during max QKT computation. - -**Results:** - -**Table 5** Performance of Optimization 3 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 3 - Baseline - Opt 3 -
        32 - 137 - 143 - 119 - 262 - 250 - 300 - 1.14 - 1.20 -
        64 - 234 - 257 - 206 - 305 - 278 - 348 - 1.14 - 1.25 -
        128 - 432 - 455 - 368 - 331 - 314 - 389 - 1.17 - 1.24 -
        256 - 815 - 866 - 696 - 351 - 331 - 411 - 1.17 - 1.24 -
        512 - 1581 - 1659 - 1338 - 362 - 345 - 428 - 1.18 - 1.24 -
        - - - -### Optimization 4: Remove local memory usage for row sum - -**Problem Analysis:** - -Similar to[ ](https://www.internalfb.com/diff/D50183201)Optimization 3, the local memory usage problem is also observed during the row sum computation in the `softmax` computation. Since local memory is off chip, accessing it as if accessing registers can hurt the performance of the kernel. - -**Solution**: - -We apply the same solution as the max QKT computation for the row sum computation. That is to have each thread compute a row sum of only one head, which requires only one float per thread. This eliminates the need for local memory. - -**Results:** - -**Table 6** Performance of Optimization 4 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 4 - Baseline - Opt 4 -
        32 - 137 - 143 - 118 - 262 - 250 - 302 - 1.15 - 1.21 -
        64 - 234 - 257 - 204 - 305 - 278 - 351 - 1.15 - 1.26 -
        128 - 432 - 455 - 364 - 331 - 314 - 393 - 1.19 - 1.25 -
        256 - 815 - 866 - 688 - 351 - 331 - 416 - 1.18 - 1.26 -
        512 - 1581 - 1659 - 1328 - 362 - 345 - 431 - 1.19 - 1.25 -
        - - - -### Optimization 5: Add prefetch for `V` load - -**Problem Analysis:** - -The same issue as `K` loading is observed when loading `V`. That is, the kernel issues data loading, and then waits to consume the data immediately causing the global load latency to be exposed. However, when using the unrolling technique mentioned above, the compiler allocates the temporary buffer on local memory instead of registers causing a large slow down. - -**Solution:** - -We adopt the data prefetching technique for `V` loading. We load the next iteration `V` values immediately after the current iteration values are consumed. This allows the data loading to be overlapped with the `PK` computation resulting in better kernel performance. - -**Results:** - -**Table 7** Performance of Optimization 5 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 5 - Baseline - Opt 5 -
        32 - 137 - 143 - 109 - 262 - 250 - 327 - 1.25 - 1.31 -
        64 - 234 - 257 - 194 - 305 - 278 - 370 - 1.21 - 1.33 -
        128 - 432 - 455 - 345 - 331 - 314 - 414 - 1.25 - 1.32 -
        256 - 815 - 866 - 649 - 351 - 331 - 441 - 1.26 - 1.33 -
        512 - 1581 - 1659 - 1244 - 362 - 345 - 460 - 1.27 - 1.33 -
        - - - -### Optimization 6: Add Group-Wise INT4 (Groups = 4) with Vector Load - -**Problem Analysis:** - -Prior to this optimization, CU only supported row-wise INT4 quantization. That is, every column in each row shares the same scales. The scales of each row are stored in the first 4 bytes of each row as shown in Figure 10. In the kernel, each thread loads only one row at a time. Since each row contains 68 bytes (4 bytes for scales and 64 bytes for data), it cannot guarantee that every row aligns with a size of any vector type. Thus, vector loads cannot be used for loading the KV cache. - - -![Figure 10: The layout of each row of INT4 KV cache with row-wise quantization](/assets/images/int4-decoding/fg10.jpg){:style="width:100%;display:block;max-width:500px;margin-left:auto;margin-right:auto;"} - - -**Figure 10** The layout of each row of INT4 KV cache with row-wise quantization - -**Solution:** - -We have implemented support for group-wise INT4 quantization with num groups = 4. In this case, columns in each row in the KV cache tensor are divided into 4 equal groups. Columns within the same group share the same scales for quantization/dequantization. The data layout for INT4 KV cache is shown in Figure 11. The scales for all groups are serialized and stored at the beginning of each row. The INT4 data is also serialized and laid out next to the scales. - -Because the number of bytes in each row now becomes 80 bytes, we can use a vector type, i.e., `uint2` in our case, to load data. (We **do not** use `uint4` since each thread loads only 16 INT4s at a time due to the tensor core fragment size.) Vector load is generally better than scalar load since it does not cause extra byte loads. - - -![Figure 11: The layout of each row of INT4 KV cache with row-wise quantization](/assets/images/int4-decoding/fg11.jpg){:style="width:100%;display:block;max-width:500px;margin-left:auto;margin-right:auto;"} - -**Figure 11** The layout of each row of INT4 KV cache with row-wise quantization - -**Results:** - -**Table 8** Performance of Optimization 6 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 6 - Baseline - Opt 6 -
        32 - 137 - 143 - 111 - 262 - 250 - 322 - 1.23 - 1.29 -
        64 - 234 - 257 - 192 - 305 - 278 - 372 - 1.22 - 1.34 -
        128 - 432 - 455 - 346 - 331 - 314 - 414 - 1.25 - 1.32 -
        256 - 815 - 866 - 642 - 351 - 331 - 446 - 1.27 - 1.35 -
        512 - 1581 - 1659 - 1244 - 362 - 345 - 460 - 1.27 - 1.33 -
        - - -**Table 9** Performance of Optimization 6 for INT4 GQA (group-wise quantization with num groups = 4) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CUDA_WMMA - FD - CUDA_WMMA - vs FD -
        Opt 6 - Opt 6 -
        32 - 129 - 116 - 325 - 364 - 1.31 -
        64 - 219 - 195 - 385 - 431 - 1.36 -
        128 - 392 - 347 - 429 - 484 - 1.39 -
        256 - 719 - 638 - 468 - 527 - 1.41 -
        512 - 1375 - 1225 - 489 - 550 - 1.43 -
        - - - -### Optimization 7: Compute max QKT From WMMA Fragment Directly (A100/H100 specific) - -**Problem Analysis:** - -We observe large stalls due to shared memory accessing during the max QKT computation (showing as large short scoreboard stalls) as shown in Figure 12. - - -![Figure 12: Stalls due to shared memory access during max QKT computation](/assets/images/int4-decoding/fg12.png){:style="width:100%"} - -**Figure 12** Stalls due to shared memory access during max QKT computation (the number that the arrow points to is stall cycles caused by shared memory wait) - -**Solution:** - -We bypass shared memory when computing max QKT by computing it from the WMMA fragment (i.e., the tensor core fragment) directly. The layout of the WMMA fragment is specific to the GPU architecture. In this optimization, we only enabled this optimization for the NVIDIA A100/H100 GPUs. Other GPUs will still use shared memory for the max QKT computation. By bypassing shared memory, we effectively eliminate the stalls caused by shared memory access. The tensor core layout of the `C` fragment which is used for storing the QKT results is shown in Figure 13. - - -![Figure 13: C fragment (QKT storage) tensor core layout on A100/H100](/assets/images/int4-decoding/fg13.jpg){:style="width:100%"} - -**Figure 13** `C` fragment (QKT storage) tensor core layout on A100/H100 - -**Table 10** Performance of Optimization 7 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 7 - Baseline - Opt 7 -
        32 - 137 - 143 - 107 - 262 - 250 - 333 - 1.27 - 1.33 -
        64 - 234 - 257 - 183 - 305 - 278 - 391 - 1.28 - 1.40 -
        128 - 432 - 455 - 333 - 331 - 314 - 430 - 1.30 - 1.37 -
        256 - 815 - 866 - 620 - 351 - 331 - 461 - 1.31 - 1.40 -
        512 - 1581 - 1659 - 1206 - 362 - 345 - 475 - 1.31 - 1.38 -
        - - -**Table 11** Performance of Optimization 7 for INT4 GQA (group-wise quantization with num groups = 4) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CUDA_WMMA - FD - CUDA_WMMA - vs FD - vs CUDA_WMMA Opt 6 -
        Opt 6 - Opt 7 - Opt 6 - Opt 7 -
        32 - 129 - 116 - 111 - 325 - 364 - 380 - 1.17 - 1.04 -
        64 - 219 - 195 - 187 - 385 - 431 - 449 - 1.17 - 1.04 -
        128 - 392 - 347 - 333 - 429 - 484 - 506 - 1.18 - 1.04 -
        256 - 719 - 638 - 615 - 468 - 527 - 547 - 1.17 - 1.04 -
        512 - 1375 - 1225 - 1184 - 489 - 550 - 569 - 1.16 - 1.03 -
        - - - -### Optimization 8: Write FP32->BF16 Results to `P` Fragment Directly (A100/H100 specific) - -**Problem Analysis:** - -During the FP32-BF16 conversion for the `P` fragment, the kernel loads the FP32 data from shared memory, does the conversion and then stores the BF16 data back to shared memory. Moreover, the conversion requires many thread block synchronizations (`__syncthreads()`). - -**Solution:** - -Due to the data partitioning design of the kernel, each warp performs only one pass through the `P` fragment. Thus, we do not have to write the conversion results back to the shared memory for future usage. To avoid writing the BF16 data to the shared memory and thread block synchronizations, we have each warp load the FP32 data of the `P` WMMA fragment from the shared memory, do the conversion and then write the BF16 data directly to the `P` fragment. - -Note that this optimization is applied to only the NVIDIA A100 and H100 GPUs because the WMMA fragment layout is architecture dependent. For non-A100/H100 GPUs, the kernel will fallback to the original path. - -The `P` fragment tensor core layout is shown in Figure 14. Note that this layout is specific to the NVIDIA A100/H100 GPU. - -![Figure 14: P fragment tensor core layout on A100/H100](/assets/images/int4-decoding/fg14.jpg){:style="width:100%"} - -**Figure 14** `P` fragment tensor core layout on A100/H100 - -**Table 12** Performance of Optimization 8 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 8 - Baseline - Opt 8 -
        32 - 137 - 143 - 101 - 262 - 250 - 353 - 1.35 - 1.41 -
        64 - 234 - 257 - 174 - 305 - 278 - 410 - 1.34 - 1.47 -
        128 - 432 - 455 - 317 - 331 - 314 - 451 - 1.36 - 1.43 -
        256 - 815 - 866 - 590 - 351 - 331 - 485 - 1.38 - 1.47 -
        512 - 1581 - 1659 - 1143 - 362 - 345 - 501 - 1.38 - 1.45 -
        - - -**Table 13** Performance of Optimization 8 for INT4 GQA (group-wise quantization with num groups = 4) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CUDA_WMMA - FD - CUDA_WMMA - vs FD - vs CUDA_WMMA Opt 6 -
        Opt 6 - Opt 8 - Opt 6 - Opt 8 -
        32 - 129 - 116 - 106 - 325 - 364 - 396 - 1.22 - 1.09 -
        64 - 219 - 195 - 180 - 385 - 431 - 467 - 1.21 - 1.08 -
        128 - 392 - 347 - 319 - 429 - 484 - 528 - 1.23 - 1.09 -
        256 - 719 - 638 - 596 - 468 - 527 - 565 - 1.21 - 1.07 -
        512 - 1375 - 1225 - 1138 - 489 - 550 - 591 - 1.21 - 1.08 -
        - - - -### Optimization 9: Swizzle P Shared Memory Layouts (A100/H100 specific) - -**Problem Analysis:** - -We observe large shared memory bank conflicts during `P` loading. The amount of bank conflict depends on the memory access stride. For instance, for split-Ks = 32 and max seq length = 8192, we observed that only 4 out of 32 banks are being accessed in parallel (memory access stride = 256). From Figure 14, when all threads access element 0, threads that have the same `threadIdx.x % 4` access the same bank. - - -![Figure 15: P fragment in shared memory before swizzling](/assets/images/int4-decoding/fg15.jpg){:style="width:100%"} - - -**Figure 15** P fragment in shared memory before swizzling - -**Solution:** - -We shuffle the layout of `P` load/store in the shared memory in such a way that avoids bank conflicts. In other words, we store the QKT results (`C` fragment) and load them (`P` fragment) using the swizzled layout. Moreover, instead of using the original memory access stride which is dependent on the number of tokens per thread block, we use the fragment's column size as the stride which is constant. Thus, the load and store of the `P` fragment is always contiguous. - -The new layouts for the C and P fragments are shown in Figure 16. With the new layout, it is guaranteed that 16 banks are being accessed in parallel as shown in Figure 17. - - -![Figure 16: The swizzled layouts of C and P fragments](/assets/images/int4-decoding/fg16.jpg){:style="width:100%"} - -**Figure 16** The swizzled layouts of C and P fragments - - - - -![Figure 17: P fragment in shared memory after swizzling](/assets/images/int4-decoding/fg17.jpg){:style="width:100%"} - - -**Figure 17** P fragment in shared memory after swizzling - -**Table 14** Performance of Optimization 9 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 9 - Baseline - Opt 9 -
        32 - 137 - 143 - 98 - 262 - 250 - 365 - 1.39 - 1.46 -
        64 - 234 - 257 - 167 - 305 - 278 - 429 - 1.41 - 1.54 -
        128 - 432 - 455 - 299 - 331 - 314 - 479 - 1.45 - 1.52 -
        256 - 815 - 866 - 549 - 351 - 331 - 521 - 1.48 - 1.58 -
        512 - 1581 - 1659 - 1060 - 362 - 345 - 540 - 1.49 - 1.56 -
        - - -**Table 15** Performance of Optimization 9 for INT4 GQA (group-wise quantization with num groups = 4) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CUDA_WMMA - FD - CUDA_WMMA - vs FD - vs CUDA_WMMA Opt 6 -
        Opt 6 - Opt 9 - Opt 6 - Opt 9 -
        32 - 129 - 116 - 105 - 325 - 364 - 400 - 1.23 - 1.10 -
        64 - 219 - 195 - 174 - 385 - 431 - 484 - 1.26 - 1.12 -
        128 - 392 - 347 - 302 - 429 - 484 - 558 - 1.30 - 1.15 -
        256 - 719 - 638 - 560 - 468 - 527 - 601 - 1.28 - 1.14 -
        512 - 1375 - 1225 - 1065 - 489 - 550 - 632 - 1.29 - 1.15 -
        - - - -### Optimization 10: Pad Shared Memory for INT4 Dequantization - -**Problem Analysis:** - -Once the kernel reads the INT4 `K` or `V` cache from global memory, it performs dequantization and stores the results (BF16) in the shared memory. Then, the BF16 data is loaded to the WMMA fragment from shared memory (via the WMMA interface). We observed a large number of bank conflicts for both `K` and `V` accesses. For instance, for `K` stores, only 4 out of 32 banks are being accessed in parallel. For `K` loads, 16 banks are being accessed in parallel. The same also occurs for `V` stores and loads. See the figures in the solution section. - -**Solution:** - -We pad the shared memory to reduce the bank conflict. Specifically, we pad each row by 2. That is, the row stride of `K` becomes `F_K` + 2 and the row stride of V becomes `F_N` + 2 (`F_K` and `F_N` are the fixed widths of the `K` and `V` WMMA fragments, respectively). With this optimization, we are able to reduce the bank conflict by 1.8x as shown in Figure 18. - - -![Figure 18: Bank conflicts before and after Optimization 10](/assets/images/int4-decoding/fg18.png){:style="width:100%"} - - -**Figure 18** Bank conflicts before and after Optimization 10 - -After Optimization 10, for `K` stores, 32 banks are being accessed in parallel (shown in Figure 19), while for `K` loads, 29 banks are accessed in parallel (shown in Figure 20). - -![Figure 19: K fragment store shared memory layout without and with padding](/assets/images/int4-decoding/fg19.jpg){:style="width:100%"} - - -**Figure 19** K fragment store shared memory layout without and with padding - -![Figure 20: K fragment load shared memory layout without and with padding](/assets/images/int4-decoding/fg20.jpg){:style="width:100%"} - - -**Figure 20** K fragment load shared memory layout without and with padding - -**Table 16** Performance of Optimization 10 for INT4 GQA (row-wise quantization) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CU - FD - CU - vs FD - vs CU baseline -
        Baseline - Opt 10 - Baseline - Opt 10 -
        32 - 137 - 143 - 94 - 262 - 250 - 380 - 1.45 - 1.52 -
        64 - 234 - 257 - 151 - 305 - 278 - 475 - 1.55 - 1.71 -
        128 - 432 - 455 - 266 - 331 - 314 - 538 - 1.63 - 1.71 -
        256 - 815 - 866 - 489 - 351 - 331 - 586 - 1.67 - 1.77 -
        512 - 1581 - 1659 - 930 - 362 - 345 - 616 - 1.70 - 1.79 -
        - - -**Table 17** Performance of Optimization 10 for INT4 GQA (group-wise quantization with num groups = 4) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch size - Time (us) - Bandwidth (GB/s) - Speed up -
        FD - CUDA_WMMA - FD - CUDA_WMMA - vs FD - vs CUDA_WMMA Opt 6 -
        Opt 6 - Opt 10 - Opt 6 - Opt 10 -
        32 - 129 - 116 - 99 - 325 - 364 - 425 - 1.31 - 1.17 -
        64 - 219 - 195 - 161 - 385 - 431 - 523 - 1.36 - 1.21 -
        128 - 392 - 347 - 282 - 429 - 484 - 598 - 1.39 - 1.23 -
        256 - 719 - 638 - 509 - 468 - 527 - 662 - 1.41 - 1.25 -
        512 - 1375 - 1225 - 965 - 489 - 550 - 698 - 1.43 - 1.27 -
        - - - -## Performance Evaluation - - -### Microbenchmark results - -We also evaluated BF16 GQA performance using our optimized kernel (as shown in Table 19). CU still performs generally worse than FD and FA for BF16. This is expected since our optimizations are INT4 focused. - -While INT4 GQA is still not as efficient as BF16 GQA (see the achieved bandwidths), it is important to note that when comparing FD BF16 GQA performance against CU INT4 GQA performance, **we can see that the latency of INT4 is smaller than that of BF16**. - -**Table 19** Performance of BF16 GQA and INT GQA after CU optimizations - -**On A100** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Time (us) - BF16 GQA - INT4 GQA -
        Batch size - FD - FA - CU before - CU after - FD - FA - CU before - CU after -
        32 - 139 - 133 - 183 - 163 - 137 - - - 143 - 94 -
        64 - 245 - 229 - 335 - 276 - 234 - - - 257 - 151 -
        128 - 433 - 555 - 596 - 517 - 432 - - - 455 - 266 -
        256 - 826 - 977 - 1127 - 999 - 815 - - - 866 - 489 -
        512 - 1607 - 1670 - 2194 - 1879 - 1581 - - - 1659 - 930 -
        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Effective Bandwidth (GB/s) - BF16 GQA - INT4 GQA -
        Batch size - FD - FA - CU before - CU after - FD - FA - CU before - CU after -
        32 - 965 - 1012 - 736 - 824 - 262 - - - 250 - 380 -
        64 - 1097 - 1175 - 802 - 972 - 305 - - - 278 - 475 -
        128 - 1240 - 968 - 901 - 1039 - 331 - - - 314 - 538 -
        256 - 1301 - 1100 - 954 - 1075 - 351 - - - 331 - 586 -
        512 - 1338 - 1287 - 980 - 1144 - 362 - - - 345 - 616 -
        - - -**On H100** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Time (us) - BF16 GQA - INT4 GQA -
        Batch size - FD - FA - CU before - CU after - FD - FA - CU before - CU after -
        32 - 91 - 90 - 114 - 100 - 70 - - - 96 - 64 -
        64 - 148 - 146 - 200 - 183 - 113 - - - 162 - 101 -
        128 - 271 - 298 - 361 - 308 - 205 - - - 294 - 170 -
        256 - 515 - 499 - 658 - 556 - 389 - - - 558 - 306 -
        512 - 1000 - 1011 - 1260 - 1066 - 756 - - - 1066 - 575 -
        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Effective Bandwidth (GB/s) - BF16 GQA - INT4 GQA -
        Batch size - FD - FA - CU before - CU after - FD - FA - CU before - CU after -
        32 - 1481 - 1496 - 1178 - 1341 - 511 - - - 371 - 560 -
        64 - 1815 - 1840 - 1345 - 1470 - 631 - - - 443 - 710 -
        128 - 1982 - 1802 - 1487 - 1743 - 699 - - - 487 - 844 -
        256 - 2087 - 2156 - 1634 - 1934 - 736 - - - 513 - 935 -
        512 - 2150 - 2127 - 1706 - 2015 - 757 - - - 537 - 996 -
        - - - -### E2E results - -We evaluated our optimized INT4 GQA kernel in Llama 2 70B on 8 H100 GPUs. We ran the model end-to-end, but only reported the decode latency. We use FP8 FFN (feed forward network) to emphasize the attention performance in the decoding phase. We vary the batch size from 1 to 256 and the context length from 2,048 (2K) to 16,384 (16K). The E2E performance results are shown in the figure below. - -![Figure 21: Meta Llama 2 decode latency (ms) comparison](/assets/images/int4-decoding/fg21.png){:style="width:100%"} - - -**Figure 21** Meta Llama 2 decode latency (ms) comparison (BF16 GQA runs out of memory in large batch size configurations) - - -## Code - -If you are interested, please checkout our code [here](https://github.com/pytorch/FBGEMM/tree/main/fbgemm_gpu/experimental/gen_ai). If you have any questions, please feel free to open an issue on GitHub, and we will be happy to help. Your contributions are welcome! \ No newline at end of file diff --git a/_posts/2024-06-11-new-executive-director.md b/_posts/2024-06-11-new-executive-director.md deleted file mode 100644 index c37fb79c2de6..000000000000 --- a/_posts/2024-06-11-new-executive-director.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Foundation Welcomes New Executive Director" ---- - -![Matt White](/assets/images/matt-white.jpg){:style="max-width:220px;float:right;margin-left: 20px;"} -The PyTorch Foundation is excited to welcome Matt White, our new executive director. The PyTorch Foundation formed in 2022 with the goal to drive adoption of AI tooling by fostering and sustaining an ecosystem of open source, vendor-neutral projects with PyTorch. Over the past 2 years, we’ve seen excellent growth across the project – with both contributor and member growth. - -“I am honored to be a part of the PyTorch Foundation, working with such a passionate and skilled community," said Matt White. “I am looking forward to working with our contributors and members to advance the PyTorch ecosystem through research, cutting edge technologies and open source best practices.” - -Matt is a career technologist, researcher and innovator and has over 25 years of experience in AI, data, autonomous systems and simulations. He is the Co-founder and Chair of the Open Metaverse Foundation, a part of the Linux Foundation. Previously, Matt was the Director of the Generative AI Commons at the Linux Foundation, leading the advancement of open science and open-source artificial intelligence projects. He is also the GM of AI at the Linux Foundation. - -## Learn more about the PyTorch Foundation: - -* Join as a [member](https://pytorch.org/join) -* Read our latest [announcements](https://pytorch.org/blog/) -* Access technical resources on [GitHub](https://github.com/pytorch/pytorch) \ No newline at end of file diff --git a/_posts/2024-06-12-reducing-checkpointing-times.md b/_posts/2024-06-12-reducing-checkpointing-times.md deleted file mode 100644 index 84fc4e9665e3..000000000000 --- a/_posts/2024-06-12-reducing-checkpointing-times.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -layout: blog_detail -title: "Reducing Model Checkpointing Times by Over 10x with PyTorch Distributed Asynchronous Checkpointing" -author: "Meta: Lucas Pasqualin, Less Wright, Iris Zhang (PyTorch), Chien-Chin Huang; IBM Research: Swaminathan Sundararaman, Saransh Gupta, Raghu Ganti" ---- - -**Summary:** With PyTorch distributed’s new asynchronous checkpointing feature, developed with feedback from IBM, we show how IBM Research Team is able to implement and reduce effective checkpointing time by a factor of 10-20x. Example: 7B model ‘down time’ for a checkpoint goes from an average of 148.8 seconds to 6.3 seconds, or 23.62x faster. - -This directly translates into either more net training progress for every given 24 hour period while continuing to robustly checkpoint or more frequent checkpoints to shorten recovery window/time. - -In this note, we showcase the usage code and architecture that makes asynchronous checkpointing possible, along with timing results verified by IBM’s Research team. - - -![Async Checkpointing vs Standard Checkpointing](/assets/images/reducing-checkpointing-times/fg1.png){:style="width:100%"} - - -Model checkpointing is a vital part of large model training, but checkpointing is an expensive process as each checkpoint process involves blocking training progress in order to save out the latest model weights. However, not checkpointing or reducing checkpointing frequency can result in a significant loss in training progress. For example, failures such as a deadlock, straggler, and gpu errors require the training process to be restarted. In order to restart from a failure, all (training) workers must stop their training process and be restarted from the last saved checkpoint. - -Thus, the inherent tension between robustness to failures vs training progress plays out as a tradeoff, but now with asynchronous checkpointing, PyTorch Distributed is able to significantly reduce this tension and enable frequent checkpoint with minimal impact to the overall training time. - -For background, it was almost exactly [a year ago](https://pytorch.org/blog/performant-distributed-checkpointing/) that we showcased how distributed checkpointing had massively sped up checkpointing times from the original torch.save() functionality. As IBM Research had noted, torch.save could take up to 30 minutes to checkpoint a single 11B model (PyTorch 1.13). - -With advancements in distributed checkpointing, checkpoints could be done in under 4 minutes for up to 30B model sizes. - -With asynchronous checkpointing, the training time lost due to checkpointing now moves to under 30 seconds, and often as short as 6 seconds. - -To be clear, asynchronous checkpointing does not compress the actual serialization checkpointing time as the previous update showcased. Rather it moves the final checkpointing process off the critical path (to cpu threads) to allow GPU training to continue while finalizing the checkpoint under separate threads. - -However, to the user, the effect is nearly the same in that down time for training due to checkpointing is substantially reduced, in many cases by 10x or even 20x. - - -![Async Dist Checkpointing](/assets/images/reducing-checkpointing-times/fg2.png){:style="width:100%"} - - -As the above speedup chart shows, asynchronous checkpointing produces a 10x to 23x further improvement over the previous large improvements from a year ago. - - -## How does Asynchronous Checkpointing work? - -Asynchronous checkpointing modularizes the checkpointing process into two parts rather than one monolithic process. The first phase copies the data from each gpu/rank from GPU to CPU. This is the visible downtime to the user and can take from 6 - 14 seconds for 7B-13B model sizes. The second phase asynchronously copies the data from CPU memory to disk to persist the checkpoint. - - -Once data is copied to CPU in the first phase, the GPU is free to immediately resume training. Hence with asynchronous checkpointing the downtime for checkpointing is simply the time needed to copy over the latest model states to CPU. - -At the same time that training resumes, non-blocking CPU threads work with the freshly arrived data in memory to complete the full checkpointing/serialization process to disk (i.e. persistent save). - -![flow diagram](/assets/images/reducing-checkpointing-times/fg3.png){:style="width:100%"} - - - -Note that PyTorch’s Distributed Checkpointer relies on collective communication calls for per-rank metadata necessary to optimize saves, as well as a final synchronization which marks checkpointing as complete and makes the action atomic. This can interfere with distributed training (as distributed training also relies upon similar calls to synchronize training across multiple GPUs) if the Checkpointing thread utilizes the same process group used for training. - -Specifically, a race condition between the calls could potentially cause training and asynch checkpointing save threads to wait on collective calls at the same time, resulting in a true collective hang. - -We avoided this scenario by initializing a separate process group for async checkpointing. This separates the checkpointing collectives into their own logical process group, which thus ensures it will not interfere with collective calls in the main training threads. - - -## How do I use Asynchronous Checkpointing in my training? - -Usage of Asynchronous checkpointing is relatively straightforward. Using the latest nightly version of PyTorch, you will want to initialize your process group with both nccl and gloo. Gloo is required for the cpu threads portion. - -From there, create a duplicate process group which the asynchronous checkpointing will utilize. -Then train as usual but at the point when you want to checkpoint, use the asynchronous save api, passing in the states to save, the checkpoint id and the checkpoint process group. - -![Code snippet](/assets/images/reducing-checkpointing-times/fg4.png){:style="width:100%"} - - - - -Asynchronous checkpointing is also fully implemented in [torchtitan](https://github.com/pytorch/torchtitan). Here, it is implemented for use with pre-training your own Llama2 or Lllama3 model. Using it is as simple as updating the toml config file: - -![Code snippet](/assets/images/reducing-checkpointing-times/fg5.png){:style="width:100%"} - - - -## Future work - -Checkpointing has made huge strides over the past year. Moving from almost half an hour checkpoints to under 5 minutes with distributed checkpointing and now to under 30 seconds with asynchronous checkpointing. - -The last frontier - zero overhead checkpointing where even the < 30 seconds is eliminated by streaming the updated weights during the backward pass such that checkpoint data is already on cpu at the point asynchronous checkpointing would kick in. - -This would effectively move large model training to where checkpointing has no disruption or downtime enabling both more robustness (as checkpoints could be taken more frequently) and faster training progress due to no downtime for checkpointing. - -Source code link: [https://github.com/pytorch/pytorch/blob/main/torch/distributed/checkpoint/state_dict_saver.py](https://github.com/pytorch/pytorch/blob/main/torch/distributed/checkpoint/state_dict_saver.py) diff --git a/_posts/2024-06-20-accelerating-neural-network-training.md b/_posts/2024-06-20-accelerating-neural-network-training.md deleted file mode 100644 index fc1f7319690f..000000000000 --- a/_posts/2024-06-20-accelerating-neural-network-training.md +++ /dev/null @@ -1,248 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity" -author: Jesse Cai, Daniel Haziza, Supriya Rao ---- - -Over the past year, we’ve added support for semi-structured (2:4) sparsity into PyTorch. With just a few lines of code, we were able to show a 10% end-to-end inference speedup on [segment-anything](https://github.com/pytorch/ao/tree/main/torchao/sparsity#segment-anything) by replacing dense matrix multiplications with sparse matrix multiplications. - -However, matrix multiplications are not unique to neural network inference - they happen during training as well. By expanding on the core primitives we used earlier to accelerate inference, we were also able to accelerate model training. We wrote a replacement nn.Linear layer, `SemiSparseLinear`, that is able to achieve a 1.3x [speedup](https://github.com/pytorch/ao/tree/main/torchao/sparsity/training#benchmarking) across the forwards + backwards pass of the linear layers in the MLP block of ViT-L on a NVIDIA A100. - -**End-to-end, we see a wall time reduction of 6% for a [DINOv2 ViT-L](https://github.com/facebookresearch/dinov2) training, with virtually no accuracy degradation out of the box (82.8 vs 82.7 on ImageNet top-1 accuracy).** - - -![2 strategies for training a ViT model](/assets/images/accelerating-neural-network-training/fg1.png){:style="width:100%"} - - -_We compare 2 strategies for training a ViT model for 125k iterations on 4x NVIDIA A100s: either fully dense (blue), or sparse for 70% of the training, then dense (orange). Both achieve similar results on the benchmarks, but the sparse variant trains 6% faster. For both experiments, we evaluate the intermediate checkpoints with and without sparsity._ - -As far as we are aware, **this is the first OSS implementation of accelerated sparse training** and we’re excited to provide a user API in [torchao](https://github.com/pytorch/ao/tree/main/torchao/sparsity/training#benchmarking). You can try accelerating your own training runs with just a few lines of code: - -```py -# Requires torchao and pytorch nightlies and CUDA compute capability 8.0+ -import torch -from torchao.sparsity.training import ( - SemiSparseLinear, - swap_linear_with_semi_sparse_linear, -) - -model = torch.nn.Sequential(torch.nn.Linear(1024, 4096)).cuda().half() - -# Specify the fully-qualified-name of the nn.Linear modules you want to swap -sparse_config = { - "seq.0": SemiSparseLinear -} - -# Swap nn.Linear with SemiSparseLinear, you can run your normal training loop after this step -swap_linear_with_semi_sparse_linear(model, sparse_config) -``` - -## How does this work? - -The general idea behind sparsity is simple: skip calculations involving zero-valued tensor elements to speed up matrix multiplication. However, simply setting weights to zero isn't enough, as the dense tensor still contains these pruned elements and dense matrix multiplication kernels will continue to process them, incurring the same latency and memory overhead. To achieve actual performance gains, we need to replace dense kernels with sparse kernels that intelligently bypass calculations involving pruned elements. - -These kernels work on sparse matrices, which remove the pruned elements and store the specified elements in a compressed format. There are many different sparse formats, but we’re particularly interested in **semi-structured sparsity,** also known as **2:4 structured sparsity** or **fine-grained structured sparsity** or more generally **N:M structured sparsity**. - -![2:4 sparse compressed representation](/assets/images/accelerating-neural-network-training/fg2.png){:style="width:100%;display:block;max-width:600px;margin-left:auto;margin-right:auto;"} - - -_2:4 sparse compressed representation. Original [Source](https://developer.nvidia.com/blog/structured-sparsity-in-the-nvidia-ampere-architecture-and-applications-in-search-engines/)_ - -A 2:4-sparse matrix is a matrix where at most 2 elements are non-zero for every 4 elements, as illustrated in the image above. Semi-structured sparsity is attractive because it exists in a goldilocks spot of performance and accuracy: - - -1. NVIDIA GPUs since Ampere offer hardware acceleration and library support ([cuSPARSELt](https://docs.nvidia.com/cuda/cusparselt/index.html)) for this format, with matrix multiplication being up to 1.6x faster -2. Pruning models to fit this sparsity pattern does not degrade accuracy as much as other patterns. NVIDIA’s [whitepaper](https://arxiv.org/pdf/2104.08378) shows pruning then retraining is able to recover accuracy for most vision models. - -![Illustration of 2:4 (sparse) matrix multiplication on NVIDIA GPUs](/assets/images/accelerating-neural-network-training/fg3.png){:style="width:100%"} - -_Illustration of 2:4 (sparse) matrix multiplication on NVIDIA GPUs. Original [source](https://arxiv.org/pdf/2104.08378)_ - -Accelerating inference with semi-structured sparsity is straightforward. Since our weights are fixed during inference, we can prune and compress the weight ahead of time (offline) and store the compressed sparse representation instead of our dense tensor. - - -![flow chart](/assets/images/accelerating-neural-network-training/fg4.png){:style="width:100%"} - -Then, instead of dispatching to dense matrix multiplication we dispatch to sparse matrix multiplication, passing in the compressed sparse weight instead of the normal dense one. For more information about accelerating models for inference using 2:4 sparsity, please refer to our [tutorial](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html?highlight=beta). - - -## Extending sparse inference acceleration to training - -In order to use sparsity to reduce the training time of our models, we need to consider when the mask is calculated, as once we store the compressed representation the mask is fixed. - -Training with a fixed mask applied to an existing trained dense model (also known as **pruning**) does not degrade accuracy, but this requires two training runs - one to obtain the dense model and another to make it sparse, offering no speedups. - -Instead we’d like to train a sparse model from scratch (**dynamic sparse training**), but training from scratch with a fixed mask will lead to a significant drop in evaluations, as the sparsity mask would be selected at initialization, when the model weights are essentially random. - -To maintain the accuracy of the model when training from scratch, we prune and compress the weights at runtime, so that we can calculate the optimal mask at each step of the training process. - -Conceptually you can think of our approach as an approximate matrix multiplication technique, where we `prune_and_compress` and dispatch to `sparse_GEMM` in less time than a `dense_GEMM` call would take. This is difficult because the native pruning and compression functions are too slow to show speedups. - -Given the shapes of our ViT-L training matrix multiplications (13008x4096x1024), we measured the runtime of a dense and sparse GEMM respectively at 538us and 387us. In other words, the pruning and compression step of the weight matrix must run in less than 538-387=151us to have any efficiency gain. Unfortunately, the compression kernel provided in cuSPARSELt already takes 380us (without even considering the pruning step!). - - -Given the max NVIDIA A100 memory IO (2TB/s), and considering that a prune and compress kernel would be memory bound, we could theoretically prune and compress our weight (4096x1024x2 bytes=8MB) in 4us (8MB / 2TB/s)! And in fact, we were able to write a kernel that prunes and compresses a matrix into 2:4-sparse format, and runs in 36 us (10x faster than the compression kernel in cuSPARSELt), making the entire GEMM (including the sparsification) faster. Our kernel is [available](https://github.com/pytorch/pytorch/pull/122350) for use in PyTorch. - - -![Our custom sparsification kernel](/assets/images/accelerating-neural-network-training/fg5.png){:style="width:100%"} - -_Our custom sparsification kernel, which includes pruning + compression, is ~30% faster across a linear layer forward+backward. Benchmarks run on a NVIDIA A100-80GB GPU._ - - -### Writing a performant runtime sparsification kernel - -There were multiple challenges we faced in order to implement a performant runtime sparsification kernel, which we will explore below. - - -#### 1) Handling the backwards pass - -For the backwards pass, we need to calculate dL/dX and dL/dW for the gradient update and the subsequent layer, which means we need to calculate xWT and xTW respectively. - - -![Overview of runtime sparsification for training acceleration (FW + BW pass)](/assets/images/accelerating-neural-network-training/fg6.png){:style="width:100%"} - -_Overview of runtime sparsification for training acceleration (FW + BW pass)_ - -However this is problematic, because the compressed representation cannot be transposed, since there’s no guarantee that the tensor is 2:4 sparse in both directions. - - -![Both matrices are valid 2:4 matrices. However, the right one is no longer a valid 2:4 matrix once transposed because one column contains more than 2 elements](/assets/images/accelerating-neural-network-training/fg7.png){:style="width:100%;display:block;max-width:500px;margin-left:auto;margin-right:auto;"} - - -_Both matrices are valid 2:4 matrices. However, the right one is no longer a valid 2:4 matrix once transposed because one column contains more than 2 elements_ - -Therefore, we prune a 4x4 tile, instead of a 1x4 strip. We greedily preserve the largest values, ensuring that we take at most 2 values for each row / column. While this approach is not guaranteed to be optimal, as we sometimes only preserve 7 values instead of 8, it efficiently calculates a tensor that is 2:4 sparse both row-wise and column-wise. - -We then compress both the packed tensor and the packed transpose tensor, storing the transpose tensor for the backwards pass. By calculating both the packed and packed transpose tensor at the same time, we avoid a secondary kernel call in the backwards pass. - -![Our kernel prunes the weight matrix in registers](/assets/images/accelerating-neural-network-training/fg8.png){:style="width:100%"} - - -_Our kernel prunes the weight matrix in registers, and writes the compressed values in global memory. It also prunes at the same time W.t, which is needed for the backward pass, minimizing the memory IO_ - -There’s some additional transpose trickery needed to handle the backwards pass - the underlying hardware only supports operations where the first matrix is sparse. For weight sparsification during inference, when we need to calculate xWT we rely on transpose properties to swap the order of the operands. - -![Math formula](/assets/images/accelerating-neural-network-training/fg1.jpg){:style="width:100%;display:block;max-width:300px;margin-left:auto;margin-right:auto;"} - -During inference, we use `torch.compile` to fuse the outer transpose into subsequent pointwise ops in order to avoid paying a performance penalty. - -However in the case of the backwards pass of training, we have no subsequent pointwise op to fuse with. Instead, we fuse the transposition into our matrix multiplication by taking advantage of cuSPARSELt’s ability to specify the row / column layout of the result matrix. - - -#### 2) Kernel tiling for efficient memory-IO - -In order for our kernel to be as efficient as possible, we want to coalesce our reads / writes, as we found that memory IO to be the main bottleneck. This means that within a CUDA thread, we want to read/write chunks of 128 bytes at a time, so that multiple parallel reads/writes can be coalesced into a single request by the GPU memory controller. - -Therefore, instead of a thread handling a single 4x4 tile, which is only 4x4x2 = 32 bytes, we decided that each thread will handle 4 4x4 tiles (aka an 8x8 tile), which allows us to operate 8x8x2 =128 byte chunks. - -![Kernel tiling for efficient memory-IO](/assets/images/accelerating-neural-network-training/fg9.png){:style="width:100%"} - - -#### 3) Sorting elements in a 4x4 tile without warp-divergence - -For each individual 4x4 tile within our thread we calculate a bitmask that specifies which elements to prune and which elements to keep. To do this we sort all 16 elements and greedily preserve elements, so long as they do not break our 2:4 row / col constraint. This preserves only the weights with the largest values. - -Crucially we observe that we are only ever sorting a fixed number of elements, so by using a branchless [sorting network](https://en.wikipedia.org/wiki/Sorting_network), we can avoid warp divergence. - -![Sorting network diagram](/assets/images/accelerating-neural-network-training/fg10.png){:style="width:100%"} - -_For clarity, the transposed packed tensor and metadata are omitted. Sorting network diagram taken from [Wikipedia](https://en.wikipedia.org/wiki/Sorting_network)._ - -Warp divergence occurs when we have conditional execution inside across a thread block. In CUDA, work items in the same work group (thread block) are dispatched at the hardware level in batches (warps). If we have conditional execution, such that some work-items in the same batch run different instructions, then they are masked when the warp is dispatched, or dispatched sequentially. - -For example, if we have some code like `if (condition) do(A) else do(B)`, where condition is satisfied by all the odd-numbered work items, then the total runtime of this conditional statement is `do(A) + do(B)`, since we would dispatch `do(A)` for all odd-numbered work-items, masking out even-numbered work-items, and `do(B)` for all even numbered work-items, masking out odd-numbered work-items. This [answer](https://www.reddit.com/r/CUDA/comments/gkpjxe/what_is_warp_divergence/#:~:text=Warp%20divergence%20is%20a%20%22Compute,later%20processed%20using%20different%20instructions.) provides more information about warp divergence. - - -#### 4) Writing the compressed matrices and metadata - -Once the bitmask has been computed, the weight data has to be written back in a compressed format in global memory. This is not trivial, because the data needs to stay in registers, and it’s not possible to index registers (eg `C[i++] = a` prevents us from storing `C` in registers). Furthermore, we found that `nvcc` was using many more registers than we expected, which caused register spilling and impacted global performance. We write this compressed matrix to global memory in Column-Major format to make the writes more efficient. - - -![compressed matrix to global memory in Column-Major format](/assets/images/accelerating-neural-network-training/fg11.png){:style="width:100%"} - - -We also need to write the cuSPARSELt metadata as well. This metadata layout is quite similar to the one from the open-source CUTLASS library and is optimized for being loaded efficiently through shared-memory in the GEMM kernel with the PTX `ldmatrix` instruction. - -However, this layout is not optimized to be written efficiently: the first 128 bits of the metadata tensor contains metadata about the first 32 columns of the rows 0, 8, 16 and 24. Recall that each thread handles an 8x8 tile, which means that this information is scattered across 16 threads. - -We rely on a series of warp-shuffle operations, once for the original and transposed representation respectively to write the metadata. Fortunately, this data represents less than 10% of the total IO, so we can afford to not fully coalesce the writes. - - -## DINOv2 Sparse Training: Experimental Setup and Results - -For our experiments, the ViT-L model is trained on ImageNet for 125k steps using the DINOv2 method. All our experiments were run on 4x AMD EPYC 7742 64-core CPUs and 4x NVIDIA A100-80GB GPUs. During sparse training, the model is trained with 2:4 sparsity enabled for the first part of the training, where only half of the weights are enabled. This sparsity mask on the weights is dynamically recomputed at every step, as weights are continuously updated during the optimization. For the remaining steps, the model is trained densely, producing a final model without 2:4 sparsity (except the 100% sparse training setup), which is then evaluated. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Training setup - ImageNet 1k log-regression -
        0% sparse (125k dense steps, baseline) - 82.8 -
        40% sparse (50k sparse -> 75k dense steps) - 82.9 -
        60% sparse (75k sparse -> 50k dense steps) - 82.8 -
        70% sparse (87.5k sparse -> 37.5k dense steps) - 82.7 -
        80% sparse (100k sparse -> 25k dense steps) - 82.7 -
        90% sparse (112.5k sparse -> 12.5k dense steps) - 82.0 -
        100% sparse (125k sparse steps) - 82.3 (2:4-sparse model) -
        - - -![sparsity training diagrams](/assets/images/accelerating-neural-network-training/fg12.png){:style="width:100%"} - - -During the sparse training steps, in the backward pass we obtain a dense gradient for the sparse weights. For the gradient descent to be sound, we should also sparsify this gradient before using it in the optimizer to update the weights. Instead of doing that, we use the full dense gradient to update the weights - we found this to work better in practice: this is the STE ([Straight Through Estimator](https://arxiv.org/pdf/1903.05662)) strategy. In other words, we update all the parameters at every step, even the ones we don’t use. - - -## Conclusion and Future Work - -In this blog post, we’ve shown how to accelerate neural network training with semi-structured sparsity and explained some of the challenges we faced. We were able to achieve a 6% end to end speedup on DINOv2 training with a small 0.1 pp accuracy drop. - -There are several areas of expansion for this work: - - - -* **Expansion to new sparsity patterns:** Researchers have created new sparsity patterns like [V:N:M](https://arxiv.org/pdf/2310.02065) sparsity that use the underlying semi-structured sparse kernels to allow for more flexibility. This is especially interesting for applying sparsity to LLMs, as 2:4 sparsity degrades accuracy too much, but we have seen some positive [results](https://arxiv.org/pdf/2310.06927) for more general N:M pattern. -* **Performance optimizations for sparse fine-tuning:** This post covers sparse training from scratch, but oftentimes we want to fine-tune a foundational model. In this case, a static mask may be sufficient to preserve accuracy which would enable us to make additional performance optimizations. -* **More experiments on pruning strategy:** We calculate the mask at each step of the network, but calculating the mask every n steps may yield better training accuracy. Overall, figuring out the best strategy to use semi-structured sparsity during training is an open area of research. -* **Compatibility with fp8:** The hardware also supports fp8 semi-structured sparsity, and this approach should work similarly with fp8 in principle. In practice, we would need to write similar sparsification kernels, and could possibly fuse them with the scaling of the tensors. -* **Activation Sparsity:** Efficient sparsification kernels also enable to sparsify the activations during training. Because the sparsification overhead grows linearly with the sparsified matrix size, setups with large activation tensors compared to the weight tensors could benefit more from activation sparsity than weight sparsity. Furthermore, activations are naturally sparse because of the usage of ReLU or GELU activation functions, reducing accuracy degradation. - -If you are interested in these problems, please feel free to open an issue / PR in [torchao](https://github.com/pytorch/ao), a community we’re building for architecture optimization techniques like quantization and sparsity. Additionally, if you have general interest in sparsity please reach out in [CUDA-MODE](discord.gg/cudamode) (#sparsity) \ No newline at end of file diff --git a/_posts/2024-06-20-pytorch-docathon-h2-2024-wrap-up.md b/_posts/2024-06-20-pytorch-docathon-h2-2024-wrap-up.md deleted file mode 100644 index 3f0ad5e10ab0..000000000000 --- a/_posts/2024-06-20-pytorch-docathon-h2-2024-wrap-up.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -layout: blog_detail -title: "🎉 PyTorch Docathon H1 2024 Wrap-up 🎉" ---- - -We are thrilled to announce the successful completion of the H1 2024 PyTorch Docathon! The event was a resounding success, and we want to extend our heartfelt gratitude to all the participants who made it possible. Dedication, expertise, and tireless efforts of our open-source contributors have once again helped us to improve PyTorch documentation. - -This Docathon ran from June 4 through June 20 with more than 176 registrants. The energy and enthusiasm were palpable, and entrants were judged on the difficulty of submissions that resulted in over 50 merged pull requests. - -We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide. - -## Meet the top contributors - -* First place: [ahoblitz](https://github.com/ahoblitz), [afrittoli](https://github.com/afrittoli), [kiszk](https://github.com/kiszk) -* Second place: [loganthomas](https://github.com/loganthomas), [ignaciobartol](https://github.com/ignaciobartol), [arunppsg](https://github.com/arunppsg), [alperenunlu](https://github.com/alperenunlu) -* Third place: [anandptl84](https://github.com/anandptl84), [GdoongMathew](https://github.com/GdoongMathew), [ZailiWang](https://github.com/ZailiWang), [ZhaoqiongZ](https://github.com/ZhaoqiongZ), [jingxu10](https://github.com/jingxu10), [sitamgithub-MSIT](https://github.com/sitamgithub-MSIT) -* Honorable mentions for contributing to PyTorch XLA and torchfix libraries: [arunppsg](https://github.com/arunppsg) and [sitamgithub-MSIT](https://github.com/sitamgithub-MSIT) - -For the full list of participants, see [here](https://github.com/pytorch/tutorials/blob/main/docathon-leaderboard.md). - -As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch documentation and code, and pushing the boundaries of what’s possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the PyTorch community. - -Thank you again for your participation and support. We look forward to seeing what you will achieve next! - -Team PyTorch diff --git a/_posts/2024-06-23-training-moes.md b/_posts/2024-06-23-training-moes.md deleted file mode 100644 index fee1999f3fbc..000000000000 --- a/_posts/2024-06-23-training-moes.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -layout: blog_detail -title: "Training MoEs at Scale with PyTorch" -author: Brian Chu, Mihir Patel, Less Wright, Vitaliy Chiley, Evan Racah, Wanchao Liang, Iris Zhang, Andrew Gu ---- - -Over the past year, Mixture of Experts (MoE) models have surged in popularity, fueled by powerful open-source models like [DBRX](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm), [Mixtral](https://mistral.ai/news/mixtral-of-experts/), [DeepSeek](https://github.com/deepseek-ai/DeepSeek-V2), and many more. At Databricks, we've worked closely with the PyTorch team to scale training of MoE models. In this blog post, we’ll talk about how we scale to over three thousand GPUs using [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) and [MegaBlocks](https://github.com/databricks/megablocks), an efficient open-source MoE implementation in PyTorch. - - -## What is a MoE? - -A MoE model is a model architecture that uses multiple expert networks to make predictions. A gating network is used to route and combine the outputs of experts, ensuring each expert is trained on a different, specialized distribution of tokens. The architecture of a transformer-based large language model typically consists of an embedding layer that leads into multiple transformer blocks (Figure 1, Subfigure A). Each transformer block contains an attention block and a dense feed forward network (Figure 1, Subfigure B). These transformer blocks are stacked such that the output of one transformer block leads to the input of the next block. The final output goes through a fully connected layer and softmax to obtain probabilities for the next token to output. - -When using a MoE in LLMs, the dense feed forward layer is replaced by a MoE layer which consists of a gating network and a number of experts (Figure 1, Subfigure D). The gating network, typically a linear feed forward network, takes in each token and produces a set of weights that determine which tokens are routed to which experts. The experts themselves are typically implemented as a feed forward network as well. During training, the gating network adapts to assign inputs to the experts, enabling the model to specialize and improve its performance. The router outputs are then used to weigh expert outputs to give the final output of the MoE layer. - - -![Figure 1: Using Mixture of Experts in a transformer block](/assets/images/training-moes/fg1.png){:style="width:100%"} - -_Figure 1: Using Mixture of Experts in a transformer block_ - -Compared to dense models, MoEs provide more efficient training for a given compute budget. This is because the gating network only sends tokens to a subset of experts, reducing the computational load. As a result, the capacity of a model (its total number of parameters) can be increased without proportionally increasing the computational requirements. During inference, only some of the experts are used, so a MoE is able to perform faster inference than a dense model. However, the entire model needs to be loaded in memory, not just the experts being used. - -The sparsity in MoEs that allows for greater computational efficiency comes from the fact that a particular token will only be routed to a subset of experts. The number of experts and how experts are chosen depends on the implementation of the gating network, but a common method is top k. The gating network first predicts a probability value for each expert, then routes the token to the top k experts to obtain the output. However, if all tokens always go to the same subset of experts, training becomes inefficient and the other experts end up undertrained. To alleviate this problem, a load balancing loss is introduced that encourages even routing to all experts. - -The number of experts and choosing the top k experts is an important factor in designing MoEs. A higher number of experts allows scaling up to larger models without increasing computational cost. This means that the model has a higher capacity for learning, however, past a certain point the performance gains tend to diminish. The number of experts chosen needs to be balanced with the inference costs of serving the model since the entire model needs to be loaded in memory. Similarly, when choosing top k, a lower top k during training results in smaller matrix multiplications, leaving free computation on the table if communication costs are large enough. During inference, however, a higher top k generally leads to slower inference speed. - - -## MegaBlocks - -[MegaBlocks](https://github.com/databricks/megablocks) is an efficient MoE implementation that uses sparse matrix multiplication to compute expert outputs in parallel despite uneven token assignment. MegaBlocks implements a dropless MoE that avoids dropping tokens while using GPU kernels that maintain efficient training. Prior to MegaBlocks, dynamic routing formulations forced a tradeoff between model quality and hardware efficiency. Previously, users had to either drop tokens from computation or waste computation and memory on padding. Experts can receive a variable number of tokens and the expert computation can be performed efficiently using block sparse matrix multiplication. We’ve [integrated MegaBlocks into LLM Foundry](https://www.databricks.com/blog/bringing-megablocks-databricks) to enable scaling MoE training to thousands of GPUs. - - -![Figure 2: Matrix multiplication for expert computations](/assets/images/training-moes/fg2.png){:style="width:100%"} - -_Figure 2: Matrix multiplication for expert computations_ - - -### Expert Parallelism - -As models scale to larger sizes and fail to fit on a single GPU, we require more advanced forms of parallelism. Expert parallelism is a form of model parallelism where we place different experts on different GPUs for better performance. Instead of expert weights being communicated across all GPUs, tokens are sent to the device that contains the expert. By moving data instead of weights, we can aggregate data across multiple machines for a single expert. The router determines which tokens from the input sequence should be sent to which experts. This is typically done by computing a gating score for each token-expert pair, and then routing each token to the top-scoring experts. Once the token-to-expert assignments are determined, an all-to-all communication step is performed to dispatch the tokens to the devices hosting the relevant experts. This involves each device sending the tokens assigned to experts on other devices, while receiving tokens assigned to its local experts. - -The key advantage of expert parallelism is processing a few, larger matrix multiplications instead of several small matrix multiplications. As each GPU only has a subset of experts, it only has to do computation for those experts. Correspondly, as we aggregate tokens across multiple GPUs, the size of each matrix is proportionally larger. As GPUs are optimized for large-scale parallel computations, larger operations can better exploit their capabilities, leading to higher utilization and efficiency. A more in depth explanation of the benefits of larger matrix multiplications can be found [here](https://www.thonking.ai/p/what-shapes-do-matrix-multiplications). Once the computation is complete, another all-to-all communication step is performed to send the expert outputs back to their original devices. - - -![Figure 3: Token routing in expert parallelism](/assets/images/training-moes/fg3.png){:style="width:100%"} - - -_Figure 3: Token routing in expert parallelism_ - -We leverage PyTorch’s [DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/_tensor/README.md), a low-level abstraction for describing how tensors are sharded and replicated, to effectively implement expert parallelism. We first manually place experts on different GPUs, typically sharding across a node to ensure we can leverage NVLink for fast GPU communication when we route tokens. We can then build a [device mesh](https://pytorch.org/tutorials/recipes/distributed_device_mesh.html) on top of this layout, which lets us succinctly describe the parallelism across the entire cluster. We can use this device mesh to easily checkpoint or rearrange experts when we need alternate forms of parallelism. - - -### Scaling ZeRO-3 with PyTorch FSDP - -In conjunction with expert parallelism, we use data parallelism for all other layers, where each GPU stores a copy of the model and optimizer and processes a different chunk of data. After each GPU has completed a forward and backward pass, gradients are accumulated across GPUs for a global model update. - -ZeRO-3 is a form of data parallelism where weights and optimizers are sharded across each GPU instead of being replicated. Each GPU now only stores a subset of the full model, dramatically reducing memory pressure. When a part of the model is needed for computation, it is gathered across all the GPUs, and after the computation is complete, the gathered weights are discarded. We use PyTorch’s implementation of ZeRO-3, called [Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/). - -As we scale to thousands of GPUs, the cost of communication across devices increases, slowing down training. Communication increases due to the need to synchronize and share model parameters, gradients, and optimizer states across all GPUs which involves all-gather and reduce-scatter operations. To mitigate this issue while keeping the benefits of FSDP, we utilize Hybrid Sharded Data Parallel (HSDP) to shard the model and optimizer across a set number of GPUs and replicate this multiple times to fully utilize the cluster. With HSDP, an additional all reduce operation is needed in the backward pass to sync gradients across replicas. This approach allows us to balance memory efficiency and communication cost during large scale distributed training. To use HSDP we can extend our previous device mesh from expert parallelism and let PyTorch do the heavy lifting of actually sharding and gathering when needed. - - -![Figure 4: FSDP and HSDP](/assets/images/training-moes/fg4.png){:style="width:100%"} - -_Figure 4: FSDP and HSDP_ - -With PyTorch, we can effectively combine these two types of parallelism, leveraging FSDP’s higher level API while using the lower-level [DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/_tensor/README.md) abstraction when we want to implement something custom like expert parallelism. We now have a 3D device mesh with expert parallel shard dimension, ZeRO-3 shard dimension, and a replicate dimension for pure data parallelism. Together, these techniques deliver near linear scaling across very large clusters, allowing us to achieve MFU numbers over 40%. - - -### Elastic Checkpointing with Torch Distributed - -Fault tolerance is crucial for ensuring that LLMs can be trained reliably over extended periods, especially in distributed environments where node failures are common. To avoid losing progress when jobs inevitably encounter failures, we checkpoint the state of the model, which includes parameters, optimizer states, and other necessary metadata. When a failure occurs, the system can resume from the last saved state rather than starting over. To ensure robustness to failures, we need to checkpoint often and save and load checkpoints in the most performant way possible to minimize downtime. Additionally, if too many GPUs fail, our cluster size may change. Accordingly, we need the ability to elastically resume on a different number of GPUs. - -PyTorch supports elastic checkpointing through its distributed training framework, which includes utilities for both saving and loading checkpoints across different cluster configurations. PyTorch Distributed Checkpoint ensures the model's state can be saved and restored accurately across all nodes in the training cluster in parallel, regardless of any changes in the cluster's composition due to node failures or additions. - -Additionally, when training very large models, the size of checkpoints may be very large, leading to very slow checkpoint upload and download times. PyTorch Distributed Checkpoint supports sharded checkpoints, which enables each GPU to save and load only its portion of the model. When combining sharded checkpointing with elastic training, each GPU reads the metadata file to determine which shards to download on resumption. The metadata file contains information on what parts of each tensor are stored in each shard. The GPU can then download the shards for its part of the model and load that part of the checkpoint. - - -![Figure 5: Checkpointing saving and resumption resharded on additional GPUs](/assets/images/training-moes/fg5.png){:style="width:100%"} - - -_Figure 5: Checkpointing saving and resumption resharded on additional GPUs_ - -By parallelizing checkpointing across GPUs, we can spread out network load, improving robustness and speed. When training a model with 3000+ GPUs, network bandwidth quickly becomes a bottleneck. We take advantage of the replication in HSDP to first download checkpoints on one replica and then send the necessary shards to other replicas. With our integration in [Composer](https://github.com/mosaicml/composer), we can reliably upload checkpoints to cloud storage as frequently as every 30 minutes and automatically resume from the latest checkpoint in the event of a node failure in less than 5 minutes. - - -## Conclusion - -We’re very excited to see how PyTorch is enabling training state-of-the-art LLMs with great performance. In our post, we’ve shown how we implemented efficient MoE training through Pytorch Distributed and MegaBlocks on Foundry. Furthermore, Pytorch elastic checkpointing allowed us to quickly resume training on a different number of GPUs when node failures occurred. Using Pytorch HSDP has allowed us to scale training efficiently as well as improve checkpointing resumption times. We look forward to continuing building on a strong and vibrant open-source community to help bring great AI models to everyone. Come join us in building great models at [LLM Foundry](https://github.com/mosaicml/llm-foundry) and [PyTorch](https://github.com/pytorch/pytorch). \ No newline at end of file diff --git a/_posts/2024-06-25-pytorch-documentary.md b/_posts/2024-06-25-pytorch-documentary.md deleted file mode 100644 index 2fcd41f772e7..000000000000 --- a/_posts/2024-06-25-pytorch-documentary.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -layout: blog_detail -title: "Powering the AI Revolution: The PyTorch Documentary" -author: The PyTorch Foundation ---- - -Now live: The official [PyTorch Documentary](https://documentary.pytorch.org/)! This film unveils the authentic narrative of PyTorch’s inception, attributing its existence to a dedicated group of unsung heroes driving technological innovation. - -The documentary shares the strength of the PyTorch community, resonating with our communities across the globe. We hope this story of PyTorch inspires greater contributions, attracts more contributors to the project, and fosters widespread recognition of PyTorch’s significance in the open source community. - - - -We couldn’t have produced this without the support of our PyTorch Foundation members and sponsors: - -![company logos](/assets/images/doc-logos.jpg){:style="width:100%"} - - -### AMD - -“PyTorch’s growth and adoption in the AI community is a testament to open collaboration. The collective efforts of all the contributors have helped propel PyTorch as one of the most widely adopted AI frameworks in the industry. AMD is proud to be a part of this movement - making sure that the future of AI is open - and we are excited to continue contributing to this vibrant ecosystem.” - -**– Niles Burbank, AMD** - -### AWS - -“The release of the PyTorch Documentary showcases the innovation and real-world impact of one of the most widely adopted open source machine learning frameworks. By supporting and contributing to the PyTorch community, AWS helps enable cutting-edge machine learning research that drives advancements in AI capabilities. We are excited about the documentary as it highlights the power of collaboration in propelling PyTorch to the forefront of machine learning and empowering developers and data scientists to create groundbreaking models. At AWS, we celebrate frameworks like PyTorch that foster environments where open source machine learning technologies can grow and benefit the community at-large, as well as our customers.” - -**– Brian Granger, AWS** - -### Google Cloud - -“Google recognizes the impact of PyTorch on the AI community, providing researchers and developers with powerful, flexible tools for innovation. This documentary not only celebrates the remarkable achievements of the PyTorch community but also highlights the collaborative spirit driving advancements in AI. We look forward to continuing our support for PyTorch and fostering an open ecosystem that accelerates machine learning research and application.” - -**– Dwarak Rajagopal, Google** - -### Meta - -“We have been so impressed with the growth and collaboration that PyTorch has created over the years. From very humble beginnings at Meta to a cornerstone in AI research and development, the documentary showcases the dedication of our contributors since the start. It’s an honor to be a part of something so impactful, and now it’s been documented for our community to take part in.” - -**– Soumith Chintala, Meta** - -### Microsoft Azure - -“We're truly excited about the premiere of the PyTorch Documentary. At Microsoft, PyTorch has been our default deep learning framework for building AI solutions including Microsoft Copilot. Additionally, we have made significant investments to create an optimized environment for our customers to develop, train, fine-tune and deploy their PyTorch workloads on Azure and Windows, furthering our commitment to democratize AI.” - -**– Eric Boyd, Microsoft** - -### PyTorch Foundation - -“The release of the PyTorch documentary marks a significant milestone for our community, showcasing the incredible journey and rapid evolution of PyTorch. We are excited to share these stories and achievements with the world, and we look forward to continuing to foster innovation and growth of the PyTorch community and PyTorch’s evolving ecosystem.” - -**– Matt White, PyTorch Foundation** \ No newline at end of file diff --git a/_posts/2024-07-03-hacker-cup.md b/_posts/2024-07-03-hacker-cup.md deleted file mode 100644 index 77a34ac9c533..000000000000 --- a/_posts/2024-07-03-hacker-cup.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -layout: blog_detail -title: "Announcing Hacker Cup AI Track at NeurIPS 2024" ---- - -The PyTorch team in partnership with Meta Hacker Cup, and Microsoft Research, are excited to announce the Hacker Cup AI Track at NeurIPS 2024. This will be the first AI track for the popular Meta Hacker Cup programming competition designed to assess the capabilities of Generative AI in performing autonomous code generation tasks. We aim to test the limits of AI in complex coding challenges and measure the performance gap between AI systems and human programmers. We will provide access to all Hacker Cup problems since 2011 alongside their respective solutions in a multimodal (image and text) format, and utilize the existing Hacker Cup infrastructure for competitor evaluation. Featuring both _open evaluation, open model_ and _open evaluation, closed model_ tracks, this competition invites diverse participation from research institutions of varied interests and resource constraints, including academic labs, AI startups, large technology companies, and AI enthusiasts. Our goal is to develop and democratize meaningful advancements in code automation with the very first open evaluation process for competitive AI programmers. Registration will begin in **Early August**, with our first qualification round on **September 20th.** - -For more information please visit our website at [https://www.facebook.com/codingcompetitions/hacker-cup/](https://www.facebook.com/codingcompetitions/hacker-cup/) **and join our Discord** at [discord.gg/wWeN9hTH32](https://discord.com/invite/wWeN9hTH32) diff --git a/_posts/2024-07-09-accelerated-pytorch-inference.md b/_posts/2024-07-09-accelerated-pytorch-inference.md deleted file mode 100644 index bae115987d08..000000000000 --- a/_posts/2024-07-09-accelerated-pytorch-inference.md +++ /dev/null @@ -1,438 +0,0 @@ ---- -layout: blog_detail -title: "Accelerated PyTorch inference with torch.compile on AWS Graviton processors" -author: Sunita Nadampalli ---- - -## Summary - -Originally PyTorch, used an eager mode where each PyTorch operation that forms the model is run independently as soon as it’s reached. PyTorch 2.0 introduced [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) to speed up PyTorch code over the default eager mode. In contrast to eager mode, the torch.compile pre-compiles the entire model into a single graph in a manner that’s optimal for running on a given hardware platform. AWS optimized the PyTorch torch.compile feature for [AWS Graviton3 processors](https://aws.amazon.com/about-aws/whats-new/2022/05/amazon-ec2-c7g-instances-powered-aws-graviton3-processors/). This optimization results in up to 2x better performance for [Hugging Face](https://huggingface.co/models) model inference (based on geomean of performance improvement for 33 models) and up to 1.35x better performance for [TorchBench](https://github.com/pytorch/benchmark) model inference (geomean of performance improvement for 45 models) compared to the default eager mode inference across several natural language processing (NLP), computer vision (CV), and recommendation models on AWS Graviton3-based Amazon EC2 instances. Starting with PyTorch 2.3.1, the optimizations are available in torch Python [wheels](https://pypi.org/project/torch/2.3.1/) and AWS Graviton PyTorch [deep learning container (DLC)](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#ec2-framework-graviton-containers-ec2-ecs-and-eks-support-only). - -In this blog post, we show how we optimized torch.compile performance on AWS Graviton3-based EC2 instances, how to use the optimizations to improve inference performance, and the resulting speedups. - - -## Why torch.compile and what’s the goal? - -In eager mode, operators in a model are run immediately as they are encountered. It’s easier to use, more suitable for machine learning (ML) researchers, and hence is the default mode. However, eager mode incurs runtime overhead because of redundant kernel launch and memory read overhead. Whereas in torch compile mode, operators are first synthesized into a graph, wherein one operator is merged with another to reduce and localize memory reads and total kernel launch overhead. - -The goal for the AWS Graviton team was to optimize torch.compile backend for Graviton3 processors. PyTorch eager mode was already optimized for Graviton3 processors with [Arm Compute Library (ACL)](https://github.com/ARM-software/ComputeLibrary) kernels using oneDNN (also known as MKLDNN). So, the question was, how to reuse those kernels in torch.compile mode to get the best of graph compilation and the optimized kernel performance together? - - -## Results - -The AWS Graviton team extended the torch inductor and oneDNN primitives that reused the ACL kernels and optimized compile mode performance on Graviton3 processors. Starting with PyTorch 2.3.1, the optimizations are available in the torch Python wheels and AWS Graviton DLC. Please see the **Running an inference** section that follows for the instructions on installation, runtime configuration, and how to run the tests. - -To demonstrate the performance improvements, we used NLP, CV, and recommendation models from [TorchBench](https://github.com/pytorch/benchmark) and the most downloaded NLP models from [Hugging Face](https://huggingface.co/models) across Question Answering, Text Classification, Token Classification, Translation, Zero-Shot Classification, Translation, Summarization, Feature Extraction, Text Generation, Text2Text Generation, Fill-Mask, and Sentence Similarity tasks to cover a wide variety of customer use cases. - -We started with measuring TorchBench model inference latency, in milliseconds (msec), for the eager mode, which is marked 1.0 with a red dotted line in the following graph. Then we compared the improvements from torch.compile for the same model inference, the normalized results are plotted in the graph. You can see that for the 45 models we benchmarked, there is a 1.35x latency improvement (geomean for the 45 models). - - -![PyTorch model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using TorchBench framework](/assets/images/accelerated-pytorch-inference/fg1.png){:style="width:100%"} - -_**Image 1**: PyTorch model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using TorchBench framework. The reference eager mode performance is marked as 1.0. (higher is better)_ - -Similar to the preceding TorchBench inference performance graph, we started with measuring the Hugging Face NLP model inference latency, in msec, for the eager mode, which is marked 1.0 with a red dotted line in the following graph. Then we compared the improvements from torch.compile for the same model inference, the normalized results are plotted in the graph. You can see that for the 33 models we benchmarked, there is around 2x performance improvement (geomean for the 33 models). - - -![Hugging Face NLP model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using Hugging Face example scripts](/assets/images/accelerated-pytorch-inference/fg2.png){:style="width:100%"} - -_**Image 2**: Hugging Face NLP model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using Hugging Face example scripts. The reference eager mode performance is marked as 1.0. (higher is better)_ - - -## Running an inference - -Starting with PyTorch 2.3.1, the optimizations are available in the torch Python wheel and in AWS Graviton PyTorch DLC. This section shows how to run inference in eager and torch.compile modes using torch Python wheels and benchmarking scripts from Hugging Face and TorchBench repos. - -To successfully run the scripts and reproduce the speedup numbers mentioned in this post, you need an instance from the Graviton3 family (`c7g/r7g/m7g/hpc7g`) of hardware. For this post, we used the [c7g.4xl (16 vcpu) instance](https://aws.amazon.com/ec2/instance-types/c7g/). The instance, the AMI details, and the required torch library versions are mentioned in the following snippet. - -``` -Instance: c7g.4xl instance -Region: us-west-2 -AMI: ami-05cc25bfa725a144a (Ubuntu 22.04/Jammy with 6.5.0-1017-aws kernel) - -# Install Python -sudo apt-get update -sudo apt-get install -y python3 python3-pip - -# Upgrade pip3 to the latest version -python3 -m pip install --upgrade pip - -# Install PyTorch and extensions -python3 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 -``` - -The generic runtime tunings implemented for eager mode inference are equally applicable for the torch.compile mode, so, we set the following environment variables to further improve the torch.compile performance on AWS Graviton3 processors. - -``` -# Enable the fast math GEMM kernels, to accelerate fp32 inference with bfloat16 gemm -export DNNL_DEFAULT_FPMATH_MODE=BF16 - -# Enable Linux Transparent Huge Page (THP) allocations, -# to reduce the tensor memory allocation latency -export THP_MEM_ALLOC_ENABLE=1 - -# Set LRU Cache capacity to cache the primitives and avoid redundant -# memory allocations -export LRU_CACHE_CAPACITY=1024 -``` - -#### TORCHBENCH BENCHMARKING SCRIPTS - -TorchBench is a collection of open source benchmarks used to evaluate PyTorch performance. We benchmarked 45 models using the scripts from the TorchBench repo. Following code shows how to run the scripts for the eager mode and the compile mode with inductor backend. - - -``` -# Set OMP_NUM_THREADS to number of vcpus, 16 for c7g.4xl instance -export OMP_NUM_THREADS=16 - -# Install the dependencies -sudo apt-get install -y libgl1-mesa-glx -sudo apt-get install -y libpangocairo-1.0-0 -python3 -m pip install psutil numpy transformers pynvml numba onnx onnxruntime scikit-learn timm effdet gym doctr opencv-python h5py==3.10.0 python-doctr - -# Clone pytorch benchmark repo -git clone https://github.com/pytorch/benchmark.git -cd benchmark -# PyTorch benchmark repo doesn't have any release tags. So, -# listing the commit we used for collecting the performance numbers -git checkout 9a5e4137299741e1b6fb7aa7f5a6a853e5dd2295 - -# Setup the models -python3 install.py - -# Colect eager mode performance using the following command. The results will be -# stored at .userbenchmark/cpu/metric-.json. -python3 run_benchmark.py cpu --model BERT_pytorch,hf_Bert,hf_Bert_large,hf_GPT2,hf_Albert,hf_Bart,hf_BigBird,hf_DistilBert,hf_GPT2_large,dlrm,hf_T5,mnasnet1_0,mobilenet_v2,mobilenet_v3_large,squeezenet1_1,timm_efficientnet,shufflenet_v2_x1_0,timm_regnet,resnet50,soft_actor_critic,phlippe_densenet,resnet152,resnet18,resnext50_32x4d,densenet121,phlippe_resnet,doctr_det_predictor,timm_vovnet,alexnet,doctr_reco_predictor,vgg16,dcgan,yolov3,pytorch_stargan,hf_Longformer,timm_nfnet,timm_vision_transformer,timm_vision_transformer_large,nvidia_deeprecommender,demucs,tts_angular,hf_Reformer,pytorch_CycleGAN_and_pix2pix,functorch_dp_cifar10,pytorch_unet --test eval --metrics="latencies,cpu_peak_mem" - -# Collect torch.compile mode performance with inductor backend -# and weights pre-packing enabled. The results will be stored at -# .userbenchmark/cpu/metric-.json -python3 run_benchmark.py cpu --model BERT_pytorch,hf_Bert,hf_Bert_large,hf_GPT2,hf_Albert,hf_Bart,hf_BigBird,hf_DistilBert,hf_GPT2_large,dlrm,hf_T5,mnasnet1_0,mobilenet_v2,mobilenet_v3_large,squeezenet1_1,timm_efficientnet,shufflenet_v2_x1_0,timm_regnet,resnet50,soft_actor_critic,phlippe_densenet,resnet152,resnet18,resnext50_32x4d,densenet121,phlippe_resnet,doctr_det_predictor,timm_vovnet,alexnet,doctr_reco_predictor,vgg16,dcgan,yolov3,pytorch_stargan,hf_Longformer,timm_nfnet,timm_vision_transformer,timm_vision_transformer_large,nvidia_deeprecommender,demucs,tts_angular,hf_Reformer,pytorch_CycleGAN_and_pix2pix,functorch_dp_cifar10,pytorch_unet --test eval --torchdynamo inductor --freeze_prepack_weights --metrics="latencies,cpu_peak_mem" -``` - -On successful completion of the inference runs, the script stores the results in JSON format. The following is the sample output: - -``` -{ - "name": "cpu" - "environ": { - "pytorch_git_version": "d44533f9d073df13895333e70b66f81c513c1889" - }, - - "metrics": { - "BERT_pytorch-eval_latency": 56.3769865, - "BERT_pytorch-eval_cmem": 0.4169921875 - } -} -``` - -#### HUGGING FACE BENCHMARKING SCRIPTS - -Google T5 Small Text Translation model is one of the around 30 Hugging Face models we benchmarked. We’re using it as a sample model to demonstrate how to run inference in eager and compile modes. The additional configurations and APIs required to run it in compile mode are highlighted in **BOLD**. Save the following script as `google_t5_small_text_translation.py`. - -
        import argparse
        -from transformers import T5Tokenizer, T5Model
        -import torch
        -from torch.profiler import profile, record_function, ProfilerActivity
        -import torch._inductor.config as config
        -config.cpp.weight_prepack=True
        -config.freezing=True
        -
        -def test_inference(mode, num_iter):
        -    tokenizer = T5Tokenizer.from_pretrained("t5-small")
        -    model = T5Model.from_pretrained("t5-small")
        -
        -    input_ids = tokenizer(
        -        "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        -    ).input_ids  # Batch size 1
        -    decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        -
        -    if (mode == 'compile'):
        -        model = torch.compile(model)
        -
        -    with torch.no_grad():
        -        for _ in range(50):
        -            outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        -
        -        with profile(activities=[ProfilerActivity.CPU]) as prof:
        -            with record_function("model_inference"):
        -                for _ in range(num_iter):
        -                    outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        -
        -    print(prof.key_averages().table(sort_by="self_cpu_time_total"))
        -
        -def main() -> None:
        -    global m, args
        -    parser = argparse.ArgumentParser(__doc__)
        -    parser.add_argument(
        -        "-m",
        -        "--mode",
        -        choices=["eager", "compile"],
        -        default="eager",
        -        help="Which test to run.",
        -    )
        -    parser.add_argument(
        -        "-n",
        -        "--number",
        -        type=int,
        -        default=100,
        -        help="how many iterations to run.",
        -    )
        -    args = parser.parse_args()
        -    test_inference(args.mode, args.number)
        -
        -if __name__ == "__main__":
        -    main()
        -
        - -Run the script with the following steps: - -``` -# Set OMP_NUM_THREADS to number of vcpus to 4 because -# the scripts are running inference in sequence, and -# they don't need large number of vcpus -export OMP_NUM_THREADS=4 - -# Install the dependencies -python3 -m pip install transformers - -# Run the inference script in Eager mode -# using number of iterations as 1 just to show the torch profiler output -# but for the benchmarking, we used 1000 iterations. -python3 google_t5_small_text_translation.py -n 1 -m eager - -# Run the inference script in torch compile mode -python3 google_t5_small_text_translation.py -n 1 -m compile -``` - -On successful completion of the inference runs, the script prints the torch profiler output with the latency breakdown for the torch operators. The following is the sample output from torch profiler: - -``` -# Torch profiler output for the eager mode run on c7g.xl (4vcpu) ------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - aten::mm 40.71% 12.502ms 40.71% 12.502ms 130.229us 96 - model_inference 26.44% 8.118ms 100.00% 30.708ms 30.708ms 1 - aten::bmm 6.85% 2.102ms 9.47% 2.908ms 80.778us 36 - aten::matmul 3.73% 1.146ms 57.26% 17.583ms 133.205us 132 - aten::select 1.88% 576.000us 1.90% 583.000us 0.998us 584 - aten::transpose 1.51% 464.000us 1.83% 563.000us 3.027us 186 ------------------------- ------------ ------------ ------------ ------------ ------------ ------------------- -Self CPU time total: 30.708ms - -# Torch profiler output for the compile mode run for the same model on the same instance ---------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ---------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - mkldnn::_linear_pointwise 37.98% 5.461ms 45.91% 6.602ms 68.771us 96 - Torch-Compiled Region 29.56% 4.251ms 98.53% 14.168ms 14.168ms 1 - aten::bmm 14.90% 2.143ms 21.73% 3.124ms 86.778us 36 - aten::select 4.51% 648.000us 4.62% 665.000us 1.155us 576 - aten::view 3.29% 473.000us 3.29% 473.000us 1.642us 288 - aten::empty 2.53% 364.000us 2.53% 364.000us 3.165us 115 ---------------------------------- ------------ ------------ ------------ ------------ ------------ -------------------- -Self CPU time total: 14.379ms -``` - -## Technical deep dive: What are the challenges and optimization details - -Underpinning torch.compile are new technologies – TorchDynamo, AOTDispatcher, and TorchInductor. - -**TorchDynamo** captures PyTorch programs safely using Python Frame Evaluation Hooks -**AOTDispatcher** overloads PyTorch’s autograd engine as a tracing autodiff for generating ahead-of-time backward traces. -**TorchInductor** is a deep learning compiler that generates fast code for multiple accelerators and backends. - -![The PyTorch compilation process source](/assets/images/accelerated-pytorch-inference/fg3.png){:style="width:100%"} - -_**Image 3**: The PyTorch compilation process_ - -When torch.compile is invoked, torch dynamo rewrites Python bytecode to extract sequences of PyTorch operations into an [FX](https://pytorch.org/docs/stable/fx.html) [Graph](https://pytorch.org/docs/stable/fx.html), which is then compiled with inductor backend. For a typical inference scenario where the graph is frozen and gradient calculations are disabled, the inductor invokes platform specific optimizations like graph rewrite into more performant operators, operator fusion, and weights pre-packing. - -However, on Graviton3, the inductor wasn’t able to perform any of those optimizations because there was no aarch64 backend defined. To fix this, we extended the inductor’s FX passes to pick oneDNN operators for linear layer compilation on Graviton3 processors with ACL backend. The code snippet for this follows: - -``` -packed_weight_op = ( - mkldnn._reorder_linear_weight - if (is_bf16_weight or mkldnn._is_mkldnn_acl_supported()) - -packed_linear_inputs: Tuple[Any, ...] = (input, packed_weight_node) -if is_bf16_weight or mkldnn._is_mkldnn_acl_supported(): - packed_linear_inputs += (bias, "none", [], "") - packed_linear_op = mkldnn._linear_pointwise.default -``` - -After this was done, the FX pass was successful in compiling the `matmul `operators to `linear_pointwise `. The following snippet highlights the matmul operator in the original model: - -``` - %attention_scores : [num_users=1] = call_function[target=torch.matmul](args = (%query_layer, %transpose), kwargs = {}) - %attention_scores_1 : [num_users=1] = call_function[target=operator.truediv](args = (%attention_scores, 8.0), kwargs = {}) - %attention_scores_2 : [num_users=1] = call_function[target=operator.add](args = (%attention_scores_1, %extended_attention_mask_3), kwargs = {}) - ``` - -The following snippet highlights the linear_pointwise operator in the compiled graph: - -``` -%_linear_pointwise_default_140 : [num_users=2] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%add_7, %_frozen_param278, %_frozen_param16, none, [], ), kwargs = {}) -%mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.5), kwargs = {}) -%mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.7071067811865476), kwargs = {}) -%erf : [num_users=1] = call_function[target=torch.ops.aten.erf.default](args = (%mul_6,), kwargs = {}) -%add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%erf, 1), kwargs = {}) -``` - -This completes the torch inductor changes required to compile the graph into optimized operators on AWS Graviton3 processors. Next comes the actual inference where the compiled graph is dispatched to be run. OneDNN with ACL was the backend we chose during the inductor compilation, so, the new operators were dispatched to oneDNN as expected, for example, `mkldnn._linear_pointwise`. However, due to gaps in oneDNN ACL primitives, the operators were run with C++ reference kernels instead of the optimized ACL kernels. Hence, the compile performance was still significantly behind the eager mode performance. - -There were mainly three areas where oneDNN ACL primitives lack support for torch.compile mode. The following section talks about them in detail. - -**1. ACL primitives didn’t have support for weights in blocked layout** - -ACL primitives originally designed for eager mode supported weights only in the standard channels last ([NHWC](https://oneapi-src.github.io/oneDNN/dev_guide_understanding_memory_formats.html#nhwc)) format, without any pre-packing. Whereas weights pre-packing into blocked layout is one of the main optimizations in the inductor compilation passes where the weights are reordered into blocks specific to the runtime platform. This avoids the redundant and on-the-fly reorders when running the General Matrix Multiplication (GEMM), which otherwise would be the bottleneck for inference performance. But the ACL primitives didn’t have support for blocked layout and hence the operators were run with oneDNN C++ reference kernels instead. - -**2. Mixed precision primitives weren’t supported in oneDNN** - -AWS Graviton3 processors support [bfloat16 MMLA instructions](https://developer.arm.com/documentation/ddi0596/2020-12/SVE-Instructions/BFMMLA--BFloat16-floating-point-matrix-multiply-accumulate-) which can be used to accelerate fp32 inference with bfloat16 GEMM as a mixed precision compute. ACL supports bfloat16 mixed precision GEMM kernels, and are integrated into oneDNN as a fast math compute option for the existing fp32 operators. However, the fast math approach didn’t work for compile mode because of weights pre-packing optimization. The compile mode requires explicit mixed precision primitive implementation in oneDNN in order to use bfloat16 acceleration. - -**3. ACL primitives didn’t support fused kernels for some of the activation functions** - -In eager mode, operators are dispatched individually because the model is run independently as soon as it’s reached. Whereas in compile mode, operator fusion is another important optimization where the operators are fused for runtime efficiency. For example, Gaussian Error Linear Unit ([GELU](https://arxiv.org/pdf/1606.08415.pdf#%3A~%3Atext%3DWe%20propose%20the%20Gaussian%20Error%2Cstandard%20Gaussian%20cumulative%20distribution%20function)) is one of the most widely used activation functions in transformers-based neural network architectures. So, it’s typical to have a linear layer (with matrix multiplications) followed by GELU activation. As part of compiling the model into efficient operators, the torch inductor fuses matmul and GELU into a single linearpointwise+gelu operator. However, oneDNN ACL primitives didn’t have the support for fused kernels with GELU. - -We addressed these gaps by extending oneDNN primitives to handle the additional layouts and new primitive definitions. The following sections talk about the optimizations in detail. - - -### Optimization 1: Extended ACL primitives to accept weight tensors in blocked layout - -We extended the ACL primitives to accept blocked layout in addition to the the standard NHWC format. The code snippet for this is as follows: - -``` -const bool is_weights_md_format_ok - = utils::one_of(weights_format_kind_received, - format_kind::any, format_kind::blocked); - - -const memory_desc_t weights_md_received = weights_md_; -acl_utils::reorder_to_weight_format(aip.wei_tensor_info, - weights_md_, expected_weight_format, inner_dim, o_dim, - remaining_dims, {}); - -ACL_CHECK_SUPPORT( - (weights_format_kind_received == format_kind::blocked) - && !(dnnl_memory_desc_equal( - &weights_md_received, &weights_md_)), - "specified blocked format not supported by ACL, use " - "format_kind_t::any to find a supported blocked format for " - "your platform"); -``` - -### Optimization 2: Defined new ACL primitives to handle mixed precision operators (weights in bfloat16 and activations in fp32) - -We defined mixed precision primitive definitions and updated the existing oneDNN ACL fp32 primitives to handle bfloat16 tensors. - -``` -{% raw %} /* With graph compilation, we are able to reorder and pre-pack the weights during the model load - * and compilation phase itself so that redundant and on-the-fly reorders can be avoided. - * This primitive definition is to support gemm fastmath mode for the compile scenario where src is - * in fp32 and weights are in bf16 - */ - {{forward, f32, bf16, f32}, { - CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t) - nullptr, - }},{% endraw %} -``` - -### Optimization 3: Disabled operator fusion pass in torch inductor - -We bypassed the operator fusion pass in torch inductor so that the compiled graph doesn’t contain GELU fused operators. This is a temporary solution to enable ACL kernels in torch.compile. There is a work in progress to enable operator fusion pass for the future PyTorch releases. With this workaround, we were able to successfully dispatch the linear layer to ACL. As shown in the following torch.profiler output, the `aten::addmm `(one of the variants of the matmul operator) and `aten::gelu `in the original model (as highlighted in _Image 4_) was compiled to `mkldnn::_linear_pointwise `without `gelu `operator fusion (as highlighted in _Image 5_). - -``` ---------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ---------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - aten::addmm 73.32% 46.543ms 74.49% 47.287ms 647.767us 73 - model_inference 9.92% 6.296ms 100.00% 63.479ms 63.479ms 1 - aten::bmm 4.37% 2.776ms 5.46% 3.467ms 144.458us 24 - aten::copy_ 1.74% 1.102ms 1.74% 1.102ms 8.103us 136 - aten::gelu 1.50% 950.000us 1.50% 950.000us 79.167us 12 -``` - -_**Image 4**: torch.profiler output for Hugging Face bert base model inference in Eager mode, showing addmm and gelu operators_ -
         
        - -``` ------------------------------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ------------------------------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ - mkldnn::_linear_pointwise 53.61% 15.529ms 57.53% 16.665ms 228.288us 73 - Torch-Compiled Region 36.95% 10.705ms 99.31% 28.769ms 28.769ms 1 - aten::_scaled_dot_product_flash_attention_for_cpu 3.67% 1.064ms 4.43% 1.284ms 107.000us 12 - aten::view 1.97% 572.000us 1.97% 572.000us 2.509us 228 - aten::empty 1.38% 399.000us 1.38% 399.000us 3.270us 122 -``` - - -_**Image 5**: torch.profiler output for Hugging Face Bert base model inference in torch.compile mode, showing linear_pointwise operator without gelu fusion_ - -Lastly, the `gelu `operator was compiled into `erf `(error function) and was dispatched to an inductor auto vectorization backend. The following snippets show the `erf `operator in the compiled graph and running it using `libm.so`. - -``` -%_linear_pointwise_default_140 : [num_users=2] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%add_7, %_frozen_param278, %_frozen_param16, none, [], ), kwargs = {}) -%mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.5), kwargs = {}) -%mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.7071067811865476), kwargs = {}) -%erf : [num_users=1] = call_function[target=torch.ops.aten.erf.default](args = (%mul_6,), kwargs = {}) -%add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%erf, 1), kwargs = {}) -``` - -_**Image 6**: snippet after post grad pass showing erf function in the compiled graph_ -
         
        - -``` - 0.82% 0.40% python3 libm.so.6 [.] erff32 - 0.05% 0.00% python3 libtorch_python.so [.] torch::autograd::THPVariable_erf - 0.05% 0.00% python3 libtorch_cpu.so [.] at::_ops::erf::call -``` - -_**Image 7**: Linux perf report showing erf dispatch to libm.so_ - -With this work, we were able to optimize `torch.compile `performance on Graviton3 processors by using inductor graph compilation along with the oneDNN+ACL backend. - - -### TorchBench enhancements - -To demonstrate the torch.compile performance improvements on AWS Graviton3 processors, we extended TorchBench framework to add a new argument to enable graph freeze and weights pre-packing and disable torch auto grad for eval test mode. The code snippet for this is as follows: - -``` -parser.add_argument( - "—freeze_prepack_weights", - action='store_true', - help="set to freeze the graph and prepack weights", - ) - -if args.freeze_prepack_weights: - torch._inductor.config.freezing=True - torch._inductor.config.cpp.weight_prepack=True -``` - -_**Image 8**: Added freeze_prepack_weights option for torchdynamo backend in TorchBench to demonstrate torch.compile performance improvements on AWS Graviton3 processors_ - -We have upstreamed all the optimizations, and starting with PyTorch 2.3.1, these are supported in torch Python wheels and AWS Graviton PyTorch DLC. - - -## What’s next - -Next, we’re extending the torch inductor CPU backend support to compile Llama model, and adding support for fused GEMM kernels to enable torch inductor operator fusion optimization on AWS Graviton3 processors. - - -## Conclusion - -In this tutorial, we covered how we optimized torch.compile performance on AWS Graviton3-based EC2 instances, how to use the optimizations to improve PyTorch model inference performance, and demonstrated the resulting speedups. We hope that you will give it a try! If you need any support with ML software on Graviton, please open an issue on the AWS Graviton Technical Guide [GitHub](https://github.com/aws/aws-graviton-getting-started). - - -## Acknowledgements - -We would like to thank the PyTorch community for the baseline torch.compile framework and their continued efforts to optimize it further. - -References: [https://pytorch.org/assets/pytorch2-2.pdf](https://pytorch.org/assets/pytorch2-2.pdf) - - -## Author - -Sunita Nadampalli is a Software Development Manager and AI/ML expert at AWS. She leads AWS Graviton software performance optimizations for AI/ML and HPC workloads. She is passionate about open source software development and delivering high-performance and sustainable software solutions for SoCs based on the Arm ISA. diff --git a/_posts/2024-07-10-develop-android-applications.md b/_posts/2024-07-10-develop-android-applications.md deleted file mode 100644 index 20f33e2b428f..000000000000 --- a/_posts/2024-07-10-develop-android-applications.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: blog_detail -title: "Learn how to develop Android applications with ExecuTorch and Llama models" -author: Arm ---- -_This blog is courtesy of the PyTorch team at Arm. More details can be found [here](https://learn.arm.com/learning-paths/smartphones-and-mobile/build-llama3-chat-android-app-using-executorch-and-xnnpack/?utm_source=twitter&utm_medium=social-organic&utm_content=landingpage&utm_campaign=mk24_developer_na)._ - -Arm’s compute platform is delivering GenAI applications on phones, laptops, and servers. Cost, privacy, performance, security, and energy efficiency are just some of the reasons developers are investigating on-device AI. - -A new Learning Path explaining how to leverage the capabilities of large language models (LLMs) on Android using ExecuTorch and XNNPACK is now available. - -Here's a summary of what you'll learn: - -* Development Environment setup - - The Learning Path begins by guiding you through setting up your development environment, ensuring you have all the necessary tools installed, including Android Studio, the Android NDK, Java JDK, and Python. - -* ExecuTorch and XNNPACK - - You'll learn about the core technologies: ExecuTorch, a framework for deploying PyTorch models to edge devices, and XNNPACK, a high-performance library for executing neural networks on Arm-based platforms. - -* Llama models - - The Learning Path explores Llama, a family of powerful LLMs, focusing specifically on the 8B Llama 3 model. You'll learn about quantization techniques, which are essential for optimizing model size and performance on mobile devices. - -* Prepare Llama models for ExecuTorch - - You'll be guided through the process of downloading, exporting, and evaluating Llama models, ensuring they are ready for deployment using ExecuTorch. - -* Check model performance on Android - - The Learning Path walks you through cross-compiling the Llama runner binary for Android, allowing you to test your model's performance on your phone. - -* Build and run an Android Chat App - - Finally, you'll learn how to build a native Android chat app using the `LlamaDemo` application from the ExecuTorch repository. This hands-on experience allows you to put your knowledge into practice and create a real-world application. - - -Explore this Learning Path if you want to learn how to leverage the power of LLMs on your Android phone, and gain expertise in tools for on-device machine learning. - -Dig into the excitement of building Android chat apps and understand more about how they work on the [Arm Developer Hub](https://learn.arm.com/learning-paths/smartphones-and-mobile/build-llama3-chat-android-app-using-executorch-and-xnnpack/?utm_source=twitter&utm_medium=social-organic&utm_content=landingpage&utm_campaign=mk24_developer_na). diff --git a/_posts/2024-07-11-flashattention-3.md b/_posts/2024-07-11-flashattention-3.md deleted file mode 100644 index 5768e0cadec6..000000000000 --- a/_posts/2024-07-11-flashattention-3.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -layout: blog_detail -title: "FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision" -author: Jay Shah and Ganesh Bikshandi, Colfax Research, Ying Zhang, Meta, Vijay Thakkar and Pradeep Ramani, NVIDIA, Tri Dao, TogetherAI and Princeton University ---- - -Attention, as a core layer of the ubiquitous Transformer architecture, is a bottleneck for large language models and long-context applications. FlashAttention (and FlashAttention-2) pioneered an approach to speed up attention on GPUs by minimizing memory reads/writes, and is now used by most [libraries](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) to accelerate Transformer training and inference. This has contributed to a massive increase in LLM context length in the last two years, from 2-4K (GPT-3, OPT) to 128K (GPT-4), or even 1M ([Llama 3](https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k)). However, despite its success, FlashAttention has yet to take advantage of new capabilities in modern hardware, with FlashAttention-2 achieving only 35% utilization of theoretical max FLOPs on the H100 GPU. In this blogpost, we describe three main techniques to speed up attention on Hopper GPUs: exploiting asynchrony of the Tensor Cores and TMA to (1) overlap overall computation and data movement via warp-specialization and (2) interleave block-wise matmul and softmax operations, and (3) incoherent processing that leverages hardware support for FP8 low-precision. - -We’re excited to release FlashAttention-3 that incorporates these techniques. It’s 1.5-2.0x faster than FlashAttention-2 with FP16, up to 740 TFLOPS, i.e., 75% utilization of H100 theoretical max FLOPS. With FP8, FlashAttention-3 reaches close to 1.2 PFLOPS, with 2.6x smaller error than baseline FP8 attention. - -FlashAttention-3 is available at: [https://github.com/Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention) -[Paper](https://tridao.me/publications/flash3/flash3.pdf) - - -## FlashAttention Recap - -[FlashAttention](https://arxiv.org/abs/2205.14135) is an algorithm that reorders the attention computation and leverages tiling and recomputation to significantly speed it up and reduce memory usage from quadratic to linear in sequence length. We use tiling to load blocks of inputs from HBM (GPU memory) to SRAM (fast cache), perform attention with respect to that block, and update the output in HBM. By not writing the large intermediate attention matrices to HBM, we reduce the amount of memory reads/writes, which brings 2-4x wallclock time speedup. - -Here we show a diagram of FlashAttention forward pass: with tiling and softmax rescaling, we operate by blocks and avoid having to read/write from HBM, while obtaining the correct output with no approximation. - -![math equations](/assets/images/flashattention-3/fg1.png){:style="width:100%"} - - -## New hardware features on Hopper GPUs - WGMMA, TMA, FP8 - -While FlashAttention-2 can achieve up to 70% theoretical max FLOPS on Ampere (A100) GPUs, it does not yet take advantage of new features on Hopper GPUs to maximize performance. We describe some of the new Hopper-specific features here, and why they are important. - - - -1\. WGMMA (Warpgroup Matrix Multiply-Accumulate). This new feature makes use of the new Tensor Cores on Hopper, with much higher throughput[^1] than the older mma.sync instruction in Ampere (image from the [H100 white paper)](https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper?ncid=no-ncid). - -![image from the H100 white paper](/assets/images/flashattention-3/fg2.png){:style="width:100%"} - - -2\. TMA (Tensor Memory Accelerator). This is a special hardware unit that accelerates the transfer of data between global memory and shared memory, taking care of all index calculation and out-of-bound predication. This frees up registers, which is a valuable resource to increase tile size and efficiency. - -![block diagram](/assets/images/flashattention-3/fg3.png){:style="width:100%"} - - -3\. Low-precision with FP8. This doubles the Tensor Core throughput (e.g. 989 TFLOPS with FP16 and 1978 TFLOPS with FP8), but trades off accuracy by using fewer bits to represent floating point numbers. - - -![6x throughput](/assets/images/flashattention-3/fg4.png){:style="width:100%"} - - - -FlashAttention-3 makes use of all of these new features of Hopper, using powerful abstractions from [NVIDIA’s CUTLASS](https://github.com/NVIDIA/cutlass) library. \ - \ -By rewriting FlashAttention to use these new features, we can already significantly speed it up (e.g., from 350 TFLOPS in FlashAttention-2 FP16 forward pass to around 540-570 TFLOPS). However, the asynchronous nature of the new instructions on Hopper (WGMMA and TMA) opens up additional algorithmic opportunities to overlap operations and thereby extract even greater performance. For this blogpost, we’ll explain two such techniques specific to attention. The generic technique of warp specialization, with separate producer and consumer warps doing TMA and WGMMA, is [well-covered elsewhere](https://github.com/NVIDIA/cutlass/blob/main/media/docs/efficient_gemm.md#warp-specialization) in the context of GEMM and works the same here. - - -## Asynchrony: Overlapping GEMM and Softmax - -Why overlap? - -Attention has GEMMs (those matmuls between Q and K and between attention probability P and V) and softmax as its two main operations. Why do we need to overlap them? Isn’t most of the FLOPS in the GEMMs anyway? As long as the GEMMs are fast (e.g., computed using WGMMA instructions), shouldn’t the [GPU be going brrrr](https://horace.io/brrr_intro.html)? - -The problem is that non-matmul operations are much slower than matmul operations on modern accelerators. Special functions such as exponential (for the softmax) have even lower throughput than floating point multiply-add; they are evaluated by the multi-function unit, a unit separate from floating point multiply-add or matrix multiply-add. As an example, the H100 GPU SXM5 has 989 TFLOPS of FP16 matrix multiply, but only 3.9 TFLOPS (256x less throughput) for special functions[^2]! For head dimension 128, there are 512x more matmul FLOPS than exponential, which means that exponential can take 50% of the time compared to matmul. The situation is even worse for FP8, where the matmul FLOPS are twice as fast yet exponential FLOPS stay the same speed. Ideally we want matmul and softmax to operate in parallel. While the Tensor Cores are busy with matmul, the multi-function units should be calculating exponential! - - -### Inter-warpgroup overlapping with pingpong scheduling - -The first and easiest way to overlap GEMM and softmax is to do nothing at all! The warp schedulers already try to schedule warps so that if some warps are blocked (e.g., waiting for GEMM results), other warps can run. That is, the warp schedulers do some of this overlapping for us, for free. - -However, we can improve on this by doing some of the scheduling manually. As an example, if we have 2 warpgroups (labeled 1 and 2 – each warpgroup is a group of 4 warps), we can use synchronization barriers (bar.sync) so that warpgroup 1 first does its GEMMs (e.g., GEMM1 of one iteration and GEMM0 of the next iteration), and then warpgroup 2 does its GEMMs while warpgroup 1 does its softmax, and so on. This “pingpong” schedule is illustrated in the figure below, where the same color denotes the same iteration. - - -![block chart](/assets/images/flashattention-3/fg5.png){:style="width:100%"} - - -This would allow us to perform the softmax in the shadow of the GEMMs of the other warpgroup. Of course, this figure is just a caricature; in practice the scheduling is not really this clean. Nevertheless, pingpong scheduling can improve FP16 attention forward pass from around 570 TFLOPS to 620 TFLOPS (head dim 128, seqlen 8K). - - -### Intra-warpgroup overlapping of GEMM and Softmax - -Even within one warpgroup, we can have some part of softmax running while the GEMMs of that warpgroup is running. This is illustrated in this figure, where the same color denotes the same iteration. - - -![block chart](/assets/images/flashattention-3/fg6.png){:style="width:100%"} - - -This pipelining increases throughput from around 620 TFLOPS to around 640-660 TFLOPS for FP16 attention forward, at the cost of higher register pressure. We need more registers to hold both accumulators of the GEMMs, and the input/output of softmax. Overall, we find this technique to offer a favorable tradeoff. - - -## Low-precision: reduce quantization error with incoherent processing - -LLM activation can have [outliers](https://arxiv.org/abs/2208.07339) with much larger magnitude than the rest of the features. These outliers make it difficult to quantize, producing much larger quantization errors. We leverage incoherent processing, a technique used in the quantization literature (e.g. from [QuIP](https://arxiv.org/abs/2307.13304)) that multiplies the query and key with a random orthogonal matrix to “spread out” the outliers and reduce quantization error. In particular, we use the Hadamard transform (with random signs), which can be done per attention head in O(d log d) instead of O(d^2) time, where d is the head dimension. Since the Hadamard transform is memory-bandwidth bound, it can be fused with previous operations such as rotary embedding (also memory-bandwidth bound) “for free”. - -In our experiment where Q, K, V are generated from a standard normal distribution but 0.1% of the entries have large magnitudes (to simulate outliers), we found that incoherent processing can reduce the quantization error by 2.6x. We show numerical error comparison in the table below. Please see the paper for details. - - -![text diagram](/assets/images/flashattention-3/fg6a.png){:style="width:100%"} - - - -## Attention benchmark - -We show some results with FlashAttention-3, and compare it to FlashAttention-2, as well as the implementation in Triton and cuDNN (both of which already use new hardware features of Hopper GPUs). - -For FP16, we see about 1.6x-1.8x speedup over FlashAttention-2 - - -![speed charts](/assets/images/flashattention-3/fg7.png){:style="width:100%"} - - -![speed charts](/assets/images/flashattention-3/fg8.png){:style="width:100%"} - -For FP8, we can reach close to 1.2 PFLOPS! - - -![speed charts](/assets/images/flashattention-3/fg9.png){:style="width:100%"} - - - -## Discussion - -This blogpost highlights some of the optimizations for FlashAttention available on Hopper GPUs. Other optimizations (e.g., variable length sequences, persistent kernel, and in-kernel transpose for FP8) are covered in the paper. - -We have seen that designing algorithms that take advantage of the hardware they run on can bring significant efficiency gains and unlock new model capabilities such as long context. We look forward to future work on optimization for LLM inference, as well as generalizing our techniques to other hardware architectures. - -We also look forward to FlashAttention-3 being integrated in a future release of PyTorch. - - - -## Notes - -[^1]: - - Without the wgmma instruction, the older mma.sync instruction can only reach about ⅔ the peak throughput of Hopper Tensor Cores: https://arxiv.org/abs/2402.13499v1 - -[^2]: - The CUDA programming guide specifies that the throughput for special functions is 16 operations per streaming multiprocessor (SM) per clock cycle. We multiply 16 by 132 SMs and 1830 Mhz (clock speed used to calculate 989 TFLOPS of FP16 matmul) to get 3.9 TFLOPS \ No newline at end of file diff --git a/_posts/2024-07-22-hopper-tma-unit.md b/_posts/2024-07-22-hopper-tma-unit.md deleted file mode 100644 index 86be657240e0..000000000000 --- a/_posts/2024-07-22-hopper-tma-unit.md +++ /dev/null @@ -1,446 +0,0 @@ ---- -layout: blog_detail -title: "Deep Dive on the Hopper TMA Unit for FP8 GEMMs" -author: Adnan Hoque, Less Wright, Chih-Chieh Yang ---- - -## Abstract - -The Hopper (H100) GPU architecture, billed as the “first truly asynchronous GPU”, includes a new, fully asynchronous hardware copy engine for bulk data movement between global and shared memory called Tensor Memory Accelerator (TMA). While CUTLASS has [built-in](https://github.com/NVIDIA/cutlass/blob/56b46e2d13875b46b8f6a03f9f5ac91e2bfdc01a/include/cute/arch/copy_sm90_tma.hpp) support for TMA via its asynchronous pipeline paradigm, Triton exposes TMA support via an [experimental API](https://github.com/triton-lang/triton/blob/538556a66ee49630e1cb0b239f93e63b968b2478/python/triton/tools/experimental_descriptor.py#L25). - -In this post, we provide a deeper dive into the details of how TMA works, for developers to understand the new async copy engine. We also show the importance of leveraging TMA for H100 kernels by building a TMA enabled FP8 GEMM kernel in Triton, which delivers from 1.4-2.2x performance gains over cuBLAS FP16 for small-to-medium problem sizes. Finally, we showcase key implementation differences between Triton and CUTLASS that may account for reports of performance regressions with TMA in Triton. We open source our implementation for reproducibility and review at [https://github.com/pytorch-labs/applied-ai/tree/main/kernels](https://github.com/pytorch-labs/applied-ai/tree/main/kernels) - -![The throughput in TFLOPs of various Triton and cuBLAS FP8 and FP16 kernels, for M=M, N=4096, K=4096. The red line is the Triton TMA, which showcases the advantages of leveraging TMA.](/assets/images/hopper-tma-unit/fg1.png){:style="width:100%"} - -_**Figure 1.** The throughput in TFLOPs of various Triton and cuBLAS FP8 and FP16 kernels, for M=M, N=4096, K=4096. The red line is the Triton TMA, which showcases the advantages of leveraging TMA._ - -## TMA Background - -TMA is an H100 hardware addition that allows applications to asynchronously and bi-directionally transfer 1D-5D tensors between GPU global and shared memory. In addition, TMA can also transfer the same data to not just the calling SM’s shared memory, but to other SM’s shared memory if they are part of the same Thread Block Cluster. This is termed ‘multicast’. - -TMA is very lightweight as only a single thread is needed to kick off a TMA transfer. By moving data directly from GMEM (global) to SMEM (shared), this avoids earlier GPU requirements of using registers for moving data between different memory spaces. - -![A100-style data movement vs H100 with TMA. TMA hardware eliminates the need for a large amount of threads and registers participating in bulk data transfers.](/assets/images/hopper-tma-unit/fg2.png){:style="width:100%"} - - - -_**Figure 2.** A100-style data movement vs H100 with TMA. TMA hardware eliminates the need for a large amount of threads and registers participating in bulk data transfers. (Image credit Nvidia)_ - -A single thread can issue large data movement instructions, allowing the majority of a given thread block to continue working on other instructions while data is in-flight. Combined with asynchronous pipelining, this allows memory transfers to be easily hidden and ensure the majority of any given thread block cluster can focus on computational task. - -This lightweight invocation for data movement enables the creation of warp-group specialized kernels, where warp-groups take on different roles, namely producers and consumers. Producers elect a leader thread that fires off TMA requests, which are then asynchronously coordinated with the consumer (MMA) warp-groups via an arrival barrier. Consumers then process the data using warp-group MMA, and signal back to the producers when they have finished reading from the SMEM buffer and the cycle repeats. - -Further, within threadblock clusters, producers can lower their max register requirements since they are only issuing TMA calls, and effectively transfer additional registers to MMA consumers, which helps to alleviate register pressure for consumers. - -In addition, TMA handles the address computation for the shared memory destination where the data requested should be placed. This is why calling threads (producers) can be so lightweight. - -To ensure maximum read access speed, TMA can lay out the arriving data based on swizzling instructions, to ensure the arriving data can be read as fast as possible by consumers, as the swizzling pattern helps avoid shared memory bank conflicts. - -Finally for TMA instructions that are outgoing, or moving data from SMEM to GMEM, TMA can also include reduction operations (add/min/max) and bitwise (and/or) operations. - -## TMA usage in Triton - -**Pre-Hopper Load:** - - -``` -offs_m = pid_m*block_m + tl.arange(0, block_m) -offs_n = pid_n*block_n + tl.arange(0, block_n) -offs_k = tl.arange(0, block_k) - -a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k[None, :]*stride_ak) -b_ptrs = b_ptr + (offs_k[:, None]*stride_bk + offs_bn[None, :]*stride_bn) - -a = tl.load(a_ptrs) -b = tl.load(b_ptrs) -``` - -_**Figure 3.** Traditional style bulk load from global to shared memory in Triton_ - - -In the above Triton example showing a pre-Hopper load, we see how the data for tensors a and b are loaded by each thread block computing global offsets (a_ptrs, b_ptrs) from their relevant program_id (pid_m, pid_n, k) and then making a request to move blocks of memory into shared memory for a and b. - -Now let’s examine how to perform a load using TMA in Triton. - -The TMA instruction requires a special data structure called a tensor map, in contrast to the above where we directly pass pointers to global memory. To build the tensor map, we first create a TMA descriptor on the CPU. The descriptor handles the creation of the tensor map by using the [cuTensorMapEncode API](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html#group__CUDA__TENSOR__MEMORY). The tensor map holds metadata such as the global and shared memory layout of the tensor and serves as a compressed representation of the structure of the multi-dimensional tensor stored in global memory. - - -![TMA address generation via a copy descriptor](/assets/images/hopper-tma-unit/fg3.png){:style="width:100%"} - - -_**Figure 4.** TMA address generation via a copy descriptor (Image credit: Nvidia)_ - -The TMA descriptor holds the tensor’s key properties: - -1. Base Pointer -2. Shape and Block Size -3. Datatype - -The TMA descriptor is created on the host before the kernel, and then moved to device by passing the descriptor to a torch tensor. Thus, in Triton, the GEMM kernel receives a global pointer to the tensor map. - -## Triton Host Code - - -``` - desc_a = np.empty(TMA_SIZE, dtype=np.int8) - desc_b = np.empty(TMA_SIZE, dtype=np.int8) - desc_c = np.empty(TMA_SIZE, dtype=np.int8) - - triton.runtime.driver.active.utils.fill_2d_tma_descriptor(a.data_ptr(), m, k, block_m, block_k, a.element_size(), desc_a) - - triton.runtime.driver.active.utils.fill_2d_tma_descriptor(b.data_ptr(), n, k, block_n, block_k, b.element_size(), desc_b) - - triton.runtime.driver.active.utils.fill_2d_tma_descriptor(c.data_ptr(), m, n, block_m, block_n, c.element_size(), desc_c) - - desc_a = torch.tensor(desc_a, device='cuda') - desc_b = torch.tensor(desc_b, device='cuda') - desc_c = torch.tensor(desc_c, device='cuda') -``` - - -This is the code that is used to set up the descriptors in the kernel invoke function. - -## Triton Device Code - -**Offsets/Pointer Arithmetic:** - - -``` - offs_am = pid_m * block_m - offs_bn = pid_n * block_n - offs_k = 0 -``` - - -**Load:** - - -``` - a = tl._experimental_descriptor_load(a_desc_ptr, [offs_am, offs_k], [block_m, block_k], tl.float8e4nv) - b = tl._experimental_descriptor_load(b_desc_ptr, [offs_bn, offs_k], [block_n, block_k], tl.float8e4nv) -``` - - -**Store:** - - -``` - tl._experimental_descriptor_store(c_desc_ptr, accumulator, [offs_am, offs_bn]) -``` - - -We no longer need to calculate a pointer array for both load and store functions in the kernel. Instead, we pass a single descriptor pointer, the offsets, block size and the input datatype. This simplifies address calculation and reduces register pressure, as we no longer have to do complex pointer arithmetic in software and dedicate CUDA cores for address computation. - -## TMA Performance Analysis - -Below, we discuss the PTX instructions for different load mechanisms on Hopper. - -**PTX for Loading Tile (cp.async) - H100 no TMA** - - -``` -add.s32 %r27, %r100, %r8; -add.s32 %r29, %r100, %r9; -selp.b32 %r30, %r102, 0, %p18; - - -@%p1 cp.async.cg.shared.global [ %r27 + 0 ], [ %rd20 + 0 ], 0x10, %r30; -@%p1 cp.async.cg.shared.global [ %r29 + 0 ], [ %rd21 + 0 ], 0x10, %r30; - - -cp.async.commit_group ; -``` - - -Here, we observe the older cp.async instruction responsible for global memory copies. From the traces below we can see that both loads bypass the L1 cache. A major difference in the newer TMA load is that before tiles from A and B were ready to be consumed by the Tensor Core we would need to execute an ldmatrix instruction that operated on data contained in register files. On Hopper, the data can now be directly reused from shared memory. - -![H100 Memory Chart showing GMEM Throughput = 910.22 GB/s](/assets/images/hopper-tma-unit/fg4.png){:style="width:100%"} - - - -_**Figure 5.** H100 Memory Chart showing GMEM Throughput = 910.22 GB/s (Triton GEMM **without** TMA) for M=128, N=4096, K=4096_ - -By leveraging TMA through the Triton API changes we mentioned above, we can investigate the PTX that Triton generates for a single 2D tile load with TMA. - -**PTX for Loading Tile (cp.async.bulk.tensor) - H100 using TMA** - - -``` -bar.sync 0; -shr.u32 %r5, %r4, 5; -shfl.sync.idx.b32 %r66, %r5, 0, 31, -1; - -elect.sync _|%p7, 0xffffffff; - - -add.s32 %r24, %r65, %r67; -shl.b32 %r25, %r66, 7; - -@%p8 -{% raw %}cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [%r24], [%rd26, {%r25,%r152}], [%r19];{% endraw %} -``` - - -The cp.async.bulk.tensor.2d.shared TMA instruction is passed the destination address in shared memory, a pointer to the tensor map, the tensor map coordinates and a pointer to the mbarrier object, respectively. - - -![H100 Memory Chart GMEM Throughput =1.45 TB/s](/assets/images/hopper-tma-unit/fg5.png){:style="width:100%"} - - - -_**Figure 6.** H100 Memory Chart GMEM Throughput =1.45 TB/s (Triton GEMM **with** TMA) for M=128, N=4096, K=4096_ - -For optimal performance we tuned the TMA GEMM kernel extensively. Amongst other parameters such as tile sizes, number of warps and number of pipeline stages, the biggest increase in memory throughput was observed when we increased the TMA_SIZE (descriptor size) from 128 to 512. From the above NCU profiles, we can see that the final tuned kernel has increased global memory transfer throughput from 910 GB/s to 1.45 TB/s, a **59%** increase in GMEM throughput, over the non-TMA Triton GEMM kernel. - -**Comparison of CUTLASS and Triton FP8 GEMM and TMA Implementation - Kernel Architecture** - -![Triton vs CUTLASS Ping-Pong FP8 GEMM TFLOPs, M=M, N=4096, K=4096](/assets/images/hopper-tma-unit/fg6.png){:style="width:100%"} - - - -_**Figure 7.** Triton vs CUTLASS Ping-Pong FP8 GEMM TFLOPs, M=M, N=4096, K=4096_ - -The above chart shows the performance of a CUTLASS [Ping-Pong GEMM kernel](https://github.com/NVIDIA/cutlass/blob/637b15906358191cb4238af419d408a65819d7ec/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp) against Triton. The Ping-Pong kernel leverages TMA differently than Triton. It makes use of all of its HW and SW software capabilities, while Triton currently does not. Specifically, CUTLASS supports the below TMA features that help explain the performance gaps in pure GEMM performance:. - - -1. TMA Multicast - - - Enables copy of data from GMEM to multiple SMs - -2. Warp Specialization - - - Enables warp groups within a threadblock to take on different roles - -3. Tensor Map (TMA Descriptor) Prefetch - - - Enables prefetching the Tensor Map object from GMEM, which allows pipelining of TMA loads - - -To put the performance numbers in perspective, below we show a ‘speed-up’ chart highlighting the latency differences on a percentage basis: - -![% Speedup of CUTLASS Ping-Pong vs Triton FP8 with TMA.](/assets/images/hopper-tma-unit/fg7.png){:style="width:100%"} - - -_**Figure 8:** % Speedup of CUTLASS Ping-Pong vs Triton FP8 with TMA._ - -This speedup is purely kernel throughput, not including E2E launch overhead which we will discuss below. - -**TMA Descriptor movement - a key difference between Triton and CUTLASS with E2E performance implications** - -As noted previously, creation of a 2D+ dimensional TMA descriptor takes place on the host and is then transferred to the device. However, this transfer process takes place very differently depending on the implementation. - -Here we showcase the differences between how Triton transfers TMA descriptors compared with CUTLASS. - -Recall, TMA transfers require a special data structure, a tensor map to be created on CPU through the cuTensorMap API, which for an FP8 GEMM Kernel means creating three descriptors, one for each A, B and C. We see below that for both the Triton and CUTLASS Kernels the same CPU procedures are invoked. - - -![Calls to cuTensorMapEncodeTiled (Both Triton and CUTLASS use this path)](/assets/images/hopper-tma-unit/fg8.png){:style="width:100%"} - - - -_**Figure 7.** Calls to cuTensorMapEncodeTiled (Both Triton and CUTLASS use this path)_ - -However, for Triton, each descriptor is transferred in its own distinct copy kernel, which adds a significant amount of overhead and serves as a barrier to use this kernel in an end-to-end use inference scenario. - - -![Three H2D Copy Kernels are launched before the kernel execution, for A, B and C](/assets/images/hopper-tma-unit/fg9.png){:style="width:100%"} - - - -_**Figure 8.** Three H2D Copy Kernels are launched before the kernel execution, for A, B and C_ - -These copies are not observed in the CUTLASS implementation, due to the way that TMA descriptors are passed to the kernel. We can see from the PTX below that with Cutlass, tensor maps are passed-by-value to the kernel. - - -``` -.entry _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_6half_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEENS7_ILi128EEES9_EEENS6_IJNS7_ILi2EEENS7_ILi1EEESC_EEENS_4gemm32KernelTmaWarpSpecializedPingpongENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE( - -.param .align 64 .b8 _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_6half_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEENS7_ILi128EEES9_EEENS6_IJNS7_ILi2EEENS7_ILi1EEESC_EEENS_4gemm32KernelTmaWarpSpecializedPingpongENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE_param_0[1024] - - -mov.b64 %rd110, _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_10bfloat16_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEES8_NS7_ILi256EEEEEENS6_IJNS7_ILi1EEESB_SB_EEENS_4gemm24KernelTmaWarpSpecializedENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE_param_0; - -add.s64 %rd70, %rd110, 704; -cvta.param.u64 %rd69, %rd70; - -{% raw %}cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%rd69, {%r284, %r283}], [%r1880];{% endraw %} -``` - - -_**Figure 9.** CUTLASS kernel PTX showing pass-by-value_ - -By directly passing the TMA Descriptor as opposed to passing a global memory pointer, the CUTLASS kernel avoids the three extra H2D copy kernels and instead these copies are included in the single device kernel launch for the GEMM. - -Because of the difference in how descriptors are moved to the device, the kernel latencies including the time to prepare the tensors to be consumed by the TMA is drastically different. For M=1-128, N=4096, K=4096 the CUTLASS pingpong kernel has an average latency of 10us Triton TMA kernels complete in an average of 4ms. This is a factor of ~3330x slower and appears to be directly linked to the 3 independent kernel launches for TMA descriptor transfer by Triton. - -Cuda graphs may be one way to reduce this, but given the overhead created by the H2D copies the current Triton implementation when measured end to end is not competitive. A rework of how the Triton compiler manages TMA descriptors would likely resolve this gap. We thus focused on comparing the actual compute kernel throughput and not E2E in our data above. - - -## Results Summary - -![Triton FP8 TMA GEMM TFLOPs Comparison](/assets/images/hopper-tma-unit/fg10.png){:style="width:100%"} - - - -_**Figure 10.** Triton FP8 TMA GEMM TFLOPs Comparison_ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        M - Triton TMA - Triton Tutorial - Triton SplitK - cuBLAS FP8 - cuBLAS FP16 - CUTLASS Ping-Pong FP8 -
        1 - 2.5 - 1 - 2.4 - 1.5 - 1.8 - 3.57 -
        2 - 5.1 - 2.5 - 4.8 - 3.1 - 3.6 - 5.9 -
        4 - 10.3 - 7.21 - 9.6 - 6.1 - 7.2 - 14.3 -
        8 - 21.0 - 16.5 - 19.2 - 12.3 - 14.4 - 28.6 -
        16 - 44.5 - 41.0 - 37.2 - 24.5 - 27.7 - 55.1 -
        32 - 89.7 - 81.2 - 72.2 - 71.6 - 56.8 - 114.4 -
        64 - 178.5 - 163.7 - 130.8 - 144.6 - 105.3 - 228.7 -
        128 - 359.7 - 225.9 - 160.1 - 244.0 - 189.2 - 377.7 -
        - - -_**Figure 11.** Triton FP8 TMA GEMM TFLOPs Comparison Table_ - -The above chart and table summarize the gain we’ve been able to achieve on a single NVIDIA H100 for FP8 GEMM, by leveraging the TMA Hardware Unit, over non-TMA Triton kernels and high performance CUDA (cuBLAS) kernels. The key point to note is this kernel’s superior scaling (with the batch size) properties over the competition. The problem sizes we benchmarked on are representative of the matrix shapes found in small-to-medium batch size LLM inference. Thus, TMA GEMM kernel performance in the mid-M regime (M=32 to M=128) will be critical for those interested in leveraging this kernel for FP8 LLM deployment use cases, as the FP8 compressed data type can allow larger matrices to fit in GPUs memory. - -To summarize our analysis, the TMA implementation in Triton and CUTLASS differ in terms of full featureset support (multicast, prefetch etc.) and how the TMA Descriptor is passed to the GPU kernel. If this descriptor is passed in a manner that more closely matches the CUTLASS kernel (pass-by-value), the extraneous H2D copies could be avoided and thus the E2E performance would be greatly improved. - -## Future Work - -For future research, we plan to improve upon these results, by working with the community to incorporate the CUTLASS architecture of TMA loads into Triton as well as investigating the Cooperative Kernel for FP8 GEMM, a modified strategy to the Ping-Pong Kernel. - -In addition, once features like thread block clusters and TMA atomic operations are enabled in Triton, we may be able to get further speedups by leveraging the SplitK strategy in the TMA GEMM Kernel, as atomic operations on Hopper can be performed in Distributed Shared Memory (DSMEM) as opposed to L2 Cache. We also note the similarities of NVIDIA Hopper GPUs with other AI hardware accelerators like Google’s [TPU](https://people.csail.mit.edu/suvinay/pubs/2023.tpu.isca.pdf) and IBM’s [AIU](https://ieeexplore.ieee.org/document/9499865) which are dataflow architectures. On Hopper, data can now “flow” from GMEM to a network of connected SMs due to the additions of TMA, which we discussed extensively in this blog, and DSMEM, which we plan to cover in a future post. \ No newline at end of file diff --git a/_posts/2024-07-24-pytorch2-4.md b/_posts/2024-07-24-pytorch2-4.md deleted file mode 100644 index ac5567b2a61f..000000000000 --- a/_posts/2024-07-24-pytorch2-4.md +++ /dev/null @@ -1,162 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.4 Release Blog" ---- - -We are excited to announce the release of PyTorch® 2.4 ([release note](https://github.com/pytorch/pytorch/releases/tag/v2.4.0))! PyTorch 2.4 adds support for the latest version of Python (3.12) for `torch.compile`. AOTInductor freezing gives developers running AOTInductor more performance-based optimizations by allowing the serialization of MKLDNN weights. As well, a new default TCPStore server backend utilizing `libuv` has been introduced which should significantly reduce initialization times for users running large-scale jobs. Finally, a new Python Custom Operator API makes it easier than before to integrate custom kernels into PyTorch, especially for `torch.compile`. - -This release is composed of 3661 commits and 475 contributors since PyTorch 2.3. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.4. More information about how to get started with the PyTorch 2-series can be found at our [Getting Started](https://pytorch.org/get-started/pytorch-2.0/) page. - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Beta - Prototype - Performance Improvements -
        Python 3.12 support for torch.compile - FSDP2: DTensor-based per-parameter-sharding FSDP - torch.compile optimizations for AWS Graviton (aarch64-linux) processors -
        AOTInductor Freezing for CPU - torch.distributed.pipelining, simplified pipeline parallelism - BF16 symbolic shape optimization in TorchInductor -
        New Higher-level Python Custom Operator API - Intel GPU is available through source build - Performance optimizations for GenAI projects utilizing CPU devices -
        Switching TCPStore’s default server backend to libuv - - -
        - - -*To see a full list of public feature submissions click [here](https://docs.google.com/spreadsheets/d/1TzGkWuUMF1yTe88adz1dt2mzbIsZLd3PBasy588VWgk/edit?usp=sharing). - - -## Beta Features - - -### [Beta] Python 3.12 support for _torch.compile_ - -`torch.compile()` previously only supported Python **3.8-3.11**. Users can now optimize models with `torch.compile()` with Python **3.12**. - - -### [Beta] AOTInductor Freezing for CPU - - -This feature enables users to turn on the freezing flag when using AOTInductor on CPU. With this feature, AOTInductor can cover the same set of op scenarios and reach on-par performance as Inductor CPP backend. Before this support, when models contain MKLDNN operators (when computation-intensive operators are involved, such as Convolution, Linear, ConvTranspose, and so on) and freezing is on, those models will fail to run since AOTInductor didn’t support serializing the MKLDNN weights which have an opaque format. - -The workflow is as explained in the AOTInductor [tutorial](https://pytorch.org/docs/main/torch.compiler_aot_inductor.html), in addition to that users could now add the freezing flag to get better performance: -``` -export TORCHINDUCTOR_FREEZING=1 -``` - - -### [Beta] New Higher-level Python Custom Operator API - -We’ve added a new higher-level Python Custom Operator API that makes it easier than before to extend PyTorch with custom operators that behave like PyTorch’s built-in operators. Operators registered using the [new high-level torch.library APIs](https://pytorch.org/docs/2.4/library.html#module-torch.library) are guaranteed to be compatible with `torch.compile` and other PyTorch subsystems; authoring a custom operator in Python using the previous [low-level torch.library APIs](https://pytorch.org/docs/2.4/library.html#low-level-apis) required deep understanding of PyTorch internals and has many footguns. - -Please see the [tutorial](https://pytorch.org/tutorials/advanced/python_custom_ops.html) for more information. - -### [Beta] Switching TCPStore’s default server backend to _libuv_ - -Introduced a new default server backend for TCPStore built with `libuv` which should introduce significantly lower initialization times and better scalability. This should ideally benefit users with a much shorter startup time when accounting for large-scale jobs. - -For more information on the motivation + fallback instructions please refer to this [tutorial](https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html). - - -## Prototype Features - - -### [PROTOTYPE] FSDP2: DTensor-based per-parameter-sharding FSDP - -FSDP2 is a new fully sharded data parallelism implementation that uses dim-0 per-parameter sharding to resolve fundamental composability challenges with FSDP1's flat-parameter sharding. - -For more information regarding the motivation / design for FSDP2 please refer to the [RFC on Github](https://github.com/pytorch/pytorch/issues/114299). - - -### [PROTOTYPE] _torch.distributed.pipelining_, simplified pipeline parallelism - -Pipeline Parallelism is one of the primitive parallelism techniques for deep learning. It allows the execution of a model to be partitioned such that multiple micro-batches can execute different parts of the model code concurrently. - -`torch.distributed.pipelining` provides a toolkit that allows for easy implementation of pipeline parallelism on general models while also offering composability with other common PyTorch distributed features like DDP, FSDP, or tensor parallel. - -For more information on this please refer to our [documentation](https://pytorch.org/docs/main/distributed.pipelining.html) and [tutorial](https://pytorch.org/tutorials/intermediate/pipelining_tutorial.html). - -### [PROTOTYPE] Intel GPU is available through source build - -Intel GPU in PyTorch on Linux systems offers fundamental functionalities on Intel® Data Center GPU Max Series: eager mode and torch.compile. - -For eager mode, the commonly used Aten operators are implemented by using SYCL programming language. The most performance-critical graphs and operators are highly optimized by using oneAPI Deep Neural Network (oneDNN). For torch.compile mode, Intel GPU backend is integrated to Inductor on top of Triton. - -For more information for Intel GPU source build please refer to our [blog post](https://www.intel.com/content/www/us/en/developer/articles/technical/pytorch-2-4-supports-gpus-accelerate-ai-workloads.html) and [documentation](https://pytorch.org/docs/main/notes/get_start_xpu.html). - -## Performance Improvements - - -### _torch.compile_ optimizations for AWS Graviton (aarch64-linux) processors - - -AWS optimized the PyTorch torch.compile feature for AWS Graviton3 processors. This optimization results in up to 2x better performance for Hugging Face model inference (based on geomean of performance improvement for 33 models) and up to 1.35x better performance for TorchBench model inference (geomean of performance improvement for 45 models) compared to the default eager mode inference across several natural language processing (NLP), computer vision (CV), and recommendation models on AWS Graviton3-based Amazon EC2 instances. - -For more information regarding specific technical details please refer to the [blog post](https://pytorch.org/blog/accelerated-pytorch-inference/). - -### BF16 symbolic shape optimization in TorchInductor - -Pytorch users can now experience improved quality and performance gains with the beta BF16 symbolic shape support. While static shape may afford additional optimization opportunities compared to symbolic shape, it is insufficient for scenarios such as inference services with varying batch size and sequence length, or detection models with data-dependent output shape. - -Verification using TorchBench, Huggingface, and timms_model shows a similar pass rate and comparable speedup with the BF16 static shape scenario. Combining the benefits of symbolic shape with BF16 AMX instructions hardware acceleration provided by Intel CPUs and general Inductor CPU backend optimizations applicable to both static and symbolic shape in PyTorch 2.4, the performance for BF16 symbolic shape has significantly improved compared to PyTorch 2.3. - -The API to use this feature: - -```python -model = …. -model.eval() -with torch.autocast(device_type=”cpu”, dtype=torch.bfloat16), torch.no_grad(): - compiled_model = torch.compile(model, dynamic=True) -``` - -### Performance optimizations for GenAI projects utilizing CPU devices - -Highlighting the enhanced performance of PyTorch on CPU, as demonstrated through the optimizations made for the ["Segment Anything Fast"](https://github.com/pytorch-labs/segment-anything-fast) and [“Diffusion Fast”](https://github.com/huggingface/diffusion-fast) project. However, only CUDA devices are supported in the model. We have incorporated CPU support into the projects, enabling users to leverage the increased power of CPU for running the project's experiments. Meanwhile, we have employed a [block-wise attention mask for SDPA](https://github.com/pytorch/pytorch/pull/126961) as well, which can significantly reduce peak memory usage and improve performance. We have also optimized a series of [layout propagation rules in Inductor CPU](https://github.com/pytorch/pytorch/pull/126961) to improve performance. - -To facilitate this, we have updated the README file. The API to use this feature is given below, simply providing `--device cpu` in the command lines: - -* For Segment Anything Fast: - - ```bash - export SEGMENT_ANYTHING_FAST_USE_FLASH_4=0 - python run_experiments.py 16 vit_b - --run-experiments --num-workers 32 --device cpu - ``` -* For Diffusion Fast: - - ```bash - python run_benchmark.py --compile_unet --compile_vae --enable_fused_projections --device=cpu - ``` - -Users can follow the guidelines to run the experiments and observe the performance improvements firsthand, as well as explore the performance improvement trends across FP32 and BF16 data types. - -Additionally, users can achieve good performance using `torch.compile` and SDPA. By observing the performance trends across these different factors, users can gain a deeper understanding of how various optimizations enhance PyTorch's performance on CPU. diff --git a/_posts/2024-07-30-quantization-aware-training.md b/_posts/2024-07-30-quantization-aware-training.md deleted file mode 100644 index 34340ccb987f..000000000000 --- a/_posts/2024-07-30-quantization-aware-training.md +++ /dev/null @@ -1,293 +0,0 @@ ---- -layout: blog_detail -title: "Quantization-Aware Training for Large Language Models with PyTorch" -author: Andrew Or, Jerry Zhang, Evan Smothers, Kartikay Khandelwal, Supriya Rao ---- - -In this blog, we present an end-to-end Quantization-Aware Training (QAT) flow for large language models in PyTorch. We demonstrate how QAT in PyTorch can **recover up to 96% of the accuracy degradation** **on hellaswag and** **68% of the perplexity degradation on wikitext** **for Llama3 compared to post-training quantization (PTQ).** We present the QAT APIs in [torchao](https://github.com/pytorch/ao/) and showcase how users can leverage them for fine-tuning in [torchtune](https://github.com/pytorch/torchtune/). - - -![Llama3-8B fine-tuned on the C4 dataset (en subset) with and without QAT using int8 per token dynamic activations + int4 grouped per channel weights, evaluated on hellaswag and wikitext on a A100 GPU. Note the log scale for wikitext (lower is better).](/assets/images/quantization-aware-training/fg1.jpg){:style="width:100%"} - - -**Figure 1:** Llama3-8B fine-tuned on the C4 dataset (en subset) with and without QAT using int8 per token dynamic activations + int4 grouped per channel weights, evaluated on hellaswag and wikitext on a A100 GPU. Note the log scale for wikitext (lower is better). - -To demonstrate the effectiveness of QAT in an end-to-end flow, we further lowered the quantized model to [XNNPACK](https://github.com/google/XNNPACK), a highly optimized neural network library for backends including iOS and Android, through [executorch](https://github.com/pytorch/executorch/tree/main/examples/models/llama2). **After lowering to XNNPACK, the QAT model saw 16.8% lower perplexity than the PTQ model, while maintaining the same model size and on-device inference and generation speeds.** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Lowered model metric - PTQ - QAT -
        Wikitext word perplexity (↓) - 23.316 - 19.403 -
        Wikitext byte perplexity (↓) - 1.850 - 1.785 -
        Wikitext bits per byte (↓) - 0.887 - 0.836 -
        Model size - 3.881 GB - 3.881 GB -
        On-device inference speed - 5.065 tok/s - 5.265 tok/s -
        On-device generation speed - 8.369 tok/s - 8.701 tok/s -
        - - -**Table 1:** QAT achieved 16.8% lower perplexity and unchanged model sizes and on-device inference and generation speeds on the Llama3-8B model lowered to XNNPACK. Linear layers are quantized using int8 per token dynamic activations + int4 grouped per channel weights, and embeddings are additionally quantized to int4 using a group size of 32 (QAT is only applied to linear layers). Wikitext evaluation is performed using 5 samples and a max sequence length of 127 on server CPU, since evaluation is not available on device (lower is better for all wikitext results). On-device inference and generation is benchmarked on the Samsung Galaxy S22 smartphone. - - -### QAT APIs - -We are excited for users to try our [QAT API](https://github.com/pytorch/ao/blob/v0.3.0/torchao/quantization/prototype/qat.py) in torchao, which can be leveraged for both training and fine-tuning. This API involves two steps, prepare and convert: prepare applies a transformation on the linear layers in the model to simulate the numerics of quantization during training, and convert actually quantizes these layers into lower bit-widths after training. The converted model can then be used in the exact same way as the PTQ model: - -```py -import torch -from torchtune.models.llama3 import llama3 -from torchao.quantization.prototype.qat import Int8DynActInt4WeightQATQuantizer - -# Smaller version of llama3 to fit in a single GPU -model = llama3( - vocab_size=4096, - num_layers=16, - num_heads=16, - num_kv_heads=4, - embed_dim=2048, - max_seq_len=2048, -).cuda() - -# Quantizer for int8 dynamic per token activations + -# int4 grouped per channel weights, only for linear layers -qat_quantizer = Int8DynActInt4WeightQATQuantizer() - -# Insert "fake quantize" operations into linear layers. -# These operations simulate quantization numerics during -# training without performing any dtype casting -model = qat_quantizer.prepare(model) - -# Standard training loop -optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5) -loss_fn = torch.nn.CrossEntropyLoss() -for i in range(10): - example = torch.randint(0, 4096, (2, 16)).cuda() - target = torch.randn((2, 16, 4096)).cuda() - output = model(example) - loss = loss_fn(output, target) - loss.backward() - optimizer.step() - optimizer.zero_grad() - -# Convert fake quantize to actual quantize operations -# The quantized model has the exact same structure as the -# quantized model produced in the corresponding PTQ flow -# through `Int8DynActInt4WeightQuantizer` -model = qat_quantizer.convert(model) - -# inference or generate -``` - -#### Fine-tuning with torchtune - -We also integrated this QAT flow into [torchtune](https://github.com/pytorch/torchtune) and provided [recipes](https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3/8B_qat_full.yaml) to run this in a distributed setting, similar to the existing full fine-tune distributed recipe. Users can additionally apply QAT during LLM fine-tuning by running the following command. See [this README](https://github.com/pytorch/torchtune/blob/main/recipes/quantization.md) for more details. - -```py -tune run --nproc_per_node 8 qat_distributed --config llama3/8B_qat_full -``` - -## What is Quantization-Aware Training? - -Quantization-Aware Training (QAT) is a common quantization technique for mitigating model accuracy/perplexity degradation that arises from quantization. This is achieved by simulating quantization numerics during training while keeping the weights and/or activations in the original data type, typically float, effectively “fake quantizing” the values instead of actually casting them to lower bit-widths: - -```py -# PTQ: x_q is quantized and cast to int8 -# scale and zero point (zp) refer to parameters used to quantize x_float -# qmin and qmax refer to the range of quantized values -x_q = (x_float / scale + zp).round().clamp(qmin, qmax).cast(int8) - -# QAT: x_fq is still in float -# Fake quantize simulates the numerics of quantize + dequantize -x_fq = (x_float / scale + zp).round().clamp(qmin, qmax) -x_fq = (x_fq - zp) * scale -``` - -Since quantization involves non-differentiable operations like rounding, the QAT backward pass typically uses [straight-through estimators (STE)](https://arxiv.org/pdf/1308.3432), a mechanism to estimate the gradients flowing through non-smooth functions, to ensure the gradients passed to the original weights are still meaningful. In this manner, the gradients are computed with the knowledge that the weights will ultimately be quantized after training, effectively allowing the model to adjust for quantization noise during the training process. Note that an alternative to QAT is quantized training, which actually casts the values to lower bit dtypes during training, but [prior efforts](https://cloud.google.com/blog/products/compute/accurate-quantized-training-aqt-for-tpu-v5e) have only seen success up to 8-bits, whereas QAT is effective even at lower bit-widths. - - -### QAT in PyTorch - -We added an initial QAT flow in torchao under prototype [here](https://github.com/pytorch/ao/blob/v0.2.0/torchao/quantization/prototype/qat.py). Currently we support int8 dynamic per-token activations + int4 grouped per-channel weights (abbreviated 8da4w) for linear layers. These settings are motivated by a combination of [kernel availability on edge backends](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md#quantization) and [prior research on LLM quantization](https://arxiv.org/pdf/2305.17888), which found that per-token activation and per-group weight quantization achieves the best model quality for LLMs compared to other quantization schemes. - - -![torchao QAT flow. This flow involves two steps: (1) prepare, which inserts the fake quantization ops into the model’s linear layers, and (2) convert, which converts these fake quantization ops with actual quantize and dequantize ops after training.](/assets/images/quantization-aware-training/fg2.png){:style="width:100%"} - - -**Figure 2:** torchao QAT flow. This flow involves two steps: (1) prepare, which inserts the fake quantization ops into the model’s linear layers, and (2) convert, which converts these fake quantization ops with actual quantize and dequantize ops after training. - -This flow produces the exact same quantized model as the PTQ flow using the same quantization settings (through [Int8DynActInt4WeightQuantizer](https://github.com/pytorch/ao/blob/v0.3.0/torchao/quantization/GPTQ.py#L941)), but with quantized weights that achieve superior accuracies and perplexities. Thus, we can use the model converted from the QAT flow as a drop-in replacement for the PTQ model and reuse all the backend delegation logic and underlying kernels. - - -## Experimental Results - -All experiments in this blog post are performed using the torchtune QAT integration described above. We use 6-8 A100 GPUs with 80 GBs each to fine-tune [Llama2-7B](https://huggingface.co/meta-llama/Llama-2-7b) and [Llama3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the [C4 dataset](https://huggingface.co/datasets/allenai/c4) (en subset) for 5000 steps. For all experiments, we use batch size = 2, learning rate = 2e-5, max sequence length = 4096 for Llama2 and 8192 for Llama3, [Fully Sharded Data Parallel](https://pytorch.org/docs/stable/fsdp.html) (FSDP) as our distribution strategy, and activation checkpointing to reduce memory footprint. For 8da4w experiments, we use a group size of 256 for weights. - -Since the pre-training dataset is not easily accessible, we perform QAT during the fine-tuning process. Empirically, we found that disabling fake quantization for the first N steps led to better results, presumably because doing so allows the weights to stabilize before we start introducing quantization noise to the fine-tuning process. We disable fake quantization for the first 1000 steps for all our experiments. - -We evaluate our quantized models using the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) integration in torchtune. We report evaluation results from a variety of tasks commonly used to evaluate LLMs, including hellaswag, a commonsense sentence completion task, wikitext, a next token/byte prediction task, and a few question-answering tasks such as arc, openbookqa, and piqa. For wikitext, perplexity refers to the inverse of how well the model can predict the next word or byte (lower is better), and `bits_per_byte` refers to how many bits are needed to predict the next byte (lower is also better here). For all other tasks, `acc_norm` refers to the accuracy normalized by the byte-length of the target string. - - -#### Int8 Dynamic Activations + Int4 Weight Quantization (8da4w) - -Starting with Llama2 8da4w quantization, we saw that QAT was able to recover 62% of the normalized accuracy degradation on hellaswag compared to PTQ, and 58% and 57% of the word and byte perplexity degradation (respectively) on wikitext. We see similar improvements for most of the other tasks. - - -![Llama2-7B 8da4w quantization with and without QAT](/assets/images/quantization-aware-training/fg3a.png){:style="width:100%"} - - -**Figure 3a:** Llama2-7B 8da4w quantization with and without QAT - - -![Llama2-7B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)](/assets/images/quantization-aware-training/fg3b.png){:style="max-width:400px; display:block; margin-left: auto; margin-right: auto; width:100%"} - - -**Figure 3b:** Llama2-7B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better) - -Llama3 8da4w quantization saw even more pronounced improvements with QAT. On the hellaswag evaluation task, we were able to recover 96% of the normalized accuracy degradation on hellaswag compared to PTQ, with minimal overall degradation (<1%) compared to the non-quantized accuracy. On the wikitext evaluation task, QAT recovered 68% and 65% of the word and byte perplexity degradation (respectively). Even on arc_challenge, which was difficult for Llama2 QAT, we were able to recover 51% of the normalized accuracy degradation. - - -![Llama3-8B 8da4w quantization with and without QAT](/assets/images/quantization-aware-training/fg4a.png){:style="width:100%"} - - -**Figure 4a:** Llama3-8B 8da4w quantization with and without QAT - - -![Llama3-8B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)](/assets/images/quantization-aware-training/fg4b.png){:style="max-width:400px; display:block; margin-left: auto; margin-right: auto; width:100%"} - -**Figure 4b:** Llama3-8B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better) - - -#### Lower Bit Weight Only Quantization - -We further extended the torchao QAT flow to 2-bit and 3-bit weight only quantization and repeated the same experiments for Llama3-8B. Quantization degradation is more severe at lower bit-widths, so we use a group size of 32 for all experiments for finer-grained quantization. - -However, this is still not enough for 2-bits PTQ, which saw wikitext perplexity explode. To mitigate this problem, we leverage knowledge from prior sensitivity analysis that the first 3 and last 2 layers of the Llama3 model are the most sensitive, and skip quantizing these layers in exchange for a moderate increase in quantized model size (1.78 GB for 2-bits and 1.65 GB for 3-bits). This brought the wikitext word perplexity down from 603336 to 6766, which is significant but still far from acceptable. To further improve the quantized model, we turn to QAT. - -![Llama3-8B 2-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.](/assets/images/quantization-aware-training/fg5a.png){:style="max-width:400px; display:block; margin-left: auto; margin-right: auto; width:100%"} - - -**Figure 5a:** Llama3-8B 2-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale. - -We observe that applying QAT while skipping quantization for the first 3 and last 2 layers further brought the word perplexity down to a much more reasonable value of 30 (from 6766). More generally, QAT was able to recover 53% of the normalized accuracy degradation on hellaswag compared to PTQ, and 99% and 89% of the word and byte perplexity degradation (respectively) on wikitext. Without skipping the sensitive layers, however, QAT was far less effective at mitigating degradation in quantized model quality. - - -![Llama3-8B 2-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.](/assets/images/quantization-aware-training/fg5b.png){:style="width:100%"} - - -**Figure 5b:** Llama3-8B 2-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. - -For 3-bit weight only quantization, QAT was effective even without skipping the first 3 and last 2 layers, though skipping these layers still led to better results for both PTQ and QAT. In the skip case, QAT was able to recover 63% of the normalized accuracy degradation on hellaswag compared to PTQ, and 72% and 65% of the word and byte perplexity degradation (respectively) on wikitext. - -![Llama3-8B 3-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.](/assets/images/quantization-aware-training/fg6a.png){:style="width:100%"} - - -**Figure 6a:** Llama3-8B 3-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. - -![Llama3-8B 3-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.](/assets/images/quantization-aware-training/fg6b.png){:style="max-width:400px; display:block; margin-left: auto; margin-right: auto; width:100%"} - - -**Figure 6b:** Llama3-8B 3-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale. - - -#### QAT Overhead - -QAT inserts many fake quantize operations throughout the model, adding considerable overhead to both the fine-tuning speed and the memory usage. For a model like Llama3-8B for example, we have (32 * 7) + 1 = 225 linear layers, each of which has at least 1 fake quantize for the weights and potentially 1 fake quantize for the input activations. Memory footprint increase is also significant, since we cannot mutate the weights in-place and so we need to clone them before applying fake quantization, though this overhead can be mostly mitigated by enabling activation checkpointing. - -In our microbenchmarks, we found that 8da4w QAT fine-tuning is ~34% slower than regular full fine-tuning. With activation checkpointing, the memory increase per GPU is around 2.35 GB. Most of these overheads are fundamental to how QAT works, though we may be able to speed up computation with [torch.compile](https://pytorch.org/get-started/pytorch-2.0/) in the future. - - - - - - - - - - - - - - - - - - -
        Per GPU statistics - Full fine-tuning - QAT fine-tuning -
        Median tokens per second - 546.314 tok/s - 359.637 tok/s -
        Median peak memory - 67.501 GB - 69.850 GB -
        - - -**Table 2:** Llama3 QAT fine-tuning overhead for int8 per token dynamic activations + int4 grouped per channel weights on 6 A100 GPUs (each with 80GB memory). - - -## Looking Ahead - -In this blog, we presented a QAT flow for LLMs through [torchao](https://github.com/pytorch/ao/), integrated this flow with the fine-tuning APIs in [torchtune](https://github.com/pytorch/torchtune/), and demonstrated its potential to recover most of the quantization degradation compared to PTQ and match non-quantized performance on certain tasks. There are many directions for future explorations: - - - -* **Hyperparameter tuning.** It is likely that extensive hyperparameter tuning can further improve the results of finetuning and QAT. In addition to the general hyperparameters like the learning rate, batch size, dataset size, and number of fine-tuning steps, we should also tune QAT-specific ones, such as when to start/stop fake quantization, how many steps to fake quantize, and regularization parameters for fake quantized values. -* **Outlier reduction techniques.** In our experiments, we found that both PTQ and QAT were susceptible to outliers. In addition to simple clamping and regularization during fine-tuning, we can explore techniques that allow the network to learn how to control these outliers (e.g. [learned quantization ranges](https://arxiv.org/pdf/1902.08153), [clipped softmax](https://arxiv.org/pdf/2306.12929), and [gated attention](https://arxiv.org/pdf/2306.12929)), or possibly even borrow outlier suppression techniques from post-training settings (e.g. [SpinQuant](https://arxiv.org/pdf/2405.16406), [SmoothQuant](https://arxiv.org/pdf/2211.10438)) and apply them sparingly throughout the fine-tuning process. -* **Mixed-precision and more complex dtypes.** Especially in the lower bit regime, we saw that skipping quantization for certain sensitive layers was effective for both PTQ and QAT. Did we need to skip quantizing these layers altogether, or can we still quantize them, just to lower bit-widths? It will be interesting to explore mixed-precision quantization in the context of QAT. Training with newer dtypes such as MX4 is another promising direction, especially given that the upcoming Blackwell GPUs will [no longer support int4 tensor cores](https://www.nvidia.com/en-us/data-center/tensor-cores/). -* **Composability with LoRA and QLoRA.** Our QAT integration in torchtune currently only supports the full fine-tuning workflow. However, many users wish to fine-tune their models using low-ranked adaptors to substantially reduce their memory footprint. Composing QAT with techniques like LoRA / QLoRA will enable users to reap the memory and performance benefits of these approaches while producing a model that will ultimately be quantized with minimal model quality degradation. -* **Composability with [torch.compile](https://pytorch.org/get-started/pytorch-2.0/).** This is another potential way to significantly speed up fake quantization computations in QAT while reducing memory footprint. torch.compile is currently not compatible with the distribution strategy used in full distributed fine-tuning recipes in torchtune (with or without QAT), but support will be added in the near future. -* **Quantizing other layers.** In this work, we only explored quantizing the linear layers. However, in the context of long sequence lengths, the KV cache often becomes the throughput bottleneck and can reach tens of GBs, hence [LLM-QAT](https://arxiv.org/pdf/2305.17888) explored quantizing the KV cache alongside activations and weights. [Prior work](https://arxiv.org/pdf/2109.12948) has also had success with quantizing the embedding layer down to 2-bits in other transformer-based models. -* **End-to-end evaluation on performant cuda kernels.** A natural extension of this work is to provide an end-to-end QAT flow evaluated on performant cuda kernels, similar to the existing 8da4w QAT flow lowered to XNNPACK kernels through executorch. For int4 weight only quantization, we can leverage the efficient [int4 weight mm kernel with bitpacking](https://github.com/pytorch/pytorch/blob/v2.3.1/aten/src/ATen/native/cuda/int4mm.cu#L865) for quantization, and there is ongoing work to add QAT support for this kernel: [https://github.com/pytorch/ao/pull/383](https://github.com/pytorch/ao/pull/383). For 8da4w quantization, [mixed 4-bit/8-bit GEMM](https://github.com/NVIDIA/cutlass/pull/1413) is also being added in cutlass. This will be needed to build an efficient 8da4w cuda kernel. - -The QAT code can be found [here](https://github.com/pytorch/ao/blob/v0.3.0/torchao/quantization/prototype/qat.py). Please refer to [this torchtune tutorial](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html) to get started. If you have any further questions, please feel free to open an issue on the torchao [github](https://github.com/pytorch/ao/issues) or reach out to [andrewor@meta.com](mailto:andrewor@meta.com). We welcome your feedback and contributions! diff --git a/_posts/2024-07-30-torchchat-local-llm-inference.md b/_posts/2024-07-30-torchchat-local-llm-inference.md deleted file mode 100644 index e17024dd4762..000000000000 --- a/_posts/2024-07-30-torchchat-local-llm-inference.md +++ /dev/null @@ -1,163 +0,0 @@ ---- -layout: blog_detail -title: "Introducing torchchat: Accelerating Local LLM Inference on Laptop, Desktop and Mobile" ---- - -Today, we’re releasing [torchchat](https://github.com/pytorch/torchchat), a library showcasing how to seamlessly and performantly run Llama 3, 3.1, and other large language models across laptop, desktop, and mobile. - -In our previous blog posts, we [showed](https://pytorch.org/blog/accelerating-generative-ai-2/) how to use native PyTorch 2 to run LLMs with great performance using CUDA. Torchchat expands on this with more target environments, models and execution modes. Additionally it provides important functions such as export, quantization and eval in a way that’s easy to understand providing an E2E story for those who want to build a local inference solution. - -You will find the project organized into three areas: - - -* Python: Torchchat provides a [REST API](https://github.com/pytorch/torchchat?tab=readme-ov-file#server) that is called via a Python CLI or can be accessed via the browser -* C++: Torchchat produces a desktop-friendly binary using PyTorch's [AOTInductor](https://pytorch-dev-podcast.simplecast.com/episodes/aotinductor) backend -* Mobile devices: Torchchat uses [ExecuTorch](https://pytorch.org/executorch/stable/index.html) to export a .pte binary file for on-device inference - -![torchchat schema](/assets/images/torchchat.png){:style="width:100%"} - - -## Performance - -The following table tracks the performance of torchchat for Llama 3 for a variety of configurations. -_Numbers for Llama 3.1 are coming soon._ - -**Llama 3 8B Instruct on Apple MacBook Pro M1 Max 64GB Laptop** - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Mode - DType - Llama 3 8B Tokens/Sec -
        Arm Compile - float16 - 5.84 -
        int8 - 1.63 -
        int4 - 3.99 -
        Arm AOTI - float16 - 4.05 -
        int8 - 1.05 -
        int4 - 3.28 -
        MPS Eager - float16 - 12.63 -
        int8 - 16.9 -
        int4 - 17.15 -
        - - -**Llama 3 8B Instruct on Linux x86 and CUDA** -_Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz with 180GB Ram + A100 (80GB)_ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        -Mode - DType - Llama 3 8B Tokens/Sec -
        x86 Compile - bfloat16 - 2.76 -
        int8 - 3.15 -
        int4 - 5.33 -
        CUDA Compile - bfloat16 - 83.23 -
        int8 - 118.17 -
        int4 - 135.16 -
        - - -**Llama3 8B Instruct on Mobile** -Torchchat achieves > 8T/s on the Samsung Galaxy S23 and iPhone using 4-bit GPTQ via ExecuTorch. - - -## Conclusion - -We encourage you to **[clone the torchchat repo and give it a spin](https://github.com/pytorch/torchchat)**, explore its capabilities, and share your feedback as we continue to empower the PyTorch community to run LLMs locally and on constrained devices. Together, let's unlock the full potential of generative AI and LLMs on any device. Please submit [issues](https://github.com/pytorch/torchat/issues) as you see them, since we are still iterating quickly. We’re also inviting community contributions across a broad range of areas, from additional models, target hardware support, new quantization schemes, or performance improvements. Happy experimenting! \ No newline at end of file diff --git a/_posts/2024-08-07-flexattention.md b/_posts/2024-08-07-flexattention.md deleted file mode 100644 index acfc1fc40f01..000000000000 --- a/_posts/2024-08-07-flexattention.md +++ /dev/null @@ -1,482 +0,0 @@ ---- -layout: blog_detail -title: "FlexAttention: The Flexibility of PyTorch with the Performance of FlashAttention" -author: "Team PyTorch: Driss Guessous, Yanbo Liang, Joy Dong, Horace He" ---- - -![a cartoon chart flexing his muscles](/assets/images/flexattention/fg1.jpg){:style="width:100%"} - - -In theory, Attention is All You Need. In practice, however, we also need optimized attention implementations like FlashAttention. - -Although these fused attention implementations have substantially improved performance and enabled long contexts, this efficiency has come with a loss of flexibility. You can no longer try out a new attention variant by writing a few PyTorch operators \- you often need to write a new custom kernel\! This operates as a sort of “software lottery” for ML researchers \- if your attention variant doesn’t fit into one of the existing optimized kernels, you’re doomed to slow runtime and CUDA OOMs. - -For some examples of attention variants, we have Causal, [Relative Positional Embeddings](https://paperswithcode.com/method/relative-position-encodings), [Alibi](https://paperswithcode.com/method/alibi), [Sliding Window Attention](https://mistral.ai/news/announcing-mistral-7b/), [PrefixLM](https://twitter.com/andersonbcdefg/status/1800907703688339569), [Document Masking/Sample Packing/Jagged Tensors](https://github.com/pytorch/torchtune/pull/875), [Tanh Soft-Capping](https://twitter.com/LysandreJik/status/1807779471891538199), [PagedAttention](https://arxiv.org/abs/2309.06180), etc. Even worse, folks often want combinations of these\! Sliding Window Attention \+ Document Masking \+ Causal \+ Context Parallelism? Or what about PagedAttention \+ Sliding Window \+ Tanh Soft-Capping? - -The left picture below represents the state of the world today \- some combinations of masking \+ biases \+ setting have existing kernels implemented. But the various options lead to an exponential number of settings, and so overall we end up with fairly spotty support. Even worse, new attention variants researchers come up with will have *zero* support. - -![Attention variant support diagram](/assets/images/flexattention/fg2.jpg){:style="max-width:600px; display:block; margin-left: auto; margin-right: auto; width:100%"} - -To solve this hypercube problem once and for all, we introduce **FlexAttention**, a new PyTorch API. - -1. We provide a flexible API that allows implementing many attention variants (including all the ones mentioned in the blog post so far) in a few lines of idiomatic PyTorch code. -2. We lower this into a fused FlashAttention kernel through `torch.compile`, generating a FlashAttention kernel that doesn’t materialize any extra memory and has performance competitive with handwritten ones. -3. We also automatically generate the backwards pass, leveraging PyTorch’s autograd machinery. -4. Finally, we can also take advantage of sparsity in the attention mask, resulting in significant improvements over standard attention implementations. - -With FlexAttention, we hope that trying new attention variants will only be limited by your imagination. - -You can find many FlexAttention examples at the Attention Gym: [https://github.com/pytorch-labs/attention-gym](https://github.com/pytorch-labs/attention-gym). If you have any cool applications, feel free to submit an example\! - -PS: We also find this API very exciting since it leverages a lot of existing PyTorch infra in a fun way \- more on that in the end. - -## FlexAttention - -Here is the classic attention equation: - -![math equation](/assets/images/flexattention/fg3.png){:style="max-width:600px; display:block; margin-left: auto; margin-right: auto; width:100%"} - -In code form: - -```py -Q, K, V: Tensor[batch_size, num_heads, sequence_length, head_dim] -score: Tensor[batch_size, num_heads, sequence_length, sequence_length] = (Q @ K) / sqrt(head_dim) -probabilities = softmax(score, dim=-1) -output: Tensor[batch_size, num_heads, sequence_length, head_dim] = probabilities @ V -``` - -FlexAttention allows for an user-defined function `score_mod:` - -![math equation](/assets/images/flexattention/fg4.png){:style="width:100%"} - - -In code form: - -```py -Q, K, V: Tensor[batch_size, num_heads, sequence_length, head_dim] -score: Tensor[batch_size, num_heads, sequence_length, sequence_length] = (Q @ K) / sqrt(head_dim) -modified_scores: Tensor[batch_size, num_heads, sequence_length, sequence_length] = score_mod(score) -probabilities = softmax(modified_scores, dim=-1) -output: Tensor[batch_size, num_heads, sequence_length, head_dim] = probabilities @ V -``` - -This function allows you to *modify* the attention scores prior to softmax. Surprisingly, this ends up being sufficient for the vast majority of attention variants (examples below)\! - -Concretely, the expected signature for `score_mod` is somewhat unique. - -```py -def score_mod(score: f32[], b: i32[], h: i32[], q_idx: i32[], kv_idx: i32[]) - return score # noop - standard attention -``` - -In other words, `score` is a scalar pytorch tensor that represents the dot product of a query token and a key token. The rest of the arguments tell you *which* dot product you’re currently computing \- `b` (current element in batch), `h` (current head), `q_idx` (position in query), `kv_idx` (position in key/value tensors). - -To apply this function, we could implement it as - -```py -for b in range(batch_size): - for h in range(num_heads): - for q_idx in range(sequence_length): - for kv_idx in range(sequence_length): - modified_scores[b, h, q_idx, kv_idx] = score_mod(scores[b, h, q_idx, kv_idx], b, h, q_idx, kv_idx) -``` - -Of course, this is not how FlexAttention is implemented under the hood. Leveraging `torch.compile`, we automatically lower your function into a single *fused* FlexAttention kernel \- guaranteed or your money back\! - -This API ends up being surprisingly expressive. Let’s look at some examples. - -## Score Mod Examples - -### Full Attention - -Let’s first do “full attention”, or standard bidirectional attention. In this case, `score_mod` is a no-op \- it takes as input the scores and then returns them as is.. - -```py -def noop(score, b, h, q_idx, kv_idx): - return score -``` - -And to use it end to end (including both forwards *and* backwards): - -```py -from torch.nn.attention.flex_attention import flex_attention - -flex_attention(query, key, value, score_mod=noop).sum().backward() -``` - -### Relative Position Encodings - -One common attention variant is the [“relative position encoding](https://paperswithcode.com/method/relative-position-encodings)”. Instead of encoding the absolute distance in the queries and keys, relative position encoding adjusts scores based on the “distance” between the queries and keys. - -```py -def relative_positional(score, b, h, q_idx, kv_idx): - return score + (q_idx - kv_idx) -``` - -Note that unlike typical implementations, this does *not* need to materialize a SxS tensor. Instead, FlexAttention computes the bias values “on the fly” within the kernel, leading to significant memory and performance improvements. - -![relative position encoding](/assets/images/flexattention/fg5.png){:style="width:100%"} - - -### ALiBi Bias - -![alibi bias](/assets/images/flexattention/fg6.png){:style="max-width:600px; display:block; margin-left: auto; margin-right: auto; width:100%"} -

        Source: Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation

        - -ALiBi was introduced in [Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation](https://arxiv.org/abs/2108.12409), and claims to have beneficial properties for length extrapolation at inference. Notably, MosaicML has pointed to [“lack of kernel support”](https://twitter.com/jefrankle/status/1804567458092605736) as the main reason why they eventually switched from ALiBi to rotary embeddings. - -Alibi is similar to relative positional encodings with one exception \- it has a per-head factor that is typically precomputed. - -```py -alibi_bias = generate_alibi_bias() # [num_heads] - -def alibi(score, b, h, q_idx, kv_idx): - bias = alibi_bias[h] * (kv_idx - q_idx) - return score + bias -``` - -This demonstrates one interesting piece of flexibility `torch.compile` provides \- we can load from `alibi_bias` even though it *wasn’t explicitly passed in as an input*\! The generated Triton kernel will calculate the correct loads from the `alibi_bias` tensor and fuse it. Note that you could regenerate `alibi_bias` and we still wouldn’t need to recompile. - -### Soft-capping - -Soft-capping is a technique used in [Gemma2](https://huggingface.co/blog/gemma2#soft-capping-and-attention-implementations) and Grok-1 that prevents logits from growing excessively large. In FlexAttention, it looks like: - -```py -softcap = 20 -def soft_cap(score, b, h, q_idx, kv_idx): - score = score / softcap - score = torch.tanh(score) - score = score * softcap - return score -``` - -Note that we also automatically generate the backwards pass from the forwards pass here. Also, although this implementation is semantically correct, we likely want to use a tanh approximation in this case for performance reasons. See [attention-gym](https://github.com/pytorch-labs/attention-gym/blob/main/attn_gym/mods/softcapping.py) for more details. - -### Causal Mask - -Although bidirectional attention is the simplest, the original *Attention is All You Need* paper and the vast majority of LLMs use attention in a decoder-only setting where each token can only attend to the tokens prior to it. Folks often think of this as a lower-triangular mask, but with the `score_mod` API it can be expressed as: - -```py -def causal_mask(score, b, h, q_idx, kv_idx): - return torch.where(q_idx >= kv_idx, score, -float("inf")) -``` - -Basically, if the query token is “after” the key token, we keep the score. Otherwise, we mask it out by setting it to \-inf, thus ensuring it won’t participate in the softmax calculation. - -However, masking is special compared to other modifications \- if something is masked out, we can completely skip its computation\! In this case, a causal mask has about 50% sparsity, so not taking advantage of the sparsity would result in a 2x slowdown. Although this `score_mod` is sufficient to implement causal masking *correctly*, getting the performance benefits of sparsity requires another concept \- `mask_mod`. - -## Mask Mods - -To take advantage of sparsity from masking, we need to do some more work. Specifically, by passing a `mask_mod` to [`create_block_mask`](https://github.com/pytorch/pytorch/blob/e49c0acc396e89baf8c6450e1fa0571d4ce2d4ed/torch/nn/attention/flex_attention.py#L594), we can create a `BlockMask`. FlexAttention can then use `BlockMask` to take advantage of the sparsity\! - -The signature of `mask_mod` is very similar to `score_mod` \- just without the `score`. In particular - -```py -# returns True if this position should participate in the computation -mask_mod(b, h, q_idx, kv_idx) => bool -``` - -Note that `score_mod` is strictly *more* expressive than `mask_mod`. However, for masking, it’s recommended to use `mask_mod` and `create_block_mask`, as it’s more performant. See the FAQ on why `score_mod` and `mask_mod` are separate. - -Now, let’s take a look at how we might implement causal mask with `mask_mod`. - -### Causal Mask - -```py -from torch.nn.attention.flex_attention import create_block_mask - -def causal(b, h, q_idx, kv_idx): - return q_idx >= kv_idx - -# Because the sparsity pattern is independent of batch and heads, we'll set them to None (which broadcasts them) -block_mask = create_block_mask(causal, B=None, H=None, Q_LEN=1024, KV_LEN=1024) -# In this case, we don't need a score_mod, so we won't pass any in. -# However, score_mod can still be combined with block_mask if you need the additional flexibility. -flex_attention(query, key, value, block_mask=block_mask) -``` - -Note that `create_block_mask` is a **relatively expensive operation\!** Although FlexAttention will not need to recompile when it changes, if you aren’t careful about caching it, it can lead to significant slowdowns (check out the FAQ for suggestions on best practices). - -![flexattention performance charts](/assets/images/flexattention/fg7.png){:style="width:100%"} - -While the TFlops are roughly the same, the execution time is 2x faster for the mask\_mod version\! This demonstrates that we can leverage the sparsity that BlockMask provides us *without* losing hardware efficiency. - -### Sliding Window \+ Causal - -![Sliding Window Causal diagrams](/assets/images/flexattention/fg8.png){:style="width:100%"} -

        Source: Mistral 7B

        - - -Popularized by [Mistral](https://arxiv.org/abs/2310.06825), sliding window attention (also known as local attention) takes advantage of the intuition that the most recent tokens are the most useful. In particular, it allows the query token to only attend to, say, the 1024 most recent tokens. This is often used together with causal attention. - -```py -SLIDING_WINDOW = 1024 - -def sliding_window_causal(b, h, q_idx, kv_idx): - causal_mask = q_idx >= kv_idx - window_mask = q_idx - kv_idx <= SLIDING_WINDOW - return causal_mask & window_mask - -# If you want to be cute... -from torch.nn.attention import and_masks - -def sliding_window(b, h, q_idx, kv_idx) - return q_idx - kv_idx <= SLIDING_WINDOW - -sliding_window_causal = and_masks(causal_mask, sliding_window) -``` - -We benchmark it against `F.scaled_dot_product_attention` with a sliding window mask as well as FA2 with a causal mask (as a reference point for performance). Not only are we significantly faster than `F.scaled_dot_product_attention`, we’re *also* significantly faster than FA2 with a causal mask as this mask has significantly more sparsity. - -![execution time charts](/assets/images/flexattention/fg9.png){:style="max-width:600px; display:block; margin-left: auto; margin-right: auto; width:100%"} - -### PrefixLM - -![PrefixLM diagram](/assets/images/flexattention/fg10.png){:style="max-width:600px; display:block; margin-left: auto; margin-right: auto; width:100%"} -

        Source: PaliGemma: A versatile 3B VLM for transfer

        - -The T5 architecture, proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683), describes an attention variant that performs full bidirectional attention on a “prefix”, and causal attention on the rest. We again compose two mask functions to accomplish this, one for causal masking and one that is based off of the prefix length. - -```py -prefix_length: [B] -def prefix_mask(b, h, q_idx, kv_idx): - return kv_idx <= prefix_length[b] - -prefix_lm_causal = or_masks(prefix_mask, causal_mask) -# In this case, our mask is different per sequence so we set B equal to our batch size -block_mask = create_block_mask(prefix_lm_causal, B=B, H=None, S, S) -``` - -Just like with `score_mod`, `mask_mod` allows us to refer to additional tensors that aren’t explicitly an input to the function\! However, with prefixLM, the sparsity pattern changes *per* *input*. This means that for each new input batch, we’ll need to recompute the `BlockMask`. One common pattern is to call `create_block_mask` at the beginning of your model and reuse that `block_mask` for all attention calls in your model. See *Recomputing Block Masks vs. Recompilation.* - -However, in exchange for that, we’re not only able to have an efficient attention kernel for prefixLM, we’re *also* able to take advantage of however much sparsity exists in the input\! FlexAttention will dynamically adjust its performance based off of the BlockMask data, *without* needing to recompile the kernel. - -### Document Masking/Jagged Sequences - -Another common attention variant is document masking/jagged sequences. Imagine that you have a number of sequences of varying length. You want to train on all of them together, but unfortunately, most operators only accept rectangular tensors. - -Through `BlockMask`, we can support this efficiently in FlexAttention as well\! - -1. First, we flatten all sequences into a single sequence with sum(sequence lengths) tokens. -2. Then, we compute the document\_id that each token belongs to. -3. Finally, in our `mask_mod`, we simply whether the query and kv token belong to the same document\! - -```py -# The document that each token belongs to. -# e.g. [0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2] corresponds to sequence lengths 3, 2, and 6. -document_id: [SEQ_LEN] - -def document_masking(b, h, q_idx, kv_idx): - return document_id[q_idx] == document_id[kv_idx] -``` - -And that’s it\! In this case, we see that we end up with a blockdiagonal mask. - -![blockdiagonal mask](/assets/images/flexattention/fg11.png){:style="max-width:600px; display:block; margin-left: auto; margin-right: auto; width:100%"} - - -One interesting aspect about document masking is that it’s easy to see how it might combine with an arbitrary combination of other masks . For example, we already defined `prefixlm_mask` in the previous section. Do we now need to define a `prefixlm_document_mask` function as well? - -In these cases, one pattern we’ve found quite useful is what we call a “higher level modification”. In this case, we can take an existing `mask_mod` and automatically transform it into one that works with jagged sequences\! - -```py -def generate_doc_mask_mod(mask_mod, document_id): - # Get unique document IDs and their counts - _, counts = torch.unique_consecutive(document_id, return_counts=True) - # Create cumulative counts (offsets) - offsets = torch.cat([torch.tensor([0], device=document_id.device), counts.cumsum(0)[:-1]]) - def doc_mask_wrapper(b, h, q_idx, kv_idx): - same_doc = document_id[q_idx] == document_id[kv_idx] - q_logical = q_idx - offsets[document_id[q_idx]] - kv_logical = kv_idx - offsets[document_id[kv_idx]] - inner_mask = mask_mod(b, h, q_logical, kv_logical) - return same_doc & inner_mask - return doc_mask_wrapper -``` - -For example, given the `prefix_lm_causal` mask from above, we can transform it into one that works on on packed documents like so: - -```py -prefix_length = torch.tensor(2, dtype=torch.int32, device="cuda") -def prefix_mask(b, h, q_idx, kv_idx): - return kv_idx < prefix_length -prefix_lm_causal = or_masks(prefix_mask, causal_mask) -doc_prefix_lm_causal_mask = generate_doc_mask_mod(prefix_lm_causal, document_id) -``` - -![blockdiagonal mask](/assets/images/flexattention/fg12.png){:style="max-width:600px; display:block; margin-left: auto; margin-right: auto; width:100%"} - -Now, this mask is “block-prefixLM-diagonal” shaped. :) - -That’s all of our examples\! There are far more attention variants than we have space to list, so check out [Attention Gym](https://github.com/pytorch-labs/attention-gym) for more examples. We hope that the community will contribute some of their favorite applications of FlexAttention as well. - -### FAQ - -##### **Q: When does FlexAttention need to recompile?** - -As FlexAttention leverages `torch.compile` for graph capture, it can actually avoid recompilation in a broad spectrum of cases. Notably, it does *not* need to recompile even if captured tensors change values\! - -```py -flex_attention = torch.compile(flex_attention) -def create_bias_mod(bias) - def bias_mod(score, b, h, q_idx, kv_idx): - return score + bias - return bias_mod -bias_mod1 = create_bias_mod(torch.tensor(0)) -flex_attention(..., score_mod=bias_mod1) # Compiles the kernel here - -bias_mod2 = create_bias_mod(torch.tensor(2)) -flex_attention(..., score_mod=bias_mod2) # Doesn't need to recompile! -``` - -Even changing the block-sparsity doesn’t require a recompile. However, if the block-sparsity changes, we do need to *recompute* the BlockMask. - -##### **Q: When should we recompute the BlockMask?** - -We need to recompute the BlockMask whenever the block-sparsity changes. Although computing the BlockMask is much cheaper than recompilation (on the order of hundreds of microseconds as opposed to seconds), you should still take care to not excessively recompute the BlockMask. - -Here are some common patterns and some recommendations on how you might approach them. - -**Mask never changes (e.g. causal mask)** -In this case, you can simply precompute the block mask and cache it globally, reusing it for all attention calls. - -```py -block_mask = create_block_mask(causal_mask, 1, 1, S,S) -causal_attention = functools.partial(flex_attention, block_mask=block_mask) -``` - -**Mask changes every batch (e.g. document masking)** -In this case, we would suggest computing the BlockMask at the beginning of the model and threading it through the model \- reusing the BlockMask for all layers. - -```py -def forward(self, x, doc_mask): - # Compute block mask at beginning of forwards - block_mask = create_block_mask(doc_mask, None, None, S, S) - x = self.layer1(x, block_mask) - x = self.layer2(x, block_mask) - ... - # amortize block mask construction cost across all layers - x = self.layer3(x, block_mask) - return x -``` - -**Mask changes every layer (e.g. data-dependent sparsity)** -This is the hardest setting, since we’re unable to amortize the block mask computation across multiple FlexAttention invocations. Although FlexAttention can certainly still benefit this case, the actual benefits from BlockMask depend on how sparse your attention mask is and how fast we can construct the BlockMask. That leads us to... - -##### **Q: How can we compute BlockMask quicker?** - -`create_block_mask` is unfortunately fairly expensive, both from a memory and compute perspective, as determining whether a block is completely sparse requires evaluating `mask_mod` at every single point in the block. There are a couple ways to address this: - -1. If your mask is the same across batch size or heads, make sure that you’re broadcasting over those (i.e. set them to `None` in `create_block_mask`). -2. Compile `create_block_mask`. Unfortunately, today, `torch.compile` does not work directly on `create_block_mask` due to some unfortunate limitations. However, you can set `_compile=True`, which will significantly reduce the peak memory and runtime (often an order of magnitude in our testing). -3. Write a custom constructor for BlockMask. The metadata for BlockMask is quite simple (see the [documentation](https://pytorch.org/docs/main/nn.attention.flex_attention.html#blockmask)). It’s essentially two tensors. - a. `num_blocks`: The number of KV blocks computed for each query block. - b. `indices`: The positions of the KV blocks computed for each query block. - - For example, here’s a custom BlockMask constructor for `causal_mask`. - -```py -def create_causal_mask(S): - BLOCK_SIZE = 128 - # The first query block computes one block, the second query block computes 2 blocks, etc. - num_blocks = torch.arange(S // BLOCK_SIZE, device="cuda") + 1 - # Since we're always computing from the left to the right, - # we can use the indices [0, 1, 2, ...] for every query block. - indices = torch.arange(S // BLOCK_SIZE, device="cuda").expand( - S // BLOCK_SIZE, S // BLOCK_SIZE - ) - num_blocks = num_blocks[None, None, :] - indices = indices[None, None, :] - return BlockMask(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=causal_mask) -``` - -##### **Q: Why are `score_mod` and `mask_mod` different? Isn’t `mask_mod` just a special case of `score_mod`?** - -Very astute question, hypothetical audience member\! In fact, any `mask_mod` can be easily converted to a `score_mod` (we do not recommend using this function in practice\!) - -```py -def mask_mod_as_score_mod(b, h, q_idx, kv_idx): - return torch.where(mask_mod(b, h, q_idx, kv_idx), score, -float("inf")) -``` - -So, if `score_mod` can implement everything `mask_mod` can, what’s the point of having `mask_mod`? - -One immediate challenge: a `score_mod` requires the actual `score` value as an input, but when we’re precomputing the BlockMask, we don’t have the actual `score` value. We can perhaps fake the values by passing in all zeros, and if the `score_mod` returns `-inf`, then we consider it to be masked (in fact, we originally did this\!). - -However, there are two issues. The first is that this is hacky \- what if the user’s `score_mod` returned `-inf` when the input is 0? Or what if the user’s `score_mod` masked out with a large negative value instead of `-inf`? It seems we’re trying to cram a round peg into a square hole. However, there’s a more important reason to separate out `mask_mod` from `score_mod` \- it’s fundamentally more efficient\!. - -As it turns out, applying masking to every single computed element is actually quite expensive \- our benchmarks see about a 15-20% degradation in performance\! So, although we can get significant speedups by skipping half the computation, we lose a meaningful part of that speedup from needing to mask out every element\! - -Luckily, if we visualize the causal mask, we notice that the vast majority of blocks do not require a “causal mask” at all \- they’re fully computed\! It is only the blocks on the diagonal, partially computed and partially masked, that require masking to be applied. - -![blockdiagonal mask](/assets/images/flexattention/fg13.png){:style="width:100%"} - -The BlockMask previously told us which blocks we need to compute and which blocks we can skip. Now, we further augment this data structure to also tell us which blocks are “fully computed” (i.e. masking can be skipped) vs. “partially computed” (i.e. a mask needs to be applied). Note, however, that although masks can be skipped on “fully computed” blocks, other `score_mod`s like relative positional embeddings still need to be applied. - -Given just a `score_mod`, there’s no sound way for us to tell which parts of it are “masking”. Hence, the user must separate these out themselves into `mask_mod`. - -##### **Q: How much additional memory does the BlockMask need?** - -The BlockMask metadata is of size `[BATCH_SIZE, NUM_HEADS, QUERY_LEN//BLOCK_SIZE, KV_LEN//BLOCK_SIZE].` If the mask is the same across the batch or heads dimension it can be broadcasted over that dimension to save memory. - -At the default `BLOCK_SIZE` of 128, we expect that the memory usage will be fairly negligible for most use cases. For example, for a sequence length of 1 million, the BlockMask would only use 60MB of additional memory. If this is a problem, you can increase the block size: `create_block_mask(..., BLOCK_SIZE=1024).` For example, increasing `BLOCK_SIZE` to 1024 would result in this metadata dropping to under a megabyte. - -##### **Q: How do the numerics compare?** - -Although the results are not bitwise identical, we are confident that FlexAttention is as numerically accurate as FlashAttention. We generate the following distribution of differences comparing FlashAttention versus FlexAttention over a large range of inputs on both causal and non causal attention variants. The errors are nearly identical. - -![distribution chart](/assets/images/flexattention/fg14.png){:style="width:100%"} - -### Performance - -Generally speaking, FlexAttention is nearly as performant as a handwritten Triton kernel, which is unsurprising, as we heavily leverage a handwritten Triton kernel. However, due to its generality, we do incur a small performance penalty. For example, we must incur some additional latency to determine which block to compute next. In some cases, we provide some kernel options that can affect the performance of the kernel while changing its behavior. They can be found here: [performance knobs](https://github.com/pytorch/pytorch/blob/ee09d066d35d7e17cf7e9479c0b8bfc70cffc264/torch/_inductor/kernel/flex_attention.py#L146-L155) - -As a case study, let's explore how the knobs affect the performance of causal attention. We will compare performance of the triton kernel versus FlashAttentionv2 on A100. The script can be found [here](https://github.com/pytorch/pytorch/blob/main/benchmarks/transformer/score_mod.py). - -FlexAttention achieves 90% of FlashAttention2's performance in the forward pass and 85% in the backward pass. FlexAttention is currently utilizing a deterministic algorithm that recomputes more intermediates than FAv2, but we have plans to improve FlexAttention’s backward algorithm and hope to close this gap\! - -![flexattention speed chart](/assets/images/flexattention/fg15.png){:style="width:100%"} - -![flexattention speed chart](/assets/images/flexattention/fg16.png){:style="width:100%"} - -## Conclusion - -We hope you have as much fun using FlexAttention as we did developing it\! While working on this, we ended up finding way more applications of this API than we could have expected. We’ve already seen it accelerate torchtune’s [sample packing throughput by 71%](https://github.com/pytorch/torchtune/pull/1193), replace the need for a researcher to spend over a week writing their own custom Triton kernel, and deliver competitive performance with custom handwritten attention variants. - -One final thing that made implementing FlexAttention quite fun is that we were able to leverage a lot of existing PyTorch infra in an interesting way. For example, one of the unique aspects about TorchDynamo (torch.compile’s frontend) is that it does *not* require tensors used in the compiled function to be explicitly passed in as inputs. This allows us to compile mods like document masking, which require accessing *global* variables where the global variables need to change\! - -```py -bias = torch.randn(1024, 1024) -def score_mod(score, b, h, q_idx, kv_idx): - return score + bias[q_idx][kv_idx] # The bias tensor can change! -``` - -Furthermore, the fact that `torch.compile` is a generic graph-capture mechanism also allows it to support more “advanced” transformations, such as the higher order transform that transforms any `mask_mod` into one that works with jagged tensors. - -We also leverage TorchInductor (torch.compile’s backend) infrastructure for Triton templates. Not only did this make it easy to support codegening FlexAttention \- it also automatically gave us support for dynamic shapes as well as epilogue fusion (i.e. fusing an operator onto the end of attention)\! In the future, we plan on extending this support to allow for quantized versions of attention or things like [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) as well. - -In addition, we also leveraged higher order ops, PyTorch’s autograd to automatically generate the backwards pass, as well as vmap to automatically apply `score_mod` for creating the BlockMask. - -And, of course, this project wouldn’t have been possible without Triton and TorchInductor’s ability to generate Triton code. - -We look forward to leveraging the approach we used here to more applications in the future\! - -### Limitations and Future Work - -- FlexAttention is currently available in PyTorch nightly releases, we plan to release it as a prototype feature in 2.5.0 -- We did not cover how to use FlexAttention for inference here (or how to implement PagedAttention) \- we will cover those in a later post. -- We are working to improve the performance of FlexAttention to match FlashAttention3 on H100 GPUs. -- FlexAttention requires that all sequence lengths be a multiple of 128 \- this will be addressed soon. -- We plan on adding GQA support soon \- for now, you can just replicate the kv heads. - - -### Acknowledgements - -We want to highlight some prior work (and people) that have inspired FlexAttention. - -- Tri Dao's work on FlashAttention -- Francisco Massa and the Xformers team for BlockSparseAttention in Triton -- The Jax team's work on SplashAttention -- Philippe Tillet and Keren Zhou for helping us with Triton -- Ali Hassani for discussions on neighborhood attention -- Everybody who's complained about attention kernels not supporting their favorite attention variant :) diff --git a/_posts/2024-08-20-clipping-in-opacus.md b/_posts/2024-08-20-clipping-in-opacus.md deleted file mode 100644 index 2019c68da3c8..000000000000 --- a/_posts/2024-08-20-clipping-in-opacus.md +++ /dev/null @@ -1,362 +0,0 @@ ---- -layout: blog_detail -title: "Enabling Fast Gradient Clipping and Ghost Clipping in Opacus" -author: Enayat Ullah, Huanyu Zhang, Will Bullock, Ilya Mironov ---- - -## Introduction and Context - -[Differentially Private Stochastic Gradient Descent (DP-SGD)](https://arxiv.org/abs/1607.00133) is the canonical method for training machine learning models with differential privacy. It involves the following two modifications to its non-private counterpart, Stochastic Gradient Descent. - -1. **Per-sample gradient clipping**: Clip gradients with respect to every sample in the mini-batch, ensuring that its norm is at most a pre-specified value, “Clipping Norm”, C, in every iteration. - -2. **Noise addition**: Add Gaussian noise of pre-specified variance, depending on the clipping norm and privacy parameters, to the average clipped gradient, in every iteration. - -The first change, **per-sample gradient clipping**, introduces additional complexities since, in general, it requires instantiating **per-sample** **gradients**. - -[Opacus](http://opacus.ai) is a PyTorch implementation of DP-SGD. Opacus addresses the above task by employing [hook functions](https://medium.com/pytorch/differential-privacy-series-part-2-efficient-per-sample-gradient-computation-in-opacus-5bf4031d9e22), which allows intervening on specific events, such as forward and backward passes. For more details about Opacus, we encourage readers to review the previous blog posts: [DP-SGD Algorithm Explained](https://bit.ly/dp-sgd-algorithm-explained), [Efficient Per-Sample Gradient Computation in Opacus](https://medium.com/pytorch/differential-privacy-series-part-2-efficient-per-sample-gradient-computation-in-opacus-5bf4031d9e22) and [Efficient Per-Sample Gradient Computation for More Layers in Opacus](https://pytorch.medium.com/differential-privacy-series-part-3-efficient-per-sample-gradient-computation-for-more-layers-in-39bd25df237). - -While Opacus provides substantial efficiency gains compared to the naive approaches, the memory cost of instantiating per-sample gradients is significant. In particular, memory usage is proportional to the batch size times the number of trainable parameters. Consequently, memory limits Opacus to small batch sizes and/or small models, significantly restricting its range of applications. - -We introduce [Fast Gradient Clipping](https://arxiv.org/abs/2009.03106) and [Ghost Clipping](https://arxiv.org/abs/2110.05679) to Opacus, which enable developers and researchers to perform gradient clipping without instantiating the per-sample gradients. As an example, this allows for fine-tuning 7M parameters of BERT, on a single 16GB GPU, with a batch size of 1024, with memory comparable to using PyTorch (without applying DP-SGD). In contrast, the previous version of Opacus, supported a maximum batch size of roughly 256 for the same setting. We provide a [tutorial](https://github.com/pytorch/opacus/blob/main/tutorials/building\_text\_classifier.ipynb) on how to use Fast Gradient Clipping in Opacus with the aforementioned task as an example. - -## Fast Gradient Clipping and Ghost Clipping - -The key idea behind these techniques is based on the following observation: suppose per-sample gradient norms are known, then gradient clipping can be achieved by backpropagation on a re-weighted loss function $ \bar{L} $. This loss function is defined as $ \bar{L} = \sum_{i} R_{i} L_{i} $, where $ R_i = \min\left(\frac{C}{C_i}, 1\right) $ are the clipping coefficients computed from the per-sample gradient norms $ {C_i} $ and $ {L_i} $ are per-sample losses. - -The above idea may seem circular at first glance, as it appears to require instantiating per-sample gradients in order to calculate per-sample gradient norms. However, for certain widely-used components of neural network architectures, such as fully connected/linear layers, it is indeed possible to obtain per-sample gradient norms in a single backpropagation pass without the need for per-sample gradients. This suggests a workflow that involves two backpropagation passes: the first to compute per-sample gradient norms, and the second to compute the aggregated (not per-sample) clipped gradient. The second backpropagation is simply the standard batched backpropagation. - -![backpropagation diagram](/assets/images/clipping-in-opacus/fg1.jpg){:style="max-width:800px; display:block; margin-left: auto; margin-right: auto; width:100%"} - -![backpropagation diagram](/assets/images/clipping-in-opacus/fg2.png){:style="max-width:400px; display:block; margin-left: auto; margin-right: auto; width:100%"} - -_Figure 1: Comparison between vanilla **Opacus** (top left), **Fast Gradient Clipping** (top right), and **Ghost clipping** (bottom). We marked in red gradient instantiations that become memory bottlenecks. For vanilla Opacus, it has to instantiate the **per-sample gradients**. **Fast Gradient Clipping** instantiates per-sample gradients for each layer to compute its norm, which is immediately released once the backward pass moves on to the next layer. Ghost Clipping works directly from **per-sample activation gradients** and **per-sample activations**, and avoids the need for gradient instantiation._ - -[**Fast Gradient Clipping**](https://arxiv.org/abs/2009.03106) -In Fast Gradient Clipping, the per-sample gradient norm is calculated in three steps: - -1. For each layer, the per-sample gradient is instantiated and its norm is calculated. -2. The per-sample gradient is then immediately discarded. -3. The (squared) per-sample gradient norms of each layer are summed up to obtain the overall (squared) per-sample gradient norm. - - -[**Ghost Clipping**](https://arxiv.org/abs/2110.05679) -Extending the approach of Fast Gradient Clipping, Ghost Clipping uses the [fact](https://arxiv.org/abs/1510.01799) that for **linear layers[^1],** per-sample gradient norms can be calculated just from **activation gradients** and **activations**. In particular, let `backprops` and `activations` be per-sample activation gradients and activations, of dimensions `batch_size ✕ output_width` and `batch_size ✕ input_width`, respectively. The per-sample gradient is the outer product of the two, which takes `O(batch_size ✕ input_width ✕ output_width)` time and space. - -The [ghost clipping trick](https://arxiv.org/abs/1510.01799) instead calculates the (squared) norm of `backprops` and `activations`, sample-wise, and takes their product, which gives the (squared) norm of the gradient. This takes `O(batch-size ✕ (input_width + output_width))` time and takes `O(batch-size)` space to store. Since **per-sample activation** and **per-sample activation gradients** are already stored, additional memory is needed only for storing the norms. - -**Relationship between Fast Gradient Clipping and Ghost Clipping** - -1. Fast Gradient Clipping and Ghost Clipping are complementary techniques. Fast Gradient Clipping can be applied to any type of layer, while Ghost Clipping is a strictly better technique for supported layers. -2. Our implementation automatically switches to Fast Gradient Clipping when the layer is not supported by Ghost Clipping. - -### How to use Fast Gradient Clipping in Opacus - -The training loop is identical to that of the standard PyTorch loop. As in Opacus before, we use the `PrivacyEngine()`, which “sanitizes” the model and optimizer. To enable Ghost Clipping, the argument `grad_sample_mode="ghost"` is used. Additionally, `make_private()` takes the loss criterion as an extra input and sanitizes it. This allows us to hide the two backward passes and the loss rescaling in between in `loss.backward()`. - -```py -from opacus import PrivacyEngine -criterion = nn.CrossEntropyLoss() # example loss function - -privacy_engine = PrivacyEngine() -model_gc, optimizer_gc, criterion_gc, train_loader, = privacy_engine.make_private( - module=model, - optimizer=optimizer, - data_loader=train_loader, - noise_multiplier=noise_multiplier - max_grad_norm=max_grad_norm, - criterion=criterion, - grad_sample_mode="ghost", -) - -# The training loop below is identical to that of PyTorch - -for input_data, target_data in train_loader: - output_gc = model_gc(input_data) # Forward pass - optimizer_gc.zero_grad() - loss = criterion_gc(output_gc, target_data) - loss.backward() - optimizer_gc.step() # Add noise and update the model -``` - -Internally, before the first pass, we enable the *hooks*, which allows us to capture layer-wise values corresponding to forward and backward calls. They are used to compute the per-sample gradient norms. We then compute the clipping coefficients, rescale the loss function and disable hooks, which lets us use the standard PyTorch backward pass. - -### Memory Complexity Analysis - - Consider a multi-layer neural network with the following properties: - -**L**: Number of layers -**d**: Maximum layer width -**B**: Batch size -**K**: Number of non-supported/non-linear layers - -The memory overhead of DP-SGD with Ghost Clipping compared to plain (PyTorch) SGD is an additive O(BL), required to store the per-sample gradient norms for all layers. Further, if there is a non-supported layer (if K≥1), then there is an additional O(Bd2) memory to instantiate the gradient of that layer. - -### Memory Benchmarking - -We provide results on the memory usage for a variety of settings. - -#### Fine-Tuning BERT - -We consider the problem of [privately fine-tuning](https://github.com/pytorch/opacus/blob/main/tutorials/building\_text\_classifier.ipynb) the last three layers of BERT for a text classification task. The base model has over 100M parameters, of which we fine-tune the last three layers, `BertEncoder,` `BertPooler,` and `Classifier`, comprising roughly 7.6M parameters. The experiments are run on a P100 GPU with 16 GB of memory. - -The following table reports the maximum memory and time taken per iteration for the various methods: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - Batch size -
        B = 32 - B = 128 - B = 512 - B = 1024 - B = 2048 -
        Mem - Time - Mem - Time - Mem - Time - Mem - Time - -
        PyTorch SGD - 236 MB - 0.15 s - 1.04 GB - 0.55 s - 5.27 GB - 2.1 s - 12.7 GB - 4.2 s - OOM -
        DP-SGD - 1,142 MB - 0.21 s - 4.55 GB - 0.68 s - OOM - OOM - OOM -
        FGC DP-SGD - 908 MB - 0.21 s - 3.6 GB - 0.75 s - OOM - OOM - OOM -
        GC DP-SGD - 362 MB - 0.21 s - 1.32 GB - 0.67 s - 5.27 GB - 2.5 s - 12.7 GB - 5 s - OOM -
        - - - -In terms of peak memory footprint, DP-SGD \> FGC DP-SGD ≫ GC DP-SGD ≈ PyTorch SGD. Further, the runtimes are similar because most of the parameters are frozen and the forward pass takes up most of the time. - -#### Synthetic Setup: Memory Profiling - -We consider the following setup to profile the memory used by PyTorch SGD, Vanilla DP-SGD and Ghost Clipping, GC DP-SGD. - -* 2-layer fully connected neural network - * Input: 5120 - * Hidden: 2560 - * Output: 1280 - * Total number of model parameters \= 15.6M - * Model size \= 62.5 MB -* Batch size, different values, as seen in the table below. - -The table below summarizes the max memory increase (in MB) broken down by stages of the training loop for each of the methods. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Batch Size - Method - Model to GPU - Forward - First Backward - Second Backward - Optimizer Step -
        32 - PyTorch SGD - 62.5 - 0.5 - 62.5 - N/A - 0 -
        Vanilla DP-SGD - 62.5 - 0.47 - 3,663 - N/A - 162.5 -
        GC DP-SGD - 62.5 - 0.47 - 63.13 - 50 - 125 -
        217 - PyTorch SGD - 62.5 - 1920 - 1932.5 - N/A - 0 -
        Vanilla DP-SGD - OOM -
        GC DP-SGD - 62.5 - 1920 - 2625 - 1932.5 - 125 -
        - - -#### Industry use case - -We tested Ghost Clipping DP-SGD on an internal Meta use case, consisting of a model of size roughly 100B with 40M trainable parameters. Our initial results show that Ghost Clipping SGD reduces 95% memory of vanilla DP-SGD, and achieves comparable memory usage to PyTorch SGD. - -## Conclusion - -In this post, we describe implementations of Fast Gradient Clipping and Ghost Clipping in Opacus that enable memory-efficient training of machine learning models with differential privacy. Currently, the Ghost Clipping implementation only applies to linear layers, but, as outlined in [part 3 of the series](https://pytorch.medium.com/differential-privacy-series-part-3-efficient-per-sample-gradient-computation-for-more-layers-in-39bd25df237), it can be extended to “generalized” linear layers such as convolutions and multi-head attention. The current techniques require two explicit backpropagation steps, which increases runtime. We will explore developments on top of Ghost Clipping such as the [Book-Keeping algorithm](https://arxiv.org/abs/2210.00038) for mitigation. - -To learn more about Opacus, visit [opacus.ai](https://opacus.ai/) and [github.com/pytorch/opacus](https://github.com/pytorch/opacus). - -## Acknowledgements - -We thank Iden Kalemaj, Darren Liu, Karthik Prasad, Hao Shi, Igor Shilov, Davide Testuggine, Eli Uriegas, Haicheng Wang, and Richard Zou for valuable feedback and suggestions. - -[^1]: There are [ways](https://proceedings.neurips.cc/paper\_files/paper/2023/file/a45d344b28179c8da7646bc38ff50ad8-Paper-Conference.pdf) to extend Ghost Clipping to non-linear layers. diff --git a/_posts/2024-08-29-intel-gpus-pytorch-2-4.md b/_posts/2024-08-29-intel-gpus-pytorch-2-4.md deleted file mode 100644 index 45a4c2c5af91..000000000000 --- a/_posts/2024-08-29-intel-gpus-pytorch-2-4.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -layout: blog_detail -title: "Accelerate Your AI: PyTorch 2.4 Now Supports Intel GPUs for Faster Workloads" -author: the PyTorch Team at Intel ---- - -We have exciting news! PyTorch 2.4 now supports Intel® Data Center GPU Max Series and the SYCL software stack, making it easier to speed up your AI workflows for both training and inference. This update allows for you to have a consistent programming experience with minimal coding effort and extends PyTorch’s device and runtime capabilities, including device, stream, event, generator, allocator, and guard, to seamlessly support streaming devices. This enhancement simplifies deploying PyTorch on ubiquitous hardware, making it easier for you to integrate different hardware back ends. - -Intel GPU support upstreamed into PyTorch provides support for both eager and graph modes, fully running Dynamo Hugging Face benchmarks. Eager mode now includes common Aten operators implemented with SYCL. The most performance-critical graphs and operators are highly optimized by using oneAPI Deep Neural Network Library (oneDNN) and oneAPI Math Kernel Library (oneMKL). Graph mode (torch.compile) now has an enabled Intel GPU back end to implement the optimization for Intel GPUs and to integrate Triton. Furthermore, data types such as FP32, BF16, FP16, and automatic mixed precision (AMP) are supported. The PyTorch Profiler, based on Kineto and oneMKL, is being developed for the upcoming PyTorch 2.5 release. - -Take a look at the current and planned front-end and back-end improvements for Intel GPU upstreamed into PyTorch. - -![the current and planned front-end and back-end improvements for Intel GPU upstreamed into PyTorch](/assets/images/intel-gpus-pytorch-2-4.jpg){:style="width:100%"} - -PyTorch 2.4 on Linux supports Intel Data Center GPU Max Series for training and inference while maintaining the same user experience as other hardware. If you’re migrating code from CUDA, you can run your existing application on an Intel GPU with minimal changes—just update the device name from `cuda` to `xpu`. For example: - -``` -# CUDA Code -tensor = torch.tensor([1.0, 2.0]).to("cuda") - -# Code for Intel GPU -tensor = torch.tensor([1.0, 2.0]).to("xpu") -``` - -## Get Started - -Try PyTorch 2.4 on the Intel Data Center GPU Max Series through the [Intel® Tiber™ Developer Cloud](https://cloud.intel.com/). Get a tour of the [environment setup, source build, and examples](https://pytorch.org/docs/main/notes/get_start_xpu.html#examples). To learn how to create a free Standard account, see [Get Started](https://console.cloud.intel.com/docs/guides/get_started.html), then do the following: - -1. Sign in to the [cloud console](https://console.cloud.intel.com/docs/guides/get_started.html). - -2. From the [Training](https://console.cloud.intel.com/training) section, open the **PyTorch 2.4 on Intel GPUs** notebook. - -3. Ensure that the **PyTorch 2.4** kernel is selected for the notebook. - -## Summary - -PyTorch 2.4 introduces initial support for Intel Data Center GPU Max Series to accelerate your AI workloads. With Intel GPU, you’ll get continuous software support, unified distribution, and synchronized release schedules for a smoother development experience. We’re enhancing this functionality to reach Beta quality in PyTorch 2.5. Planned features in 2.5 include: - -* More Aten operators and full Dynamo Torchbench and TIMM support in Eager Mode. - -* Full Dynamo Torchbench and TIMM benchmark support in torch.compile. - -* Intel GPU support in torch.profile. - -* PyPI wheels distribution. - -* Windows and Intel Client GPU Series support. - -We welcome the community to evaluate these new contributions to [Intel GPU support on PyTorch](https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support).  - -## Resources - -* [PyTorch 2.4: Get Started on an Intel GPU](https://pytorch.org/docs/main/notes/get_start_xpu.html) - -* [PyTorch Release Notes](https://github.com/pytorch/pytorch/releases) - -## Acknowledgments - -We want thank PyTorch open source community for their technical discussions and insights: [Nikita Shulga](https://github.com/malfet), [Jason Ansel](https://github.com/jansel), [Andrey Talman](https://github.com/atalman), [Alban Desmaison](https://github.com/alband), and [Bin Bao](https://github.com/desertfire). - -We also thank collaborators from PyTorch for their professional support and guidance. - -1 To enable GPU support and improve performance, we suggest installing the [Intel® Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/) diff --git a/_posts/2024-09-04-cuda-free-inference-for-llms.md b/_posts/2024-09-04-cuda-free-inference-for-llms.md deleted file mode 100644 index 9d48de44aadc..000000000000 --- a/_posts/2024-09-04-cuda-free-inference-for-llms.md +++ /dev/null @@ -1,180 +0,0 @@ ---- -layout: blog_detail -title: "CUDA-Free Inference for LLMs" -author: Adnan Hoque, Less Wright, Raghu Ganti and Mudhakar Srivatsa ---- - -In this blog, we discuss the methods we used to achieve FP16 inference with popular LLM models such as [Meta’s Llama3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) and [IBM’s Granite-8B Code](https://huggingface.co/ibm-granite/granite-8b-code-base), where **100%** of the computation is performed using [OpenAI’s Triton Language](https://github.com/triton-lang/triton). -For single token generation times using our Triton kernel based models, we were able to approach **0.76-0.78x** performance relative to the CUDA kernel dominant workflows for both Llama and Granite on Nvidia H100 GPUs, and **0.62-0.82x** on Nvidia A100 GPUs. - -Why explore using 100% Triton? Triton provides a path for enabling LLMs to run on different types of GPUs \- NVIDIA, AMD, and in the future Intel and other GPU based accelerators. It also provides a higher layer of abstraction in Python for programming GPUs and has allowed us to write performant kernels faster than authoring them using vendor specific APIs. In the rest of this blog, we will share how we achieve CUDA-free compute, micro-benchmark individual kernels for comparison, and discuss how we can further improve future Triton kernels to close the gaps. - -![](/assets/images/granite_llama_throughput.png){:style="width:100%"} - -**Figure 1\. Inference throughput benchmarks with Triton and CUDA variants of Llama3-8B and Granite-8B, on NVIDIA H100 and A100** -*Settings: batch size \= 2, input sequence length \= 512, output sequence length \= 256* - -**2.0 Composition of a Transformer Block** - -We start with a breakdown of the computations that happen in Transformer-based models. The figure below shows the “kernels” of a typical Transformer block. - -![](/assets/images/transformer_block.png){:style="width:100%"} - **Figure 2\.** Transformer Block by core kernels - -The core operations for a Llama3 architecture are summarized in this list: - -1. RMSNorm -2. Matrix multiplication: Fused QKV -3. RoPE -4. Attention -5. Matrix multiplication: Output Projection -6. RMSNorm -7. Matrix multiplication: Fused Gate \+ Up Projection -8. Activation function: SiLU -9. Element Wise Multiplication -10. Matrix multiplication: Down Projection - - -Each of these operations is computed on the GPU through the execution of one (or multiple) kernels. While the specifics of each of these kernels can vary across different transformer models, the core operations remain the same. For example, IBM’s Granite 8B Code model uses bias in the MLP layer, different from Llama3. Such changes do require modifications to the kernels. A typical model is a stack of these transformer blocks wired together with embedding layers. - -**3.0 Model Inference** - -Typical model architecture code is shared with a python model.py file that is launched by PyTorch. In the default PyTorch [eager execution](https://pytorch.org/blog/optimizing-production-pytorch-performance-with-graph-transformations/) mode, these kernels are all executed with CUDA. To achieve 100% Triton for end-to-end Llama3-8B and Granite-8B inference we need to write and integrate handwritten Triton kernels as well as leverage torch.compile (to generate Triton ops). First, we replace smaller ops with compiler generated Triton kernels, and second, we replace more expensive and complex computations (e.g. matrix multiplication and flash attention) with handwritten Triton kernels. - -Torch.compile generates Triton kernels automatically for RMSNorm, RoPE, SiLU and Element Wise Multiplication. Using tools like [Nsight Systems](https://developer.nvidia.com/nsight-systems) we can observe these generated kernels; they appear as tiny dark green kernels in-between the matrix multiplications and attention. - -![](/assets/images/nsys_trace_cuda.png){:style="width:100%"} -**Figure 3\.** Trace of Llama3-8B with torch.compile, showing CUDA kernels being used for matrix multiplications and flash attention - -For the above trace, we note that the two major ops that make up **80%** of the E2E latency in a Llama3-8B style model are matrix multiplication and attention kernels and both remain CUDA kernels. Thus to close the remaining gap, we replace both matmul and attention kernels with handwritten Triton kernels. - -**4.0 Triton SplitK GEMM Kernel** - -For the matrix multiplications in the linear layers, we wrote a custom FP16 Triton GEMM (General Matrix-Matrix Multiply) kernel that leverages a [SplitK work decomposition](https://pytorch.org/blog/accelerating-moe-model/\#30-work-decomposition---splitk). We have previously discussed this parallelization in other blogs as a way to accelerate the decoding portion of LLM inference. - -**5.0 GEMM Kernel Tuning** - -To achieve optimal performance we used the exhaustive search approach to tune our SplitK GEMM kernel. Granite-8B and Llama3-8B have linear layers with the following shapes: - -| Linear Layer | Shape (in\_features, out\_features) | -| :---- | :---- | -| Fused QKV Projection | (4096, 6144\) | -| Output Projection | (4096, 4096\) | -| Fused Gate \+ Up Projection | (4096, 28672\) | -| Down Projection | (14336, 4096\) | - -**Figure 4\.** Granite-8B and Llama3-8B Linear Layer Weight Matrix Shapes - -Each of these linear layers have different weight matrix shapes. Thus, for optimal performance the Triton kernel must be tuned for each of these shape profiles. After tuning for each linear layer we were able to achieve **1.20x** E2E speedup on Llama3-8B and Granite-8B over the untuned Triton kernel. - -**6.0 Flash Attention Kernel** - -We evaluated a suite of existing Triton flash attention kernels with different configurations, namely: - -1. [AMD Flash](https://github.com/ROCm/triton/blob/triton-mlir/python/perf-kernels/flash-attention.py) -2. [OpenAI Flash](https://github.com/triton-lang/triton/blob/main/python/tutorials/06-fused-attention.py) -3. [Dao AI Lab Flash](https://github.com/Dao-AILab/flash-attention/blob/3669b25206d5938e3cc74a5f7860e31c38af8204/flash_attn/flash_attn_triton.py#L812) -4. [XFormers Flash](https://github.com/facebookresearch/xformers/blob/fae0ceb195a41f2ab762d89449c6012fbcf2ffda/xformers/ops/fmha/triton_splitk.py#L96) -5. [PyTorch FlexAttention](https://github.com/pytorch/pytorch/blob/e7b870c88bc3b854a95399a96a274d2f1f908172/torch/nn/attention/flex_attention.py#L800) - -We evaluated the text generation quality of each of these kernels, first, in eager mode and then (if we were able to torch.compile the kernel with standard methods) compile mode. For kernels 2-5, we noted the following: - -| Kernel | Text Generation Quality | Torch.compile | Support for Arbitrary Sequence Length | -| ----- | ----- | ----- | ----- | -| AMD Flash | Coherent | Yes | Yes | -| OpenAI Flash | Incoherent | Did not evaluate. WIP to debug precision in eager mode first | No | -| Dao AI Lab Flash | Incoherent | Did not evaluate. WIP to debug precision in eager mode first | Yes | -| Xformers FlashDecoding | Hit a compilation error before we were able to evaluate text quality | WIP | No (This kernel is optimized for decoding) | -| PyTorch FlexAttention | Coherent | WIP | WIP | - -**Figure 5\.** Table of combinations we tried with different Flash Attention Kernels - -The above table summarizes what we observed out-of-the box. With some effort we expect that kernels 2-5 can be modified to meet the above criteria. However, this also shows that having a kernel that works for benchmarking is often only the start of having it usable as an end to end production kernel. -We chose to use the AMD flash attention kernel in our subsequent tests as it can be compiled via torch.compile and produces legible output in both eager and compiled mode. - -To satisfy torch.compile compatibility with the AMD flash attention kernel, we had to define it as a torch custom operator. This process is explained in detail [here](https://pytorch.org/tutorials/advanced/python\_custom\_ops.html). The tutorial link discusses how to wrap a simple image crop operation. However, we note that wrapping a more complex flash attention kernel follows a similar process. The two step approach is as follows: - -1. Wrap the function into a PyTorch Custom Operator - -![](/assets/images/torch_op_warpping_2.png){:style="width:100%"} - - -2. Add a FakeTensor Kernel to the operator, which given the shapes of the input tensors of flash (q, k and v) provides a way to compute the output shape of the flash kernel - -![](/assets/images/torch_op_wrapping_1.png){:style="width:100%"} - -After defining the Triton flash kernel as a custom op, we were able to successfully compile it for our E2E runs. - -![](/assets/images/nsys_trace_triton.png){:style="width:100%"} - -**Figure 6\.** Trace of Llama3-8B with torch.compile, after swapping in Triton matmul and Triton flash attention kernels - -From Figure 5, we note that now, after integrating both the SplitK matrix multiplication kernel, the torch op wrapped flash attention kernel, and then running torch.compile, we are able to achieve a forward pass that uses 100% Triton computation kernels. - -**7.0 End-to-End Benchmarks** - -We performed end-to-end measurements on NVIDIA H100s and A100s (single GPU) with Granite-8B and Llama3-8B models. We performed our benchmarks with two different configurations. - -The Triton kernel configuration uses: - -1. Triton SplitK GEMM -2. AMD Triton Flash Attention - - -The CUDA Kernel configuration uses: - -1. cuBLAS GEMM -2. cuDNN Flash Attention \- Scaled Dot-Product Attention (SDPA) - -We found the following throughput and inter-token latencies for both eager and torch compiled modes, with typical inference settings: - -| GPU | Model | Kernel Config | Median Latency (Eager) \[ms/tok\] | Median Latency (Compiled) \[ms/tok\] | -| :---- | :---- | :---- | :---: | :---: | -| H100 | Granite-8B | Triton | 27.42 | 11.59 | -| | | CUDA | 18.84 | 9.50 | -| | Llama3-8B | Triton | 20.36 | 10.61 | -| | | CUDA | 16.59 | 8.59 | -| A100 | Granite-8B | Triton | 53.44 | 16.88 | -| | | CUDA | 37.13 | 14.25 | -| | Llama3-8B | Triton | 44.44 | 17.94 | -| | | CUDA | 32.45 | 12.96 | - -**Figure 7\.** Granite-8B and Llama3-8B Single Token Generation Latency on H100 and A100, -(batch size \= 2, input sequence length \= 512, output sequence length \= 256\) - -To summarize, the Triton models can get up to **78%** of the performance of the CUDA models on the H100 and up to **82%** on the A100. - -The performance gap can be explained by the kernel latencies we observe for matmul and flash attention, which are discussed in the next section. - -**8.0 Microbenchmarks** - -| Kernel | Triton \[us\] | CUDA \[us\] | -| ----- | :---: | :---: | -| QKV Projection Matmul | 25 | 21 | -| Flash Attention | 13 | 8 | -| Output Projection Matmul | 21 | 17 | -| Gate \+ Up Projection Matmul | 84 | 83 | -| Down Projection Matmul | 58 | 42 | - -**Figure 8\.** Triton and CUDA Kernel Latency Comparison (Llama3-8B on NVIDIA H100) -Input was an arbitrary prompt (bs=1, prompt \= 44 seq length), decoding latency time - -From the above, we note the following: - -1. Triton matmul kernels are **1.2-1.4x** slower than CUDA - -2. AMDs Triton Flash Attention kernel is **1.6x** slower than CUDA SDPA - -These results highlight the need to further improve the performance of kernels that are core primitives like GEMM and Flash Attention. We leave this as future research, as recent works (e.g. [FlashAttention-3](https://pytorch.org/blog/flashattention-3/), [FlexAttention](https://pytorch.org/blog/flexattention/)) provide ways to leverage the underlying hardware better as well as Triton pathways that we hope to be able to build on to produce greater speedups. To illustrate this, we compared FlexAttention with SDPA and AMD’s Triton Flash kernel. - -We are working to verify E2E performance with FlexAttention. For now, initial microbenchmarks with Flex show promise for longer context lengths and decoding problem shapes, where the query vector is small: - -![](/assets/images/flash_attention_tflops.png){:style="width:100%"} - -**Figure 9\.** FlexAttention Kernel Benchmarks on NVIDIA H100 SXM5 80GB -(batch=1, num\_heads=32, seq\_len=seq\_len, head\_dim=128) - -**9.0 Future Work** - -For future work we plan to explore ways to further optimize our matmuls that leverage the hardware better, such as this blog we published on [utilizing TMA for H100](https://pytorch.org/blog/hopper-tma-unit/), as well as different work decompositions (persistent kernel techniques like StreamK etc.) to get greater speedups for our Triton-based approach. For flash attention, we plan to explore FlexAttention and FlashAttention-3 as the techniques used in these kernels can be leveraged to help further close the gap between Triton and CUDA. -We also note that our prior work has shown promising results for FP8 Triton GEMM kernel performance versus cuBLAS FP8 GEMM, thus in a future post we will explore E2E FP8 LLM inference. diff --git a/_posts/2024-09-08-pytorch-shanghai-notes.md b/_posts/2024-09-08-pytorch-shanghai-notes.md deleted file mode 100644 index 106588db9826..000000000000 --- a/_posts/2024-09-08-pytorch-shanghai-notes.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Shanghai Meetup Notes" -hidden: true ---- - -## Summary - -![group photo](/assets/images/pytorch-shanghai-notes/fg1.jpg){:style="width:100%"} - -We are honored to successfully host the PyTorch Shanghai Meetup on August 15, 2024\. This Meetup has received great attention from the industry. We invited senior PyTorch developers from Intel and Huawei as guest speakers, who shared their valuable experience and the latest technical trends. In addition, this event also attracted PyTorch enthusiasts from many technology companies and well-known universities. A total of more than 40 participants gathered together to discuss and exchange the latest applications and technological advances of PyTorch. - -This Meetup not only strengthened the connection between PyTorch community members, but also provided a platform for local AI technology enthusiasts to learn, communicate and grow. We look forward to the next gathering to continue to promote the development of PyTorch technology in the local area. - -## 1\. PyTorch Foundation Updates - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg2.jpg){:style="width:100%"} - -PyTorch Board member Fred Li shared the latest updates in the PyTorch community, He reviewed the development history of the PyTorch community, explained in detail the growth path of community developers, encouraged everyone to delve deeper into technology, and introduced the upcoming PyTorch Conference 2024 related matters. - -## 2\. Intel’s Journey with PyTorch Democratizing AI with ubiquitous hardware and open software - -PyTorch CPU module maintainer Jiong Gong shared 6-year technical contributions from Intel to PyTorch and its ecosystem, explored the remarkable advancements that Intel has made in both software and hardware democratizing AI, ensuring accessibility, and optimizing performance across a diverse range of Intel hardware platforms. - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg3.jpg){:style="width:100%"} - -## 3\. Exploring Multi-Backend Support in PyTorch Ecosystem: A Case Study of Ascend - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg4.jpg){:style="width:100%"} - -Fengchun Hua, a PyTorch contributor from Huawei, took Huawei Ascend NPU as an example to demonstrate the latest achievements in multi-backend support for PyTorch applications. He introduced the hardware features of Huawei Ascend NPU and the infrastructure of CANN (Compute Architecture for Neural Networks), and explained the key achievements and innovations in native support work. He also shared the current challenges and the next work plan. - -Yuanhao Ji, another PyTorch contributor from Huawei, then introduced the Autoload Device Extension proposal, explained its implementation details and value in improving the scalability of PyTorch, and introduced the latest work progress of the PyTorch Chinese community. - -## 4\. Intel XPU Backend for Inductor - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg5.jpg){:style="width:100%"} - -Eikan is a PyTorch contributor from Intel. He focuses on torch.compile stack for both Intel CPU and GPU. In this session, Eikan presented Intel's efforts on torch.compile for Intel GPUs. He provided updates on the current status of Intel GPUs within PyTorch, covering both functionality and performance aspects. Additionally, Eikan used Intel GPU as a case study to demonstrate how to integrate a new backend into the Inductor using Triton. - -## 5\. PyTorch PrivateUse1 Evolution Approaches and Insights - -![man instructing students](/assets/images/pytorch-shanghai-notes/fg6.jpg){:style="width:100%"} - -Jiawei Li, a PyTorch collaborator from Huawei, introduced PyTorch's Dispatch mechanism and emphasized the limitations of DIspatchKey. He took Huawei Ascend NPU as an example to share the best practices of the PyTorch PrivateUse1 mechanism. He mentioned that while using the PrivateUse1 mechanism, Huawei also submitted many improvements and bug fixes for the mechanism to the PyTorch community. He also mentioned that due to the lack of upstream CI support for out-of-tree devices, changes in upstream code may affect their stability and quality, and this insight was recognized by everyone. diff --git a/_posts/2024-09-12-arm-joins-pytorch.md b/_posts/2024-09-12-arm-joins-pytorch.md deleted file mode 100644 index 25cb767cf3e8..000000000000 --- a/_posts/2024-09-12-arm-joins-pytorch.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -layout: blog_detail -title: "Arm Joins the PyTorch Foundation as a Premier Member" -author: The PyTorch Foundation ---- - -The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that [Arm](https://www.arm.com/) has joined as a premier member. - -Arm designs a high-performance, power-efficient compute platform with unmatched scalability, supporting a vast ecosystem of developers deploying AI at the edge and in the cloud, ranging from the Arm instances offered by all major cloud service providers to smartphones, laptops, software-defined vehicles and more. - -“Our continued investments in software are accelerating development and AI performance for over 20 million software developers, ensuring they can develop for Arm, on Arm,” said Alex Spinelli, VP Developer Technology at Arm. “PyTorch is a pivotal framework in advancing AI research and development. This membership demonstrates our strong commitment to open source - ensuring PyTorch just works on Arm and can leverage seamless acceleration for the most demanding AI models, now and in the future.” - -Last year at the PyTorch Conference, Arm partnered with Apple, Meta and Qualcomm to release [ExecuTorch](https://pytorch.org/blog/pytorch-edge/), an end-to-end solution for enabling on-device inference capabilities across mobile and edge devices including wearables, embedded devices and microcontrollers. - -"We're thrilled to welcome Arm to the PyTorch Foundation. As we look to the future of AI and machine learning, the role of specialized silicon and edge devices becomes increasingly crucial. Arm's expertise in these areas will be invaluable as we work to make PyTorch more efficient and accessible across a wider range of hardware,” said PyTorch Foundation Executive Director Matt White. “This collaboration underscores our commitment to fostering innovation and expanding PyTorch's capabilities to meet the evolving needs of developers and researchers worldwide." - -As a premier member, Arm is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction. - -We’re happy to welcome Alex Spinelli, VP Developer Technology at Arm, to our board. Prior to Arm, Alex was VP of Product for Core Machine Learning at Google, where he led Google’s technology and infrastructure for building, training, and serving machine learning, including the TensorFlow stack. - -To learn more about how you can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/foundation). - - -## About PyTorch Foundation - -The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration. - - -## About The Linux Foundation - -The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds. diff --git a/_posts/2024-09-18-pt-multidevice-integration.md b/_posts/2024-09-18-pt-multidevice-integration.md deleted file mode 100644 index 9cdb20e9d967..000000000000 --- a/_posts/2024-09-18-pt-multidevice-integration.md +++ /dev/null @@ -1,143 +0,0 @@ ---- -layout: blog_detail -title: "Challenges and Efforts in PyTorch Multi-Device Integration: Compatibility, Portability, and Integration Efficiencies" -author: "Zesheng Zong (Huawei), Jiawei Li (Huawei) | Co-authors: Jiong Gong (Intel), Bartosz Sochacki (Intel), Eikan Wang (Intel)" ---- - -## Introduction - -As the demand for diverse hardware accelerators grows, the need for a robust and adaptable deep learning framework becomes increasingly critical. While working through this integration, several challenges have surfaced in the PyTorch ecosystem, potentially affecting various hardware vendors. This blog aims to highlight these issues and propose solutions to enhance PyTorch's adaptability, portability, and resilience across different hardware platforms. - - -## Improve Users’ Code Portability via Accelerator Autoloading - -Currently, users face additional work when running their code on different accelerators. One such task is manually importing modules for out-of-tree devices. This requires users to not only understand the different usage patterns between accelerators but also make their code aware of these differences. If you have projects originally running on GPU/CPU and want to migrate to other accelerators, this can lead to significant work and potential frustration. - -Examples of extra import: - - -``` -# Case 1: Use HPU -import torch -import torchvision.models as models -import habana_frameworks.torch # <-- extra import -model = models.resnet50().eval().to("hpu") -input = torch.rand(128, 3, 224, 224).to("hpu") -output = model(input) - -# Case 2: Use torch_npu -import torch -import torch_npu # <-- extra import -print(torch.ones(1, 2, device='npu')) -``` - - -As a high-level machine learning framework, PyTorch's ability to shield users from device differences is a competitive feature. **Accelerator Autoloading** allows users to continue using the familiar PyTorch device programming model without explicitly loading or importing device-specific extensions. - -**How does it works?** - -Utilize Python's plugin architecture to enable automatic loading of device extensions via entry points in the PyTorch package. - -Python entry points provide a standardized way for Python packages to expose and discover components or plugins within an application. Via definition in accelerator’s package `setup.py` , PyTorch can automatically initialize accelerator modules when calling `import torch` , which gives users consistent experience between different backend devices. - -From device perspective, only need to claim following setup in `setup.py` (as example of `torch_npu` ) - - -``` -// setup.py -entry_points={ - 'torch.backends': ['torch_npu = torch_npu:_autoload', ], -} -``` - - -When `import torch` is invoked, the accelerator module will be loaded automatically. This provides users with a consistent programming experience across out-of-tree devices, eliminating the need to be aware of differences between CUDA, HPU, and NPU. - - -``` -# Case 1: Use HPU -import torch -import torchvision.models as models -model = models.resnet50().eval().to("hpu") -input = torch.rand(128, 3, 224, 224).to("hpu") -output = model(input) - -# Case 2: Use torch_npu -import torch -print(torch.ones(1, 2, device='npu')) -``` - - - -## Device Integration Optimization - -**What is PrivateUse1?** - -In PyTorch, the dispatcher is a crucial component of the framework's backend that manages how operations are routed to the appropriate device-specific implementation. Dispatch keys are an integral part of this system, serving as identifiers that represent various execution contexts—such as the device (CPU, CUDA, XPU), layout (dense, sparse), and autograd functionality. These keys ensure that operations are directed to the correct implementation. - -**PrivateUse1** is a customizable device dispatch key, similar to CUDA/CPU/XPU, etc.), reserved for out-of-tree devices. It provides developers with a way to extend PyTorch's functionality without modifying the core framework, allowing for the integration of new devices, hardware accelerators, or other specialized computing environments. - -**Why do we need PrivateUse1?** - -Internally, dispatch keys are represented as bit masks, each bit represents whether a certain key is active. This bit mask representation is efficient for quick lookup and combination of keys, but it inherently limits the number of distinct keys (typically to 64 or fewer). - -The current implementation of BackendComponent dispatch keys in PyTorch has encountered a critical bottleneck, which restricts the addition of new backends and, as a result, limits the expansion of the PyTorch ecosystem. - - -![bit diagram](/assets/images/multidevice-integration/fg1.png){:style="width:100%"} - - -In response to this challenge, a series of optimizations have been applied to the PrivateUse1 mechanism to enhance its capacity. - - - -* **PrivateUse1 integration mechanism** - - Initially reserved as fallback options, **PrivateUse1**, along with **PrivateUse2** and **PrivateUse3**, were designed to be activated only when existing key resources became scarce. - - **PrivateUse1** is now being developed to match the robustness and versatility of established keys like CUDA and CPU. Achieving this required a deep integration across critical PyTorch modules. This integration wasn't just a simple switch—it involved significant updates to core components such as **AMP (Automatic Mixed Precision)**, **Autograd**, **Distributed Training**, **Checkpointing**, **DataLoader**, **Optimization**, and **Quantization,** etc. - - -![flow diagram](/assets/images/multidevice-integration/fg2.png){:style="width:100%"} - -The activation of **PrivateUse1** was a massive collaborative effort, culminating in over 100 pull requests aimed at making it from a placeholder to a fully operational dispatch key. - -* **PrivateUse1 UT/CI Quality Assurance** - - While unit tests are essential for ensuring quality during the development of the **PrivateUse1** mechanism, they are not sufficient on their own to prevent new pull requests from inadvertently affecting existing functionality or compatibility of out-of-tree devices. - - To mitigate this risk, the community has added the `pytorch_openreg` module to the test suite. This module leverages a CPU backend to simulate interactions with accelerators, creating a controlled environment for rigorous testing. After implemented, this will enable automatic execution of device-generic test cases whenever relevant code is updated, allowing us to quickly detect and address any potential issues affecting the PrivateUse1 integration mechanism. - -* **Comprehensive Documentation** - - By providing comprehensive and easy-to-understand documentation, we aim to lower the barrier to entry for developers and encourage wider adoption of the PrivateUse1 mechanism in the PyTorch ecosystem. This documentation includes: - * Step-by-step guides for integrating new backends using PrivateUse1 - * Clear explanations of PrivateUse1's functionality and benefits - * Code examples and best practices for efficient implementation - -These enhancements aim to improve the robustness and reliability of the PrivateUse1 mechanism, facilitating better integration of new backends and expanding the capabilities of PyTorch. - - -## Compatibility Between Upstream and Downstream - - -### Device-Generic Unit Tests - -Most unit tests in PyTorch focus on CPU and CUDA devices, which limits participation from users with other hardware. To address this, a plan to modify PyTorch’s unit testing framework, enabling better support for non-CUDA devices. This plan includes removing existing device restrictions, implementing dynamic data type loading, and generalizing decorators to accommodate a broader range of devices. Additionally, we aim to enforce the use of universal device code and expand distributed testing to support non-NCCL backends. - -Through these improvements, we hope to significantly increase test coverage and pass rates for non-CUDA devices, integrating them into PyTorch's continuous integration process. Initial changes have already been implemented, paving the way for new hardware support and creating a reference template for other devices. - - -### Ensuring Robust Device Integration through Automated Testing - -To uphold the high standards of quality assurance in PyTorch, an independent build repository and daily continuous integration (CI) workflows have been established, focusing on smoke and integration testing. - -The `pytorch-integration-tests` repository automates the testing of PyTorch's device-specific functionalities, ensuring that they operate correctly and efficiently across a variety of hardware platforms(NPUs and other specialized devices). In repository we are trying to make a fully automated system that continuously validates PyTorch's compatibility with different hardware backends. - - - -* **Automated Integration Tests**: Run automated tests across different devices using GitHub Actions. This automation ensures that every change in the codebase is thoroughly tested against multiple hardware platforms, catching potential issues early in the development process. -* **Reusable Workflows**: Workflows in this repository are modular and reusable, which streamlines the testing process. Developers can easily adapt these workflows to new devices or testing scenarios, making the system both flexible and scalable as PyTorch evolves. -* **Awareness of Out-of-Tree Devices**: The repository displays the existence and behavior of all out-of-tree devices, keeping the community informed. This approach minimizes the risk of accidentally breaking downstream functionalities and provides fast feedback on changes. - -Efforts to enhance multi-device integration are pivotal for its adaptability in the evolving deep learning landscape. These initiatives not only benefit current users but also lower entry barriers for new hardware vendors and developers, fostering innovation in AI and machine learning. As PyTorch continues to evolve, its commitment to flexibility, robustness, and inclusivity positions it as a leading framework capable of meeting the diverse needs of the deep learning community. \ No newline at end of file diff --git a/_posts/2024-09-26-pytorch-native-architecture-optimization.md b/_posts/2024-09-26-pytorch-native-architecture-optimization.md deleted file mode 100644 index 0bcbb7c6a7d7..000000000000 --- a/_posts/2024-09-26-pytorch-native-architecture-optimization.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Native Architecture Optimization: torchao" -author: Team PyTorch ---- - - -We’re happy to officially launch torchao, a PyTorch native library that makes models faster and smaller by leveraging low bit dtypes, quantization and sparsity. [torchao](https://github.com/pytorch/ao) is an accessible toolkit of techniques written (mostly) in easy to read PyTorch code spanning both inference and training. This blog will help you pick which techniques matter for your workloads. - -We benchmarked our techniques on popular GenAI models like LLama 3 and Diffusion models and saw minimal drops in accuracy. Unless otherwise noted the baselines are bf16 run on A100 80GB GPU. - -Our topline metrics for llama 3 are -* 97% speedup for Llama 3 8B inference using autoquant with int4 weight only quantization and hqq -* 73% peak VRAM reduction for Llama 3.1 8B inference at 128K context length with a quantized KV cache -* 50% speedup for Llama 3 70B pretraining using float8 training on H100 -* 30% peak VRAM reduction for Llama 3 8B using 4 bit quantized optimizers. - -Our topline metrics for diffusion model inference -* 53% speedup using float8 dynamic quantization inference with float8 row-wise scaling on flux1.dev onH100 -* 50% reduction in model VRAM for CogVideoX using int8 dynamic quantization - -Below we'll walk through some of the techniques available in torchao you can apply to your models for inference and training. - -## Inference - -[Our inference quantization algorithms](https://github.com/pytorch/ao/tree/main/torchao/quantization) work over arbitrary PyTorch models that contain nn.Linear layers. Weight only and dynamic activation quantization for various dtypes and sparse layouts can be chosen using our top level `quantize_` api - -```python -from torchao.quantization import ( - quantize_, - int4_weight_only, -) -quantize_(model, int4_weight_only()) -``` - -Sometimes quantizing a layer can make it slower because of overhead so if you’d rather we just pick how to quantize each layer in a model for you then you can instead run - -```python -model = torchao.autoquant(torch.compile(model, mode='max-autotune')) -``` - -`quantize_` API has a few different options depending on whether your model is compute bound or memory bound. - -```python -from torchao.quantization import ( - # Memory bound models - int4_weight_only, - int8_weight_only, - - # Compute bound models - int8_dynamic_activation_int8_semi_sparse_weight, - int8_dynamic_activation_int8_weight, - - # Device capability 8.9+ - float8_weight_only, - float8_dynamic_activation_float8_weight, -) -``` - -We also have extensive benchmarks on diffusion models in collaboration with the HuggingFace diffusers team in [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao) where we demonstrated 53.88% speedup on Flux.1-Dev and 27.33% speedup on CogVideoX-5b - -![](/assets/images/Figure_1.png){:style="width:100%"} - - -Our APIs are composable so we’ve for example composed sparsity and quantization to bring 5% [speedup for ViT-H inference](https://github.com/pytorch/ao/tree/main/torchao/sparsity) - -But also can do things like quantize weights to int4 and the kv cache to int8 to support [Llama 3.1 8B at the full 128K context length running in under 18.9GB of VRAM](https://github.com/pytorch/ao/pull/738). -![](/assets/images/Figure_2.png){:style="width:100%"} - -## QAT - -Post training quantization, especially at less than 4 bit can suffer from serious accuracy degradations. Using [Quantization Aware Training](https://pytorch.org/blog/quantization-aware-training/) (QAT) we’ve managed to recover up to 96% of the accuracy degradation on hellaswag. We’ve integrated this as an end to end recipe in torchtune with a minimal [tutorial](https://github.com/pytorch/ao/tree/main/torchao/quantization/prototype/qat) - -![](/assets/images/Figure_3.jpg){:style="width:100%"} - -# Training - -## Low precision compute and communications - -torchao provides easy to use e2e workflows for reducing the precision of training compute and distributed communications, starting with float8 for \`torch.nn.Linear\` layers.Here is a one-liner to convert the compute gemms of your training run to float8: - -```python -from torchao.float8 import convert_to_float8_training -convert_to_float8_training(model) -``` - -For an e2e example of how to speed up LLaMa 3 70B pretraining by up to **1.5x** with float8, see our [README](https://github.com/pytorch/ao/tree/main/torchao/float8), and torchtitan's [blog](https://dev-discuss.pytorch.org/t/enabling-float8-all-gather-in-fsdp2/2359) and [float8 recipe](https://github.com/pytorch/torchtitan/blob/main/docs/float8.md). - -### Performance and accuracy of float8 pretraining of LLaMa 3 70B, vs bfloat16 - -![](/assets/images/Figure_4.png){:style="width:100%"} -(source: [https://dev-discuss.pytorch.org/t/enabling-float8-all-gather-in-fsdp2/2359](https://dev-discuss.pytorch.org/t/enabling-float8-all-gather-in-fsdp2/2359)) - -We are expanding our training workflows to more dtypes and layouts - -1. [NF4 QLoRA in torchtune](https://pytorch.org/torchtune/main/tutorials/qlora_finetune.html) -2. [Prototype int8 training support](https://github.com/pytorch/ao/pull/748) -3. [Accelerated sparse 2:4 training](https://pytorch.org/blog/accelerating-neural-network-training/) - -## Low bit Optimizers - -Inspired by Bits and Bytes we’ve also added prototype support for 8 and 4 bit optimizers as a drop in replacement for AdamW. - -```python -from torchao.prototype.low_bit_optim import AdamW8bit, AdamW4bit -optim = AdamW8bit(model.parameters()) -``` - -![](/assets/images/Figure_5.png){:style="width:100%"} - -# Integrations - -We’ve been actively working on making sure torchao works well in some of the most important projects in open source. - -1. Huggingface transformers as an [inference backend](https://huggingface.co/docs/transformers/main/quantization/torchao) -2. [In diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao) as a reference implementation for accelerating diffusion models -3. In HQQ for [fast 4 bit inference](https://github.com/mobiusml/hqq#faster-inference) -4. In [torchtune](https://github.com/pytorch/torchtune) for PyTorch native QLoRA and QAT recipes -5. In [torchchat](https://github.com/pytorch/torchchat) for post training quantization -6. In SGLang for for [int4 and int8 post training quantization](https://github.com/sgl-project/sglang/pull/1341) - -## Conclusion - -If you’re interested in making your models faster and smaller for training or inference, we hope you’ll find torchao useful and easy to integrate. - -pip install torchao - -There are a lot of things we’re excited about next ranging from going lower than 4 bit, performant kernels for high-throughput inference, expanding to more layers, scaling types or granularities, MX hardware support and supporting more hardware backends. If any of the above sounds exciting you can follow our progress at: [https://github.com/pytorch/ao](https://github.com/pytorch/ao) - -If you’re interested in working on torchao, we’ve created a [contributors guide](https://github.com/pytorch/ao/issues/391), and if you have any questions we hang out on the `#torchao` channel on [discord.gg/gpumode](http://discord.gg/gpumode) - -## Acknowledgements - -We are fortunate to stand on the shoulders of giants and collaborate with some of the best people in open source. Thank you! - -1. Bits and Bytes for pioneering work in low bit optimizers and QLoRA -2. Answer.ai for their engineering work to get FSDP and QLoRA composing -3. Mobius Labs for the lovely back and forths on quantization algorithms and low bit kernels -4. HuggingFace transformers for their help in battle testing and integrating our work -5. HuggingFace diffusers for our collaboration on extensive benchmarks and best practices -6. torch.compile so we could write our algorithms in pure PyTorch -7. GPU MODE for most of our early contributors diff --git a/_posts/2024-10-02-pytorch-conference-2024-recap.md b/_posts/2024-10-02-pytorch-conference-2024-recap.md deleted file mode 100644 index b2a0413d04dc..000000000000 --- a/_posts/2024-10-02-pytorch-conference-2024-recap.md +++ /dev/null @@ -1,162 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Conference 2024 Recap: On Fire 🔥" -author: Team PyTorch ---- - -![women dancing with fire](/assets/images/pytorch-conference-2024-recap/54018197476_9fce5b234d_k.jpg){:style="width:100%"} - - -The 2024 PyTorch Conference in San Francisco gathered nearly 1,500 AI researchers, developers, and enthusiasts. Over two days, the event featured engaging discussions, insightful keynotes, and hands-on sessions focused on artificial intelligence (AI) and advancements in PyTorch, the leading open-source machine learning framework. Attendees delved into the future of generative AI, Large Language Models (LLMs), and the crucial role open-source technology plays in driving AI innovation. Here’s a recap of the key themes, highlights, and major takeaways from this year’s conference. - - - - -## Key Themes of the PyTorch Conference 2024 - -Three core themes emerged throughout the conference: - - - -1. **Generative AI and LLMs**: Many sessions focused on how PyTorch continues to evolve as a primary framework for Large Language Models and Generative AI applications. From scaling these models to optimizing their performance on various hardware platforms, the conference showcased the ongoing advancements and challenges in LLM architecture. -2. **Democratizing AI Through Open Source**: One of the recurring themes was the importance of open source tools and communities in shaping the future of AI. PyTorch is committed to inclusivity, ease of use, and accessibility to developers of all levels, with a focus on bringing AI to an even larger global audience. -3. **Distributed and Edge Computing**: Distributed computing and edge deployment appeared in many discussions, highlighting how PyTorch is being used to drive AI to the edge. The focus on edge accelerators, scalable training, and inference showcased how PyTorch enables the deployment of powerful models across diverse environments, from the cloud to on-device applications. - -![panel of people on a conference stage](/assets/images/pytorch-conference-2024-recap/54017358432_8d9b53a2c8_k.jpg){:style="width:100%"} - - - - -## Watch the Sessions from PyTorch Conference - -The PyTorch Conference featured keynote sessions from top AI leaders and interesting lightning talks. You can view all of the conference sessions on our YouTube channel. - - -
        - -
        - - - -## PyTorch Conference Startup Showcase - - -![man speaking at a conference](/assets/images/pytorch-conference-2024-recap/54018500933_4df67cbbd4_k.jpg){:style="width:100%"} - -New this year, the Startup Showcase was an exciting addition to the PyTorch Conference. Featuring early-stage founders pitching their AI startups to a panel of top venture capitalists, this event showcased the next generation of AI-driven innovation. The finalists for the inaugural PyTorch Conference Startup Showcase included Remix Inc., Cartesia, OpenBabylon, Remyx AI, A2 Labs, Inc., QuicSnap, Iso AI, CTGT, and Creao.ai, representing some of the most innovative AI/ML startups in the industry. Attendees got a front-row seat to see cutting-edge AI startups in action, while top VCs from the AI industry evaluated the pitches. - -Congratulations to the PyTorch Conference Startup Showcase winner, CTGT! Deep learning can be opaque and biased, which limits its potential in crucial areas like healthcare and finance. CTGT is changing the game by enhancing data lineage in LLMs and cutting hallucinations. They're empowering companies to create customized models using 500x less compute. - -[View the Startup Showcase](https://youtu.be/xAePG2YVz7c?feature=shared) - - - -## Mini-Summits - - - -The **DL Compiler Mini-Summit** offered attendees a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads. - -[View the DL Compiler Mini-Summit](https://youtube.com/playlist?list=PL_lsbAsL_o2DyFOVyBzDS5scLfUotrG52&feature=shared) - -![People watching an event](/assets/images/pytorch-conference-2024-recap/54036162068_0afdec2ca6_k.jpg){:style="width:100%"} - - -The **Fine-Tuning Mini-Summit** brought together a thriving community of researchers, developers, practitioners and hobbyists which focuses on topics ranging from memory efficiency, parameter-efficient fine-tuning and quantization to performance at scale and reproducible evaluations. - -[View the Fine-Tuning Mini-Summit](https://youtube.com/playlist?list=PL_lsbAsL_o2D6l1brEg0DuDShep5p33nu&feature=shared ) - -## Major Takeaways from the PyTorch Conference 2024 - -![Matt giving his keynote](/assets/images/pytorch-conference-2024-recap/54018555324_daae473637_k.jpg){:style="width:100%"} - - - -1. **LLMs are Here to Stay**: were a focal point of the event, reaffirming their pivotal role in the future of AI. As these models continue to scale, PyTorch remains the preferred framework for developing, training, and deploying them across various platforms and industries. -2. **Open Source Drives Innovation**: A key takeaway from the conference was that open-source tools like PyTorch are vital for democratizing AI. This community-driven approach accelerates innovation, enabling researchers and developers globally to collaborate and contribute to faster advancements and more accessible AI technologies. -3. **Ethics and Sustainability Matter**: The focus on ethical AI development was a significant takeaway. Talks on the inclusivity of computer vision models, the environmental impacts of AI infrastructure, and the need for transparent, unbiased AI models highlighted the growing importance of ethical considerations in the future of AI. -4. **PyTorch Expands Beyond the Cloud**: With several sessions dedicated to edge AI and distributed computing, the conference showcased how PyTorch is expanding beyond cloud-based applications into edge devices and diverse computing environments. This shift is crucial as AI advances into areas like autonomous vehicles, mobile applications, and IoT devices. - - - - -## Thank You to Our Sponsors - -![A crowd of people at a conference](/assets/images/pytorch-conference-2024-recap/54006027240_be489d89a3_k.jpg){:style="width:100%"} - - - -![Sponsor logos](/assets/images/pytorch-conference-2024-recap/sponsors.png){:style="width:100%"} - - -We would like to thank each of the sponsors that made the PyTorch Conference 2024 possible. These include: - -### Diamond Sponsors: - -* AMD -* Cloud Native Computing Foundation -* IBM -* Intel – PyTorch -* Lightning.ai -* Meta – PyTorch - -### Platinum Sponsors: - -* Arm -* Google -* Lambda Labs -* Nvidia - -### Silver Sponsors: - -* Anyscale – PyTorch -* Baseten -* Chainguard -* Databricks -* Fal -* FuriosaAi -* HPE -* Jane Street -* Microsoft – PyTorch -* MinIO -* Outerbounds -* Together.AI - -### Bronze Sponsors: - -* d-Matrix -* MemVerge -* Perforated AI -* Quansight -* Rotational Labs -* ScaleGenAI - -### Special Event Sponsors: - -* PyTorch Flare Party: Hugging Face -* Startup Showcase: Mayfield -* Diversity Scholarship: AWS -* Women and Non-Binary in PyTorch Lunch: Google -* Happy Hour Reception: Lightning.AI - -Thank you for your continued support in advancing the PyTorch ecosystem and helping to shape the future of AI! - -## Save the Date - -See you next year for the PyTorch Conference in San Francisco at the Palace of Fine Arts from October 22-23, 2025. \ No newline at end of file diff --git a/_posts/2024-10-08-tac-elects-new-leadership.md b/_posts/2024-10-08-tac-elects-new-leadership.md deleted file mode 100644 index d7281a06b8ca..000000000000 --- a/_posts/2024-10-08-tac-elects-new-leadership.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Foundation Technical Advisory Council Elects New Leadership" -author: Team PyTorch ---- - -We are pleased to announce the first-ever Chair and Vice Chair of the PyTorch Foundation’s Technical Advisory Council (TAC): **Luca Antiga** as the Chair and **Jiong Gong** as Vice Chair. Both leaders bring extensive experience and deep commitment to the PyTorch community, and they are set to guide the TAC in its mission to foster an open, diverse, and innovative PyTorch technical community. - - -## Meet the New Leadership - - - -![Luca Antiga](/assets/images/tac-elects-new-leadership/luca-antiga.jpg){:style="max-width:350px"} - -Luca Antiga is the CTO at Lightning AI since 2022. He is an early contributor to PyTorch core and co-authored “Deep Learning with PyTorch” (published by Manning). He started his journey as a researcher in Bioengineering, and later co-founded Orobix, a company focused on building and deploying AI in production settings. - -“I am looking forward to taking on the role of the chair of the PyTorch TAC,” says Luca. “As the TAC chair, I will ensure effective, timely topic selection and enhance visibility of technical needs from the board members and from the ecosystem at large. I will strive for directional, cohesive messaging throughout the transition of PyTorch from Meta to the Linux Foundation.” - - - -![Jiong Gong](/assets/images/tac-elects-new-leadership/jiong-gong.jpg){:style="max-width:350px; margin-top: 40px"} - -Jiong Gong is a Principal Engineer and SW Architect for PyTorch Optimization from Intel. He serves as one of the PyTorch CPU module maintainers and is an active contributor to the TorchInductor CPU backend. - - - -“I plan to further strengthen the collaboration between PyTorch developers and hardware vendors, promoting innovation and performance optimization across various hardware platforms, enhancing PyTorch ecosystem and streamlining the decision-making process,” says Jiong. “I am honored to serve as the vice chair of the TAC.” - - - -## What Does the TAC Do? - -The PyTorch Foundation's TAC provides a forum for technical communication, leadership, and collaboration for the PyTorch Foundation. The committee members are members of the PyTorch Foundation. The committee holds open meetings once a month that anyone in the community can attend. The committee provides thought leadership on technical topics, knowledge sharing, and a forum to discuss issues with other technical experts in the community. - -## New TAC Webpage - -Stay connected with the PyTorch Foundation's Technical Advisory Council (TAC) by visiting our new [TAC webpage](/tac). Here you can find the TAC members, where to view upcoming meeting agendas, access presentations, attend public meetings, watch meeting recordings and participate in discussions on key technical topics. - -Plus stay tuned on our blog for regular updates from the PyTorch Foundation TAC leadership. - - \ No newline at end of file diff --git a/_posts/2024-10-15-performance-boost-windows.md b/_posts/2024-10-15-performance-boost-windows.md deleted file mode 100644 index c73859f8b244..000000000000 --- a/_posts/2024-10-15-performance-boost-windows.md +++ /dev/null @@ -1,390 +0,0 @@ ---- -layout: blog_detail -title: "The Path to Achieve PyTorch Performance Boost on Windows CPU" -author: Intel Corporation ---- - -The challenge of PyTorch’s lower CPU performance on Windows compared to Linux has been a significant issue. There are multiple factors leading to this performance disparity. Through our investigation, we’ve identified several reasons for poor CPU performance on Windows, two primary issues have been pinpointed: the inefficiency of the Windows default malloc memory allocator and the absence of [SIMD](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) for vectorization optimizations on the Windows platform. In this article, we show how PyTorch CPU performance on Windows has improved from the previous releases and where it stands as of PyTorch 2.4.1. - - -## Memory Allocation Optimization in PyTorch 2.1.2 and later - -In versions prior to PyTorch 2.1.2, PyTorch relied on the operating system’s default malloc function for memory allocation. The default malloc memory allocation on the Windows platform was less efficient compared to the malloc implementation mechanism on the Linux platform, leading to increased memory allocation times and reduced performance. To address this, we have substituted the default Windows malloc with mimalloc, a more efficient memory allocator developed by Microsoft. This update, included with the release of PyTorch 2.1.2 and later, has significantly enhanced the CPU performance of PyTorch on Windows, as shown in Figure 1.1. - - - -![performance comparison chart](/assets/images/performance-boost-windows/fg1.png){:style="width:100%"} - - -*PyTorch CPU Performance Improvement on Windows with Memory Allocation Optimization* - -*Figure 1.1: Relative throughput improvement achieved by upgrading from Windows PyTorch version 2.0.1 to 2.1.2 (higher is better).* - -The graph illustrates that with the release of PyTorch 2.1.2, there has been a notable enhancement in CPU performance on the Windows platform. The degree of improvement varies across different models, which can be attributed to the diverse mix of operations they perform and their corresponding memory access patterns. While the BERT model shows a modest performance gain, models like ResNet50 and MobileNet-v3 Large benefit from more pronounced improvements. - -On a high-performance CPU, memory allocation becomes a performance bottleneck. This is also why addressing this issue has led to such significant performance improvements. - -As shown in the graphs below, we see that PyTorch CPU performance on Windows can significantly be improved. However, there is still a noticeable gap when compared to its performance on Linux. The absence of vectorization optimizations in the Windows variant of PyTorch CPU is a key factor to the remaining performance gap. - - -![performance comparison chart](/assets/images/performance-boost-windows/fg2.png){:style="width:100%"} - - -*Windows vs Linux Performance on PyTorch 2.0.1* - -*Figure 1.2: Relative performance of Windows vs Linux with PyTorch version 2.0.1 (higher is better).* - - -![performance comparison chart](/assets/images/performance-boost-windows/fg3.png){:style="width:100%; margin-top: 50px;"} - - -*Windows vs Linux Performance on PyTorch 2.1.2* - -*Figure 1.3: Relative performance of Windows vs Linux with PyTorch version 2.1.2 (higher is better).* - - -## Vectorization Optimization in PyTorch 2.4.1 and later - -Prior to PyTorch 2.4.1, the Windows build of PyTorch lacked [SIMD](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) for vectorization optimizations, a feature that the Linux build leveraged for improved performance. This discrepancy was due to the [SLEEF](https://github.com/shibatch/sleef) Library’s integration issues on Windows, which is a SIMD Library for Evaluating Elementary Functions, vectorized libm and DFT and is essential for efficient trigonometric calculations. Through a collaborative effort with engineers from ARM and Qualcomm, these challenges were resolved, enabling the integration of SIMD into PyTorch for Windows. The PyTorch 2.4.1 update has thus significantly enhanced PyTorch’s CPU performance on Windows, as shown in Figure 2.1. - - -![performance comparison chart](/assets/images/performance-boost-windows/fg4.png){:style="width:100%"} - - -*PyTorch CPU Performance Improvement on Windows with Vertorization Optimization* - -*Figure 2.1: Relative throughput improvement achieved by upgrading from PyTorch CPU version 2.1.2 to 2.4.1 (higher is better).* - -As shown in the graph below, we see that PyTorch CPU performance on Windows ahieved the performance on Linux. - - -![performance comparison chart](/assets/images/performance-boost-windows/fg5.png){:style="width:100%"} - - -*Windows vs Linux Performance on PyTorch 2.4.1* - -*Figure 2.2: Relative performance of Windows vs Linux with PyTorch version 2.4.1 (higher is better).* - - -## CONCLUSION - -From PyTorch 2.0.1 to PyTorch 2.4.1, the CPU performance gap between Windows and Linux has been continuously narrowing. We compared the ratio of CPU performance on Windows to CPU performance on Linux across different versions, and the results are shown in the following graph. - - -![performance comparison chart](/assets/images/performance-boost-windows/fg6.png){:style="width:100%"} - - -*Windows vs Linux Performance on different version of PyTorch* - -*Figure 3: Performance Ratio for Windows to Linux with different version of PyTorch (higher is better).* - -The graph shows that with PyTorch 2.4.1, CPU performance on Windows has nearly converged with that on Linux, and on some models, it has even surpassed Linux. For example, in the case of DistillBERT and RoBERTa models, the CPU performance ratio of Windows to Linux has achieved a remarkable 102%. However, certain models, including MobileNet-v3, still show a performance discrepancy. Intel engineers will continue to collaborate with Meta engineers, to reduce the performance gap of PyTorch CPU between Windows and Linux. - - -## HOW TO TAKE ADVANTAGE OF THE OPTIMIZATIONS - -Install PyTorch CPU 2.4.1 or later on Windows from the [official repository](https://pytorch.org/get-started/locally/), and you may automatically experience a performance boost with memory allocation and vectorizations. - - -## ACKNOWLEDGMENTS - -The results presented in this blog post was achieved through the collaborative effort of the Intel PyTorch team and Meta. We would like to express our sincere gratitude to [Xu Han](https://github.com/xuhancn), [Jiong Gong](https://github.com/jgong5), [Haozhe Zhu](https://github.com/zhuhaozhe), [Mingfei Ma](https://github.com/mingfeima), [Chuanqi Wang](https://github.com/chuanqi129), [Guobing Chen](https://github.com/Guobing-Chen) and [Eikan Wang](https://github.com/EikanWang). Their expertise and dedication have been instrumental in achieving the optimizations and performance improvements discussed here. Thanks to [Jiachen Pu](https://github.com/peterjc123) from community for his participation in the issue discussion and suggesting the use of [mimalloc](https://github.com/microsoft/mimalloc). We’d also like to express our gratitude to Microsoft for providing such an easily integrated and performant mallocation library. Thanks to [Pierre Blanchard](https://github.com/blapie) , [Nathan Sircombe](https://github.com/nSircombe) from ARM and [Alex Reinking](https://github.com/alexreinking) from Adobe for their contribution in overcome the compatibility issues with the [sleef](https://github.com/shibatch/sleef) integrated to PyTorch Windows. Finally we want to thank [Jing Xu](https://github.com/jingxu10), [Weizhuo Zhang](https://github.com/WeizhuoZhang-intel) and [Zhaoqiong Zheng](https://github.com/ZhaoqiongZ) for their contributions to this blog. - - -### Product and Performance Information - -The configurations in the table are collected with [svr-info](https://github.com/intel/svr-info). Test by Intel on August 30, 2024. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Specification - Configuration1 - Configuration2 -
        Name - ThinkBook 14 G5+ IRH - ThinkBook 14 G5+ IRH -
        Time - Fri Aug 30 02:43:02 PM UTC 2024 - Fri Aug 30 02:43:02 PM UTC 2024 -
        System - LENOVO - LENOVO -
        Baseboard - LENOVO - LENOVO -
        Chassis - LENOVO - LENOVO -
        CPU Model - 13th Gen Intel(R) Core(TM) i7-13700H - 13th Gen Intel(R) Core(TM) i7-13700H -
        Microarchitecture - Unknown Intel - Unknown Intel -
        Sockets - 1 - 1 -
        Cores per Socket - 14 - 14 -
        Hyperthreading - Enabled - Enabled -
        CPUs - 20 - 20 -
        Intel Turbo Boost - Enabled - Enabled -
        Base Frequency - 2.4GHz - 2.4GHz -
        All-core Maximum Frequency - 4.7GHz - 4.7GHz -
        Maximum Frequency - 4.8GHz - 4.8GHz -
        NUMA Nodes - 1 - 1 -
        Prefetchers - L2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled - L2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled -
        PPINs - - - - -
        Accelerators - DLB, DSA, IAA, QAT - DLB, DSA, IAA, QAT -
        Installed Memory - 32GB (8x4GB LPDDR4 7400 MT/s [5200 MT/s]) - 32GB (8x4GB LPDDR4 7400 MT/s [5200 MT/s]) -
        Hugepagesize - 2048kb - 2048kb -
        Transparent Huge Pages - madvise - madvise -
        Automatic NUMA Balancing - Disabled - Disabled -
        NIC - “1. Raptor Lake PCH CNVi WiFi 2. Intel Corporation” - “1. Raptor Lake PCH CNVi WiFi 2. Intel Corporation” -
        Disk - Micron MTFDKBA512TFH 500G - Micron MTFDKBA512TFH 500G -
        BIOS - LBCN22WW - LBCN22WW -
        Microcode - 0x411c - 0x411c -
        OS - Windows 11 Desktop - Ubuntu 23.10 -
        Kernel - OS Build 19045.4412 - 6.5.0-27-generic -
        TDP - 200 watts - 200 watts -
        Power & Perf Policy - Normal Powersave (7) - Normal Powersave (7) -
        Frequency Governor - performance - performance -
        Frequency Driver - intel_pstate - intel_pstate -
        Max C-State - 9 - 9 -
        - - - -## Notices and Disclaimers - -Performance varies by use, configuration and other factors. Learn more on the [Performance Index site](https://edc.intel.com/content/www/us/en/products/performance/benchmarks/overview/). - -Performance results are based on testing as of dates shown in [configurations](#product-and-performance-information) and may not reflect all publicly available updates. See backup for configuration details. No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation. - -Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. diff --git a/_posts/2024-10-17-pytorch2-5.md b/_posts/2024-10-17-pytorch2-5.md deleted file mode 100644 index f077d613cd30..000000000000 --- a/_posts/2024-10-17-pytorch2-5.md +++ /dev/null @@ -1,155 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.5 Release Blog" ---- - -We are excited to announce the release of PyTorch® 2.5 ([release note](https://github.com/pytorch/pytorch/releases/tag/v2.5.0))! This release features a new cuDNN backend for SDPA, enabling speedups by default for users of SDPA on H100s or newer GPUs. As well, regional compilation of torch.compile offers a way to reduce the cold start up time for torch.compile by allowing users to compile a repeated nn.Module (e.g. a transformer layer in LLM) without recompilations. Finally, TorchInductor CPP backend offers solid performance speedup with numerous enhancements like FP16 support, CPP wrapper, AOT-Inductor mode, and max-autotune mode. - -This release is composed of 4095 commits from 504 contributors since PyTorch 2.4. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.5. More information about how to get started with the PyTorch 2-series can be found at our [Getting Started](https://pytorch.org/get-started/pytorch-2.0/) page. - -As well, please check out our new ecosystem projects releases with [TorchRec](https://github.com/pytorch/torchrec) and [TorchFix](https://github.com/pytorch-labs/torchfix/releases/tag/v0.6.0). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Beta - Prototype -
        cuDNN backend for SDPA - FlexAttention -
        torch.compile regional compilation without recompilations - Compiled Autograd -
        TorchDynamo added support for exception handling & MutableMapping types - Flight Recorder -
        TorchInductor CPU backend optimization - Max-autotune Support on CPU with GEMM Template -
        - TorchInductor on Windows -
        - FP16 support on CPU path for both eager mode and TorchInductor CPP backend -
        - Autoload Device Extension -
        - Enhanced Intel GPU support -
        - - -*To see a full list of public feature submissions click [here](https://docs.google.com/spreadsheets/d/1TzGkWuUMF1yTe88adz1dt2mzbIsZLd3PBasy588VWgk/edit?usp=sharing). - - -## BETA FEATURES - - -### [Beta] cuDNN backend for SDPA - -The cuDNN "Fused Flash Attention" backend was landed for *torch.nn.functional.scaled_dot_product_attention*. On NVIDIA H100 GPUs this can provide up to 75% speed-up over FlashAttentionV2. This speedup is enabled by default for all users of SDPA on H100 or newer GPUs. - - -### [Beta] *torch.compile* regional compilation without recompilations - -Regional compilation without recompilations, via *torch._dynamo.config.inline_inbuilt_nn_modules* which default to True in 2.5+. This option allows users to compile a repeated *nn.Module* (e.g. a transformer layer in LLM) without recompilations. Compared to compiling the full model, this option can result in smaller compilation latencies with 1%-5% performance degradation compared to full model compilation. - -See the [tutorial](https://pytorch.org/tutorials/recipes/regional_compilation.html) for more information. - - -### [Beta] TorchInductor CPU backend optimization - -This feature advances Inductor’s CPU backend optimization, including CPP backend code generation and FX fusions with customized CPU kernels. The Inductor CPU backend supports vectorization of common data types and all Inductor IR operations, along with the static and symbolic shapes. It is compatible with both Linux and Windows OS and supports the default Python wrapper, the CPP wrapper, and AOT-Inductor mode. - -Additionally, it extends the max-autotune mode of the GEMM template (prototyped in 2.5), offering further performance gains. The backend supports various FX fusions, lowering to customized kernels such as oneDNN for Linear/Conv operations and SDPA. The Inductor CPU backend consistently achieves performance speedups across three benchmark suites—TorchBench, Hugging Face, and timms—outperforming eager mode in 97.5% of the 193 models tested. - - -## PROTOTYPE FEATURES - - -### [Prototype] FlexAttention - -We've introduced a flexible API that enables implementing various attention mechanisms such as Sliding Window, Causal Mask, and PrefixLM with just a few lines of idiomatic PyTorch code. This API leverages torch.compile to generate a fused FlashAttention kernel, which eliminates extra memory allocation and achieves performance comparable to handwritten implementations. Additionally, we automatically generate the backwards pass using PyTorch's autograd machinery. Furthermore, our API can take advantage of sparsity in the attention mask, resulting in significant improvements over standard attention implementations. - -For more information and examples, please refer to the [official blog post](https://pytorch.org/blog/flexattention/) and [Attention Gym](https://github.com/pytorch-labs/attention-gym). - - -### [Prototype] Compiled Autograd - -Compiled Autograd is an extension to the PT2 stack allowing the capture of the entire backward pass. Unlike the backward graph traced by AOT dispatcher, Compiled Autograd tracing is deferred until backward execution time, which makes it impervious to forward pass graph breaks, and allows it to record backward hooks into the graph. - -Please refer to the [tutorial](https://pytorch.org/tutorials/intermediate/compiled_autograd_tutorial.html) for more information. - - -### [Prototype] Flight Recorder - -Flight recorder is a new debugging tool that helps debug stuck jobs. The tool works by continuously capturing information about collectives as they run. Upon detecting a stuck job, the information can be used to quickly identify misbehaving ranks/machines along with code stack traces. - -For more information please refer to the following [tutorial](https://pytorch.org/tutorials/prototype/flight_recorder_tutorial.html). - - -### [Prototype] Max-autotune Support on CPU with GEMM Template - -Max-autotune mode for the Inductor CPU backend in torch.compile profiles multiple implementations of operations at compile time and selects the best-performing one. This is particularly beneficial for GEMM-related operations, using a C++ template-based GEMM implementation as an alternative to the ATen-based approach with oneDNN and MKL libraries. We support FP32, BF16, FP16, and INT8 with epilogue fusions for x86 CPUs. We’ve seen up to 7% geomean speedup on the dynamo benchmark suites and up to 20% boost in next-token latency for LLM inference. - -For more information please refer to the [tutorial](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html). - - -### [Prototype] TorchInductor CPU on Windows - -Inductor CPU backend in torch.compile now works on Windows. We support MSVC (cl), clang (clang-cl) and Intel compiler (icx-cl) for Windows inductor currently. - -See the [tutorial](https://pytorch.org/tutorials/prototype/inductor_windows_cpu.html) for more details. - - -### [Prototype] FP16 support on CPU path for both eager mode and TorchInductor CPP backend - -Float16 is a commonly used reduced floating point type for performance improvement in neural network inference/training. Since this release, float16 for both eager and TorchInductor is supported on the CPU path. - - -### [Prototype] Autoload Device Extension - -PyTorch now supports autoloading for out-of-tree device extensions, streamlining integration by eliminating the need for manual imports. This feature, enabled through the torch.backends entrypoint, simplifies usage by ensuring seamless extension loading, while allowing users to disable it via an environment variable if needed. - -See the [tutorial](https://pytorch.org/tutorials/prototype/python_extension_autoload.html) for more information. - -### [Prototype] Enhanced Intel GPU support - -Intel GPUs support enhancement is now available for both Intel® Data Center GPU Max Series and Intel® Client GPUs (Intel® Core™ Ultra processors with built-in Intel® Arc™ graphics and Intel® Arc™ Graphics for dGPU parts), which is to make it easier to accelerate your Machine Learning workflows on Intel GPUs in PyTorch 2.5 release. We also enabled the initial support of PyTorch on Windows for Intel® Client GPUs in this release. - - - -* Expanded PyTorch hardware backend support matrix to include both Intel Data Center and Client GPUs.   -* The implementation of SYCL* kernels to enhance coverage and execution of Aten operators on Intel GPUs to boost performance in PyTorch eager mode. -* Enhanced Intel GPU backend of torch.compile to improve inference and training performance for a wide range of deep learning workloads. - -These features are available through PyTorch preview and nightly binary PIP wheels. For more information regarding Intel GPU support, please refer to [documentation](https://pytorch.org/docs/main/notes/get_start_xpu.html). diff --git a/_posts/2024-10-23-torchrec-fbgemm-1.md b/_posts/2024-10-23-torchrec-fbgemm-1.md deleted file mode 100644 index bae04fda3f74..000000000000 --- a/_posts/2024-10-23-torchrec-fbgemm-1.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -layout: blog_detail -title: "TorchRec and FBGEMM 1.0 Stable Release" -author: Paul Zhang, Zain Huda, Sarunya Pumma, Shintaro Iwasaki, Supadchaya Puangpontip, Benson Ma ---- - -We are happy to announce the stable release, 1.0, for [TorchRec](https://github.com/pytorch/torchrec) and [FBGEMM](https://github.com/pytorch/FBGEMM). TorchRec is the PyTorch native recommendation systems library, powered by FBGEMM’s (Facebook GEneral Matrix Multiplication) efficient, low-level kernels. - - -## TorchRec - -[Initially open sourced in 2022](https://pytorch.org/blog/introducing-torchrec/), [TorchRec](https://github.com/pytorch/torchrec) provides common primitives for creating state-of-the-art personalization models: - -* Simple, optimized APIs for distributed training across hundreds of GPUs -* Advanced sharding techniques for embeddings -* Modules common in authoring recommendation systems -* Frictionless path to distributed inference with APIs for quantization and sharding of TorchRec models - -Since then, TorchRec has matured significantly, with wide internal adoption across many Meta production recommendation models for training and inference, alongside new features such as: [variable batched embeddings, embedding offloading, zero collision hashing, etc.](https://github.com/pytorch/torchrec/releases?page=1) Furthermore, TorchRec has a presence outside of Meta, such as [in recommendation models at Databricks](https://docs.databricks.com/en/machine-learning/train-recommender-models.html) and in the [Twitter algorithm](https://github.com/twitter/the-algorithm-ml). As a result, standard TorchRec features have been marked as **stable**, with PyTorch style BC guarantees, and can be seen on the [revamped TorchRec documentation](https://pytorch.org/torchrec/). - - -## FBGEMM - -[FBGEMM is a library that provides high-performance kernels for CPUs and GPUs](https://pytorch.org/FBGEMM/). Since 2018, FBGEMM has supported the efficient execution of Meta-internal and external AI/ML workloads by expanding its scope from [performance-critical kernels for inference on CPUs](https://arxiv.org/abs/2101.05615) to more complex sparse operators for both training and inference – and recently for Generative AI – on CPUs and GPUs. - -FBGEMM has been empowering TorchRec through its backend high-performance kernel implementations for recommendation workloads, ranging from embedding bag kernels to jagged tensor operations. Together with TorchRec, we released FBGEMM 1.0, which guarantees the functionality and backward-compatibility of several stable APIs serving its core features with [enhanced documentation](https://pytorch.org/FBGEMM/). - - -## Performance - -[DLRM (Deep Learning Recommendation Model)](https://ai.meta.com/blog/dlrm-an-advanced-open-source-deep-learning-recommendation-model/) is the standard neural network architecture for powering recommendations at Meta, with categorical features being processed through embeddings, while continuous (dense) features are processed with a bottom multilayer perceptron. The following diagram depicts the basic architecture of DLRM, with a second order interaction layer between the dense and sparse features and a top MLP for generating the prediction. - -![flow diagram](/assets/images/torchrec-1.png){:style="width:100%"} - - - -TorchRec provides standardized modules with significant optimizations in fusing embedding lookups. EBC is a traditional PyTorch embedding module implementation, containing a collection of `torch.nn.EmbeddingBags.` FusedEBC, powered by FBGEMM for high performance operations on embedding tables with a fused optimizer and UVM caching/management for alleviating memory constraints, is the optimized version present in sharded TorchRec modules for distributed training and inference. The below benchmark demonstrates the vast performance improvements of FusedEBC in comparison to a traditional PyTorch embedding module implementation (EBC) and the ability for FusedEBC to handle much larger embeddings than what is available on GPU memory with UVM caching. - -![performance chart](/assets/images/torchrec-2.png){:style="width:100%"} - - - -## TorchRec Data Types - -TorchRec provides standard [data types](https://pytorch.org/torchrec/datatypes-api-reference.html) and [modules](https://pytorch.org/torchrec/modules-api-reference.html) for easy handling of distributed embeddings. Here is a simple example setting up a collection of embedding tables through TorchRec: - - -``` -from torchrec import EmbeddingBagCollection -from torchrec import KeyedJaggedTensor -from torchrec import JaggedTensor - -ebc = torchrec.EmbeddingBagCollection( - device="cpu", - tables=[ - torchrec.EmbeddingBagConfig( - name="product_table", - embedding_dim=64, - num_embeddings=4096, - feature_names=["product"], - pooling=torchrec.PoolingType.SUM, - ), - torchrec.EmbeddingBagConfig( - name="user_table", - embedding_dim=64, - num_embeddings=4096, - feature_names=["user"], - pooling=torchrec.PoolingType.SUM, - ) - ] -) - -product_jt = JaggedTensor( - values=torch.tensor([1, 2, 1, 5]), lengths=torch.tensor([3, 1]) -) -user_jt = JaggedTensor(values=torch.tensor([2, 3, 4, 1]), lengths=torch.tensor([2, 2])) - -kjt = KeyedJaggedTensor.from_jt_dict({"product": product_jt, "user": user_jt}) - -print("Call EmbeddingBagCollection Forward: ", ebc(kjt)) -``` - - - -## Sharding - -TorchRec provides a planner class that automatically generates an optimized sharding plan across many GPUs. Here we demonstrate generating a sharding plan across two GPUs: - - -``` -from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - -planner = EmbeddingShardingPlanner( - topology=Topology( - world_size=2, - compute_device="cuda", - ) -) - -plan = planner.collective_plan(ebc, [sharder], pg) - -print(f"Sharding Plan generated: {plan}") -``` - - - -## Model Parallel - -TorchRec’s main distributed training API is [DistributedModelParallel](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module), which calls the planner to generate a sharding plan (demonstrated above) and shards TorchRec modules according to that plan. We demonstrate using [DistributedModelParallel](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) to our EmbeddingBagCollection for sharding embeddings distributed training: - - -``` -model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda")) -``` - - - -## Inference - -TorchRec provides simple APIs for quantizing and sharding embeddings for a model for distributed inference. The usage is demonstrated below: - - -``` -from torchrec.inference.modules import ( - quantize_inference_model, - shard_quant_model, -) -quant_model = quantize_inference_model(ebc) -sharded_model, _ = shard_quant_model( - quant_model, compute_device=device, sharding_device=device -) -``` - - - -## Conclusion - -TorchRec and FBGEMM are now stable, with optimized features for large scale recommendation systems. - -For setting up TorchRec and FBGEMM, check out the [getting started guide](https://pytorch.org/torchrec/setup-torchrec.html). \ - \ -We also recommend the comprehensive, end-to-end [tutorial for introducing the features in TorchRec and FBGEMM](https://pytorch.org/tutorials/intermediate/torchrec_intro_tutorial.html#). \ No newline at end of file diff --git a/_posts/2024-10-24-executorch-beta.md b/_posts/2024-10-24-executorch-beta.md deleted file mode 100644 index 463aa9bf392a..000000000000 --- a/_posts/2024-10-24-executorch-beta.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -layout: blog_detail -title: "ExecuTorch Beta: On-Device AI and LLMs, Stability, and Acceleration with Partners" -author: Team PyTorch ---- - -* ExecuTorch has achieved Beta status with the release of v0.4, providing stable APIs and runtime, as well as extensive kernel coverage. -* ExecuTorch is the recommended on-device inference engine for Llama 3.2 1B/3B models, offering enhanced performance and memory efficiency for both original and quantized models. -* There has been a significant increase in adoption and ecosystem growth for ExecuTorch, and the focus is now on improving reliability, performance, and coverage for non-CPU backends as the next steps. - -**Current On-Device AI Market** - -The on-device AI market has been rapidly expanding, and is revolutionizing the way we interact with technology. It is unlocking new experiences, enabling personalization, and reducing latency. Traditionally, computer vision and speech recognition have been the primary use-cases for on-device AI, particularly in IoT, industrial applications, and mobile devices. However, the emergence of Large Language Models (LLMs) has made Generative AI the fastest growing sector in AI, subsequently highlighting the importance of on-device Generative AI. IDC [forecasts](https://www.idc.com/getdoc.jsp?containerId=prUS52478124) by 2028, close to 1 billion GenAI capable smartphones being shipped worldwide. - -LLMs are not only getting smaller but more powerful. This has led to the creation of a new class of applications that leverage multiple models for intelligent agents and streamlined workflows. The community is rapidly adopting and contributing to these new models, with quantized versions being created within hours of model release. Several leading technology companies are investing heavily in small LLMs, even deploying Low-Rank Adaptation (LoRA) at scale on-device to transform user experiences. - -However, this rapid progress comes at a cost. The fragmentation of our on-device AI landscape creates complexity and inefficiency when going from model authoring to edge deployment. This is where PyTorch’s [ExecuTorch](https://github.com/pytorch/executorch) comes in – our Beta announcement marks an important milestone in addressing these challenges and empowering developers to create innovative, AI-powered applications. - -**What’s New Today** - -It’s been exactly one year since we [first open sourced ExecuTorch](https://pytorch.org/blog/pytorch-edge/), six months since [Alpha release](https://pytorch.org/blog/executorch-alpha/), and today, we’re excited to announce three main developments: - -**1\. Beta**. ExecuTorch has reached Beta status starting from v0.4\! It is now widely adopted and used in production environments across Meta. Through this adoption process we’ve identified and addressed feature gaps, improved stability, and expanded kernel and accelerator coverage. These improvements make us confident to promote ExecuTorch from [Alpha](https://github.com/pytorch/executorch/releases/tag/v0.2.0) to [Beta](https://github.com/pytorch/executorch/releases/tag/v0.4.0) status, and we are happy to welcome the community to adopt it in their own production settings. Here are three concrete enhancements: - - -1. Developers can write application code and include the latest ExecuTorch as a dependency, updating when needed with a clean API contract. This is possible due to our API stabilization efforts, as well as our [explicit API lifecycle](https://pytorch.org/executorch/main/api-life-cycle.html) and backwards [compatibility policy](https://github.com/pytorch/executorch/blob/main/runtime/COMPATIBILITY.md). -2. Running ExecuTorch on CPUs reached the necessary performance, portability and coverage. In particular, we have implemented more than 85% of all [core ATen operators](https://pytorch.org/executorch/main/ir-ops-set-definition.html) as part of our [portable CPU kernels library](https://pytorch.org/executorch/stable/kernel-library-overview.html) to ensure running a model on ExecuTorch just works in most cases and making missing ops an exception rather than the norm. Moreover, we integrated and extensively tested our [XNNPACK](https://pytorch.org/executorch/main/native-delegates-executorch-xnnpack-delegate.html) delegate for high performance on a wide range of CPU architectures. It is used in a number of production cases today. -3. In addition to the low-level ExecuTorch components for greater portability, we built extensions and higher-level abstractions to support more common use-cases such as [developer tooling](https://pytorch.org/executorch/main/devtools-overview.html) to support on-device debugging and profiling, and [Module.h](https://pytorch.org/executorch/main/extension-module.html) extension to simplify deployment for mobile devices. - -**2\. On-Device Large-Language Models (LLMs).** There has been a growing interest in the community to deploy Large Language Models (LLMs) on edge devices, as it offers improved privacy and offline capabilities. However, these models are quite large, pushing the limits of what is possible. Fortunately, ExecuTorch can support these models, and we’ve enhanced the overall framework with numerous optimizations. - -- ExecuTorch is the [recommended framework](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2/#-inference-with-lightweight-models-) to run latest Llama models on-device with [excellent performance](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2/#-inference-with-lightweight-models-) today. The Llama 3.2 1B/3B models are well-suited for mobile deployment, and it is especially true with the official [quantized 1B/3B model releases](https://ai.meta.com/blog/meta-llama-quantized-lightweight-models/) from Meta, as it provides a great balance between performance, accuracy, and size. When deploying Llama 3.2 1B/3B quantized models, decode latency improved by 2.5x and prefill latency improved by 4.2x on average, while model size decreased by 56% and memory usage reduced by 41% on average when benchmarked on Android OnePlus 12 device (we’ve also verified similar relative performance on Samsung S24+ for 1B and 3B, and Samsung S22 for 1B). For Llama 3.2 1B quantized model, for example, ExecuTorch is able to achieve 50.2 tokens/s for decoding and 260 tokens/s for prefill on the OnePlus 12, using the latest CPU kernels from XNNPACK and [Kleidi libraries](https://community.arm.com/arm-community-blogs/b/ai-and-ml-blog/posts/llm-inference-llama-quantized-models-executorch-kleidiai). These quantized models allow developers to integrate LLMs into memory and power-constrained devices while still maintaining quality and safety. -- One of the value propositions of ExecuTorch is being able to use accelerators on mobile devices seamlessly. In fact, ExecuTorch also showcased accelerators to achieve even greater performance running Llama across [Apple MPS backend](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md), [Qualcomm AI Accelerator](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md), and [MediaTek AI Accelerator](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md). -- There has been growing community and industry interest in multimodal and beyond text-only LLMs, evidenced by Meta’s [Llama 3.2 11B/90B vision models](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/) and open-source models like Llava. We have so far [enabled Llava 1.5 7B model on phones via ExecuTorch](https://github.com/pytorch/executorch/tree/main/examples/models/llava), making many optimizations, notably reducing runtime memory from 11GB all the way down to 5GB. - - -**3\. Ecosystem and Community Adoption** -Now that ExecuTorch is in Beta, it is mature enough to be used in production. It is being increasingly used at Meta across various product surfaces. For instance, ExecuTorch already powers various ML inference use cases across Meta’s Ray-Ban Meta Smart Glasses and Quest 3 VR headsets as well as Instagram and WhatsApp. - -We also [partnered with Hugging Face](https://huggingface.co/docs/transformers/main/en/main_classes/executorch) to provide native ExecuTorch support for models being exported using torch.export. This collaboration ensures exported artifacts can directly be lowered and run efficiently on various mobile and edge devices. Models like gemma-2b and phi3-mini are already supported and more foundational models support is [in progress](https://github.com/huggingface/transformers/issues/32253). - -With stable APIs and Gen AI support, we’re excited to build and grow ExecuTorch with the community. The on-device AI community is growing rapidly and finding ways to adopt ExecuTorch across various fields. For instance, ExecuTorch is being used in a mobile app built by [Digica](https://digica.com/) to streamline inventory management in hospitals. As another example, Software Mansion developed an app, [EraserAI](https://blog.swmansion.com/eraserai-how-to-create-efficient-app-for-edge-device-04f09aa8072f), to remove unwanted objects from a photo with EfficientSAM running on-device with ExecuTorch via Core ML delegate. - -**Towards General Availability (GA):** -Since the original release of ExecuTorch alpha, we’ve seen a growing interest within the community in using ExecuTorch in various production environments. To that end, we have made great progress towards more stabilized and matured APIs and have made a significant investment in community support, adoption and contribution to ExecuTorch. As are are getting close to GA, we are investing our efforts in the following areas: - -- **Non-CPU backends:** Bringing non-CPU backends to even greater robustness, coverage and performance is our next goal. From day one of our original launch, we have partnered with Apple (for Core ML and MPS), Arm (for EthosU NPU) and Qualcomm (for Hexagon NPU) on accelerator integration with ExecuTorch, and we’ve since then expanded our partnership to MediaTek (NPU) and Cadence (XTensa DSP). We’re also building [Vulkan GPU](https://pytorch.org/executorch/stable/native-delegates-executorch-vulkan-delegate.html) integration in-house. In terms of feature coverage, we’ve successfully implemented the core functionalities with our partners, ensured seamless integration with our developer tooling, and showcased successful LLM integration with many of the accelerators. Our next big step is to thoroughly validate the performance and reliability of the system in real-world, production use-cases. This stage will help us fine-tune the experience and ensure the stability needed for smooth operations. - -- **Benchmarking infra**: As part of our ongoing testing efforts, we’ve developed a benchmarking infrastructure along with a [public dashboard](https://hud.pytorch.org/benchmark/llms?repoName=pytorch%2Fexecutorch) to showcase our progress toward on-device model inference benchmarking. This allows us to transparently track and display model coverage across various backends, giving our community real-time insights into how we’re advancing towards our goals. - - -We’re excited to share these developments with you and look forward to continued improvements in collaboration with our partners and the community\! We welcome community contribution to help us make ExecuTorch the clear choice for deploying AI and LLM models on-device. We invite you to start using ExecuTorch in your on-device projects, or even better consider [contributing](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md) to it. You can also report any issues on our [GitHub](https://github.com/pytorch/executorch/issues) page. diff --git a/_posts/2024-10-25-intel-gpu-support-pytorch-2-5.md b/_posts/2024-10-25-intel-gpu-support-pytorch-2-5.md deleted file mode 100644 index 6884f3e4f48f..000000000000 --- a/_posts/2024-10-25-intel-gpu-support-pytorch-2-5.md +++ /dev/null @@ -1,145 +0,0 @@ ---- -layout: blog_detail -title: "Intel GPU Support Now Available in PyTorch 2.5" -author: PyTorch Team at Intel ---- - -Support for Intel GPUs is now available in PyTorch® 2.5, providing improved functionality and performance for Intel GPUs which including [Intel® Arc™ discrete graphics](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/arc.html), [Intel® Core™ Ultra processors](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html) with built-in Intel® Arc™ graphics and [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html). This integration brings Intel GPUs and the SYCL\* software stack into the official PyTorch stack, ensuring a consistent user experience and enabling more extensive AI application scenarios, particularly in the AI PC domain. - -Developers and customers building for and using Intel GPUs will have a better user experience by directly obtaining continuous software support from native PyTorch, unified software distribution, and consistent product release time. - -Furthermore, Intel GPU support provides more choices to users. Now PyTorch provides a consistent GPU programming paradigm on both front ends and back ends. Developers can now run and deploy workloads on Intel GPUs with minimal coding efforts. - -## **Overview of Intel GPU support** - -Intel GPU support in PyTorch provides eager mode and graph mode support in the PyTorch built-in front end. Eager mode now has an implementation of commonly used Aten operators with the SYCL programming language. Graph mode (torch.compile) now has an enabled Intel GPU back end to implement the optimization for Intel GPUs and to integrate Triton.  - -Essential components of Intel GPU support were added to PyTorch, including runtime, Aten operators, oneDNN, TorchInductor, Triton and Intel GPU tool chains integration. Meanwhile, quantization and distributed are being actively developed in preparation for the PyTorch 2.6 release. - -## **Features** - -In addition to providing key features for Intel® Client GPUs and Intel® Data Center GPU Max Series for inference and training, PyTorch keeps the same user experience as other hardware the PyTorch supports. If you migrate code from CUDA\*, you can run the existing application code on an Intel GPU with minimal code changes for the device name (from cuda to xpu). For example: - -*\# CUDA Code* -**tensor** **\=** **torch.tensor(\[**1.0**,** 2.0**\]).to(**"cuda"**)** - -*\# Code for Intel GPU* -**tensor** **\=** **torch.tensor(\[**1.0**,** 2.0**\]).to(**"xpu"**)** - -PyTorch 2.5 features with an Intel GPU include:  - -* Inference and training workflows. -* Enhance both torch.compile and eager mode functionalities (more Ops), together with performance improvement, and fully run three Dynamo Hugging Face\*, TIMM\* and TorchBench\* benchmarks for eager and compile modes.  -* Data types such as FP32, BF16, FP16, and automatic mixed precision (AMP). -* Runs on Intel® Client GPUs and Intel® Data Center GPU Max Series. -* Supports Linux (Ubuntu, SUSE Linux and Red Hat Linux) and Windows 10/11. - -## **Get Started** - -Get a tour of the environment setup, PIP wheels installation, and examples on Intel® Client GPUs and Intel® Data Center GPU Max Series from [Getting Started Guide](https://pytorch.org/docs/main/notes/get_start_xpu.html). Support for Intel GPUs can be experienced through PyTorch PIP wheels installation by nightly and preview binary releases. - -* Try Intel® Client GPUs through Intel® Arc™ Graphics family (Codename DG2), Intel® Core™ Ultra processor family with Intel® Graphics (Codename Meteor Lake), and Intel® Core™ Ultra mobile processor family with Intel® Graphics (Codename Lunar Lake). - -* Try Intel Data Center GPU Max Series through [Intel® Tiber™ AI Cloud](https://cloud.intel.com/). - - 1. To learn how to create a free Standard account, see [Get Started](https://console.cloud.intel.com/docs/guides/get_started.html). Then do the following: - - * Sign in to the [cloud console](https://console.cloud.intel.com/docs/guides/get_started.html). - - * From the [Training](https://console.cloud.intel.com/training)** **section, open the  [PyTorch on Intel® GPUs](https://console.cloud.intel.com/training/detail/7db2a900-e47d-4b70-8968-cefa08432c1d)  notebook and click “Launch Jupyter Notebook.” - - * Ensure that the **PyTorch 2.5** kernel is selected for the notebook. - -## **Performance** - -The performance of Intel GPU on PyTorch was continuously optimized to achieve decent result on three Dynamo Hugging Face, TIMM and TorchBench benchmarks for eager and compile modes. - -The latest performance data measured on top of PyTorch Dynamo Benchmarking Suite using Intel® Data Center GPU Max Series 1100 single card showcase the FP16/BF16 significant speedup ratio over FP32 on eager mode in Figure 1, and Torch.compile mode speedup ratio over eager mode in Figure 2\. Both inference and training reached the similar significant improvements. - -![Figure 2: FP16/BF16 Performance Gains Over FP32 Eager](/assets/images/performance-gains-over-fp32-eager-2.png){:style="width:100%"} - -Figure 2: FP16/BF16 Performance Gains Over FP32 Eager - -![Figure 3: Torch.compile Performance Gains Over Eager Mode](/assets/images/performance-gains-over-fp32-eager.png){:style="width:100%"} - -Figure 3: Torch.compile Performance Gains Over Eager Mode - -## **Summary** - -Intel GPU on PyTorch 2.5 brings Intel® Client GPUs (Intel® Core™ Ultra processors with built-in Intel® Arc™ graphics and Intel® Arc™ Graphics for dGPU parts) and Intel® Data Center GPU Max Series into the PyTorch ecosystem for AI workload acceleration. Especially, Client GPUs is added to the GPU-supported list for AI PC use scenarios on Windows and Linux environment. - -We warmly welcome the community to evaluate and provide feedback on these enhancements to  [Intel GPU support on PyTorch](https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support).  - -## **Resources** - -* [PyTorch Docs: Getting Started on Intel GPU](https://pytorch.org/docs/main/notes/get_start_xpu.html) -* [Intel® Tiber™ AI Cloud](https://cloud.intel.com/) - -## **Acknowledgments** - -We want thank PyTorch open source community for their technical discussions and insights: [Andrey Talman](https://github.com/atalman), [Alban Desmaison](https://github.com/alband), [Nikita Shulga](https://github.com/malfet), [Eli Uriegas](https://github.com/seemethere), [Jason Ansel](https://github.com/jansel), and [Bin Bao](https://github.com/desertfire). - -We also thank collaborators from PyTorch for their professional support and guidance. - -## **Performance Configuration** - -The configurations in the table are collected with [svr-info](https://github.com/intel/svr-info). Test by Intel on September 12, 2024\. - -## Table 1 - -| Component | Details | -| :---- | :---- | -| **Name** | Intel® Max Series GPU 1100 in Intel® Tiber™ Developer Cloud | -| **Time** | Thu Sep 12 08:21:27 UTC 2024 | -| **System** | Supermicro SYS-521GE-TNRT | -| **Baseboard** | Supermicro X13DEG-OA | -| **Chassis** | Supermicro Other | -| **CPU Model** | Intel(R) Xeon(R) Platinum 8468V | -| **Microarchitecture** | SPR\_XCC | -| **Sockets** | 2 | -| **Cores per Socket** | 48 | -| **Hyperthreading** | Enabled | -| **CPUs** | 192 | -| **Intel Turbo Boost** | Enabled | -| **Base Frequency** | 2.4GHz | -| **All-core Maximum Frequency** | 2.4GHz | -| **Maximum Frequency** | 2.9GHz | -| **NUMA Nodes** | 2 | -| **Prefetchers** | L2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled, AMP: Disabled, Homeless: Disabled, LLC: Disabled | -| **PPINs** | 5e3f862ef7ba9d50, 6c85812edfcc84b1 | -| **Accelerators** | DLB 2, DSA 2, IAA 2, QAT (on CPU) 2, QAT (on chipset) 0 | -| **Installed Memory** | 1024GB (16x64GB DDR5 4800 MT/s \[4800 MT/s\]) | -| **Hugepagesize** | 2048 kB | -| **Transparent Huge Pages** | madvise | -| **Automatic NUMA Balancing** | Enabled | -| **NIC** | 2 x Ethernet Controller X710 for 10GBASE-T, 4 x MT2892 Family \[ConnectX-6 Dx\] | -| **Disk** | 1 x 894.3G Micron\_7450\_MTFDKBG960TFR | -| **BIOS** | 1.4a | -| **Microcode** | 0x2b0004b1 | -| **OS** | Ubuntu 22.04.2 LTS | -| **Kernel** | 5.15.0-73-generic | -| **TDP** | 330W | -| **Power & Perf Policy** | Normal (6) | -| **Frequency Governor** | performance | -| **Frequency Driver** | acpi-cpufreq | -| **Max C-State** | 9 | - -## Table 2 - -| Component | Details | -| :---- | :---- | -| **Single Card** | Intel® Max Series GPU 1100 series on 4th Gen Intel® Xeon® processors of Intel Tiber Developer Cloud | -| **Workload & version** | Timm ac34701, TorchBench 03cde49, Torchvision d23a6e1, Torchaudio b3f6f51, Transformers 243e186 | -| **Software Stack** | intel-for-pytorch-gpu-dev 0.5.3, intel-pti-dev 0.9.0, Intel xpu backend for Triton cc981fe | -| **Framework** | Pytorch 4a3dabd67f8ce63f2fc45f278421cca3cc532cfe | -| **GPU driver** | agama-ci-devel-803.61 | -| **GFX FW Version** | PVC2\_1.23374 | - -**Notices & Disclaimers** - -Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation. - -Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. - -**AI disclaimer:** -AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at  [www.intel.com/AIPC](http://www.intel.com/AIPC). Results may vary. \ No newline at end of file diff --git a/_posts/2024-10-28-pt-executorch-ethos-u85.md b/_posts/2024-10-28-pt-executorch-ethos-u85.md deleted file mode 100644 index 7620e146f41d..000000000000 --- a/_posts/2024-10-28-pt-executorch-ethos-u85.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -layout: blog_detail -title: "Getting started with PyTorch, ExecuTorch, and Ethos-U85 in three easy steps" -author: Robert Elliott, Fredrik Knutsson, and Mark Quartermain ---- - - -## ExecuTorch support for Ethos-U85 - -In the rapidly evolving landscape of machine learning, PyTorch has emerged as a leading framework for model development, given its flexibility and comprehensive ecosystem. Arm has worked with Meta to [introduce support for Arm platforms in ExecuTorch](https://community.arm.com/arm-community-blogs/b/ai-and-ml-blog/posts/executorch-and-tosa-enabling-pytorch-on-arm-platforms), that further simplifies this process, making it seamless to deploy PyTorch models on edge devices. - -The Arm Ethos-U85 NPU is the highest performing Ethos NPU addressing the growing demand for running advanced AI inference workloads at the edge, including transformer-based networks like LLMs. Arm offers reference designs, including the Corstone-320 IoT reference design platform, around the Ethos-U to accelerate and simplify the chip development cycle. The reference design platform includes, among many items, a Fixed Virtual Platform (FVP) that simulates an entire system, enabling cutting edge embedded software development and neural network deployment for the Ethos-U85. - -Today, Arm is extending the support for developers building IoT edge applications, by supporting ExecuTorch beta on Ethos-U85. Leveraging ExecuTorch, developers can now efficiently land their natively developed PyTorch models to enable intelligent and responsive IoT solutions built on Arm. - -With this package now available, thousands of developers looking to create Edge AI applications, can start their model and application development months before the platforms arrive on the market. - - -## Getting started with ExecuTorch on Ethos-U85 - -A full development environment has been provided in the public ExecuTorch GitHub repository. This provides an integrated and tested development flow with all necessary components. - -The three simple steps are: - - - -1. [Set up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup.html) -2. [Set up the Arm Build environment](https://pytorch.org/executorch/main/executorch-arm-delegate-tutorial.html) -3. [Compile and Run models on the arm_executor_runner](https://pytorch.org/executorch/main/executorch-arm-delegate-tutorial.html#delegated-quantized-workflow) - -You can then build on this flow for compiling and running models, to capture runtime behavior from the Ethos-U85 driver, such as cycle count information. - -To make the process easier for end users, we have also added scripts to the ExecuTorch repository: - - - -1. [Set up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup.html) -2. [setup.sh](https://github.com/pytorch/executorch/blob/main/examples/arm/setup.sh): Download the necessary software. -3. [run.sh](https://github.com/pytorch/executorch/blob/main/examples/arm/run.sh): to compile and run the model on the Corstone-320 FVP - -To build other models, you can use the ahead of time compiler script [aot_arm_compiler.py,](https://github.com/pytorch/executorch/blob/main/examples/arm/aot_arm_compiler.py) which takes a PyTorch program (nn.module) to an ExecuTorch program (.pte flatbuffer file). To write custom applications which use ExecuTorch you can follow the application flow in the example [executor_runner](https://github.com/pytorch/executorch/tree/main/examples/arm/executor_runner) application. - -We support approximately 40 core ATen operators and already support end-to-end deployment of models such as Mobilenetv2. Ongoing efforts to support further operators will enable more PyTorch models every week . - -As more functionality is added, it will be demonstrated through the tutorial materials for Ethos-U on [pytorch.org](https://pytorch.org/executorch/main/index.html) - - -## How this deployment flow works in more detail - -Leveraging the extensibility of ExecuTorch and the expressiveness of Arm’s [Tensor Operator Set Architecture (TOSA)](https://www.mlplatform.org/tosa/), we have enabled Ethos-U support in ExecuTorch. The Ethos-U compiler, [Vela](https://pypi.org/project/ethos-u-vela/), has been enhanced with a TOSA front-end, making it possible to compile models for all products in the Ethos-U family. Combining these components into a cohesive workflow involves the following steps. - - - -1. Converting a PyTorch model into a deployable ExecuTorch program (AOT flow) -2. Compile the ExecuTorch program into an executable, which can be deployed on Corstone-320 (runtime flow) - - -### The ExecuTorch Ahead of time (AOT) flow - -The process begins by converting a PyTorch model into a quantized TOSA representation using the PyTorch dynamo export flow. This allows us to generate an Ethos-U set of machine instructions, known as a command stream, utilizing the Vela compiler TOSA frontend. The command stream is bundled into an ExecuTorch program, represented by a flatbuffer file (.pte). This file contains everything the ExecuTorch runtime needs to perform inference using Ethos-U hardware. - - -![flow diagram](/assets/images/pt-executorch-ethos-u85/fg1.jpg){:style="width:100%"} - - -### The ExecuTorch Runtime flow - -The ExecuTorch runtime, written in C/C++, is designed to support multiple backends. We have extended it to include support for the Ethos-U device driver. Following this flow will produce a self-contained compiled executable. Deploying the executable on the Corstone-320 FVP is straightforward and requires only the appropriate flags when calling the FVP. - -![flow diagram](/assets/images/pt-executorch-ethos-u85/fg2.jpg){:style="width:100%"} - - -## Ethos-U85 and Corstone-320 - -The Ethos-U family of NPUs offers high performance and energy-efficient solutions for edge AI. The Ethos-U55 (also supported by ExecuTorch) is widely deployed in many Cortex-M heterogeneous systems, while the Ethos-U65 extends the applicability of the Ethos-U family to Cortex-A-based systems and increases the performance. - -Ethos-U85 further extends the Ethos-U product line, supporting current and future workloads on the edge using [transformer-based networks](https://newsroom.arm.com/blog/enabling-next-gen-edge-ai-applications-with-transformer-networks). Ethos-U85 delivers a 4x performance uplift and 20% higher energy efficiency compared to its predecessor, with up to 85% utilization on popular networks. Notable feature of Ethos-U85 includes; - - - -* configurations from 128 to 2048 MACs/cycle, delivering up 4 TOP/s at 1GHz -* Compatible with Cortex-A and Cortex-M based systems -* Native support for major neural networks though support for TOSA -* Full hardware acceleration of all major neural networks -* For a full list of features, see the [Ethos-U85 Technical Overview](https://developer.arm.com/documentation/102684/0000) - - -![A typical compute subsystem design with Ethos-U85](/assets/images/pt-executorch-ethos-u85/fg3.png){:style="width:100%"} - - -A typical compute subsystem design with Ethos-U85 - - -## What’s next - -We are adding new operator support every week, extending ExecuTorch core ATen operator coverage, and enabling a wider range of models to run on Ethos-U. Our ongoing efforts focus on improving performance to ensure models run as optimally as possible on Ethos-U. - -The ExecuTorch delegate framework supports fallback to running operators not supported by Ethos-U on the CPU using reference kernel implementations. We will work towards optimal performance on Cortex-M CPUs using CMSIS-NN, providing the best possible support for fallback operators and ensuring optimal performance for devices without Ethos-U capability. - -The package above with the Corstone-320 FVP are more steps to simplify application development, so please, go ahead, check out the code and build process and send us feedback. Meanwhile we will be busy making weekly releases to enable more features, models and to extract the maximum performance out of the hardware. diff --git a/_posts/2024-10-28-unleashing-ai-mobile.md b/_posts/2024-10-28-unleashing-ai-mobile.md deleted file mode 100644 index 2e703736337d..000000000000 --- a/_posts/2024-10-28-unleashing-ai-mobile.md +++ /dev/null @@ -1,150 +0,0 @@ ---- -layout: blog_detail -title: "Unleashing the Power of AI on Mobile: LLM Inference for Llama 3.2 Quantized Models with ExecuTorch and KleidiAI" -author: Gian Marco Iodice, Arm and Digant Desai, Meta -excerpt: "At the recent PyTorch Conference, Arm highlighted the widespread impact of its technology, spanning from cloud to edge, emphasizing its commitment to delivering its advanced AI computing capabilities seamlessly to millions of developers worldwide." ---- - -## Introduction - -At the recent [PyTorch Conference](https://events.linuxfoundation.org/pytorch-conference/), Arm highlighted the widespread impact of its technology, spanning from cloud to edge, emphasizing its commitment to delivering its advanced AI computing capabilities seamlessly to millions of developers worldwide. - - -![key stats](/assets/images/unleashing-ai-mobile/fg1.png){:style="width:100%"} - - -During the presentation, it was emphasized that Arm bears the immense responsibility of equipping 20+ million developers and billions of users with advanced AI computing features without friction. Achieving this requires crucial software collaborations across a vast ecosystem of software and hardware partners. - -Just a few months ago, Arm launched Arm Kleidi, developer enablement technologies and resources to drive technical collaboration and innovation across the ML stack. This includes the KleidiAI software library providing optimized software routines, which when integrated into key frameworks such as XNNPACK enable automatic AI acceleration for developers on Arm Cortex-A CPUs. - -Today, we’re excited to announce a new milestone for the AI open-source community that brings Arm even closer to realizing this vision: the integration of KleidiAI into [ExecuTorch](https://github.com/pytorch/executorch) via XNNPACK, boosting AI workload performance on Arm mobile CPUs! - -Thanks to the collaborative efforts of the engineering teams at Arm and Meta, AI developers can now deploy quantized Llama models which run up to 20% faster on Arm Cortex-A v9 CPUs with the i8mm ISA extension. - -And there’s more exciting news - the ExecuTorch team has officially launched the [Beta release](https://pytorch.org/blog/executorch-beta/)! - -This marks an important milestone in our partnership. In this blog, we are eager to share more details about ExecuTorch capabilities, the new Meta Llama 3.2 models, the integer 4-bit with per-block quantization, and the impressive performance recorded on certain Arm CPUs. Notably, we have achieved speeds of over 350 tokens per second on the prefill stage with the quantized Llama 3.2 1B model on Samsung S24+ device, as shown in the following screenshots. - -![mobile app screenshots](/assets/images/unleashing-ai-mobile/fg2.png){:style="width:100%"} - - -Now, let’s dive into the key components that enabled the demo creation presented in the preceding images. First up: new Llama 3.2 models! - - -## Meta Llama 3.2 - -Meta recently [announced](https://ai.meta.com/blog/meta-llama-quantized-lightweight-models/) the first lightweight quantized Llama models, which are designed to run on popular mobile devices. Meta used two techniques for quantizing Llama 3.2 1B and 3B models: Quantization-Aware Training (QAT) with LoRA adaptors (QLoRA), and SpinQuant, a state-of-the-art post-training quantization method. The quantized models were evaluated using PyTorch's ExecuTorch framework as the inference engine, with the Arm CPU as a backend. - -These instruction-tuned models retain the quality and safety of the original 1B and 3B models while achieving a 2-4x speedup and reducing model size by 56% on average and memory footprint by 41% on average compared to the original BF16 format. - -In this blog post, we will demonstrate the performance improvements we observed in our experiments. - - -## ExecuTorch - -[ExecuTorch](https://github.com/pytorch/executorch) is a PyTorch-native framework specifically designed for deploying AI models on-device, enhancing privacy and reducing latency. It supports the deployment of cutting-edge open-source AI models, including the Llama family of models and vision and speech models like [Segment Anything](https://segment-anything.com/) and [Seamless](https://ai.meta.com/research/seamless-communication/). - -This unlocks new possibilities for edge devices such as mobile phones, smart glasses, VR headsets, and smart home cameras. Traditionally, deploying PyTorch-trained AI models to resource-limited edge devices has been challenging and time-consuming, often requiring conversion to other formats which could lead to errors and suboptimal performance. The varied toolchains across the hardware and edge ecosystem have also degraded the developer experience, making a universal solution impractical. - -ExecuTorch addresses these issues by providing composable components that include core runtime, operator library, and delegation interface that allows for portability as well extensibility. Models can be exported using torch.export(), producing a graph that is natively compatible with the ExecuTorch runtime, capable of running on most edge devices with CPUs, and extendable to specialized hardware like GPUs and NPUs for enhanced performance. - -Working with Arm, ExecuTorch now leverages the optimized low-bit matrix multiplication kernels from the Arm KleidiAI library to improve on-device Large Language Model (LLM) inference performance via XNNPACK. We also thank the XNNPACK team at Google for supporting this effort. - -In this post, we will focus on this integration available in [ExecuTorch](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) - - -## Evolving the architecture for AI workloads - -At Arm, we have been deeply committed to investing in open-source projects and advancing new technologies in our processors since the early days of the deep learning wave, focusing on making AI workloads high-performing and more power-efficient. - -For instance, Arm introduced the SDOT instruction, starting with the Armv8.2-A architecture, to accelerate dot product arithmetic between 8-bit integer vectors. This feature, now widely available in mobile devices, significantly speeds up the computation of quantized 8-bit models. After the SDOT instruction, Arm introduced the BF16 data type and the MMLA instruction to further enhance the floating-point and integer matrix multiplication performance on CPUs and, most recently, announced the Scalable Matrix Extension (SME), marking a significant leap forward in machine learning capabilities. - -The following image shows a few examples of Arm CPU's continuous innovations in the AI space over the last decade: - - -![line chart](/assets/images/unleashing-ai-mobile/fg3.jpg){:style="width:100%"} - - -Given the widespread use of Arm CPUs, AI frameworks need to take full advantage of these technologies in key operators to maximize performance. Recognizing this, we saw the need for an open-source library to share these optimized software routines. However, we were mindful of the challenges in integrating a new library into AI frameworks, such as concerns about library size, dependencies, and documentation and the need to avoid adding extra burdens for developers. So, we took extra steps to gather feedback from our partners and ensure a smooth integration process that does not require additional dependencies for AI developers. This effort led to KleidiAI, an open-source library that provides optimized performance-critical routines for artificial intelligence (AI) workloads tailored for Arm CPUs. You can learn more about KleidiAI [here](https://community.arm.com/arm-community-blogs/b/ai-and-ml-blog/posts/kleidiai). - -Working with the ExecuTorch team at Meta, Arm provided the software optimizations for their novel 4-bit with per-block quantization schema, which is used to accelerate the matrix multiplication kernel in the Transformer layer’s torch.nn.linear operator for Llama 3.2 quantized models. This flexible 4-bit quantization schema from ExecuTorch strikes a balance between model accuracy and low-bit matrix multiplication performance targeting on-device LLMs. - - -## The integer 4-bit with per-block quantization - -In KleidiAI, we introduced micro-kernels optimized for this new 4-bit integer quantization scheme (**matmul_clamp_f32_qai8dxp_qsi4c32p**) - -As shown in the following image, this 4-bit quantization uses a per-block strategy for weight (RHS matrix) quantization and an 8-bit per-row quantization for activations (LHS matrix): - - -![arch diagram](/assets/images/unleashing-ai-mobile/fg4.png){:style="width:100%"} - - -As you can see in the preceding image, each output feature map (OFM) in the weight matrix is divided into equally sized blocks (group size), with each block having a scale factor stored in BF16 format. BF16 is advantageous because it maintains the dynamic range of 32-bit floating-point (FP32) format with half the bit size, and it’s easy to convert to and from FP32 using a simple shift operation. This makes BF16 ideal for saving model space, preserving accuracy, and ensuring backward compatibility with devices that lack BF16 hardware acceleration. You can learn more about the BF16 format in [this](https://community.arm.com/arm-community-blogs/b/ai-and-ml-blog/posts/bfloat16-processing-for-neural-networks-on-armv8_2d00_a) Arm Community blog post. - -For completeness, this 4-bit quantization scheme and our implementation in KleidiAI allow users to configure group size for the linear weights (RHS), allowing them to trade-off between model size, model accuracy, and model performance if the model is quantized by the user. - -At this point, we are ready to unveil the incredible performance recorded on Arm CPUs with ExecuTorch when running Llama 3.2 1B and Llama 3.2 3B. Let’s first go over metrics we will use to evaluate the performance of LLM inference. - - -### Metrics for LLM Inference - -Typically, performance metrics used to evaluate LLM performance during inference include: - - - -* **Time To First Token (TTFT)**: This measures the time it takes to produce the first output token after a prompt is provided by the user. This latency or response time is important for a good user experience, especially on a phone. TTFT is also a function of the length of the prompt or prompt tokens. To make this metric independent of the prompt length, we use Prefill tokens/second as a proxy here. The relationship between these is inverse: lower TTFT corresponds to higher Prefill tokens/second. -* **Decode Performance**: This is the average number of output tokens generated per second, thus reported in Tokens/Second. It is independent of the total number of tokens generated. For on-device inference, it is important to keep this higher than a user's average reading speed. -* **Peak Runtime Memory**: This metric reflects the amount of RAM, typically reported in MegaBytes (MiB), needed to run the model with expected performance measured using the metrics above. Given the limited amount of RAM available on Android and iOS devices, this is one of the key metrics for on-device LLM deployment. It dictates the type of models that can be deployed on a device. - - -### Results - - -The quantized Llama 3.2 1B models, both SpinQuant and QLoRA, are designed to run efficiently on a wide range of phones with limited RAM. In this section, we demonstrate that the quantized Llama 3.2 1B models can achieve over 350 tokens per second in the prefill phase and over 40 tokens per second in the decode stage. This level of performance is sufficient to enable on-device text summarization with a reasonable user experience using only Arm CPUs. To put this into perspective, on average, 50 unread messages contain about 600 tokens. With this performance, the response time (the time it takes for the first generated word to appear on the screen) is approximately two seconds. - - -We present measurements from a Samsung S24+ running vanilla Android. We used Llama 3.2 1B parameter models for these experiments. Although we only demonstrate using 1B models, similar performance gains can be expected for the 3B parameter models. The experiment setup involves doing a single warmup run, sequence length of 128, prompt length of 64, and using 6 out of 8 available CPUs, and measuring [results](https://github.com/pytorch/executorch/tree/main/examples/models/llama#step-5-run-benchmark-on) over adb. - - -Using the ExecuTorch main branch from GitHub, we first generated the ExecuTorch PTE binary files for each model using the published checkpoints. Then, using the same repository, we generated the ExecuTorch runtime binary for Armv8. In the rest of the section, we will compare the performance of different quantized 1B models against the BF16 model using the binary built with KleidiAI. We will also compare the performance gains for quantized models between the binary with KleidiAI and the one without KleidiAI to distill the impact from KleidiAI. - - -#### Quantized Model Performance - - -Llama 3.2 quantized models both SpinQuant and QLoRA perform significantly better on prompt prefill and text generation (decode) compared to the baseline BF16. We observed a >2x improvement in decode and a >5x improvement in prefill performance. - - -Furthermore, the quantized model size, PTE file size in bytes, is less than half that of the BF16 model, 2.3 GiB vs. 1.1 GiB. Although the size of int4 is a quarter of BF16, some layers in the model are quantized with int8, making the PTE file size ratio larger. We observed runtime peak memory footprint reduction of almost 40% from 3.1 GiB for the BF16 model to 1.9 GiB for the SpinQuant model, measured in Resident Set Size (RSS) for a maximum sequence length of 2048. - -With all-around improvements, the new quantized Llama 3.2 models are ideal for on-device deployment targeting Arm CPUs. For more information on accuracy, check out the Meta Llama 3.2 blog. - - -![bar graph](/assets/images/unleashing-ai-mobile/fg5.png){:style="width:100%"} - - - -#### KleidiAI Impact - - -ExecuTorch relies on the Arm KleidiAI library to provide low-bit performant matrix multiplication kernels for the latest Arm CPUs with advanced Armv8/9 ISA features. These kernels are utilized for on-device quantized Llama 3.2 model inference in ExecuTorch. As depicted in the graph below, ExecuTorch achieves an average of >20% better prefill performance on S24+ with KleidiAI compared to non-KleidiAI kernels, while maintaining the same accuracy. This performance advantage is not limited to specific models or devices, and is expected to benefit all ExecuTorch models using low-bit quantized matrix multiplication on Arm CPUs. - - -To assess the impact of Kleidi, we generated two ExecuTorch runtime binaries targeting Arm Cortex-A CPUs and compared their performance. - - - -1. The first ExecuTorch runtime binary built with the Arm KleidiAI library through the XNNPACK library. -2. The second binary was built without the Arm KleidiAI repository, using native kernels from the XNNPACK library. - - -![bar chart](/assets/images/unleashing-ai-mobile/fg6.png){:style="width:100%"} - - - -## Try it yourself! - -Ready to experience the performance improvements firsthand? Here's how you can try out ExecuTorch with the optimizations provided by KleidiAI on your projects: Here is a [link to the learning path](https://learn.arm.com/learning-paths/smartphones-and-mobile/build-llama3-chat-android-app-using-executorch-and-xnnpack/) from Arm to start developing your own application using LLMs using ExecuTorch and KleidiAI. - -We look forward to hearing your feedback! \ No newline at end of file diff --git a/_posts/2024-10-30-triton-kernel-compilation-stages.md b/_posts/2024-10-30-triton-kernel-compilation-stages.md deleted file mode 100644 index 10b0e3d88785..000000000000 --- a/_posts/2024-10-30-triton-kernel-compilation-stages.md +++ /dev/null @@ -1,205 +0,0 @@ ---- -layout: blog_detail -title: "Triton Kernel Compilation Stages" -author: Sara Kokkila-Schumacher*, Brian Vaughan*, Raghu Ganti*, and Less Wright+ (*IBM Research, +Meta) ---- - -The Triton open-source programming language and compiler offers a high-level, python-based approach to create efficient GPU code. In this blog, we highlight the underlying details of how a triton program is compiled and the intermediate representations. For an introduction to Triton, we refer readers to this [blog](https://openai.com/index/triton/). - - -## Triton Language and Compilation - -The Triton programming language supports different types of modern GPUs and follows a blocked programming approach. As an example, we will follow the [Triton vector add tutorial](https://github.com/triton-lang/triton/blob/main/python/tutorials/01-vector-add.py) with minor modifications. The vector addition kernel and helper function is defined as: - - -``` -import torch -import triton -import triton.language as tl - -@triton.jit -def add_kernel(x_ptr, # *Pointer* to first input vector. - y_ptr, # *Pointer* to second input vector. - output_ptr, # *Pointer* to output vector. - n_elements, - BLOCK_SIZE: tl.constexpr, - ): - - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - - mask = offsets < n_elements - - x = tl.load(x_ptr + offsets, mask=mask) - y = tl.load(y_ptr + offsets, mask=mask) - output = x + y - tl.store(output_ptr + offsets, output, mask=mask) - -def add(x: torch.Tensor, y: torch.Tensor): - output = torch.empty_like(x) - assert x.is_cuda and y.is_cuda and output.is_cuda - n_elements = output.numel() - - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) - triton_kernel=add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) - torch.cuda.synchronize() - - # Save compilation stages - some of the stages identified here are specific to NVIDIA devices: - with open('triton_IR.txt', 'w') as f: - print(triton_kernel.asm['ttir'], file=f) - with open('triton_TTGIR.txt', 'w') as f: - print(triton_kernel.asm['ttgir'], file=f) - with open('triton_LLVMIR.txt', 'w') as f: - print(triton_kernel.asm['llir'], file=f) - with open('triton_PTX.ptx', 'w') as f: - print(triton_kernel.asm['ptx'], file=f) - with open('triton_cubin.txt', 'w') as f: - print(triton_kernel.asm['cubin'], file=f) - - return output - -torch.manual_seed(0) -size = 98432 -x = torch.rand(size, device='cuda') -y = torch.rand(size, device='cuda') -output_torch = x + y -output_triton = add(x, y) -print(output_torch) -print(output_triton) -print(f'The maximum difference between torch and triton is ' - f'{torch.max(torch.abs(output_torch - output_triton))}') -``` - - -The Triton vector add kernel includes the `@triton.jit` decorator. The Triton compiler will compile functions marked by `@triton.jit`, which lowers the function through multiple compilation stages. The helper function `add` allocates the output tensor, computes the appropriate GPU grid size, and additionally saves the intermediate compilation stages. - -Focusing on the compilation process, the Triton kernel is lowered to device specific assembly through a series of stages outlined in the following figure. - - - -![compilation process](/assets/images/triton-kernel-compilation-stages.jpg){:style="width:100%; max-width: 500px; margin-left: auto; margin-right: auto; display: block"} - - - -The kernel is compiled by first walking the abstract syntax tree (AST) of the decorated python function to create the Triton Intermediate Representation (Triton-IR). The Triton-IR is an unoptimized, machine independent intermediate representation. It introduces tile-level programming requirements and is based on the open-source LLVM compiler project. Next the Triton compiler optimizes and converts the Triton-IR into the stages Triton-GPU IR (Triton-TTGIR) and then LLVM-IR. Both the Triton-IR and Triton-GPUIR representations are written as MLIR dialects, where MLIR is a subproject of LLVM that aims to improve compilation for heterogeneous hardware. - -For the Triton vector add tutorial kernel, the example Triton IR snippet is: - - -``` -module { - tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg3: i32 {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0)) attributes {noinline = false} { - %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) - %0 = tt.get_program_id x : i32 loc(#loc2) - %1 = arith.muli %0, %c1024_i32 : i32 loc(#loc3) - %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc4) - %3 = tt.splat %1 : i32 -> tensor<1024xi32> loc(#loc5) - %4 = arith.addi %3, %2 : tensor<1024xi32> loc(#loc5) - %5 = tt.splat %arg3 : i32 -> tensor<1024xi32> loc(#loc6) - %6 = arith.cmpi slt, %4, %5 : tensor<1024xi32> loc(#loc6) - %7 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc7) - %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc7) - %9 = tt.load %8, %6 : tensor<1024x!tt.ptr> loc(#loc8) - %10 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc9) - %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc9) - %12 = tt.load %11, %6 : tensor<1024x!tt.ptr> loc(#loc10) - %13 = arith.addf %9, %12 : tensor<1024xf32> loc(#loc11) - %14 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc12) - %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc12) - tt.store %15, %13, %6 : tensor<1024x!tt.ptr> loc(#loc13) - tt.return loc(#loc14) - } loc(#loc) -} loc(#loc) -``` - - -Notice that the main functions in the Triton kernel are now represented as: - - - - - - - - - - - - - - - - - - - - - - - -
        Triton kernel - Triton IR -
        x = tl.load(x_ptr + offsets, mask=mask) - %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc8) -
        y = tl.load(y_ptr + offsets, mask=mask) - %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc10) -
        output = x + y - %13 = arith.addf %9, %12 : tensor<1024xf32> loc(#loc11) -
        tl.store(output_ptr + offsets, output, mask=mask) - tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc13) -
        - - -At the Triton IR stage, the `%arg0: !tt.ptr<f32>` and the following tensor references show that the intermediate representation is already specialized by the data type. - -We ran this example on a Tesla V100-SXM2-32GB GPU with CUDA Version 12.2, Python version 3.11.9, and PyTorch 2.4.1 with the default version of Triton that is installed with PyTorch. On this device, the simple vector addition has the following Triton GPU IR snippet with lines omitted for clarity: - - -``` -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:70", "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} - ⋮ - %9 = tt.load %8, %6 : tensor<1024x!tt.ptr, #blocked> loc(#loc8) - ⋮ - %12 = tt.load %11, %6 : tensor<1024x!tt.ptr, #blocked> loc(#loc10) - %13 = arith.addf %9, %12 : tensor<1024xf32, #blocked> loc(#loc11) - ⋮ - tt.store %15, %13, %6 : tensor<1024x!tt.ptr, #blocked> loc(#loc13) - ⋮ - } loc(#loc) -} loc(#loc) -``` - - -At this stage, some of the hardware specific information is included. For example, the compute capability is included along with details on how the tensors are distributed to cores and warps or for AMD GPUs on wavefronts. In this example, the tensors are represented as a `#blocked` layout. In this encoding, each warp owns a contiguous portion of the tensor. Currently, other possible memory optimizations include layouts such as `slice` (restructures and distributes a tensor along a dimension), `dot_op`(optimized layout for block matrix product), `shared`(indicates GPU shared memory), `nvidia_mma` (produced by NVIDIA tensor cores), `amd_mfma` (produced by AMD MFMA matrix core), and `amd_wmma` (produced by AMD WMMA matrix core). As announced at the recent Triton conference, this layout representation will transition to a new linear layout to unify layouts within and across backends. The stage from Triton-GPUIR to LLVM-IR converts the Triton-GPUIR to LLVM's representation. At this time, Triton has third-party backend support for NVIDIA and AMD devices, but other device support is under active development by the open-source community. - -A small subset of the LLVM-IR vector add arguments shown below for illustration: - - -``` - %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !16 - %39 = extractvalue { i32, i32, i32, i32 } %38, 0, !dbg !18 - %23 = bitcast i32 %19 to float, !dbg !16 - %43 = bitcast i32 %39 to float, !dbg !18 - %56 = fadd float %23, %43, !dbg !19 -``` - - -After some pointer arithmetic and an inline assembly call to retrieve the data from global memory, the vector elements are extracted and cast to the correct type. Finally they are added together and later written to global memory through an inline assembly expression. - -The final stages of the Triton compilation process lower the LLVM-IR to a device specific binary. For the example vector add, on an NVIDIA GPU, the next intermediate is PTX (Parallel Thread Execution). The low-level PTX syntax specifies the execution at the thread level of NVIDIA devices, starting with the CUDA 1.0 release. For an in-depth guide on PTX, see [NVIDIA's documentation](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#). In the vector add, the kernel parameters are passed from the host to the kernel, addresses are assigned and `mov` instructions facilitate the thread-level data access, ultimately representing the element addition calls with `add.f32` such as the example below: - - -``` - add.f32 %f17, %f1, %f9// add type float32, output register, input register for x, input register for y -``` - - -The Triton compiler orchestrates the final stage with different hardware backends managing how the assembly code is compiled into binary. The Triton kernel is now ready for use. - - -## Summary - -Triton provides a high-level abstraction to program and compile kernels for different types of hardware. In this post, we highlight the different stages of the Triton code representations and Triton compiler. For details on including custom Triton kernels or accelerating different workloads with Triton kernels, check out the [PyTorch Triton tutorial](https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html), the blog posts on [Triton GPTQ kernels](https://pytorch.org/blog/accelerating-triton), [Llama3 FP8 Inference with Triton](https://pytorch.org/blog/accelerating-llama3/), and [CUDA-Free Inference for LLMs](https://pytorch.org/blog/cuda-free-inference-for-llms/), or the [PyTorch 2.2 Section on Triton code generation](https://pytorch.org/assets/pytorch2-2.pdf). \ No newline at end of file diff --git a/_posts/2024-10-31-deploying-llms-torchserve-vllm.md b/_posts/2024-10-31-deploying-llms-torchserve-vllm.md deleted file mode 100644 index 07e184695ac4..000000000000 --- a/_posts/2024-10-31-deploying-llms-torchserve-vllm.md +++ /dev/null @@ -1,210 +0,0 @@ ---- -layout: blog_detail -title: "Deploying LLMs with TorchServe + vLLM" -author: Matthias Reso, Ankith Gunapal, Simon Mo, Li Ning, Hamid Shojanazeri ---- - -The vLLM engine is currently one of the top-performing ways to execute large language models (LLM). It provides the [vllm serve](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) command as an easy option to deploy a model on a single machine. While this is convenient, to serve these LLMs in production and at scale some advanced features are necessary. - - -![flow diagram](/assets/images/deploying-llms-torchserve-vllm/fg1.png){:style="width:100%"} - - - -TorchServe offers these essential production features (like custom metrics and model versioning) and through its flexible custom handler design, makes it very easy to integrate features such as retrieval-augmented generation (RAG) or safeguards like [Llama Guard](https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/). It is therefore natural to pair the vLLM engine with TorchServe to create a full-fledged LLM serving solution for production. - -Before going into the specifics of the integration, we will demonstrate the deployment of a Llama-3.1-70B-Instruct model using TorchServe's vLLM docker image. - - -## Quickly getting started with Llama 3.1 on TorchServe + vLLM - -To get started we need to build the [new TS LLM Docker](https://github.com/pytorch/serve/blob/master/docker/Dockerfile.llm) container image by checking out the [TorchServe repository](https://github.com/pytorch/serve) and execute the following command from the main folder: - -``` -docker build --pull . -f docker/Dockerfile.vllm -t ts/vllm -``` - -The container uses our new LLM launcher script `ts.llm_launcher` which takes a Hugging Face model URI or local folder and spins up a local TorchServe instance with the vLLM engine running in the backend. To serve a model locally, you can create an instance of the container with the following command: - -``` -#export token= -docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p -8080:8080 -v data:/data ts/vllm --model_id meta-llama/Meta-Llama-3.1-70B-Instruct --disable_token_auth -``` - -You can test the endpoint locally with this curl command: - -``` -curl -X POST -d '{"model":"meta-llama/Meta-Llama-3.1-70B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions" -``` - -The docker stores the model weights in the local folder “data” which gets mounted as /data inside the container. To serve your custom local weights simply copy them into data and point the model_id to /data/<your weights>. - -Internally, the container uses our new `ts.llm_launcher` script to launch TorchServe and deploy the model. The launcher simplifies the deployment of an LLM with TorchServe into a single command line and can also be used outside the container as an efficient tool for experimentation and testing. To use the launcher outside the docker, follow the [TorchServe installation steps](https://github.com/pytorch/serve?tab=readme-ov-file#-quick-start-with-torchserve) and then execute the following command to spin up a 8B Llama model: - -``` -# after installing TorchServe and vLLM run -python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct --disable_token_auth -``` - -If multiple GPUs are available the launcher will automatically claim all visible devices and apply tensor parallelism (see [CUDA_VISIBLE_DEVICES](https://developer.nvidia.com/blog/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/) to specify which GPUs to use). - -While this is very convenient, it's important to note that it does not encompass all the functionalities provided by TorchServe. For those looking to leverage more advanced features, a model archive needs to be created. While this process is a bit more involved than issuing a single command, it bears the advantage of custom handlers and versioning. While the former allows to implement RAG inside the preprocessing step, the latter lets you test different versions of a handler and model before deploying on a larger scale. - -Before we provide the detailed steps to create and deploy a model archive, let’s dive into the details of the vLLM engine integration. - - -## TorchServe’s vLLM Engine Integration - -As a state-of-the-art serving framework, vLLM offers a plethora of advanced features, including PagedAttention, continuous batching, rapid model execution through CUDA graphs, and support for various quantization methods such as GPTQ, AWQ, INT4, INT8, and FP8. It also provides integration for important parameter-efficient adapter methods like LoRA and access to a wide range of model architectures including Llama and Mistral. vLLM is maintained by the vLLM team and a thriving open-source community. - -To facilitate quick deployment, it offers a serving mode based on FastAPI to serve LLMs over HTTP. For a tighter, more flexible integration the project also provides the [vllm.LLMEngine](https://docs.vllm.ai/en/latest/dev/engine/llm_engine.html) which offers interfaces to process requests on a continuous basis. We leveraged the [asynchronous variant](https://docs.vllm.ai/en/latest/dev/engine/async_llm_engine.html) for the integration into TorchServe. - -[TorchServe](https://pytorch.org/serve/) is an easy-to-use, open-source solution for serving PyTorch models in production. As a production-tested serving solution, TorchServe offers numerous benefits and features beneficial for deploying PyTorch models at scale. By combining it with the inference performance of the vLLM engine these benefits can now also be used to deploy LLMs at scale. - - -![Torchserve highlights and integrations](/assets/images/deploying-llms-torchserve-vllm/fg2.png){:style="width:100%"} - - -To maximize hardware utilization it is generally a good practice to batch requests from multiple users together. Historically, TorchServe only offered a synchronized mode to collect requests from various users. In this mode, TorchServe waits for a predefined amount of time (e.g., batch_delay=200ms) or until enough requests (e.g., batch_size=8) have arrived. When one of these events is triggered, the batched data gets forwarded to the backend where the model is applied to the batch, and the model output is returned to the users through the frontend. This works especially well for traditional vision models where outputs for each request usually finish at the same time. - -For generative use cases, particularly text generation, the assumption that requests are ready simultaneously is no longer valid, as responses will have varying lengths. Although TorchServe supports continuous batching (the ability to add and remove requests dynamically), this mode only accommodates a static maximum batch size. With the introduction of PagedAttention, even this assumption of a maximum batch size becomes more flexible, as vLLM can combine requests of different lengths in a highly adaptable manner to optimize memory utilization. - -To achieve optimal memory utilization, i.e., to fill unused gaps in memory (think Tetris), vLLM requires complete control over the decision of which requests to process at any given time. To provide this flexibility, we had to reevaluate how TorchServe handles user requests. Instead of the previous synchronous processing mode, we introduced an [asynchronous mode](https://github.com/pytorch/serve/blob/ba8c268fe09cb9396749a9ae5d480ba252764d71/examples/large_models/vllm/llama3/model-config.yaml#L7) (see diagram below) where incoming requests are directly forwarded to the backend, making them available for vLLM. The backend feeds the vllm.AsyncEngine, which can now select from all available requests. If streaming mode is enabled and the first token of a request is available, the backend will send out the result immediately and continue sending tokens until the final token is generated. - -![flow diagram](/assets/images/deploying-llms-torchserve-vllm/fg3.png){:style="width:100%"} - - -[Our implementation of the VLLMHandler](https://github.com/pytorch/serve/blob/master/ts/torch_handler/vllm_handler.py) enables users to quickly deploy any model compatible with vLLM using a configuration file, while still offering the same level of flexibility and customizability through a custom handler. Users are free to add e.g. custom [preprocessing](https://github.com/pytorch/serve/blob/ba8c268fe09cb9396749a9ae5d480ba252764d71/ts/torch_handler/vllm_handler.py#L108) or [post-processing](https://github.com/pytorch/serve/blob/ba8c268fe09cb9396749a9ae5d480ba252764d71/ts/torch_handler/vllm_handler.py#L160) steps by inheriting from VLLMHandler and overriding the respective class methods. - -We also support single-node, multi-GPU [distributed inference](https://github.com/pytorch/serve/blob/master/examples/large_models/vllm/Readme.md#distributed-inference), where we configure vLLM to use tensor parallel sharding of the model to either increase capacity for smaller models or enable larger models that do not fit on a single GPU, such as the 70B Llama variants. Previously, TorchServe only supported distributed inference using torchrun, where multiple backend worker processes were spun up to shard the model. vLLM manages the creation of these processes internally, so we [introduced the new “custom” parallelType to TorchServe](https://github.com/pytorch/serve/blob/master/examples/large_models/vllm/Readme.md#distributed-inference) which launches a single backend worker process and provides the list of assigned GPUs. The backend process can then launch its own subprocesses if necessary. - -To facilitate integration of TorchServe + vLLM into docker-based deployments, we provide a separate [Dockerfile](https://github.com/pytorch/serve?tab=readme-ov-file#-quick-start-llm-deployment-with-docker) based on [TorchServe’s GPU docker image](https://hub.docker.com/r/pytorch/torchserve), with vLLM added as a dependency. We chose to keep the two separate to avoid increasing the docker image size for non-LLM deployments. - -Next, we will demonstrate the steps required to deploy a Llama 3.1 70B model using TorchServe + vLLM on a machine with four GPUs. - - -## Step-by-Step Guide - -For this step-by-step guide we assume the [installation of TorchServe](https://github.com/pytorch/serve/tree/master?tab=readme-ov-file#-quick-start-with-torchserve) has finished successfully. Currently, vLLM is not a hard-dependency for TorchServe so let’s install the package using pip: - -``` -$ pip install -U vllm==0.6.1.post2 -``` - -In the following steps, we will (optionally) download the model weights, explain the configuration, create a model archive, deploy and test it: - - - -### 1. (Optional) Download Model Weights - -This step is optional, as vLLM can also handle downloading the weights when the model server is started. However, pre-downloading the model weights and sharing the cached files between TorchServe instances can be beneficial in terms of storage usage and startup time of the model worker. If you choose to download the weights, use the huggingface-cli and execute: - -``` -# make sure you have logged into huggingface with huggingface-cli login before -# and have your access request for the Llama 3.1 model weights approved - -huggingface-cli download meta-llama/Meta-Llama-3.1-70B-Instruct --exclude original/* -``` - -This will download the files under $HF_HOME, and you can alter the variable if you want to place the files elsewhere. Please ensure that you update the variable wherever you run TorchServe and make sure it has access to that folder. - - - -### 2. Configure the Model - -Next, we create a YAML configuration file that contains all the necessary parameters for our model deployment. The first part of the config file specifies how the frontend should launch the backend worker, which will ultimately run the model in a handler. The second part includes parameters for the backend handler, such as the model to load, followed by various parameters for vLLM itself. For more information on possible configurations for the vLLM engine, please refer to this [link](https://docs.vllm.ai/en/latest/models/engine_args.html#engine-args). - -``` -echo ' -# TorchServe frontend parameters -minWorkers: 1 -maxWorkers: 1 # Set the number of worker to create a single model instance -startupTimeout: 1200 # (in seconds) Give the worker time to load the model weights -deviceType: "gpu" -asyncCommunication: true # This ensures we can cummunicate asynchronously with the worker -parallelType: "custom" # This lets TS create a single backend prosses assigning 4 GPUs -parallelLevel: 4 - -# Handler parameters -handler: - # model_path can be a model identifier for Hugging Face hub or a local path - model_path: "meta-llama/Meta-Llama-3.1-70B-Instruct" - vllm_engine_config: # vLLM configuration which gets fed into AsyncVLLMEngine - max_num_seqs: 16 - max_model_len: 512 - tensor_parallel_size: 4 - served_model_name: - - "meta-llama/Meta-Llama-3.1-70B-Instruct" - - "llama3" -'> model_config.yaml -``` - -### 3. Create the Model Folder - -After creating the model configuration file (model_config.yaml), we will now create a model archive that includes the configuration and additional metadata, such as versioning information. Since the model weights are large, we will not include them inside the archive. Instead, the handler will access the weights by following the model_path specified in the model configuration. Note that in this example, we have chosen to use the "no-archive" format, which creates a model folder containing all necessary files. This allows us to easily modify the config files for experimentation without any friction. Later, we can also select the mar or tgz format to create a more easily transportable artifact. - -``` -mkdir model_store -torch-model-archiver --model-name vllm --version 1.0 --handler vllm_handler --config-file model_config.yaml --archive-format no-archive --export-path model_store/ -``` - -### 4. Deploy the Model - -The next step is to start a TorchServe instance and load the model. Please note that we have disabled token authentication for local testing purposes. It is highly recommended to implement some form of authentication when publicly deploying any model. - -To start the TorchServe instance and load the model, run the following command: - -``` -torchserve --start --ncs --model-store model_store --models vllm --disable-token-auth -``` - -You can monitor the progress of the model loading through the log statements. Once the model has finished loading, you can proceed to test the deployment. - - - -### 5. Test the Deployment - -The vLLM integration uses an OpenAI API compatible format so we can either use a specialized tool for this purpose or curl. The JSON data we are using here includes the model identifier as well as the prompt text. Other options and their default values can be found in the [vLLMEngine](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) docs. - -``` -echo '{ - "model": "llama3", - "prompt": "A robot may not injure a human being", - "stream": 0 -}' | curl --header "Content-Type: application/json" --request POST --data-binary @- http://localhost:8080/predictions/vllm/1.0/v1/completions -``` - -The output of the request looks like this: - -``` -{ - "id": "cmpl-cd29f1d8aa0b48aebcbff4b559a0c783", - "object": "text_completion", - "created": 1727211972, - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "choices": [ - { - "index": 0, - "text": " or, through inaction, allow a human being to come to harm.\nA", - "logprobs": null, - "finish_reason": "length", - "stop_reason": null, - "prompt_logprobs": null - } - ], - "usage": { - "prompt_tokens": 10, - "total_tokens": 26, - "completion_tokens": 16 - } -``` - -When streaming is False TorchServe will collect the full answer and send it in one go after the last token was created. If we flip the stream parameter we will receive piecewise data containing a single token in each message. - - -## Conclusion - -In this blog post, we explored the new, native integration of the vLLM inference engine into TorchServe. We demonstrated how to locally deploy a Llama 3.1 70B model using the ts.llm_launcher script and how to create a model archive for deployment on any TorchServe instance. Additionally, we discussed how to build and run the solution in a Docker container for deployment on Kubernetes or EKS. In future works, we plan to enable multi-node inference with vLLM and TorchServe, as well as offer a pre-built Docker image to simplify the deployment process. - -We would like to express our gratitude to Mark Saroufim and the vLLM team for their invaluable support in the lead-up to this blog post. \ No newline at end of file diff --git a/_posts/2024-11-01-cutlass-ping-pong-gemm-kernel.md b/_posts/2024-11-01-cutlass-ping-pong-gemm-kernel.md deleted file mode 100644 index 796ed4ac81e1..000000000000 --- a/_posts/2024-11-01-cutlass-ping-pong-gemm-kernel.md +++ /dev/null @@ -1,201 +0,0 @@ ---- -layout: blog_detail -title: "Deep Dive on CUTLASS Ping-Pong GEMM Kernel" -author: Less Wright, Adnan Hoque -excerpt: "In this post, we provide an overview, with relevant FP8 inference kernel benchmarking, of the CUTLASS Ping-Pong GEMM kernel." ---- - -![Figure 1. FP8 GEMM Throughput Comparison CUTLASS vs Triton](/assets/images/cutlass-ping-pong-gemm-kernel/fg1.png){:style="width:100%"} - -**Figure 1. FP8 GEMM Throughput Comparison CUTLASS vs Triton** - -## Summary - -In this post, we provide an overview, with relevant FP8 inference kernel benchmarking, of the CUTLASS Ping-Pong GEMM kernel. - -Ping-Pong is one of the fastest matmul (GEMM) kernel architectures available for the Hopper GPU architecture. Ping-Pong is a member of the Warp Group Specialized Persistent Kernels family, which includes both Cooperative and Ping-Pong variants. Relative to previous GPUs, Hopper’s substantial tensor core compute capability requires deep asynchronous software pipelining in order to achieve peak performance. - -The Ping-Pong and Cooperative kernels exemplify this paradigm, as the key design patterns are persistent kernels to amortize launch and prologue overhead, and ‘async everything’ with specialized warp groups with two consumers and one producer, to create a highly overlapped processing pipeline that is able to continuously supply data to the tensor cores. - -When the H100 (Hopper) GPU was launched, Nvidia billed it as the first truly asynchronous GPU. That statement highlights the need for H100 specific kernel architectures to also be asynchronous in order to fully maximize computational/GEMM throughput. - -The pingpong GEMM, introduced in CUTLASS 3.x, exemplifies this by moving all aspects of the kernel to a ‘fully asynchronous’ processing paradigm. In this blog, we’ll showcase the core features of the ping-pong kernel design as well as showcase its performance on inference workloads vs cublas and triton split-k kernels. - -## Ping-Pong Kernel Design - -Ping-Pong (or technically ‘sm90_gemm_tma_warpspecialized_pingpong’) operates with an asynchronous pipeline, leveraging warp specialization. Instead of the more classical homogeneous kernels, “warp groups” take on specialized roles. Note that a warp group consists of 4 warps of 32 threads each, or 128 total threads. - -On earlier architectures, latency was usually hidden by running multiple thread blocks per SM. However, with Hopper, the Tensor Core throughput is so high that it necessitates moving to deeper pipelines. These deeper pipelines then hinder running multiple thread blocks per SM. Thus, persistent thread blocks now issue collective main loops across multiple tiles and multiple warp groups. Thread block clusters are allocated based on the total SM count. - -For Ping-Pong, each warp group takes on a specialized role of either Data producer or Data consumer. - -The producer warp group focuses on producing data movement to fill the shared memory buffers (via TMA). Two other warp groups are dedicated consumers that process the math (MMA) portion with tensor cores, and then do any follow up work and write their results back to global memory (epilogue). - -Producer warp groups work with TMA (Tensor Memory Accelerator), and are deliberately kept as lightweight as possible. In fact, in Ping-Pong, they deliberately reduce their register resources to improve occupancy. Producers will reduce their max register counts by 40, vs consumers will increase their max register count by 232, an effect we can see in the CUTLASS source and corresponding SASS: - - -![source code](/assets/images/cutlass-ping-pong-gemm-kernel/fg2.png){:style="width:100%"} - - - -Unique to Ping-Pong, each consumer works on separate C output tiles. (For reference, the cooperative kernel is largely equivalent to Ping-Pong, but both consumer groups work on the same C output tile). Further, the two consumer warp groups then split their work between the main loop MMA and epilogue. - -This is shown in the below image: - - -![Figure 2: An overview of the Ping-Pong Kernel pipeline. Time moves left to right.](/assets/images/cutlass-ping-pong-gemm-kernel/fg3.png){:style="width:100%"} - - - -**Figure 2: An overview of the Ping-Pong Kernel pipeline. Time moves left to right.** - -By having two consumers, it means that one can be using the tensor cores for MMA while the other performs the epilogue, and then vice-versa. This maximizes the ‘continuous usage’ of the tensor cores on each SM, and is a key part of the reason for the max throughput. The tensor cores can be continuously fed data to realize their (near) maximum compute capability. (See the bottom section of the Fig 2 illustration above). - -Similar to how Producer threads stay focused only on data movements, MMA threads only issue MMA instructions in order to achieve peak issue rate. MMA threads must issue multiple MMA instructions and keep these in flight against TMA wait barriers. - -An excerpt of the kernel code is shown below to cement the specialization aspects: - - -``` -// Two types of warp group 'roles' -enum class WarpGroupRole { - Producer = 0, - Consumer0 = 1, - Consumer1 = 2 - }; - -//warp group role assignment -auto warp_group_role = WarpGroupRole(canonical_warp_group_idx()); -``` - - -## Data Movement with Producers and Tensor Memory Accelerator - -The producer warps focus exclusively on data movement - specifically they are kept as lightweight as possible and in fact give up some of their register space to the consumer warps (keeping only 40 registers, while consumers will get 232). Their main task is issuing TMA (tensor memory accelerator) commands to move data from Global memory to shared memory as soon as a shared memory buffer is signaled as being empty. - -To expand on TMA, or Tensor Memory Accelerator, TMA is a hardware component introduced with H100’s that asynchronously handles the transfer of memory from HBM (global memory) to shared memory. By having a dedicated hardware unit for memory movement, worker threads are freed to engage in other work rather than computing and managing data movement. TMA not only handles the movement of the data itself, but also calculates the required destination memory addresses, can apply any transforms (reductions, etc.) to the data and can handle layout transformations to deliver data to shared memory in a ‘swizzled’ pattern so that it’s ready for use without any bank conflicts. Finally, it can also multicast the same data if needed to other SM’s that are members of the same thread cluster. Once the data has been delivered, TMA will then signal the consumer of interest that the data is ready. - -## CUTLASS Asynchronous Pipeline Class - -This signaling between producers and consumers is coordinated via the new Asynchronous Pipeline Class which CUTLASS describes as follows: - -“Implementing a persistent GEMM algorithm calls for managing dozens of different kinds of asynchronously executing operations that synchronize using multiple barriers organized as a circular list. - -This complexity is too much for human programmers to manage by hand. - -As a result, we have developed [[CUTLASS Pipeline Async Class](https://l.workplace.com/l.php?u=https%3A%2F%2Fgithub.com%2FNVIDIA%2Fcutlass%2Fblob%2Fmain%2Finclude%2Fcutlass%2Fpipeline%2Fsm90_pipeline.hpp&h=AT0Qy69t9mn_9VGkJlf1TkC_yCVPAQbYzHtS9it0ZVxTxVasGZfb6u-VHKReULm29NsLhp3DtuRfN4BHnzczniArsCFe8Uzj7izIx646Otyl4lEwl9jUHDhTcUq87KfS919MkadFMjq5i4qtkbe7QbgZEMbhFi0ARgvz3-u7_X0Hf3kHwQ&__tn__=-UK-R&c[0]=AT2Wep-mQJcJ7w2cBPcqoNcO9gLYx7_Qg9TGIcfKPSoo8kGdDtl70vKog1VICaOX45DhNP-Eu6pUbUl9TxGeGLQHgzyXWuxAgDQrdlOhhiOC3QRDMckh2vCi8RADkSCainRbZ5JoF7CERyij7CrhsSskOfVqQ_fvN-lKG6W2_TkvMFLe8UbKNPkzSqjzfdo)]…” - -## Barriers and synchronization within the Ping-Pong async pipeline - -Producers must ‘acquire’ a given smem buffer via 'producer_acquire'. At the start, a pipeline is empty meaning that producer threads can immediately acquire the barrier and begin moving data. - - -``` -PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state(); -``` - - -Once the data movement is complete, producers issue the ‘producer_commit’ method to signal the consumer threads that data is ready. \ -However, for Ping-Pong, this is actually a noop instruction since TMA based producer's barriers are automatically updated by the TMA when writes are completed. - -consumer_wait - wait for data from producer threads (blocking). - -consumer_release - signal waiting producer threads that they are finished consuming data from a given smem buffer. In other words, allow producers to go to work refilling this with new data. - -From there, synchronization will begin in earnest where the producers will wait via the blocking producer acquire until they can acquire a lock, at which point their data movement work will repeat. This continues until the work is finished. - -To provide a pseudo-code overview: - - -``` -//producer -While (work_tile_info.is_valid_tile) { - - collective_mainloop.dma() // fetch data with TMA - scheduler.advance_to_next_work() - Work_tile_info = scheduler.get_current_work() - -} - -// Consumer 1, Consumer 2 -While (work_tile_info.is_valid_tile()) { - - collective_mainloop.mma() - scheduler.advance_to_next_work() - Work_tile_info = scheduler.get_current_work() - -} -``` - - -And a visual birds-eye view putting it all together with the underlying hardware: - - - -![Figure 3: An overview of the full async pipeline for Ping-Pong](/assets/images/cutlass-ping-pong-gemm-kernel/fg4.png){:style="width:100%"} - - -**Figure 3: An overview of the full async pipeline for Ping-Pong** - -## Step-by-Step Breakdown of Ping-Pong Computation Loop - -Finally, a more detailed logical breakout of the Ping-Pong processing loop: - -A - Producer (DMA) warp group acquires a lock on a shared memory buffer. - -B - this allows it to kick off a tma cp_async.bulk request to the tma chip (via a single thread). - -C - TMA computes the actual shared memory addressing required, and moves the data to shared memory. As part of this, swizzling is performed in order to layout the data in smem for the fastest (no bank conflict) access. - -C1 - potentially, data can also be multicast to other SMs and/or it may need to wait for data from other tma multicast to complete the loading. (threadblock clusters now share shared memory across multiple SMs!) - -D - At this point, the barrier is updated to signal the arrival of the data to smem. - -E - The relevant consumer warpgroup now gets to work by issuing multiple wgmma.mma_async commands, which then read the data from smem to Tensor cores as part of it’s wgmma.mma_async matmul operation. - -F - the MMA accumulator values are written to register memory as the tiles are completed. - -G - the consumer warp group releases the barrier on the shared memory. - -H - the producer warp groups go to work issuing the next tma instruction to refill the now free smem buffer. - -I - The consumer warp group simultaneously applies any epilogue actions to the accumulator, and then move data from register to a different smem buffer. - -J - The consumer warp issues a cp_async command to move data from smem to global memory. - -The cycle repeats until the work is completed. Hopefully this provides you with a working understanding of the core concepts that power Ping-Pong’s impressive performance. - -## Microbenchmarks - -To showcase some of Ping-Pong’s performance, below are some comparison charts related to our work on designing fast inference kernels. - -First a general benchmarking of the three fastest kernels so far (lower is better): \ - -![Figure 4, above: Benchmark timings of FP8 GEMMs, lower is better (faster)](/assets/images/cutlass-ping-pong-gemm-kernel/fg5.png){:style="width:100%"} - -**Figure 4, above: Benchmark timings of FP8 GEMMs, lower is better (faster)** - -And translating that into a relative speedup chart of Ping-Pong vs cuBLAS and Triton: - - - -![Figure 5, above: Relative speedup of Ping-Pong vs the two closest kernels.](/assets/images/cutlass-ping-pong-gemm-kernel/fg6.png){:style="width:100%"} - -**Figure 5, above: Relative speedup of Ping-Pong vs the two closest kernels.** - -The full source code for the Ping-Pong kernel is here (619 lines of deeply templated CUTLASS code, or to paraphrase the famous turtle meme - "it's templates...all the way down! ): - -- [https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp](https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp) - -In addition, we have implemented PingPong as a CPP extension to make it easy to integrate into use with PyTorch here (along with a simple test script showing it’s usage): - -- [https://github.com/pytorch-labs/applied-ai/tree/main/kernels/cuda/cutlass_gemm](https://github.com/pytorch-labs/applied-ai/tree/main/kernels/cuda/cutlass_gemm) - -Finally, for continued learning, Nvidia has two GTC videos that dive into kernel design with CUTLASS: - -- [Developing Optimal CUDA Kernels on Hopper Tensor Cores \| GTC Digital Spring 2023 \| NVIDIA On-Demand](https://www.nvidia.com/en-us/on-demand/session/gtcspring23-s51413/) -- [CUTLASS: A Performant, Flexible, and Portable Way to Target Hopper Tensor Cores \| GTC 24 2024 \| NVIDIA On-Demand](https://www.nvidia.com/en-us/on-demand/session/gtc24-s61198/) - -## Future Work - -Data movement is usually the biggest impediment to top performance for any kernel, and thus having an optimal strategy understanding of TMA (Tensor Memory Accelerator) on Hopper is vital. We previously published work on [TMA usage in Triton](https://pytorch.org/blog/hopper-tma-unit/). Once features like warp specialization are enabled in Triton, we plan to do another deep dive on how Triton kernels like FP8 GEMM and FlashAttention can leverage kernel designs like Ping-Pong for acceleration on Hopper GPUs. \ No newline at end of file diff --git a/_posts/2024-11-18-llama-into-torchtune.md b/_posts/2024-11-18-llama-into-torchtune.md deleted file mode 100644 index abe9f290987e..000000000000 --- a/_posts/2024-11-18-llama-into-torchtune.md +++ /dev/null @@ -1,818 +0,0 @@ ---- -layout: blog_detail -title: "Distilling Llama3.1 8B into 1B in torchtune" -author: Linda Wang, Evan Smothers, Kartikay Khandelwal ---- - -In this blog, we present a case study on distilling a Llama 3.1 8B model into Llama 3.2 1B using torchtune’s knowledge distillation recipe. We demonstrate how knowledge distillation (KD) can be used in post-training to improve instruction-following task performance and showcase how users can leverage the recipe. - - -## What is Knowledge Distillation? - -[Knowledge Distillation](https://arxiv.org/pdf/1503.02531) is a widely used compression technique that transfers knowledge from a larger (teacher) model to a smaller (student) model. Larger models have more parameters and capacity for knowledge, however, this larger capacity is also more computationally expensive to deploy. Knowledge distillation can be used to compress the knowledge of a larger model into a smaller model. The idea is that performance of smaller models can be improved by learning from larger model’s outputs. - - -## How does Knowledge Distillation work? - -Knowledge is transferred from the teacher to student model by training on a transfer set where the student is trained to imitate the token-level probability distributions of the teacher. The assumption is that the teacher model distribution is similar to the transfer dataset. The diagram below is a simplified representation of how KD works. - -![Figure 1: Simplified representation of knowledge transfer from teacher to student model](/assets/images/llama-into-torchtune/fg1.png){:style="width:100%"} - - -**Figure 1: Simplified representation of knowledge transfer from teacher to student model** - -As knowledge distillation for LLMs is an active area of research, there are papers, such as [MiniLLM](https://arxiv.org/pdf/2306.08543), [DistiLLM](https://arxiv.org/pdf/2402.03898), [AKL](https://arxiv.org/pdf/2404.02657), and [Generalized KD](https://arxiv.org/pdf/2306.13649), investigating different loss approaches. In this case study, we focus on the standard cross-entropy (CE) loss with the forward [Kullback-Leibler (KL) divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) loss as the baseline. Forward KL divergence aims to minimize the difference by forcing the student’s distribution to align with all of the teacher’s distributions. - - -## Why is Knowledge Distillation useful? - -The idea of knowledge distillation is that a smaller model can achieve better performance using a teacher model’s outputs as an additional signal than it could training from scratch or with supervised fine-tuning. For instance, [Llama 3.2 lightweight 1B and 3B text models](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/) incorporated logits from Llama 3.1 8B and 70B to recover performance after pruning. In addition, for fine-tuning on instruction-following tasks, research in LLM distillation demonstrates that knowledge distillation methods can outperform supervised fine-tuning (SFT) alone. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model - Method - DollyEval - Self-Inst - S-NI -
        GPT-4 Eval - GPT-4 Eval - Rouge-L -
        Llama 7B - SFT - 73.0 - 69.2 - 32.4 -
        KD - 73.7 - 70.5 - 33.7 -
        MiniLLM - 76.4 - 73.1 - 35.5 -
        Llama 1.1B - SFT - 22.1 - - - 27.8 -
        KD - 22.2 - - - 28.1 -
        AKL - 24.4 - - - 31.4 -
        OpenLlama 3B - SFT - 47.3 - 41.7 - 29.3 -
        KD - 44.9 - 42.1 - 27.9 -
        SeqKD - 48.1 - 46.0 - 29.1 -
        DistiLLM - 59.9 - 53.3 - 37.6 -
        - - -**Table 1: Comparison of knowledge distillation approaches to supervised fine-tuning** - -Below is a simplified example of how knowledge distillation differs from supervised fine-tuning. - - - - - - - - - - - -
        Supervised fine-tuning - Knowledge distillation -
        -
        -   
        -model = llama3_2_1b()
        -ce_loss = CrossEntropyLoss()
        -kd_loss = ForwardKLLoss()
        -
        -tokens, labels = batch["tokens"], batch["labels"]
        -logits = model(tokens, ...)
        -
        -loss = ce_loss(logits, labels)
        -loss.backward()
        -
        -   
        -   
        -
        -
        -   
        -model = llama3_2_1b()
        -teacher_model = llama3_1_8b()
        -ce_loss = CrossEntropyLoss()
        -kd_loss = ForwardKLLoss()
        -
        -tokens, labels = batch["tokens"], batch["labels"]
        -logits = model(tokens, ...)
        -teacher_logits = teacher_model(tokens, ...)
        -loss = ce_loss(logits, labels) + kd_loss(logits, teacher_logits, labels)
        -loss.backward()
        -   
        -   
        -
        - - - -## KD recipe in torchtune - -With torchtune, we can easily apply knowledge distillation to Llama3, as well as other LLM model families, using torchtune’s [KD recipe](https://github.com/pytorch/torchtune/blob/4234b78b914af23384ce0348f564e2119d107a96/recipes/knowledge_distillation_single_device.py). The objective for this recipe is to fine-tune Llama3.2-1B on the Alpaca instruction-following dataset by distilling from Llama3.1-8B. This recipe focuses on post-training and assumes the teacher and student models have already been pre-trained. - -First, we have to download the model weights. To be consistent with other torchtune fine-tuning configs, we will use the instruction tuned models of Llama3.1-8B as teacher and Llama3.2-1B as student. - - -``` -tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf_token - -tune download meta-llama/Llama-3.2-1B-Instruct --output-dir /tmp/Llama-3.2-1B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf_token -``` - - -In order for the teacher model distribution to be similar to the Alpaca dataset, we will fine-tune the teacher model using LoRA. Based on our experiments, shown in the next section, we’ve found that KD performs better when the teacher model is already fine-tuned on the target dataset. - - -``` -tune run lora_finetune_single_device --config llama3_1/8B_lora_single_device -``` - - -Finally, we can run the following command to distill the fine-tuned 8B model into the 1B model on a single GPU. For this case study, we used a single A100 80GB GPU. We also have a [distributed recipe](https://github.com/pytorch/torchtune/blob/09c2619f713e771b4159f7b83bac8971c7053bd3/recipes/knowledge_distillation_distributed.py) for running on multiple devices. - - -``` -tune run knowledge_distillation_single_device --config llama3_2/knowledge_distillation_single_device -``` - - - -## Ablation studies - -In this section, we demonstrate how changing configurations and hyperparameters can affect performance. By default, our configuration uses the LoRA fine-tuned 8B teacher model, downloaded 1B student model, learning rate of 3e-4 and KD loss ratio of 0.5. For this case study, we fine-tuned on the [alpaca_cleaned_dataset](https://pytorch.org/torchtune/main/generated/torchtune.datasets.alpaca_cleaned_dataset.html#torchtune.datasets.alpaca_cleaned_dataset) and evaluated the models on [truthfulqa_mc2](https://github.com/EleutherAI/lm-evaluation-harness/tree/feff1b55c57993c4d42c8f913a22eeec395cd690/lm_eval/tasks/truthfulqa), [hellaswag](https://github.com/EleutherAI/lm-evaluation-harness/tree/517aadc/lm_eval/tasks/hellaswagd) and [commonsense_qa](https://github.com/EleutherAI/lm-evaluation-harness/tree/b62b9bd/lm_eval/tasks/commonsense_qa) tasks through the EleutherAI [LM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main). Let’s take a look at the effects of: - - - -1. Using a fine-tuned teacher model -2. Using a fine-tuned student model -3. Hyperparameter tuning of KD loss ratio and learning rate - - -### Using a fine-tuned teacher model - -The default settings in the config uses the fine-tuned teacher model. Now, let’s take a look at the effects of not fine-tuning the teacher model first. - -Taking a loss at the losses, using the baseline 8B as teacher results in a higher loss than using the fine-tuned teacher model. The KD loss also remains relatively constant, suggesting that the teacher model should have the same distributions as the transfer dataset. - -![Figure 2: (left to right) KD loss from forward KL divergence, class loss from cross entropy, total loss: even combination of KD and class loss.](/assets/images/llama-into-torchtune/fg2.png){:style="width:100%"} - - -**Figure 2: (left to right) KD loss from forward KL divergence, class loss from cross entropy, total loss: even combination of KD and class loss.** - -In our benchmarks, we can see that supervised fine-tuning of the 1B model achieves better accuracy than the baseline 1B model. By using the fine-tuned 8B teacher model, we see comparable results for truthfulqa and improvement for hellaswag and commonsense. When using the baseline 8B as a teacher, we see improvement across all metrics, but lower than the other configurations. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model - TruthfulQA - hellaswag - commonsense -
        mc2 - acc - acc_norm - acc -
        Baseline Llama 3.1 8B - 0.5401 - 0.5911 - 0.7915 - 0.7707 -
        Fine-tuned Llama 3.1 8B using LoRA - 0.5475 - 0.6031 - 0.7951 - 0.7789 -
        Baseline Llama 3.2 1B - 0.4384 - 0.4517 - 0.6064 - 0.5536 -
        Fine-tuned Llama 3.2 1B using LoRA - 0.4492 - 0.4595 - 0.6132 - 0.5528 -
        KD using baseline 8B as teacher - 0.444 - 0.4576 - 0.6123 - 0.5561 -
        KD using fine-tuned 8B as teacher - 0.4481 - 0.4603 - 0.6157 - 0.5569 -
        - - -**Table 2: Comparison between using baseline and fine-tuned 8B as teacher model** - - -### Using a fine-tuned student model - -For these experiments, we look at the effects of KD when the student model is already fine-tuned. We analyze the effects using different combinations of baseline and fine-tuned 8B and 1B models. - -Based on the loss graphs, using a fine-tuned teacher model results in a lower loss irrespective of whether the student model is fine-tuned or not. It’s also interesting to note that the class loss starts to increase when using a fine-tuned student model. - -![Figure 3: Comparing losses of different teacher and student model initializations](/assets/images/llama-into-torchtune/fg3.png){:style="width:100%"} - - - -**Figure 3: Comparing losses of different teacher and student model initializations** - -Using the fine-tuned student model boosts accuracy even further for truthfulqa, but the accuracy drops for hellaswag and commonsense. Using a fine-tuned teacher model and baseline student model achieved the best results on hellaswag and commonsense dataset. Based on these findings, the best configuration will change depending on which evaluation dataset and metric you are optimizing for. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model - TruthfulQA - hellaswag - commonsense -
        mc2 - acc - acc_norm - acc -
        Baseline Llama 3.1 8B - 0.5401 - 0.5911 - 0.7915 - 0.7707 -
        Fine-tuned Llama 3.1 8B using LoRA - 0.5475 - 0.6031 - 0.7951 - 0.7789 -
        Baseline Llama 3.2 1B - 0.4384 - 0.4517 - 0.6064 - 0.5536 -
        Fine-tuned Llama 3.2 1B using LoRA - 0.4492 - 0.4595 - 0.6132 - 0.5528 -
        KD using baseline 8B and baseline 1B - 0.444 - 0.4576 - 0.6123 - 0.5561 -
        KD using baseline 8B and fine-tuned 1B - 0.4508 - 0.448 - 0.6004 - 0.5274 -
        KD using fine-tuned 8B and baseline 1B - 0.4481 - 0.4603 - 0.6157 - 0.5569 -
        KD using fine-tuned 8B and fine-tuned 1B - 0.4713 - 0.4512 - 0.599 - 0.5233 -
        - - -**Table 3: Comparison using baseline and fine-tuned teacher and student models** - - -### Hyperparameter tuning: learning rate - -By default, the recipe has a learning rate of 3e-4. For these experiments, we changed the learning rate from as high as 1e-3 to as low as 1e-5. - -Based on the loss graphs, all learning rates result in similar losses except for 1e-5, which has a higher KD and class loss. - - -![Figure 4: Comparing losses of different learning rates](/assets/images/llama-into-torchtune/fg4.png){:style="width:100%"} - - - -**Figure 4: Comparing losses of different learning rates** - -Based on our benchmarks, the optimal learning rate changes depending on which metric and tasks you are optimizing for. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model - learning rate - TruthfulQA - hellaswag - commonsense -
        mc2 - acc - acc_norm - acc -
        Baseline Llama 3.1 8B - - - 0.5401 - 0.5911 - 0.7915 - 0.7707 -
        Fine-tuned Llama 3.1 8B using LoRA - - - 0.5475 - 0.6031 - 0.7951 - 0.7789 -
        Baseline Llama 3.2 1B - - - 0.4384 - 0.4517 - 0.6064 - 0.5536 -
        Fine-tuned Llama 3.2 1B using LoRA - - - 0.4492 - 0.4595 - 0.6132 - 0.5528 -
        KD using fine-tuned 8B and baseline 1B - 3e-4 - 0.4481 - 0.4603 - 0.6157 - 0.5569 -
        KD using fine-tuned 8B and baseline 1B - 1e-3 - 0.4453 - 0.4535 - 0.6071 - 0.5258 -
        KD using fine-tuned 8B and baseline 1B - 1e-4 - 0.4489 - 0.4606 - 0.6156 - 0.5586 -
        KD using fine-tuned 8B and baseline 1B - 1e-5 - 0.4547 - 0.4548 - 0.6114 - 0.5487 -
        - - -**Table 4: Effects of tuning learning rate** - - -### Hyperparameter tuning: KD ratio - -By default, the KD ratio is set to 0.5, which gives even weighting to both the class and KD loss. In these experiments, we look at the effects of different KD ratios, where 0 only uses the class loss and 1 only uses the KD loss. - -Overall, the benchmark results show that for these tasks and metrics, higher KD ratios perform slightly better. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model - kd_ratio (lr=3e-4) - TruthfulQA - hellaswag - commonsense -
        mc2 - acc - acc_norm - acc -
        Baseline Llama 3.1 8B - - - 0.5401 - 0.5911 - 0.7915 - 0.7707 -
        Fine-tuned Llama 3.1 8B using LoRA - - - 0.5475 - 0.6031 - 0.7951 - 0.7789 -
        Baseline Llama 3.2 1B - - - 0.4384 - 0.4517 - 0.6064 - 0.5536 -
        Fine-tuned Llama 3.2 1B using LoRA - - - 0.4492 - 0.4595 - 0.6132 - 0.5528 -
        KD using fine-tuned 8B and baseline 1B - 0.25 - 0.4485 - 0.4595 - 0.6155 - 0.5602 -
        KD using fine-tuned 8B and baseline 1B - 0.5 - 0.4481 - 0.4603 - 0.6157 - 0.5569 -
        KD using fine-tuned 8B and baseline 1B - 0.75 - 0.4543 - 0.463 - 0.6189 - 0.5643 -
        KD using fine-tuned 8B and baseline 1B - 1.0 - 0.4537 - 0.4641 - 0.6177 - 0.5717 -
        - - -**Table 5: Effects of tuning KD ratio** - - -## Looking Ahead - -In this blog, we presented a study on how to distill LLMs through torchtune using the forward KL divergence loss on Llama 3.1 8B and Llama 3.2 1B logits. There are many directions for future exploration to further improve performance and offer more flexibility in distillation methods. - - - -* **Expand KD loss offerings**. The KD recipe uses the forward KL divergence loss. However, aligning the student distribution to the whole teacher distribution may not be effective, as mentioned above. There are multiple papers, such as [MiniLLM](https://arxiv.org/pdf/2306.08543), [DistiLLM](https://arxiv.org/pdf/2402.03898), and [Generalized KD](https://arxiv.org/pdf/2306.13649), that introduce new KD losses and policies to address the limitation and have shown to outperform the standard use of cross entropy with forward KL divergence loss. For instance, MiniLLM uses reverse KL divergence to prevent the student from over-estimating low-probability regions of the teacher. DistiLLM introduces a skewed KL loss and an adaptive training policy. -* **Enable cross-tokenizer distillation**. The current recipe requires the teacher and student model to use the same tokenizer, which limits the ability to distill across different LLM families. There has been research on cross-tokenizer approaches (e.g. [Universal Logit Distillation](https://arxiv.org/pdf/2402.12030)) that we could explore. -* **Expand distillation to multimodal LLMs and encoder models**. A natural extension of the KD recipe is to expand to multimodal LLMs. Similar to deploying more efficient LLMs, there’s also a need to deploy smaller and more efficient multimodal LLMs. In addition, there has been work in demonstrating LLMs as encoder models (e.g. [LLM2Vec](https://arxiv.org/pdf/2404.05961)). Distillation from LLMs as encoders to smaller encoder models may also be a promising direction to explore. \ No newline at end of file diff --git a/_posts/2024-11-21-rebellions.md b/_posts/2024-11-21-rebellions.md deleted file mode 100644 index b8941a003d83..000000000000 --- a/_posts/2024-11-21-rebellions.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -layout: blog_detail -title: "Rebellions Joins the PyTorch Foundation as a General Member" -excerpt: "The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Rebellions has joined as a general member." ---- - -![Rebellions logo](/assets/images/rebellions-logo.svg){:style="max-width:350px;width:100%;float:right;margin: 20px;"} - -The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Rebellions has joined as a general member. - -Rebellions is a South Korea-based semiconductor company specializing in the design and development of AI chips for data centers and edge devices. Their innovative hardware and software solutions aim to accelerate generative AI and machine learning workloads, focusing on high energy efficiency and performance. The company successfully launched and deployed its AI chip ‘ATOM’ targeting data centers in 2023 and is developing its next-generation AI accelerator ‘REBEL’. - -"We’re thrilled to welcome Rebellions as a new general member of the PyTorch Foundation,” said Matt White, Executive Director of the PyTorch Foundation. “Rebellions brings a unique perspective to the PyTorch ecosystem with their focus on advancing the integration of NPU architectures for AI acceleration with PyTorch. Their expertise will play a vital role in ensuring PyTorch continues to evolve as a versatile framework, accommodating the diverse needs of modern AI workloads. We look forward to collaborating with Rebellions to drive innovation and strengthen the PyTorch ecosystem for developers worldwide.” - -Rebellions has introduced native support for PyTorch 2.0 in their RBLN SDK. This integration includes compatibility with torch.compile, a pivotal feature of PyTorch 2.0 that enhances model performance. Through this development, Rebellions has empowered developers to seamlessly harness the full potential of their AI accelerator lineup within the environment. - -Rebellions is also deeply committed to advancing the PyTorch ecosystem through collaborative innovation starting in Korea. The company has established a Special Interest Group (SIG) focusing on Pytorch Core within the PyTorch Korea community and is actively working with volunteers recruited through MODULABS, an open research institute, to integrate native support for the deep learning framework into their Neural Processing Unit (NPU). - -In addition, Rebellions is collaborating with academic institutions, such as Yonsei University, Hanyang University, University of Science & Technology (UST) and national agencies, such as the Electronics and Telecommunications Research Institute (ETRI), to offer undergraduate and graduate courses on PyTorch and enable them to leverage Pytorch as their research platform. - -These initiatives highlight Rebellions' dedication to optimizing the PyTorch experience for developers and researchers alike, while also fostering education and innovation in the field. - -“By integrating our hardware innovations with PyTorch, we’re building Native NPU support to accelerate diverse AI workloads.” said Hong-seok Kim, the Chief Software Architect at Rebellions. “We're excited to contribute to the PyTorch community by community-driven initiatives and partnerships, advancing NPU architecture support for next-generation AI solutions. Together with the PyTorch community, we aim to pioneer new possibilities in AI acceleration and empower developers worldwide with efficient computing solutions.” - -To learn more about how your organization can be a part of the PyTorch Foundation, visit our [website](https://pytorch.org/join). - -## About Rebellions - -Rebellions is a South Korea-based semiconductor company specializing in the design and development of AI chips for data centers and edge devices. Their innovative hardware and software solutions aim to accelerate generative AI and machine learning workloads, focusing on high energy efficiency and performance. The company successfully launched and deployed its AI chip ‘ATOM’ targeting data centers in 2023 and is developing its next-generation AI accelerator ‘REBEL’ incorporating a scalable chiplet architecture and high-bandwidth memory. - -## About PyTorch Foundation - -The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration. - -## About The Linux Foundation - -The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. \ No newline at end of file diff --git a/_posts/2024-11-25-training-using-float8-fsdp2.md b/_posts/2024-11-25-training-using-float8-fsdp2.md deleted file mode 100644 index ea81f892f0c7..000000000000 --- a/_posts/2024-11-25-training-using-float8-fsdp2.md +++ /dev/null @@ -1,243 +0,0 @@ ---- -layout: blog_detail -title: "Supercharging Training using float8 and FSDP2" -author: "IBM and Meta" -excerpt: "In this blog, we will demonstrate how we achieve up to 50% throughput speedup while achieving loss and evaluation benchmark parity in training over FSDP1 bf16 training" ---- - -**IBM**: Tuan Hoang Trong, Alexei Karve, Yan Koyfman, Linsong Chu, Divya Kumari, Shweta Salaria, Robert Walkup, Praneet Adusumilli, Nirmit Desai, Raghu Ganti, Seetharami Seelam -**Meta**: Less Wright, Wei Feng, Vasiliy Kuznetsov, Driss Guesseous - -In this blog, we will demonstrate how we achieve up to **50% throughput speedup** while achieving loss and evaluation benchmark parity in training over [FSDP1 bf16 training](https://pytorch.org/blog/maximizing-training-throughput/). We achieve this speedup by leveraging FSDP2, DTensor, and torch.compile with torchao’s float8 via linear layer updates (compute), and float8 all_gathers for weight communication. We showcase these improvements across a spectrum of Meta LLaMa model architecture sizes, ranging from small 1.8B model size all the way to 405B model size, making training faster than ever. - -We demonstrate these improvements using the Meta Llama3 architecture, and then perform model quality studies at two scales: 100B tokens at 8B model size, and 50B tokens at 70B model size, which provide an exact comparison of float8 and bf16 training loss curves. We demonstrate that the loss curves result in identical loss convergence across these model training runs compared to the `bf16` counterpart. Further, we train a 3B model to 1T tokens using the FineWeb-edu dataset and run standard evaluation benchmarks to ensure that the model quality is intact and comparable to a `bf16` run. - -At IBM Research, we plan to adopt these capabilities for our data ablations to improve the number of experiments we can perform in a given GPU budget. Longer term, we will follow up with a larger scale model run to demonstrate the end-to-end feasibility of `float8` training. - - -## What is Float8? - -The `float8` format for training models was introduced by NVIDIA, ARM, and Intel in a [2022 paper](https://arxiv.org/abs/2209.05433) which demonstrated the feasibility of training using lower precision float8, without sacrificing model quality. With the introduction of newer GPUs like the NVIDIA Hopper series, FP8 training became feasible with the potential of more than 2x improvement in training throughput due to native float8 tensor core support. There are a few challenges to realize this promise: \ -(i) Enable the core model operations like `matmul` and `attention` in `float8`, \ -(ii) Enable `float8` training in a distributed framework, and \ -(iii) Enable weight communication between GPUs in `float8`. \ -While the `float8` `matmul` was enabled by NVIDIA libraries, the latter two were provided in recent updates to `FSDP2` and `torchao`. - -In this blog, we are using [torchtitan](https://github.com/pytorch/torchtitan) as the entry point for training, IBM’s deterministic data loader, the float8 linear layer implementation from [torchao](https://www.google.com/url?q=https://github.com/pytorch/ao/tree/main/torchao/float8&sa=D&source=docs&ust=1730743084184771&usg=AOvVaw21FdkNG452P-nDIO-hIwcW), and the float8 all gather from the latest PyTorch nightlies in conjunction with FSDP2. For this training, we are using the float8 per tensor (tensorwise) scaling granularity rather than rowwise. We leverage torch.compile to ensure that we get maximum performance gains. We are computing attention in bf16 using SDPA and are currently working on moving this to float8 as well. - - -## Experiments - -We perform various experiments to demonstrate the benefits of float8 training. The first is to ensure that model quality is not sacrificed. To verify this, we train an 8B model and 70B model for a few thousand steps and compare the loss curves between both the float8 and bf16 training run. Our experiments are performed on three different H100 clusters with 128, 256, and 512 H100 GPU configurations in very different environments to demonstrate reproducibility. The first cluster is customized on [Grand Teton](https://engineering.fb.com/2024/03/12/data-center-engineering/building-metas-genai-infrastructure/) in Meta with 400Gbps custom interconnect, the second is an IBM research cluster with 3.2Tbps Infiniband interconnect, and the third is an IBM Cloud cluster with 3.2Tbps RoCE interconnect for GPU-to-GPU communication. - - -First, we plot the loss curve comparisons for both these models in the below figures to demonstrate loss parity for a few thousand steps. - - -![Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps](/assets/images/training-using-float8-fsdp2/fg1.png){:style="width:100%"} - - - -![Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps](/assets/images/training-using-float8-fsdp2/fg2.png){:style="width:100%"} - - -*Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps* - -We observe that across these different models and in different environments, we obtain loss parity for the small scale of tokens. Next, we characterize the throughput gains for four different model sizes ranging from 1.8B to 405B. We explored the best batch size and activation checkpointing schemes for both the float8 and bf16 training runs to determine the tokens/sec/GPU (wps) metric and report the performance gain. For the 405B model, we leveraged `DTensor` for tensor parallel training with FSDP2. We use a sequence length of 8K for all our measurements. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Model size - wps (bf16) - wps (float8) - Percent gain -
        1.8B - 29K - 35K - 18% -
        8B - 8K - 10K - 28% -
        70B - 956 - 1430 - 50% -
        405B (TP4) - 149 - 227 - 52% -
        - - -*Table 1: Performance gains over bf16 (both bf16 and float8 use torch.compile)* - -We observe from Table 1 that the gains for larger models (70B and 405B) reach up to 50%, the smaller models see gains between roughly 20 and 30%. In further experiments, we observed that the addition of `float8` `all_gather` enables a boost of ~5% beyond the compute itself in `float8`, which is inline with the observations in this [blog](https://aws.amazon.com/blogs/machine-learning/efficient-pre-training-of-llama-3-like-model-architectures-using-torchtitan-on-amazon-sagemaker/). - -Second, to demonstrate the effectiveness of an FP8 model, we trained a 3B model following the Llama3 architecture for 1T tokens using the FineWeb-edu dataset from Hugging Face. We performed evaluations using the `lm-eval-harness` framework and present a small portion of these results in the below table. We observe that the `bf16` performance is marginally better than the `float8` scores (about one percent). While some scores are significantly better with `bf16` (e.g., MMLU is 3 pts higher), we expect these gaps to vanish when the right hyper parameters are chosen and across larger scale training runs (e.g., the `bf16` run had half the batch size and it is well known that smaller batch size runs can improve evaluation scores). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Benchmark - Score (float8) - Score (bf16) -
        MMLU (5-shot) - 0.26 - 0.29 -
        ARC-e - 0.73 - 0.73 -
        ARC-c - 0.43 - 0.46 -
        Hellaswag - 0.65 - 0.67 -
        sciq - 0.89 - 0.88 -
        OpenBook QA - 0.43 - 0.43 -
        PIQA - 0.76 - 0.76 -
        Winogrande - 0.60 - 0.65 -
        Average - 0.59 - 0.60 -
        - - -*Table 2: Benchmark scores for float8 trained model running in FP16 for eval (at 1T tokens of FineWeb pre-training).* - -Finally, we scale our experiments to 512 H100 GPUs on the IBM Cloud cluster. We were able to recreate the results and speedups that we observed even at 512 GPU scale. We summarize these results only for the large models in the below table (70B and 405B). - - - - - - - - - - - - - - - - - - - - - -
        Model size - wps (bf16) - wps (float8) - Percent gain -
        70B - 960 - 1448 - 51% -
        405B (TP4) - 152 - 217 - 43% -
        - - -*Table 3: Performance gains over bf16 (both bf16 and float8 use torch.compile) for 512 GPU scale* - - -## Future work - -We are also working on evaluating other forms of parallelism such as Context Parallelism. We plan to evaluate all of these features to demonstrate the composability and ability to make choices for training large scale models. - - -## Acknowledgements - -We thank Davis Wertheimer from IBM Research for enabling the data loader for torchtitan runs enabling us to replay data in the same order across multiple runs. We also thank IBM Cloud for enabling us with early test access to the H100 cluster. \ No newline at end of file diff --git a/_posts/2024-12-02-hadacore.md b/_posts/2024-12-02-hadacore.md deleted file mode 100644 index 71e50803c09f..000000000000 --- a/_posts/2024-12-02-hadacore.md +++ /dev/null @@ -1,207 +0,0 @@ ---- -layout: blog_detail -title: "HadaCore: Tensor Core Accelerated Hadamard Transform Kernel" -author: "IBM and Meta" -excerpt: "Quantization is a method for improving model inference speeds by compressing model weights and performing (faster) computation in lower precision data types. However, quantization can result in accuracy loss due to the presence of outliers." ---- - -**IBM**: Krish Agarwal, Rishi Astra, Adnan Hoque, Mudhakar Srivatsa, Raghu Ganti -**Meta**: Less Wright, Sijia Chen - -Quantization is a method for improving model inference speeds by compressing model weights and performing (faster) computation in lower precision data types. However, quantization can result in accuracy loss due to the presence of outliers. Recent works like [QuaRot](https://arxiv.org/abs/2404.00456), [SpinQuant](https://arxiv.org/abs/2405.16406), and [FlashAttention-3](https://arxiv.org/pdf/2407.08608) introduce methods to increase the numerical accuracy of INT4, INT8 and FP8 quantization in LLMs. These methods rely on [Hadamard Transforms](https://en.wikipedia.org/wiki/Hadamard_transform). In this blog, we present HadaCore, a Hadamard Transform CUDA kernel that achieves state-of-the-art performance on NVIDIA A100 and H100 GPUs. Our kernel achieves speedups of **1.1–1.4x** and **1.0–1.3x**, with a peak gain of **3.5x** and **3.6x** respectively, over Dao AI Lab’s [Fast Hadamard Transform Kernel](https://github.com/Dao-AILab/fast-hadamard-transform). We leverage a hardware-aware work decomposition that benefits from Tensor Core acceleration while maintaining quantization error reduction. - - - -![Figure 1: Speedup of HadaCore vs Dao AI Hadamard CUDA kernel. A peak gain of 3.46x on the A100 is achieved using 128 rotation by 8.4M elements.](/assets/images/hadacore/fg1.png){:style="width:100%"} - -*Figure 1: Speedup of HadaCore vs Dao AI Hadamard CUDA kernel. A peak gain of 3.46x on the A100 is achieved using 128 rotation by 8.4M elements.* - -The [HadaCore Kernel is publicly available](https://github.com/pytorch-labs/applied-ai/tree/main/kernels/cuda/inference/hadamard_transform). - -## Background - -[QuaRot](https://arxiv.org/abs/2404.00456) and [SpinQuant](https://arxiv.org/abs/2405.16406) both propose methods to increase the numerical accuracy of INT4 and INT8 quantization in LLMs. Both methods rotate model activations since rotations are statistically likely to reduce the magnitude of outliers, as it “distributes” extreme values among other (less extreme) dimensions, and rotation is also an easily invertible operation using the inverse of the rotation matrix. These methods can also improve FP8 inference accuracy, such as in [FlashAttention-3](https://arxiv.org/pdf/2407.08608). - - -![Figure 2. Transformer block showing online (red) and offline rotations (blue) in QuaRot](/assets/images/hadacore/fg2.png){:style="width:100%"} - - -*Figure 2. Transformer block showing online (red) and offline rotations (blue) in QuaRot* - -Applying these rotation matrices introduces model runtime overhead due to the online operations shown in Figure 2. These rotations can be applied through matrix multiplication, but the added overhead would diminish the benefits from quantization. Therefore, QuaRot and SpinQuant opt to use Walsh-Hadamard matrices, a special type of rotation matrix that can be applied faster than matrix multiplication using the [Fast Walsh-Hadamard Transform](https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform) algorithm. HadaCore is an optimized implementation of this algorithm for NVIDIA GPUs that support Tensor Cores. - -## Tensor Core Accelerated Hadamard Transform - -HadaCore leverages [NVIDIA Tensor Cores](https://www.nvidia.com/en-us/data-center/tensor-cores/), which are specialized compute units on NVIDIA GPUs optimized for matrix multiplication. To achieve this, our kernel performs a hardware-aware work decomposition of the Fast Walsh-Hadamard algorithm. This work decomposition ensures that we can utilize the [MMA PTX instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=mma#multiply-and-accumulate-instruction-mma) that execute on the Tensor Core chip. HadaCore applies a 16×16 Hadamard transform to chunks of the input data. The computation can then be offloaded to the FP16 Tensor Core with usage of the [mma.m16n8k16](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=mma#matrix-fragments-for-mma-m16n8k16-with-floating-point-type) instruction. The warp-level parallelism for HadaCore is shown below. - - -![Figure 3: HadaCore Parallelization, 1x256 vectors (rows) being rotated by a size 256 Hadamard.](/assets/images/hadacore/fg3.png){:style="width:100%"} - - -*Figure 3: HadaCore Parallelization, 1x256 vectors (rows) being rotated by a size 256 Hadamard.* - -We process fragments of 256 elements in parallel using warp-level Tensor Core operations to achieve up to a 256-size Hadamard transform. For further sizes, we shuffle data between warps and repeat. - -## Microbenchmarks - -We benchmark HadaCore against the[ Dao AI Lab Hadamard Kernel](https://github.com/Dao-AILab) on both NVIDIA H100 and A100 GPUs across varying Hadamard and input tensor sizes. - -![Figure 4: HadaCore Kernel Speedup on NVIDIA A100 over Dao AI Lab Fast Hadamard Kernel](/assets/images/hadacore/fg4.png){:style="width:100%"} - - - -*Figure 4: HadaCore Kernel Speedup on NVIDIA A100 over Dao AI Lab Fast Hadamard Kernel* - - -![Color coded Speedup Table for NVIDIA A100, Green = Speedup over Baseline](/assets/images/hadacore/fg5.png){:style="width:100%; margin-top: 35px;"} - - -*Color coded Speedup Table for NVIDIA A100, Green = Speedup over Baseline* - - -![Figure 5: HadaCore Kernel Speedup on NVIDIA H100 over Dao AI Lab Fast Hadamard Kernel](/assets/images/hadacore/fg6.png){:style="width:100%; margin-top: 35px;"} - - -*Figure 5: HadaCore Kernel Speedup on NVIDIA H100 over Dao AI Lab Fast Hadamard Kernel* - - -![Color coded Speedup Table for NVIDIA H100, Green = Speedup over Baseline](/assets/images/hadacore/fg7.png){:style="width:100%; margin-top: 35px;"} - - -*Color coded Speedup Table for NVIDIA H100, Green = Speedup over Baseline* - -We showcase our speedup as the input tensor size (labeled element count) in our charts increase. Element count is the number of elements in the target matrix we are rotating. For example, in multi-head attention: - - -The queries (Q), keys (K) and values (V) tensors are 4D tensors of size: - -`(batch_size, seq_len, n_heads, head_dim)` - -A Hadamard matrix of size `head_dim` is applied to these activation tensors, so we refer to this as using a Hadamard size of `head_dim` with an element count of: - -`batch_size*seq_len*n_heads*head_dim.` - -Common element counts for query rotations in an attention block: - - - - - - - - - - - - - - - - - - -
        Model \ Tokens - Prefill - Decoding -
        Llama-2 70b - 33,554,432 elements -
        -128 Hadamard size -
        - -(1 batch * 64 heads * 4096 tokens * 128 dimensional embeddings per head per token) -
        8192 elements -
        -128 Hadamard size -
        -(1 batch * 64 heads * 1 token * 128 dimensional embeddings per head per token) -
        Llama-3 8b - 33,554,432 elements -
        -128 Hadamard size -
        -(1 batch * 32 heads * 8192 tokens * 128 dimensional embeddings per head per token) -
        4,096 elements -
        -128 Hadamard size -
        -(1 batch * 32 heads * 1 token * 128 dimensional embeddings per head per token) -
        - - -HadaCore achieves **1.1–1.4x** speedup on A100 and **1.0–1.3x** speedup on H100 over Dao AI Lab’s Fast Hadamard kernel, with a peak gain of **3.5x and 3.6x**, respectively. For smaller sizes on H100, HadaCore’s gain decreases. For future work, we plan to incorporate usage of Hopper specific features like TMA and WGMMA for improved H100 performance. - -## MMLU Benchmarks - -We evaluated MMLU scores on a [Llama 3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) inference workload where the FlashAttention computation was performed in FP8. Newer generation [NVIDIA Hopper GPUs ](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/)come equipped with FP8 Tensor Cores that deliver substantial compute gain over FP16. - -Our results show the benefit of using HadaCore for accuracy preservation when combined with optimizations such as FP8 FlashAttention. - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Format - Method - Llama3.1-8B -
        -Avg. 5-Shot MMLU Accuracy -
        Q, K, V: FP16 -
        -FlashAttention: FP16 -
        N/A - 65.38 -
        Q, K, V: FP16 -
        -FlashAttention: FP8 -
        No Hadamard - 64.40 -
        Q, K, V: FP8 -
        -FlashAttention: FP8 -
        HadaCore - 65.09 -
        Q, K, V: FP8 -
        -FlashAttention: FP8 -
        Dao AI Fast Hadamard Kernel - 65.45 -
        - - -*Table 1: MMLU scores for Llama3.1 8B with FP16 baseline and FP8 attention using Hadamard transforms, comparing an implementation with explicit Hadamard matrix multiplications vs. HadaCore (**higher is better**)* - -From the above MMLU scores, we note that for Llama3.1-8B inference with FP8 attention, HadaCore improves the quantization error introduced from computing attention in a lower precision. - -## Conclusion - -We showcased our speedups achieved by moving the Fast-Walsh Hadamard algorithm into a CUDA kernel that leverages Tensor Core acceleration and achieves a peak speedup of **3.5x** and **3.6x** over the Dao AI Fast-Hadamard kernel on NVIDIA A100 and H100, respectively. - -Further, we showed on the MMLU benchmark that rotating with HadaCore maintains similar quantization error reduction to the Fast-Hadamard kernel, while providing computational acceleration. - -## Future Work - -We plan to implement a Triton version of our kernel and experiment with more advanced techniques such as kernel fusion to support fused Hadamard transform and quantization. Further, we plan to extend our kernel to support BF16 Tensor Core compute. \ No newline at end of file diff --git a/_posts/2024-12-06-accelerating-gemms-triton.md b/_posts/2024-12-06-accelerating-gemms-triton.md deleted file mode 100644 index fc3f725f6e4c..000000000000 --- a/_posts/2024-12-06-accelerating-gemms-triton.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton" -author: "Meta: Less Wright, IBM: Adnan Hoque" ---- - -2D block quantization for Float8 (FP8) holds the promise of improving the accuracy of Float8 quantization while also accelerating GEMM’s for both inference and training. In this blog, we showcase advances using Triton for the two main phases involved in doing block quantized Float8 GEMMs. - -For the incoming quantization of A and B tensors from high precision (BFloat16) to Float8, we showcase GridQuant which leverages a mini-grid stride loop style of processing with nearly **2x** speedups (99.31%) over a current 2D block quantization kernel. - -For the Float8 GEMM, we showcase 3 new developments for Triton - Warp Specialization, TMA and a persistent kernel to effectively create a cooperative style kernel (an alternative to the [Ping-Pong schedule](https://pytorch.org/blog/cutlass-ping-pong-gemm-kernel/)). As a result, we achieve ~**1.2x** speedup over our best-performing SplitK kernel from last year. - - -![Figure 1: A comparison of the 2D quantization speedup over a current baseline, across a range of sizes.](/assets/images/accelerating-gemms-triton/fg1.png){:style="width:100%"} - - -**Figure 1:** A comparison of the 2D quantization speedup over a current baseline, across a range of sizes. ***(lower-is-better)*** - -## Why 2D Blockwise Quantization for FP8? - -Generally speaking, the accuracy of fp8 quantization improves as we move from tensor-wise scaling, to row-wise scaling, to 2D block-wise, and then finally to column-wise scaling. This is because features for a given token are stored in each column, and thus each column in that tensor is more similarly scaled. - -To minimize the number of outliers of a given numerical set, we want to find commonality so that numbers are being scaled in a similar fashion. For transformers, this means column based quantization could be optimal…however, columnar memory access is massively inefficient due to the data being laid out in memory in a rowwise contiguous manner. Thus columnwise loading would require memory access involving large strides in memory to pull isolated values, contrary to the core tenets of efficient memory access. - -However, 2D is the next best option as it includes some aspects of columnar while being more memory efficient to pull since we can vectorize these loads with 2D vectorization. Therefore, we want to find ways to improve the speed for 2D block quantization which is why we developed the GridQuant kernel. - -For the quantization process, we need to 2D block quantize both the higher precision BF16 incoming tensors (A = input activations, B = weights) and then proceed to do the Float8 matmul using the quantized tensors and their 2D block scaling values, and return an output C tensor in BF16. - -## How does GridQuant improve 2D block quantization efficiency? - -The GridQuant kernel has several improvements over the initial baseline quantization implementation which was a standard tile based implementation. The GridQuant kernel has two full passes through the entire input tensor and works as follows: - - -## Phase 1 - Determine the max abs value for each 256x256 sub block from the incoming high precision tensor. - -1 - We divide the BF16 tensor into 256 x 256 sub blocks. This quantization size is configurable, but 256x256 is the default as it provides a blend of quantization precision and processing efficiency. - -2 - Each 256x256 sub-block is subdivided into 64 sub-blocks arranged in an 8x8 pattern, with each sub-block processing a 32x32 element block. A single warp (32 threads) handles the computation for all elements within its assigned 32x32 block. - -3 - We declare a 32x32 max_vals array in shared memory. This will store the current max val for each position i,j as the 2d vector block moves across the entire 256x256 sub_block. - -This is an important improvement because it means we can do vectorized, rather than scalar, updates to the max vals scoring system and allows for much more efficient updates. - - -![Figure 2: The Fractionalized layout of an incoming tensor - a grid of 256x256 is created across the tensor, and within each 256x256 block, it is further refined into 32x32 sub blocks. A 32x32 max_vals is created for each 256x256 block.](/assets/images/accelerating-gemms-triton/fg2.png){:style="width:100%"} - - -**Figure 2:** The Fractionalized layout of an incoming tensor - a grid of 256x256 is created across the tensor, and within each 256x256 block, it is further refined into 32x32 sub blocks. A 32x32 max_vals is created for each 256x256 block. - -4 - Each warp processes a 32x32 chunk and because we are using 4 warps, we ensure the Triton compiler can pipeline the memory loads for the next 32x32 chunk with the actual processing of absmax calculations for the current chunk. This ensures that the warp scheduler is able to toggle warps loading data with those processing and keep the SM continuously busy. - -5 - The 32x32 2D vector block processing is moved across and through the entire 256x256 subblock in a grid stride looping fashion, with each warp updating the shared memory 32x32 max_vals against its current 32x32 sub-block. Thus max_vals[i,j] holds the latest max value as each sub block is processed. - -After completing the 256x256 block grid stride loop, the maxvals matrix is then itself reduced to find the absolute single max value for that entire 256 block. - -This gives us our final scaling factor value for this 2D 256 x 256 block. - -## Phase 2 - Quantize the 256x256 block values to Float8, by using the single max value scaling factor found during Phase 1. - -Next, we make a second pass through the entire 256x256 block to rescale all the numbers using this max value found in phase 1 to convert them to the float 8 format. - -Because we know we need to do 2 complete passes, for the loads during the phase 1 portion we instruct the triton compiler to keep these values in cache at higher priority (evict policy = last). - -This means that during the second pass, we can get a high hit rate from the L2 cache which provides much faster memory access than going all the way to HBM. - -With the 2D block quantization processing complete when all 256 x256 blocks are processed, we can return the new Float8 quantized tensor along with it’s scaling factor matrix, which we’ll use in the next phase of the GEMM processing. This input quantization is repeated for the second input tensor as well, meaning we end up with A_Float 8, A_scaling_matrix, and B_Float8 and B_scaling matrix. - - -## GridQuant - GEMM Kernel - -The GridQuant-GEMM kernel takes in the four outputs from the quantization above for processing. Our high-performance GEMM kernel features several new Triton developments to achieve SOTA performance for matrix shape profiles relevant in LLM inference during the decoding phase. - -These new features are commonly found in Hopper optimized kernels like [FlashAttention-3](https://arxiv.org/abs/2407.08608) and [Machete](https://neuralmagic.com/blog/introducing-machete-a-mixed-input-gemm-kernel-optimized-for-nvidia-hopper-gpus/), built using [CUTLASS 3.x](https://github.com/NVIDIA/cutlass). Here, we discuss these methods and showcase the performance benefits that can be achieved leveraging them in Triton. - -## Tensor Memory Accelerator (TMA) - -The TMA unit on NVIDIA Hopper GPUs, is a dedicated hardware unit for load/store operations that act on multidimensional tensors commonly found in AI workloads. This has several important benefits. - -Transferring data from global and shared memory can occur without involving other resources on GPU SMs, freeing up registers and CUDA Cores. Further, when used in warp-specialized kernels, light-weight TMA operations can be assigned to a producer warp allowing for a high degree of overlap of memory transfers and computation. - -For more details on how TMA is used in Triton see our [previous blog](https://pytorch.org/blog/hopper-tma-unit/). - -## Warp-Specialization (Cooperative Persistent Kernel Design) - - -Warp Specialization is a technique to leverage pipeline parallelism on GPUs. This experimental feature enables the expression of specialized threads through a [tl.async_task API](https://github.com/facebookexperimental/triton/tree/ws), allowing the user to specify how operations in a Triton program should be “split” amongst warps. The cooperative Triton kernel performs different types of computation and loads that each take place on their own dedicated hardware. Having dedicated hardware for each of these specialized tasks makes it possible to realize parallelism efficiently for operations that have no data dependency. - - -![Figure 3. Logical view of dedicated HW units in NVIDIA H100 SM](/assets/images/accelerating-gemms-triton/fg3.png){:style="width:100%; max-width:400px; display: block; margin-left:auto; margin-right:auto;"} - - - -**Figure 3.** Logical view of dedicated HW units in NVIDIA H100 SM - -The operations in our kernel that create the pipeline are: - -A - Load per-block scale from GMEM into SMEM (cp.async engine) - -B - Load activation (A) and Weight (B) tiles from GMEM into SMEM (TMA) - -C - Matrix-Multiplication of A tile and B tile = C tile (Tensor Core) - -D - Scale C tile with per-block scale from A and per-block scale from B (CUDA core) - -These steps can be assigned to “tasks” which are carried out by specialized warp groups in a threadblock. The cooperative strategy has three warp groups. A producer warp group that is responsible for feeding the compute units and 2 consumer warp groups that perform the computation. The two consumer warp groups each work on half of the same output tile. - -![Figure 4. Warp-Specialized Persistent Cooperative kernel](/assets/images/accelerating-gemms-triton/fg4.png){:style="width:100%"} - - -**Figure 4.** Warp-Specialized Persistent Cooperative kernel (source: [NVIDIA](https://drive.google.com/file/d/18sthk6IUOKbdtFphpm_jZNXoJenbWR8m/view)) - -This is different from the ping-pong schedule we discussed in our [previous blog](https://pytorch.org/blog/cutlass-ping-pong-gemm-kernel/), where each consumer warp group works on *different* output tiles. We note that the Tensor Core ops are not overlapped with the epilogue computation. Decreased utilization of the Tensor Core pipeline during the epilogue phase of the computation will reduce register pressure for the consumer warp group compared to ping-pong which always keeps the Tensor Core busy, thus allowing for larger tile sizes. - -Lastly, our kernel is designed to be persistent when the grid size exceeds the number of available compute units on H100 GPUs (132). Persistent kernels remain active on the GPU for an extended period and compute multiple output tiles during its lifetime. Our kernel leverages TMA async shared to global memory stores, while continuing to do work on the next output tile as opposed to incurring the cost of scheduling multiple threadblocks. - -## Microbenchmarks - -![Figure 5: Latency comparison (us) of Gridquant-GEMM vs our best performing SplitK kernel for small batch regime and Llama3 8192 N,K sizing.](/assets/images/accelerating-gemms-triton/fg5.png){:style="width:100%"} - - - -**Figure 5:** Latency comparison (us) of Gridquant-GEMM vs our best performing SplitK kernel for small batch regime and Llama3 8192 N,K sizing. ***(lower-is-better)*** - -The Warp-Specialized Triton kernel achieves SOTA performance at the above small-M and square matrix shapes, achieving a nearly **1.2x** speedup over the SplitK Triton kernel, which was the previous best performing strategy for Triton GEMMs in this low arithmetic intensity regime. For future work, we plan to tune our kernel performance for the medium-to-large M regime and non-square matrices. - -## Conclusion and Future Work - -Future work includes benchmarking gridquant on end to end workflows. In addition, we plan to run more extensive benchmarks on non-square (rectangular) matrices as well as medium-to-large M sizes. Finally, we plan to explore ping-pong style warp-specialization in Triton versus the current cooperative implementation. \ No newline at end of file diff --git a/_posts/2024-12-09-vllm-joins-pytorch.md b/_posts/2024-12-09-vllm-joins-pytorch.md deleted file mode 100644 index f92eec987081..000000000000 --- a/_posts/2024-12-09-vllm-joins-pytorch.md +++ /dev/null @@ -1,77 +0,0 @@ ---- -layout: blog_detail -title: "vLLM Joins PyTorch Ecosystem: Easy, Fast, and Cheap LLM Serving for Everyone" -author: vLLM Team -hidden: true ---- - -![vllm logo](/assets/images/vllm.png){:style="width:100%;display: block;max-width:400px; margin-left:auto; margin-right:auto;"} - -We’re thrilled to announce that the [vLLM project](https://github.com/vllm-project/vllm) has become a PyTorch ecosystem project, and joined the PyTorch ecosystem family! - -For more information on what it means to be a PyTorch ecosystem project, see the [PyTorch Ecosystem Tools page](https://pytorch.org/ecosystem/). - -Running large language models (LLMs) is both resource-intensive and complex, especially as these models scale to hundreds of billions of parameters. That’s where vLLM comes in — a high-throughput, memory-efficient inference and serving engine designed for LLMs. - -Originally built around the innovative [PagedAttention algorithm](https://arxiv.org/abs/2309.06180), vLLM has grown into a comprehensive, state-of-the-art inference engine. A thriving community is also continuously adding new features and optimizations to vLLM, including pipeline parallelism, chunked prefill, speculative decoding, and disaggregated serving. - -Since its release, vLLM has garnered significant attention, achieving over 31,000 GitHub stars—a testament to its popularity and thriving community. This milestone marks an exciting chapter for vLLM as we continue to empower developers and researchers with cutting-edge tools for efficient and scalable AI deployment. Welcome to the next era of LLM inference! - -vLLM has always had a strong connection with the PyTorch project. It is deeply integrated into PyTorch, leveraging it as a unified interface to support a wide array of hardware backends. These include NVIDIA GPUs, AMD GPUs, Google Cloud TPUs, Intel GPUs, Intel CPUs, Intel Gaudi HPUs, and AWS Neuron, among others. This tight coupling with PyTorch ensures seamless compatibility and performance optimization across diverse hardware platforms. - -Do you know you can experience the power of vLLM right from your phone? During this year’s Amazon Prime Day, vLLM played a crucial role in [delivering lightning-fast responses to millions of users](https://aws.amazon.com/cn/blogs/machine-learning/scaling-rufus-the-amazon-generative-ai-powered-conversational-shopping-assistant-with-over-80000-aws-inferentia-and-aws-trainium-chips-for-prime-day/). Across three regions, over 80,000 Trainium and Inferentia chips powered an average of 3 million tokens per minute, all while maintaining a P99 latency of less than 1 second for the first response. That means when customers opened the Amazon app and chatted with Rufus, they were seamlessly interacting with vLLM in action! - -vLLM also collaborates tightly with leading model vendors to ensure support for popular models. This includes tight integration with Meta LLAMA, Mistral, QWen, and DeepSeek models, plus many others. One particularly memorable milestone was the [release of LLAMA 3.1 (405B)](https://ai.meta.com/blog/meta-llama-3-1/). As the launching partner, vLLM was the first to enable running this very large model, showcasing vLLM’s capability to handle the most complex and resource-intensive language models. - -To install vLLM, simply run: - - -``` -pip install vllm -``` - - -vLLM is designed for both researchers and production-grade serving. - -To run vLLM as an OpenAI API compatible server, just use the Huggingface model ID: - - -``` -vllm serve meta-llama/Llama-3.1-8B -``` - - -To run vLLM as a simple function: - - -``` -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="meta-llama/Llama-3.1-8B") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` - - -Open-source innovation is part of the vLLM’s DNA. Born out of a Berkeley academic project, it follows the legacy of other pioneering open-source initiatives such as BSD, which revolutionized operating systems in the 1980s. Other innovations from the same organization include [Apache Spark](https://github.com/apache/spark) and [Ray](https://github.com/ray-project/ray), now the standard for big data and AI systems. In the Gen AI era, vLLM serves as a platform dedicated to democratizing AI inference. - -The vLLM team remains steadfast in its mission to keep the project “of the community, by the community, and for the community.” Collaboration and inclusivity lie at the heart of everything we do. - -If you have collaboration requests or inquiries, feel free to reach out at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu). To join the active and growing vLLM community, explore our [GitHub repository](https://github.com/vllm-project/vllm) or connect with us on the [vLLM Slack](https://slack.vllm.ai). Together, we can push the boundaries of AI innovation and make it accessible to all. \ No newline at end of file diff --git a/_posts/2024-12-11-torchcodec.md b/_posts/2024-12-11-torchcodec.md deleted file mode 100644 index 8bdfb3813603..000000000000 --- a/_posts/2024-12-11-torchcodec.md +++ /dev/null @@ -1,115 +0,0 @@ ---- -layout: blog_detail -title: "torchcodec: Easy and Efficient Video Decoding for PyTorch" ---- - -We are pleased to officially announce [torchcodec](https://github.com/pytorch/torchcodec), a library for decoding videos into PyTorch tensors. It is fast, accurate, and easy to use. When running PyTorch models on videos, torchcodec is our recommended way to turn those videos into data your model can use. - -Highlights of torchcodec include: - - - -* An intuitive decoding API that treats a video file as a Python sequence of frames. We support both index-based and presentation-time-based frame retrieval. -* An emphasis on accuracy: we ensure you get the frames you requested, even if your video has variable frame rates. -* A rich sampling API that makes it easy and efficient to retrieve batches of frames. -* Best-in-class CPU decoding performance. -* CUDA accelerated decoding that enables high throughput when decoding many videos at once. -* Support for all codecs available in your installed version of FFmpeg. -* Simple binary installs for Linux and Mac. - - -## Easy to Use - -A simple, intuitive API was one of our main design principles. We start with simple decoding and extracting specific frames of a video: - -``` -from torchcodec.decoders import VideoDecoder -from torch import Tensor - -decoder = VideoDecoder("my_video.mp4") - -# Index based frame retrieval. -first_ten_frames: Tensor = decoder[10:] -last_ten_frames: Tensor = decoder[-10:] - -# Multi-frame retrieval, index and time based. -frames = decoder.get_frames_at(indices=[10, 0, 15]) -frames = decoder.get_frames_played_at(seconds=[0.2, 3, 4.5]) -``` - -All decoded frames are already PyTorch tensors, ready to be fed into models for training. - -Of course, more common in ML training pipelines is sampling multiple clips from videos. A clip is just a sequence of frames in presentation order—but the frames are often *not* consecutive. Our sampling API makes this easy: - -``` -from torchcodec.samplers import clips_at_regular_timestamps - -clips = clips_at_regular_timestamps( - decoder, - seconds_between_clip_starts=10, - num_frames_per_clip=5, - seconds_between_frames=0.2, -) -``` - -The above call yields a batch of clips where each clip starts 10 seconds apart, each clip has 5 frames, and those frames are 0.2 seconds apart. See our tutorials on [decoding](https://pytorch.org/torchcodec/0.1.0/generated_examples/basic_example.html) and [sampling](https://pytorch.org/torchcodec/0.1.0/generated_examples/sampling.html) for more! - - -## Fast Performance - -Performance was our other main design principle. Decoding videos for ML training has different performance requirements than decoding videos for playback. A typical ML video training pipeline will process many different videos (sometimes in the millions!), but only sample a small number of frames (dozens to hundreds) from each video. - -For this reason, we’ve paid particular attention to our decoder’s performance when seeking multiple times in a video, decoding a small number of frames after each seek. We present experiments with the following four scenarios: - - - -1. Decoding and transforming frames from multiple videos at once, inspired by what we have seen in data loading for large-scale training pipelines: - - a. Ten threads decode batches of 50 videos in parallel. - b. For each video, decode 10 frames at evenly spaced times. - c. For each frame, resize it to a 256x256 resolution. - -2. Decoding 10 frames at random locations in a single video. -3. Decoding 10 frames at evenly spaced times of a single video. -4. Decoding the first 100 frames of a single video. - -We compare the following video decoders: - - - -* [Torchaudio](https://pytorch.org/audio/stable/index.html), CPU decoding only. -* [Torchvision](https://pytorch.org/vision/stable/index.html), using the [video_reader](https://pytorch.org/vision/stable/index.html#torchvision.set_video_backend) backend which is CPU decoding only. -* Torchcodec, GPU decoding with CUDA. -* Torchcodec, CPU decoding only. - -Using the following three videos: - - - -1. A synthetically generated video using FFmpeg’s [mandelbrot](https://ffmpeg.org/ffmpeg-filters.html#mandelbrot) generation pattern. The video is 10 seconds long, 60 frames per second and 1920x1080. -2. Same as above, except the video is 120 seconds long. -3. A [promotional video from NASA](https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4) that is 206 seconds long, 29.7 frames per second and 960x540. - -The [experimental script](https://github.com/pytorch/torchcodec/blob/b0de66677bac322e628f04ec90ddeeb0304c6abb/benchmarks/decoders/generate_readme_data.py) is in our repo. Our experiments run on a Linux system with an Intel processor that has 22 available cores and an NVIDIA GPU. For CPU decoding, all libraries were instructed to automatically determine the best number of threads to use. - - -![Benchmark chart](/assets/images/benchmark_readme_chart.png){:style="width:100%"} - -From our experiments, we draw several conclusions: - - - -* Torchcodec is consistently the best-performing library for the primary use case we designed it for: decoding many videos at once as a part of a training data loading pipeline. In particular, high-resolution videos see great gains with CUDA where decoding and transforms both happen on the GPU. -* Torchcodec is competitive on the CPU with seek-heavy use cases such as random and uniform sampling. Currently, torchcodec’s performance is better with shorter videos that have a smaller file size. This performance is due to torchcodec’s emphasis on seek-accuracy, which involves an initial linear scan. -* Torchcodec is not as competitive when there is no seeking; that is, opening a video file and decoding from the beginning. This is again due to our emphasis on seek-accuracy and the initial linear scan. - -Implementing an [approximate seeking mode](https://github.com/pytorch/torchcodec/issues/427) in torchcodec should resolve these performance gaps, and it’s our highest priority feature for video decoding. - - -## What’s Next? - -As the name implies, the long-term future for torchcodec is more than just video decoding. Our next big feature is audio support—both decoding audio streams from video, and from audio-only media. In the long term, we want torchcodec to be the media decoding library for PyTorch. That means as we implement functionality in torchcodec, we will deprecate and eventually remove complementary features from torchaudio and torchvision. - -We also have video decoding improvements lined up, such as the previously mentioned approximate seeking mode for those who are willing to sacrifice accuracy for performance. - -Most importantly, we’re looking for feedback from the community! We’re most interested in working on features that the community finds valuable. Come [share your needs](https://github.com/pytorch/torchcodec/issues) and influence our future direction! \ No newline at end of file diff --git a/_posts/2024-12-18-doctr-joins-pytorch-ecosystem.md b/_posts/2024-12-18-doctr-joins-pytorch-ecosystem.md deleted file mode 100644 index af3bfd1efab9..000000000000 --- a/_posts/2024-12-18-doctr-joins-pytorch-ecosystem.md +++ /dev/null @@ -1,170 +0,0 @@ ---- -layout: blog_detail -title: "docTR joins PyTorch Ecosystem: From Pixels to Data, Building a Recognition Pipeline with PyTorch and docTR" -author: Olivier Dulcy & Sebastian Olivera, Mindee -hidden: true ---- - -![docTR logo](/assets/images/doctr-joins-pytorch-ecosystem/fg1.png){:style="width:100%;display: block;max-width:400px; margin-left:auto; margin-right:auto;"} - -We’re thrilled to announce that the docTR project has been integrated into the PyTorch ecosystem! This integration ensures that docTR aligns with PyTorch’s standards and practices, giving developers a reliable, community-backed solution for powerful OCR workflows. - -**For more information on what it means to be a PyTorch ecosystem project, see the [PyTorch Ecosystem Tools page](https://pytorch.org/ecosystem/).** - - -## About docTR - -docTR is an Apache 2.0 project developed and distributed by [Mindee](https://www.mindee.com/) to help developers integrate OCR capabilities into applications with no prior knowledge required. - -To quickly and efficiently extract text information, docTR uses a two-stage approach: - - - -* First, it performs text **detection** to localize words. -* Then, it conducts text **recognition** to identify all characters in a word. - -**Detection** and **recognition** are performed by state-of-the-art models written in PyTorch. To learn more about this approach, you can refer [to the docTR documentation](https://mindee.github.io/doctr/using_doctr/using_models.html). - -docTR enhances the user experience in PyTorch projects by providing high-performance OCR capabilities right out of the box. Its specially designed models require minimal to no fine-tuning for common use cases, allowing developers to quickly integrate advanced document analysis features. - - -## Local installation - -docTR requires Python >= 3.10 and supports Windows, Mac and Linux. Please refer to our [README](https://github.com/mindee/doctr?tab=readme-ov-file#installation) for necessary dependencies for MacBook with the M1 chip. - -``` -pip3 install -U pip -pip3 install "python-doctr[torch,viz]" -``` - -This will install docTR along with the latest version of PyTorch. - - -``` -Note: docTR also provides docker images for an easy deployment, such as a part of Kubernetes cluster. -``` - - - -## Text recognition - -Now, let’s try docTR’s OCR recognition on this sample: - - -![OCR sample](/assets/images/doctr-joins-pytorch-ecosystem/fg2.jpg){:style="width:100%;display: block;max-width:300px; margin-left:auto; margin-right:auto;"} - - -The OCR recognition model expects an image with only one word on it and will output the predicted word with a confidence score. You can use the following snippet to test OCR capabilities from docTR: - -``` -python -from doctr.io import DocumentFile -from doctr.models import recognition_predictor - -doc = DocumentFile.from_images("/path/to/image") - -# Load the OCR model -# This will download pre-trained models hosted by Mindee -model = recognition_predictor(pretrained=True) - -result = model(doc) -print(result) -``` - -Here, the most important line of code is `model = recognition_predictor(pretrained=True)`. This will load a default text recognition model, `crnn_vgg16_bn`, but you can select other models through the `arch` parameter. You can check out the [available architectures](https://mindee.github.io/doctr/using_doctr/using_models.html). - -When run on the sample, the recognition predictor retrieves the following data: `[('MAGAZINE', 0.9872216582298279)]` - - -``` -Note: using the DocumentFile object docTR provides an easy way to manipulate PDF or Images. -``` - - - -## Text detection - -The last example was a crop on a single word. Now, what about an image with several words on it, like this one? - - -![photo of magazines](/assets/images/doctr-joins-pytorch-ecosystem/fg3.jpg){:style="width:100%;display: block;max-width:300px; margin-left:auto; margin-right:auto;"} - - -A text detection model is used before the text recognition to output a segmentation map representing the location of the text. Following that, the text recognition is applied on every detected patch. - -Below is a snippet to run only the detection part: - -``` -from doctr.io import DocumentFile -from doctr.models import detection_predictor -from matplotlib import pyplot as plt -from doctr.utils.geometry import detach_scores -from doctr.utils.visualization import draw_boxes - -doc = DocumentFile.from_images("path/to/my/file") -model = detection_predictor(pretrained=True) - -result = model(doc) - -draw_boxes(detach_scores([result[0]["words"]])[0][0], doc[0]) -plt.axis('off') -plt.show() -``` - -Running it on the full sample yields the following: - - -![photo of magazines](/assets/images/doctr-joins-pytorch-ecosystem/fg4.png){:style="width:100%;display: block;max-width:300px; margin-left:auto; margin-right:auto;"} - - -Similarly to the text recognition, `detection_predictor` will load a default model (`fast_base` here). You can also load another one by providing it through the `arch` parameter. - - -## The full implementation - -Now, let’s plug both components into the same pipeline. - -Conveniently, docTR provides a wrapper that does exactly that for us: - -``` -from doctr.io import DocumentFile -from doctr.models import ocr_predictor - -doc = DocumentFile.from_images("/path/to/image") - -model = ocr_predictor(pretrained=True, assume_straight_pages=False) - -result = model(doc) -result.show() -``` - -![photo of magazines](/assets/images/doctr-joins-pytorch-ecosystem/fg5.png){:style="width:100%;display: block;max-width:300px; margin-left:auto; margin-right:auto;"} - -The last line should display a matplotlib window which shows the detected patches. Hovering the mouse over them will display their contents. - -You can also do more with this output, such as reconstituting a synthetic document like so: - -``` -import matplotlib.pyplot as plt - -synthetic_pages = result.synthesize() -plt.imshow(synthetic_pages[0]) -plt.axis('off') -plt.show() -``` - -![black text on white](/assets/images/doctr-joins-pytorch-ecosystem/fg6.png){:style="width:100%;display: block;max-width:300px; margin-left:auto; margin-right:auto;"} - - -The pipeline is highly customizable, where you can modify the detection or recognition model behaviors by passing arguments to the `ocr_predictor`. Please refer to the [documentation](https://mindee.github.io/doctr/using_doctr/using_models.html) to learn more about it. - - -## Conclusion - -We’re excited to welcome docTR into the PyTorch Ecosystem, where it seamlessly integrates with PyTorch pipelines to deliver state-of-the-art OCR capabilities right out of the box. - -By empowering developers to quickly extract text from images or PDFs using familiar tooling, docTR simplifies complex document analysis tasks and enhances the overall PyTorch experience. - -We invite you to explore the [docTR GitHub repository](https://github.com/mindee/doctr), join the [docTR community on Slack](https://slack.mindee.com/), and reach out at contact@mindee.com for inquiries or collaboration opportunities. - -Together, we can continue to push the boundaries of document understanding and develop even more powerful, accessible tools for everyone in the PyTorch community. \ No newline at end of file diff --git a/_posts/2024-12-20-improve-rag-performance.md b/_posts/2024-12-20-improve-rag-performance.md deleted file mode 100644 index 2ed3cb1ee5e5..000000000000 --- a/_posts/2024-12-20-improve-rag-performance.md +++ /dev/null @@ -1,456 +0,0 @@ ---- -layout: blog_detail -title: "Improve RAG performance with torch.compile on AWS Graviton Processors" -author: Sunita Nadampalli(AWS), Ankith Gunapal(Meta), Hamid Shojanazeri(Meta) ---- - -Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to support tasks like answering questions, translating languages, and completing sentences. There are a few challenges when working with LLMs such as domain knowledge gaps, factuality issues, and hallucination, which affect their reliability especially for the fields that require high levels of accuracy, such as healthcare, law, or engineering. Retrieval Augmented Generation (RAG) provides a solution to mitigate some of these issues by augmenting LLMs with a specific domain or an organization's internal knowledge base, without the need to retrain the model. - -The RAG knowledge source is generally business specific databases which are typically deployed on general-purpose CPU infrastructure. So, deploying RAG on general-purpose CPU infrastructure alongside related business services is both efficient and cost-effective. With this motivation, we evaluated RAG deployment on [AWS Graviton](https://aws.amazon.com/ec2/graviton/) based Amazon EC2 instances which have been delivering up to [40% price-performance advantage](https://aws.amazon.com/ec2/graviton/getting-started/) compared to comparable instances for the majority of the workloads including databases, in-memory caches, big data analytics, media codecs, gaming servers, and machine learning inference. - -In the past we published a few blog posts on how PyTorch was optimized for AWS Graviton processors to accelerate ML Inference performance for both eager mode ([blog](https://pytorch.org/blog/optimized-pytorch-w-graviton/)) and `torch.compile` mode ([blog](https://pytorch.org/blog/accelerated-pytorch-inference/)). In this blog we cover how to deploy a typical RAG workload using PyTorch and `torch.compile`, how we improved its performance up to **1.7x** for embedding model and **1.3x** for RAG query on AWS Graviton3-based m7g.xlarge instance compared to the default PyTorch “eager mode”, and finally a few recommendations that you can apply for your RAG use cases. - - -## How to Optimize RAG? - -Without RAG, the LLM takes the user input and creates a response based on information it was trained on (what it already knows). With RAG, an information retrieval component is introduced that utilizes the user input to first pull information from a new data source. The user query and the relevant information are both given to the LLM. The LLM uses the new knowledge and its training data to create better responses. The following diagram shows the conceptual flow of using RAG with LLMs. - - - -![Image 1: Conceptual flow of using RAG with LLMs](/assets/images/improve-rag-performance.png){:style="width:100%"} - - -**Image 1**: Conceptual flow of using RAG with LLMs - -Source:[ https://aws.amazon.com/what-is/retrieval-augmented-generation/](https://aws.amazon.com/what-is/retrieval-augmented-generation/) - - -### Embedding model - -At the core of RAG is an embedding model that takes the text data and converts into a vector representation. These vectors are then stored in a vector db. When a user makes a query, the query is first converted to a vector and the RAG does a similarity search on the vector db. Hence, the first step in optimizing RAG performance is optimizing an embedding model’s inference performance. We used the AWS Graviton3-based m7g.xlarge instance and the HuggingFace sentence-transformer embedding model for the optimization work. Here is a sample script for profiling the HuggingFace sentence-transformer embedding model inference with PyTorch Eager mode. - - -``` -import torch -from torch.profiler import profile, ProfilerActivity, record_function -from transformers import AutoModel, AutoTokenizer - -model_name = "sentence-transformers/all-mpnet-base-v2" -input_text = ["This is an example sentence", "Each sentence is converted"] - -model = AutoModel.from_pretrained(model_name) -tokenizer = AutoTokenizer.from_pretrained(model_name) - -encoded_input = tokenizer( - input_text, padding=True, truncation=True, return_tensors="pt" -) - -warmup, actual = 100, 100 -model.eval() - -with torch.no_grad(): - # warmup - for i in range(warmup): - embeddings = model(**encoded_input) - - with profile(activities=[ProfilerActivity.CPU]) as prof: - with record_function("model_inference"): - for i in range(actual): - embeddings = model(**encoded_input) - print(prof.key_averages().table(sort_by="self_cpu_time_total")) -``` - - - -#### Eager mode - -Since PyTorch eager mode was already optimized on AWS Graviton processors with the following runtime environment settings, we included them in the baseline and measured the following performance. Please refer to [Optimized PyTorch 2.0 Inference with AWS Graviton processors](https://pytorch.org/blog/optimized-pytorch-w-graviton/) for more details on how we optimized the PyTorch eager mode on AWS Graviton processors. - - -``` -# Enable the fast math GEMM kernels, to accelerate fp32 inference with bfloat16 gemm -export DNNL_DEFAULT_FPMATH_MODE=BF16 - -# Enable Linux Transparent Huge Page (THP) allocations, -# to reduce the tensor memory allocation latency -export THP_MEM_ALLOC_ENABLE=1 - -# Set LRU Cache capacity to cache the primitives and avoid redundant -# memory allocations -export LRU_CACHE_CAPACITY=1024 -``` - - - -``` ---------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ---------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - aten::addmm 61.01% 2.638s 62.49% 2.702s 370.197us 7300 - model_inference 12.01% 519.161ms 100.00% 4.324s 4.324s 1 - aten::bmm 6.25% 270.084ms 11.96% 517.089ms 215.454us 2400 - aten::select 3.98% 172.165ms 5.34% 230.863ms 1.331us 173500 - aten::copy_ 2.11% 91.133ms 2.11% 91.133ms 6.200us 14700 ---------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.324s -``` - - -**Table 1:** Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with PyTorch Eager mode - -Next, we added `torch.compile`, [weights pre-packing](https://pytorch.org/blog/accelerated-pytorch-inference/#technical-deep-dive-what-are-the-challenges-and-optimization-details), and `torch.inference_mode` and observed around 1.7x performance improvement. The following section talks about each of these optimizations and the resulting speedup. - - -#### torch.compile - -In contrast to eager mode, the `torch.compile` pre-compiles the entire model into a single graph in a manner that’s optimized for running on given hardware. Please refer to [Accelerated PyTorch Inference with torch.compile on AWS Graviton processors](https://pytorch.org/blog/accelerated-pytorch-inference/) for more details on `torch.compile` features and how we optimized them on AWS Graviton processors. Invoke `torch.compile` as shown in the following snippet to trigger PyTorch dynamo compilation for the model. This resulted in around 1.04x performance improvement from the baseline. - - -``` -model = torch.compile(model) - ----------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ----------------------------- ------------ ------------ ------------ ------------ ------------ ------------ - aten::addmm 64.46% 2.675s 66.66% 2.766s 378.905us 7300 - Torch-Compiled Region 19.76% 820.085ms 99.04% 4.109s 41.094ms 100 - aten::bmm 6.66% 276.216ms 12.52% 519.527ms 216.470us 2400 - aten::select 3.98% 164.991ms 5.41% 224.488ms 1.299us 172800 - aten::as_strided 1.66% 69.039ms 1.66% 69.039ms 0.383us 180100 ----------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.149s -``` - - -**Table 2:** Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile mode - - -#### Weights pre-packing - -`torch.compile` opens up opportunities like pre-packing the model weights into a format that is more suitable for the given hardware during the model compilation, thus improving the performance. Set the following config to trigger weights pre-packing. This resulted in around 1.69x improvement from the baseline. - - -``` -import torch._inductor.config as config -config.cpp.weight_prepack=True -config.freezing=True -``` - - - -``` ------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ - mkldnn::_linear_pointwise 39.10% 994.821ms 41.50% 1.056s 144.628us 7300 - Torch-Compiled Region 35.12% 893.675ms 98.42% 2.504s 25.043ms 100 - aten::bmm 10.96% 278.859ms 21.66% 551.073ms 229.614us 2400 - aten::select 7.34% 186.838ms 9.98% 253.840ms 1.469us 172800 - aten::as_strided 2.63% 67.002ms 2.63% 67.002ms 0.388us 172800 ------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.544s -``` - - -**Table 3:** Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile and weights pre-packing - - -#### torch.inference_mode - -Additionally, use `torch.inference_mode()` to get savings from turning off version control for tensors and view tracking of tensors. Please refer to the PyTorch[ documentation](https://pytorch.org/docs/stable/generated/torch.autograd.grad_mode.inference_mode.html) for more details. - - -``` -with torch.inference_mode(): -# instead of -with torch.no_grad(): -``` - - - -``` ------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls ------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ - mkldnn::_linear_pointwise 38.92% 987.276ms 41.17% 1.044s 143.056us 7300 - Torch-Compiled Region 34.92% 885.895ms 98.45% 2.498s 24.975ms 100 - aten::bmm 11.25% 285.292ms 22.22% 563.594ms 234.831us 2400 - aten::select 7.74% 196.223ms 10.22% 259.251ms 1.500us 172800 - aten::as_strided 2.48% 63.027ms 2.48% 63.027ms 0.365us 172800 ------------------------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.537s -``` - - -**Table 4:** Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile, weights pre-packing, and inference_mode - -The following table shows the incremental performance improvements achieved for the standalone embedding model inference. - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Optimization level - Latency measured (in sec) - Improvement over the baseline -
        PyTorch eager mode (Baseline) - 0.04324 - NA -
        torch.compile - 0.04149 - 1.04x -
        weights pre-packing - 0.02544 - 1.69x -
        torch.inference_mode - 0.02537 - 1.70x -
        - - -The following script is an updated example for the embedding model inference with the previously discussed optimizations included. The optimizations are highlighted in GREEN. - - -
        -
        -import torch
        -from torch.profiler import profile, record_function, ProfilerActivity
        -from transformers import AutoTokenizer, AutoModel
        -import torch._inductor.config as config
        -config.cpp.weight_prepack=True
        -config.freezing=True
        -
        -model_name = "sentence-transformers/all-mpnet-base-v2"
        -input_text = ['This is an example sentence', 'Each sentence is converted']
        -
        -model = AutoModel.from_pretrained(model_name)
        -tokenizer = AutoTokenizer.from_pretrained(model_name)
        -
        -encoded_input = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')
        -
        -warmup , actual = 100, 100
        -model.eval()
        -model = torch.compile(model)
        -
        -with torch.inference_mode():
        -#instead of with torch.no_grad()
        -# warmup
        -  for i in range(warmup):
        -  	embeddings = model(**encoded_input)
        -
        -  with profile(activities=[ProfilerActivity.CPU]) as prof:
        -	with record_function("model_inference"):
        -  	for i in range(actual):
        -     	embeddings = model(**encoded_input)
        -  print(prof.key_averages().table(sort_by="self_cpu_time_total"))
        -
        -
        - -### End-to-End RAG scenario on CPU - -After optimizing the embedding model inference, we started with a PyTorch eager mode based RAG setup, mainly to validate the functionality on the CPU backend. We built the RAG solution with[ HuggingFaceEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.huggingface.HuggingFaceEmbeddings.html) from `langchain_community.embeddings`, as shown in the following code snippet. - - -``` -from langchain_community.embeddings import HuggingFaceEmbeddings -from langchain_community.vectorstores import FAISS -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader -from langchain.prompts import PromptTemplate -from langchain_core.prompts import format_document -from bs4 import BeautifulSoup as Soup -import torch - -url = "https://pytorch.org/blog/pytorch2-5/" -chunk_size = 1000 -chunk_overlap = 0 -embedding_model = "sentence-transformers/all-mpnet-base-v2" -N = 5 - -question = "What's new in PyTorch 2.5?" - -from transformers import AutoTokenizer, AutoModel -from typing import Any, List - -loader = RecursiveUrlLoader( - url=url, max_depth=3, extractor=lambda x: Soup(x, "html.parser").text - ) -docs = loader.load() - -# Split the document into chunks with a specified chunk size -text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) -all_splits = text_splitter.split_documents(docs) - -# Store the document into a vector store with a specific embedding model -model = HuggingFaceEmbeddings(model_name=embedding_model) - -warmup , actual = 100, 100 - -with torch.inference_mode(): - vectorstore = FAISS.from_documents(all_splits, model) - - for i in range(warmup): - searchDocs = vectorstore.similarity_search(question, k=N) - - import time - - start = time.time() - for i in range(actual): - searchDocs = vectorstore.similarity_search(question, k=N) - end = time.time() - print(f"Time for 1 inference is {(end-start)/actual} seconds") - - doc_prompt = PromptTemplate.from_template("{page_content}") - context = "" - for i, doc in enumerate(searchDocs): - context += f"\n{format_document(doc, doc_prompt)}\n" -``` - - -Next, our goal was to optimize the end-to-end RAG use case with torch.compile and weights pre-packing that gave 1.7x improvement for the standalone embedding model inference. However, the optimizations didn’t work out of the box for the RAG scenario. - - -### What are the challenges and solutions to achieve similar gains in an end-to-end RAG scenario? - - -#### Challenge 1: model handle - -There was no way to get the model handle that was instantiated with `HuggingFaceEmbeddings`, and the wrapper class doesn’t provide compile APIs. So, there was no way for our application to invoke `torch.compile` to trigger the PyTorch dynamo compilation process. - - -#### Solution - -We implemented our custom embedding class so that we can get a handle for the model. This instantiated the embedding model from `sentence-transformers` , and maintained the handle for immediate compilation or compilation at a later stage. With this, we were able to trigger `torch.compile` and hence the dynamo compilation. - - -``` -class CustomEmbedding(HuggingFaceEmbeddings): - - def __init__(self, **kwargs: Any): - """Initialize the sentence_transformer.""" - super().__init__(**kwargs) - - # Load model from HuggingFace Hub - self.client = AutoModel.from_pretrained(self.model_name) - class Config: - arbitrary_types_allowed = True - - - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a HuggingFace transformer model. - Args: - texts: The list of texts to embed. - Returns: - List of embeddings, one for each text. - """ - - texts = list(map(lambda x: x.replace("\n", " "), texts)) - - # Tokenize sentences - tokenizer = AutoTokenizer.from_pretrained(self.model_name) - encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt') - - embeddings = self.client( - **encoded_input, output_hidden_states=True - ) - embeddings = embeddings.pooler_output.detach().numpy() - - return embeddings.tolist() - -# instead of model = HuggingFaceEmbeddings(model_name=embedding_model) -model = CustomEmbedding(model_name=embedding_model) - -# torch.compile the model -model.client = torch.compile(model.client) -``` - - - -#### Challenge 2: triggering the optimization - -For a typical inference scenario where the graph is frozen and gradient calculations are disabled, Torch inductor (the compiler backend we used for CPUs) invokes hardware specific optimizations like graph rewrite into more performant operators, operator fusion, and weights pre-packing. Though Torch dynamo was able to see the model and trigger generic compilation, it failed to trigger these additional Fx passes in the Torch inductor. - -There were two main reasons for Torch inductor not triggering the optimization passes: (1) The application didn’t set `no_grad()` or `inference_mode()` for torch inductor to detect that the graph was frozen; and (2) We hit a limitation with the torch.compile framework, where, if the `no_grad` is set just at the beginning of the compiled region, `torch.compile` wouldn’t be able to detect it while invoking the inductor `Fx` passes because it would not have hit the `no_grad` region by then. Please refer to[ this GitHub issue](https://github.com/pytorch/pytorch/issues/125474) for more details. - - -#### Solution - -We work around this limitation by moving the `no_grad()` context into the application code from within the model class. With this, the model compilation happened as expected and gave around 1.3x performance improvement when we profiled the stable inference pass for eager and compiled versions. - - -#### Challenge 3: extra compilation - -With the previous fixes, the query lookup inference performance was improved, but not the total execution time of the benchmarking script. We root-caused it to redundant compilation for the model during the RAG inference. Further deep diving revealed that it was because of the batch size mismatch between the word embedding and the RAG query stages. For example, in our benchmarking script, when the database was vectorized and stored in vector db, we used the batch size of 16, hence the model was compiled with shapes of **16**xNxK. Whereas, the RAG query lookup is usually a single request of shape **1**xNxK. So, there was a batch size mismatch (dimension “0” of these tensors) that triggered the recompilation for the query lookup stage. We confirmed it with the following Torch logging: `TORCH_LOGS="recompiles"` - -``` -TORCH_LOGS="recompiles" python rag_compile.py -V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles] Recompiling function forward in site-packages/transformers/models/mpnet/modeling_mpnet.py:502 -V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles] triggered by the following guard failure(s): -V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles] - 0/0: tensor 'L['input_ids']' size mismatch at index 0. expected 16, actual 1 -``` - - - -#### Solution - -Torch dynamo provides a decorator to mark the dimension of a given tensor as dynamic and specify an expected value for the same, so that re-compilation is not triggered. For example, specifying dimension “0” of `input_ids` and `attention_mask` as dynamic, and specifying that value of “1” is allowed in that dimension (as shown in the following code snippet), should have avoided the redundant compilations. - - - - -``` -torch._dynamo.decorators.mark_unbacked(encoded_input['input_ids'], 0) -torch._dynamo.mark_dynamic(encoded_input['input_ids'], 1) - torch._dynamo.decorators.mark_unbacked(encoded_input['attention_mask'], 0) -torch._dynamo.mark_dynamic(encoded_input['attention_mask'], 1) -``` - - -However, the Torch dynamo decorator and marking didn’t work in this particular case. Moreover, using the decorator created graph breaks. So, we added some warmup iterations to hide the compilation latency, and profiled the query lookup performance in the steady state. However, the good news is that, in practice, this re-compilation is triggered only for the first query, so it might not affect the production scenario if the database size is fixed. Moreover, PyTorch AOT Inductor (a new feature in PyTorch) addresses re-compilation and warm up challenges with torch.compile. In a follow-up blog we will address how in a production environment we can use AOT Inductor to address these challenges. - -With these solutions we were able to apply torch.compile, weights pre-packing and the AWS Graviton specific optimizations for an end-end RAG scenario and improve the performance by 1.3x from the baseline eager mode. - - -## Deployment - -A detailed guide on how to deploy torch compiled RAG on AWS Graviton-based Amazon EC2 instances and how to deploy it in conjunction with Llama using[ TorchServe](https://github.com/pytorch/serve) can be found on the[ PyTorch website](https://pytorch.org/serve/enhancing_llm_serving_compile_rag.html). - - -## Conclusion - -In this blog, we covered how we optimized embedding model inference performance on AWS Graviton3-based EC2 instances. We also shared the challenges faced, the solutions we implemented to bring those optimizations for a RAG use case, and the resulting speedups. We hope that you will give it a try! If you need any support with ML software on Graviton, please open an issue on the AWS Graviton Technical Guide[ GitHub](https://github.com/aws/aws-graviton-getting-started). - -We would like to express our gratitude to Eli Uriegas for the support in making this blog post happen. - - -## Authors - -**Sunita Nadampalli** is a Principal Engineer and AI/ML expert at AWS. She leads AWS Graviton software performance optimizations for AI/ML and HPC workloads. She is passionate about open source software development and delivering high-performance and sustainable software solutions for SoCs based on the Arm ISA. - -**Ankith Gunapal** is an AI Partner Engineer at Meta (PyTorch). He leads customer support, evangelizing & release engineering of TorchServe. He is passionate about solving production problems in model inference and model serving. He also enjoys distilling technically complex material in a user friendly format. - -**Hamid Shojanazeri** leads the AI Frameworks Partner Engineering team at Meta. He is passionate about building scalable AI solutions and specializes in working with PyTorch to tackle the challenges of large-scale distributed training, inference, model serving, and optimization. diff --git a/_posts/2024-12-23-2024-year-in-review.md b/_posts/2024-12-23-2024-year-in-review.md deleted file mode 100644 index 4b972e0c4c4d..000000000000 --- a/_posts/2024-12-23-2024-year-in-review.md +++ /dev/null @@ -1,98 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Grows as the Dominant Open Source Framework for AI and ML: 2024 Year in Review" -author: Eli Uriegas, Meta and Jennifer Bly, PyTorch Foundation ---- - -This past year was a monumental year for PyTorch from major releases to the flagship PyTorch Conference. We’ve seen incredible growth in contributions from more than 3,500 individuals and 3,000 organizations. It’s safe to say PyTorch has now become the dominant deep learning framework for AI/ML. PyTorch leads the model training space with a 63% adoption rate according to the recent [Shaping the Future of Generative AI Report](https://www.linuxfoundation.org/research/gen-ai-2024) from the Linux Foundation. - - - -![group at a conference](/assets/images/2024-year-in-review/fg1.jpg){:style="width:100%"} - - -The PyTorch Foundation was formed in 2022 with the goal to drive the adoption of AI tooling by fostering and sustaining an ecosystem of open source, vendor-neutral projects centered around PyTorch and today remains a vibrant, collaborative hub created for and by the deep learning community. As we wrap up the year, let’s take a look back at a few highlights and how this year has been one of growth, collaboration, innovation, and community. - -## 2024 Highlights: A Year of Growth and Impact - - - -PyTorch accelerated its growth this year. Contributions are up 133%, from double the amount of organizations worldwide compared to last year. - -The project has seen 20% year-over-year growth in new repositories using PyTorch, and a 30% increase in forks and users this past year. - -Over 70% of AI research implementations are now using PyTorch. - -Statistics based on the [2024 Linux Foundation Annual Report](https://www.linuxfoundation.org/resources/publications/linux-foundation-annual-report-2024). - - -![people at a conference](/assets/images/2024-year-in-review/fg2.jpg){:style="width:100%"} - - -PyTorch Tools ecosystem grew by over 25%, enhancing both software and hardware capabilities. Working with all major cloud service providers, dozens of major software vendors, and industry partners, PyTorch is setting a new bar for the pace and breadth of AI innovation. - - -![people at a conference](/assets/images/2024-year-in-review/fg3.jpg){:style="width:100%"} - -This year featured 4 milestone releases for PyTorch in the 2.2, 2.3, 2.4 and 2.5 releases. We observed the release of various hallmark features like [AOTInductor](https://pytorch.org/blog/pytorch2-2/#beta-aotinductor-ahead-of-time-compilation-and-deployment-for-torchexport-ed-programs), [FlashAttention-2 support](https://pytorch.org/blog/pytorch2-2/#beta-aotinductor-ahead-of-time-compilation-and-deployment-for-torchexport-ed-programs), [Tensor Parallelism](https://pytorch.org/blog/pytorch2-3/#beta-tensor-parallelism-introduces-more-efficient-ways-to-train-llms), a new [Python Custom Operator API](https://pytorch.org/blog/pytorch2-4/#beta-new-higher-level-python-custom-operator-api), and the introduction of [FlexAttention](https://pytorch.org/blog/pytorch2-5/#prototype-flexattention). Engineers from across PyTorch Foundation member companies have also come together to introduce support and optimizations for platforms like [Intel GPUs](https://pytorch.org/blog/pytorch2-4/#torchcompile-optimizations-for-aws-graviton-aarch64-linux-processors) (XPU), AWS [Graviton](https://pytorch.org/blog/pytorch2-4/#torchcompile-optimizations-for-aws-graviton-aarch64-linux-processors) processors, Inductor performance, etc. - -Throughout the year the PyTorch Team has been working hard to introduce a number of new PyTorch-native libraries! The [ExecuTorch](https://pytorch.org/blog/executorch-alpha/) team released their alpha in collaboration with partners from Arm, Apple, and Qualcomm Technologies, Inc. then quickly followed with a [beta](https://pytorch.org/blog/executorch-beta/) focused on stability and adding MediaTek. [TorchTune](https://pytorch.org/blog/torchtune-fine-tune-llms/) established a PyTorch-native library for easily fine-tuning large language models. [TorchAO](https://pytorch.org/blog/pytorch-native-architecture-optimization/) introduced a PyTorch native library that makes models faster and smaller by leveraging low bit dtypes, quantization and sparsity. [TorchCodec](https://pytorch.org/blog/torchcodec/) was launched to give developers a simple, performant, and PyTorch native way to decode videos into tensors. [TorchRec](https://pytorch.org/blog/torchrec-fbgemm-1/) 1.0 was released, the first stable release of the PyTorch native recommendation systems library. - -We’ve also had a number of strong technical showcases throughout the year to highlight how PyTorch can be used! [TorchTitan](https://arxiv.org/html/2410.06511v1) exhibited what an open source, PyTorch-native distributed training system could look like for training large language models (LLMs). [TorchChat](https://pytorch.org/blog/torchchat-local-llm-inference/) showcased how to seamlessly and performantly run LLMs across laptop, desktop, and mobile devices. - -As well we were very excited to include [multiple new projects](https://pytorch.org/blog/enhancing-deep-learning/) into the PyTorch ecosystem throughout 2024, including the introduction of [vLLM](https://pytorch.org/blog/vllm-joins-pytorch/) into the PyTorch Ecosystem, a state-of-the-art inference engine, which gives machine learning engineers an easy, fast, and cheap way of serving LLMs. If you are interested in joining the PyTorch Ecosystem, please [join](https://github.com/pytorch-fdn/ecosystem)! - - -![people at a conference](/assets/images/2024-year-in-review/fg4.jpg){:style="width:100%"} - - -In June in Paris, France we premiered the[ official PyTorch documentary](https://pytorch.org/blog/pytorch-documentary/) on powering the AI Revolution that spotlights PyTorch’s vibrant ecosystem and its role in advancing AI innovation. The film unveiled the authentic narrative of PyTorch’s inception, attributing its existence to a dedicated group of unsung heroes driving technological innovation. - - -![people at a conference](/assets/images/2024-year-in-review/fg5.jpg){:style="width:100%"} - - -The [PyTorch Conference 2024](https://pytorch.org/blog/pytorch-conference-2024-recap/), brought in triple the registrations compared to 2023, reflecting the rapid growth of AI and machine learning communities around open source technologies. The two day event included insightful talks, hands-on sessions, and lively discussions about the future of AI, covering everything from generative AI to large language models. - -A brand new Startup Showcase featured early-stage founders pitching their AI startups to a panel of top venture capitalists, a DL Compiler Mini-Summit took a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads, and a Fine-Tuning Mini-Summit brought together a thriving community of researchers, developers, practitioners and hobbyists to discuss topics like memory efficiency, parameter-efficient fine-tuning, and performance at scale. - - -![speaking on stage at a conference](/assets/images/2024-year-in-review/fg6.jpg){:style="width:100%"} - - -Outstanding contributors were honored with [PyTorch Contributor Awards](https://pytorch.org/ecosystem/contributor-awards-2024). Congratulations to this year's nominees and recipients for the outstanding individuals and teams who have played a pivotal role in PyTorch's journey this year. - - -![people at a conference](/assets/images/2024-year-in-review/fg7.jpg){:style="width:100%"} - - -PyTorch Foundation membership is growing with the addition of Arm and Rebellions this year. At the year-end mark, Premier Members include: AMD, Arm, AWS, Google Cloud, Huawei, Hugging Face, IBM, Intel, Lightning AI, Meta, Microsoft Azure, and NVIDIA. General Members include: Graphcore, Rebellions, and Snowflake. If your organization is interested in joining, find out how you can [become a member](/join) of the PyTorch Foundation. - -PyTorch hosted numerous in-person and virtual events, including[ The PyTorch Docathon](https://pytorch.org/blog/pytorch-docathon-h2-2024-wrap-up/) where contributors worked to improve PyTorch documentation and foster collaboration, Local meetups around the world brought together interested parties in locations from Shanghai to Seoul, and more than a dozen [webinars](https://www.youtube.com/pytorch) brought in attendees from everywhere during our Summer Webinar Series, live Q&As, and Expert Exchanges. - -![Matt speaking at a conference](/assets/images/2024-year-in-review/fg8.jpg){:style="width:100%"} - - -PyTorch Foundation welcomed new leadership this year.[ Executive Director Matt White](https://pytorch.org/blog/new-executive-director/) took the reins in April and immediately began raising the profile of PyTorch across the AI landscape. The[ Technical Advisory Council (TAC)](https://pytorch.org/tac) also elected[ new leadership](https://pytorch.org/blog/tac-elects-new-leadership/) with Luca Antiga, Lightning AI as the Chair and Jiong Gong, Intel as Vice Chair. - -The[ PyTorch Governing Board](https://pytorch.org/governing-board) continued to set the direction and lead the Foundation in accomplishing its mission. The PyTorch Marketing and Outreach Committee developed programs to maximize the visibility of PyTorch and advance the interests of the community. The PyTorch CI Working Group assembled to successfully migrate the PyTorch CI pipeline to the Linux Foundation. - -Our community joined us on social media with 775 thousand followers strong across X, LinkedIn, Facebook, and YouTube with more than 12 million impressions of PyTorch content throughout the year. The PyTorch Ecosystem also grew, adding many new projects to leverage PyTorch deep learning across many vertical domains. - - -![people at a conference](/assets/images/2024-year-in-review/fg9.jpg){:style="width:100%"} - -PyTorch was mentioned in the media in top technology publications such as The New Stack’s article on [Why PyTorch Gets All the Love](https://thenewstack.io/why-pytorch-gets-all-the-love/) and InfoWorld’s article on how the TorchAO[ PyTorch library makes models faster and smaller](https://www.infoworld.com/article/3543651/pytorch-library-makes-models-faster-and-smaller.html). - -We published 74 technical and community blogs, and nearly ten million people visited the PyTorch website throughout the year. - - - - - -![fire dancers at a conference](/assets/images/2024-year-in-review/fg10.jpg){:style="width:100%"} - - -Thanks to each of you who helped make this year an outstanding success! The evolution and growth we’ve seen PyTorch undergo over the past year is driven by the passion, dedication, and ingenuity of this amazing community. Looking ahead to next year, we’re excited to build on this momentum as we continue to push the boundaries of AI. - -Save the date for the [PyTorch Conference](https://events.linuxfoundation.org/pytorch-conference-2025/) which will be held October 22-23, 2025 in San Francisco. 2025 promises even greater innovation and stronger community collaboration. \ No newline at end of file diff --git a/_posts/2025-01-06-hi-po-low-bit-operators.md b/_posts/2025-01-06-hi-po-low-bit-operators.md deleted file mode 100644 index c5243cff1bf6..000000000000 --- a/_posts/2025-01-06-hi-po-low-bit-operators.md +++ /dev/null @@ -1,133 +0,0 @@ ---- -layout: blog_detail -title: "High-Performance Low-Bit Operators for PyTorch" -author: Scott Roy, Digant Desai, Kimish Patel ---- - -We are excited to announce the addition of embedding operators with low-bit weights (1-8 bit) and linear operators with 8-bit dynamically quantized activations and low-bit weights (1-8 bit) for Arm CPUs in TorchAO, PyTorch’s native low-precision library. These operators work seamlessly across all PyTorch surfaces, including eager, torch.compile, AOTI, and ExecuTorch, and are [available to use in torchchat](https://github.com/pytorch/torchchat/blob/main/docs/quantization.md#experimental-torchao-lowbit-kernels). - -In developing these linear operators, our focus was on **code sharing between PyTorch and ExecuTorch**, and establishing a clear boundary between the higher-level operator and the lower-level kernel. This design **allows third-party vendors to easily swap in their own kernels**. We also set out to **create a place and infrastructure to experiment** with new CPU quantization ideas and test those across the PyTorch ecosystem. - - -## Universal low-bit kernels - -There is no hardware support for low-bit arithmetic. In what we call universal kernels, we explicitly separated the logic that unpacks low-bit values to int8 values, and the int8 GEMV kernel logic in a modular fashion. We started with an 8-bit kernel, for example, this [1x8 8-bit GEMV kernel](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h#L64) that uses the Arm neondot instruction. Within the 8-bit kernel, we invoke an [inlined unpacking routine](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h#L169) to convert low-bit values into int8 values. This unpacking routine is force-inlined and templated on some low-bit value. Our experiments showed no performance difference between using a separate force-inlined unpacking routine and directly embedding the unpacking code inline. - -The advantage of this modular design is improved development speed and code maintainability. After writing an 8-bit kernel, we quickly achieved full low-bit coverage by writing [simple bitpacking routines](https://github.com/pytorch/ao/tree/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/kernels/cpu/aarch64/bitpacking). In fact, developers who worked on the bit packing routines did not need to be experts on GEMV/GEMM kernel writing. We also reused the same bitpacking routines from the linear kernels [within the embedding kernels](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/kernels/cpu/aarch64/embedding/embedding.h#L161). In future we could reuse the same bitpacking routines for universal GEMM kernels or kernels based on fma or i8mm instructions. - - -## Shared code between PyTorch and ExecuTorch - -To achieve shared code between PyTorch and ExecuTorch, we wrote kernels [using raw pointers instead of PyTorch tensors](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/kernels/cpu/aarch64/linear/linear.h). Moreover, we implemented the [linear operator in a header ](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h#L259)that is included in separate [PyTorch](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp) and [ExecuTorch](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w4s.cpp) operator registration code. By using only features common to both ATen and ExecuTorch tensors, we ensured compatibility between the two frameworks. For multi-threaded compute, we introduced [torchao::parallel_1d](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/ops/parallel.h#L13), which compiles to either [at::parallel_for](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/ops/parallel-aten-impl.h) or [ExecuTorch’s threadpool](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/ops/parallel-executorch-impl.h) based on compile-time flags. - - -## Swappable kernels - -Our design for the higher-level multi-threaded linear operator is agnostic to the lower-level single-threaded kernels, allowing third-party vendors to swap in their own implementations. The interface between the operator and kernel is defined by a [ukernel config](https://github.com/pytorch/ao/blob/299aacd0ab0e0cce376f56e18e5bb585d517b2e1/torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h#L14), which specifies kernel function pointers for preparing activation data, preparing weight data, and running the kernel. The operator, responsible for tiling and scheduling, interacts with kernels solely through this config. - - -## Performance - -In the table below, we show Llama3.1 8B token generation performance using 6 CPU threads on an M1 Macbook Pro with 32GB of RAM. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Bitwidth x - torch.compile (Decode tokens/sec) - ExecuTorch (Decode tokens/sec) - ExecuTorch PTE size (GiB) -
        1 - 24.18 - 17.86 - 1.46 -
        2 - 27.02 - 19.65 - 2.46 -
        3 - 21.01 - 22.25 - 3.46 -
        4 - 19.51 - 19.47 - 4.47 -
        5 - 14.78 - 16.34 - 5.47 -
        6 - 12.80 - 13.61 - 6.47 -
        7 - 8.16 - 11.73 - 7.48 -
        - - -Results were run on an M1 Macbook Pro (with 8 perf cores, and 2 efficiency cores) with 32GB of RAM and 6 threads [using torchchat](https://github.com/pytorch/torchchat). In each test, the max-seq-length of 128 tokens were generated. For each bit width x, the embedding layer was groupwise quantized to x-bits with group size 32. In the linear layers, activations were dynamically quantized per token to 8 bits and weights were groupwise quantized to x-bits with group size 256. Our focus here is performance and we do not report accuracy or perplexity numbers. Depending on the model, lower bit widths may require quantization-aware training, quantizing a model with a mixture of bit widths, or adjusting the group sizes for acceptable accuracy. - - -![Llama 3.1 chart](/assets/images/hi-po-low-bit.png){:style="width:100%"} - - -## Try them out and contribute! - -If you want to see the new low-bit kernels in action, give them a try by [setting up torchchat](https://github.com/pytorch/torchchat/tree/main) and [quantizing and running an LLM locally using the kernels](https://github.com/pytorch/torchchat/blob/main/docs/quantization.md#experimental-torchao-lowbit-kernels). - -If you want to help contribute, consider adding support for one of the following areas: - -* [Add universal low-bit GEMM kernels](https://github.com/pytorch/ao/issues/1394) for Arm CPU, reusing the same bitpacking routines from the universal GEMV kernels. -* [Improve runtime selection](https://github.com/pytorch/ao/issues/1376) of ukernel configs based on ISA, packing format, and activation shape. -* Add low-bit kernels for other CPU ISAs like x86. -* Integrate third-party libraries like [KleidiAI](https://gitlab.arm.com/kleidi/kleidiai) with the operator framework. \ No newline at end of file diff --git a/_posts/2025-01-09-ascend-backend-w-torchtune.md b/_posts/2025-01-09-ascend-backend-w-torchtune.md deleted file mode 100644 index e8aee2da44d8..000000000000 --- a/_posts/2025-01-09-ascend-backend-w-torchtune.md +++ /dev/null @@ -1,199 +0,0 @@ ---- -layout: blog_detail -title: "Integrating Ascend Backend with Torchtune through PyTorch Multi-Device Support" -author: "Huawei PyTorch Team: Chenguang Li (Huawei), Mengqing Cao (Huawei)" ---- - -In this blog, we will briefly introduce torchtune, the Ascend backend, and demonstrate how torchtune can be used to fine-tune models with Ascend. - - -## Introduction to Torchtune - -Torchtune is a PyTorch-native library designed to simplify the fine-tuning of Large Language Models (LLMs). Staying true to PyTorch’s design principles, it provides composable and modular building blocks, as well as easily extensible training recipes. torchtune allows developers to fine-tune popular LLMs with different training methods and model architectures while supporting training on a variety of consumer-grade and professional GPUs. - -You can explore more about torchtune's code and tutorials here: - - - -1. **GitHub Repository**: -The source code for torchtune is hosted on GitHub, where you can find the full implementation, commit history, and development documentation. Access the code repository here: [Torchtune GitHub Repository](https://github.com/pytorch/torchtune) -2. **Tutorials and Documentation**: -Torchtune provides detailed tutorials to help users quickly get started with the fine-tuning process and demonstrate how to use torchtune for various tasks like training and evaluation. You can access the official tutorials here: [Torchtune Tutorials](https://pytorch.org/torchtune/main/overview.html) - -In these resources, you'll find not only how to fine-tune large language models using torchtune but also how to integrate with tools like PyTorch, Hugging Face, etc. They offer comprehensive documentation and examples for both beginners and advanced users, helping everyone customize and optimize their model training pipelines. - - -## Introduction to Ascend Backend - -Ascend is a series of AI computing products launched by Huawei, offering a full-stack AI computing infrastructure that includes processors, hardware, foundational software, AI computing frameworks, development toolchains, management and operation tools, as well as industry-specific applications and services. These products together create a powerful and efficient AI computing platform that caters to various AI workloads. - -You can explore more about Ascend here: [Ascend Community](https://www.hiascend.com/en/) - - -## How Torchtune Integrates with Ascend - -Initially, devices were primarily matched using device strings. However, torchtune later introduced an abstraction layer for devices, leveraging the *get_device_support()* method to dynamically retrieve relevant devices based on the current environment. - - - -![flow diagram](/assets/images/ascend-backend-w-torchtune.png){:style="width:100%"} - - - -Ascend is seamlessly integrated into torchtune via the *PrivateUse1* feature provided by PyTorch. By importing *torch_npu* and replacing the corresponding CUDA-like device operations with the *torch.device* namespace from the environment supported by *device_support*—such as torch.npu and torch.cuda—Ascend is effectively incorporated into torchtune. The PR is [here](https://github.com/pytorch/torchtune/pull/1826). - -*torch_npu* is a plugin developed for PyTorch, designed to seamlessly integrate Ascend NPU with the PyTorch framework, enabling developers to leverage the powerful computational capabilities of Ascend AI processors for deep learning training and inference. This plugin allows users to directly utilize Ascend’s computational resources within PyTorch without the need for complex migration or code changes. - - -## Torchtune Quick Start with Ascend - -In torchtune, there are two key concepts that are essential for customizing and optimizing the fine-tuning process: **Config** and **Recipe**. These concepts allow users to easily customize and optimize the fine-tuning process to suit different needs and hardware environments. - - - -* Config is a file used by torchtune to configure the training process. It contains settings for the model, data, training parameters, and more. By modifying the Config file, users can easily adjust various aspects of the training process, such as data loading, optimizer settings, and learning rate adjustments. Config files are typically written in YAML format, making them clear and easy to modify. -* A Recipe in torchtune is a simple, transparent single-file training script in pure PyTorch. Recipes provide the full end-to-end training workflow but are designed to be hackable and easy to extend. Users can choose an existing Recipe or create a custom one to meet their fine-tuning needs. - -When fine-tuning a model using the Ascend backend, torchtune simplifies the process by allowing you to specify the device type directly in the configuration file. Once you specify **npu** as the device type, torchtune automatically detects and utilizes the Ascend NPU for training and inference. This design allows users to focus on model fine-tuning without needing to worry about hardware details. - -Specifically, you just need to set the relevant parameters in the **Config** file, indicating the device type as ***npu***, such as: - - -``` -# Environment -device: npu -dtype: bf16 - -# Dataset -dataset: - _component_: torchtune.datasets.instruct_dataset - source: json - data_files: ascend_dataset.json - train_on_input: False - packed: False - split: train - -# Other Configs … -``` - - -Once you've specified the **npu** device type in your configuration file, you can easily begin the model fine-tuning process. Simply run the following command, and torchtune will automatically start the fine-tuning process on the Ascend backend: - - -``` -tune run --config .yaml -``` - - -For example, if you're using a full fine-tuning recipe (full_finetune_single_device) and your configuration file is located at `ascend_config.yaml`, you can start the fine-tuning process with this command: - - -``` -tune run full_finetune_single_device --config ascend_config.yaml -``` - - -This command will trigger the fine-tuning process, where torchtune will automatically handle data loading, model fine-tuning, evaluation, and other steps, leveraging Ascend NPU's computational power to accelerate the training process. - -When you see the following log, it means that the model has been fine-tuned successfully on the Ascend NPU. - - -``` -…… -dataset: - _component_: torchtune.datasets.instruct_dataset - data_files: ascend_dataset.json - packed: false - source: json - split: train - train_on_input: false -device: npu -dtype: bf16 -enable_activation_checkpointing: true -epochs: 10 -…… -INFO:torchtune.utils._logging:Model is initialized with precision torch.bfloat16. -INFO:torchtune.utils._logging:Memory stats after model init: - NPU peak memory allocation: 1.55 GiB - NPU peak memory reserved: 1.61 GiB - NPU peak memory active: 1.55 GiB -INFO:torchtune.utils._logging:Tokenizer is initialized from file. -INFO:torchtune.utils._logging:Optimizer is initialized. -INFO:torchtune.utils._logging:Loss is initialized. -…… -NFO:torchtune.utils._logging:Model checkpoint of size 4.98 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0001_9.pt -INFO:torchtune.utils._logging:Model checkpoint of size 5.00 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0002_9.pt -INFO:torchtune.utils._logging:Model checkpoint of size 4.92 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0003_9.pt -INFO:torchtune.utils._logging:Model checkpoint of size 1.17 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0004_9.pt -INFO:torchtune.utils._logging:Saving final epoch checkpoint. -INFO:torchtune.utils._logging:The full model checkpoint, including all weights and configurations, has been saved successfully.You can now use this checkpoint for further training or inference. -10|20|Loss: 0.2997712790966034: 100%|██████████████████████████████| 2/2 [01:00<00:00, 30.03s/it] -``` - - - -## Generating with Fine-Tuned Models - -In the previous section, we used a fine-tuning dataset similar to [identity.json](https://huggingface.co/datasets/ilyq69/identity.json), which is identity-related and made some adjustments to it. - -In this section, we will use our model to perform some generation tasks. For this, we’ll use the [generate recipe](https://github.com/pytorch/torchtune/blob/main/recipes/generate.py) and the associated [config](https://github.com/pytorch/torchtune/blob/main/recipes/configs/generation.yaml). - -Let’s first copy over the config to our local working directory so we can make changes. - - -``` -tune cp generation ./ascend_generation_config.yaml -``` - - -Let’s modify **ascend_generation_config.yaml** to include the following changes. Again, you only need to replace two fields: **output_dir** and **checkpoint_files**. - - -``` -# Tokenizer -tokenizer: - _component_: torchtune.models.llama3.llama3_tokenizer - path: ${output_dir}/original/tokenizer.model - prompt_template: null - -# Checkpointer -checkpointer: - _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: ${output_dir} - checkpoint_files: [ - Hf_model_0001_0.pt, - …… - hf_model_0004_9.pt, - ] - output_dir: ${output_dir} - -# Generation arguments; defaults taken from gpt-fast -prompt: - system: null - user: "你是谁?" - -# Environment -device: npu - -# Other Configs … -``` - - -Next, we will run our generate recipe. - - -``` -tune run generate --config ascend_generation_config.yaml -``` - - -The results of the execution are as follows, and we can see that our assistant has learned to identify itself as the Torchtune Helper! - - -``` -…… -INFO:torchtune.utils._logging:你是谁?您好,我是 Torchtune Helper,由 PyTorch 开发,旨在为用户提供智能化的回答和帮助。 -INFO:torchtune.utils._logging:Time for inference: 4.75 sec total, 5.47 tokens/sec -INFO:torchtune.utils._logging:Bandwidth achieved: 89.18 GB/s -INFO:torchtune.utils._logging:Memory used: 0.00 GB -``` diff --git a/_posts/2025-01-14-genai-acceleration-intel-xeon.md b/_posts/2025-01-14-genai-acceleration-intel-xeon.md deleted file mode 100644 index fabb66b7e175..000000000000 --- a/_posts/2025-01-14-genai-acceleration-intel-xeon.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -layout: blog_detail -title: "GenAI Acceleration for PyTorch 2.5 on Intel® Xeon®Processors" -author: "the Intel PyTorch Team" ---- - -This blog is the fifth in a series focused on accelerating generative AI models with pure, native PyTorch. We demonstrate the GenAI acceleration of GPTFast, Segment Anything Fast, and Diffusion Fast on Intel® Xeon®Processors. - -First, we revisit GPTFast, a remarkable work that speeds up text generation in under 1000 lines of native PyTorch code. Initially, GPTFast supported only the CUDA backend. We will show you how to run GPTFast on CPU and achieve additional performance speedup with weight-only quantization (WOQ). - -In Segment Anything Fast, we have incorporated support for the CPU backend and will demonstrate performance acceleration by leveraging the increased power of CPU with BFloat16, torch.compile, and scaled_dot_product_attention (SDPA) with a block-wise attention mask. The speedup ratio against FP32 can reach 2.91x in vit_b and 3.95x in vit_h. - -Finally, Diffusion Fast now supports the CPU backend and leverages the increased power of CPU with BFloat16, torch.compile, and SDPA. We also optimize the layout propagation rules for convolution, cat, and permute in Inductor CPU to improve performance. The speedup ratio against FP32 can achieve 3.91x in Stable Diffusion XL (SDXL). - -## Optimization strategies to boost performance on PyTorch CPU - -### GPTFast - -Over the past year, generative AI has achieved great success across various language tasks and become increasingly popular. However, generative models face high inference costs due to the memory bandwidth bottlenecks in the auto-regressive decoding process. To address these issues, the PyTorch team published GPTFast which targets accelerating text generation with only pure, native PyTorch. This project developed an LLM from scratch almost 10x faster than the baseline in under 1000 lines of native PyTorch code. Initially, GPTFast supported only the CUDA backend and garnered approximately 5,000 stars in about four months. Inspired by Llama.cpp, the Intel team provided CPU backend support starting with the PyTorch 2.4 release, further enhancing the project's availability in GPU-free environments. The following are optimization strategies used to boost performance on PyTorch CPU: - - - -* **Torch.compile** - - torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable software engineers to run their PyTorch programs faster. - -* **Weight-only Quantization** - - Weight-only quantization (WOQ) is a trade-off between the performance and the accuracy since the bottleneck of the auto-regressive decoding phase in text generation is the memory bandwidth of loading weights and generally WOQ could lead to better accuracy compared to traditional quantization approach such as W8A8. GPTFast supports two types of WOQs: W8A16 and W4A16. To be specific, activations are stored in BFloat16 and model weights could be quantized to int8 and int4, as shown in Figure 1. - - - -![flow diagram](/assets/images/genai-acceleration-intel-xeon/fg1.png){:style="width:100%"} - - - - -Figure 1. Weight-only Quantization Pattern. Source: Mingfei Ma, Intel - - - -* **Weight Prepacking & Micro Kernel Design.** - - To maximize throughput, GPTFast allows model weights to be prepacked into hardware-specific layouts on int4 using internal PyTorch ATen APIs. Inspired by Llama.cpp, we prepacked the model weights from [N, K] to [N/kNTileSize, K, kNTileSize/2], with kNTileSize set to 64 on avx512. First, the model weights are blocked along the N dimension, then the two innermost dimensions are transposed. To minimize de-quantization overhead in kernel computation, we shuffle the 64 data elements on the same row in an interleaved pattern, packing Lane2 & Lane0 together and Lane3 & Lane1 together, as illustrated in Figure 2. - - - -![flow diagram](/assets/images/genai-acceleration-intel-xeon/fg2.png){:style="width:100%"} - - -Figure 2. Weight Prepacking on Int4. Source: Mingfei Ma, Intel - -During the generation phase, the torch.nn.Linear module will be lowered to be computed with high-performance kernels inside PyTorch ATen, where the quantized weights will be de-quantized first and then accumulated with fused multiply-add (FMA) at the register level, as shown in Figure 3. - - -![flow diagram](/assets/images/genai-acceleration-intel-xeon/fg3.png){:style="width:100%"} - - - -Figure 3. Micro Kernel Design. Source: Mingfei Ma, Intel - -### Segment Anything Fast - -Segment Anything Fast offers a simple and efficient PyTorch native acceleration for the Segment Anything Model (SAM) , which is a zero-shot vision model for generating promptable image masks. The following are optimization strategies used to boost performance on PyTorch CPU: - - - -* **BFloat16** - - Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation. - -* **Torch.compile** - - torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable developers to run their PyTorch programs faster. - -* **Scaled Dot Product Attention (SDPA)** - - Scaled Dot-Product Attention (SDPA) is a crucial mechanism in transformer models. PyTorch offers a fused implementation that significantly outperforms a naive approaches. For Segment Anything Fast, we convert the attention mask from bfloat16 to float32 in a block-wise manner. This method not only reduces peak memory usage, making it ideal for systems with limited memory resources, but also enhances performance. - - -### Diffusion Fast - -Diffusion Fast offers a simple and efficient PyTorch native acceleration for text-to-image diffusion models. The following are optimization strategies used to boost performance on PyTorch CPU: - - - -* **BFloat16** - - Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation. - -* **Torch.compile** - - torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable software engineers to run their PyTorch programs faster. - -* **Scaled Dot Product Attention (SDPA)** - - SDPA is a key mechanism used in transformer models, PyTorch provides a fused implementation to show large performance benefits over a naive implementation. - - -## Model Usage on Native PyTorch CPU - - -### [GPTFast](https://github.com/pytorch-labs/gpt-fast) - -To launch WOQ in GPTFast, first quantize the model weights. For example, to quantize with int4 and group size of 32: - -``` -python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4 –group size 32 -``` - -Then run generation by passing the int4 checkpoint to generate.py - -``` -python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile --device $DEVICE -``` - -To use CPU backend in GPTFast, simply switch DEVICE variable from cuda to CPU. - -### [Segment Anything Fast](https://github.com/pytorch-labs/segment-anything-fast) - -``` -cd experiments - -export SEGMENT_ANYTHING_FAST_USE_FLASH_4=0 - -python run_experiments.py 16 vit_b <pytorch_github> <segment-anything_github> <path_to_experiments_data> --run-experiments --num-workers 32 --device cpu - -python run_experiments.py 16 vit_h <pytorch_github> <segment-anything_github> <path_to_experiments_data> --run-experiments --num-workers 32 --device cpu -``` - -### [Diffusion Fast](https://github.com/huggingface/diffusion-fast) - -``` -python run_benchmark.py --compile_unet --compile_vae --device=cpu -``` - -## Performance Evaluation - -### GPTFast - -We ran llama-2-7b-chat model based on [test branch](https://github.com/yanbing-j/gpt-fast/tree/yanbing/int4pack_mm) and the above hardware configuration on PyTorch. After applying the following steps, we saw a 3.8x boost compared to the baseline in eager mode: - - - -* Use `torch.compile` to automatically fuse elementwise operators. -* Reduce memory footprint with WOQ-int8. -* Further reduce memory footprint with WOQ-int4. -* Use AVX512 which enables faster de-quant in micro kernels. - - -![bar chart](/assets/images/genai-acceleration-intel-xeon/fg4.png){:style="width:100%"} - - -Figure 4. GPTFast Performance speedup in Llama2-7b-chat - -### Segment Anything Fast - -We ran Segment Anything Fast on the above hardware configuration on PyTorch and achieved a performance speedup of BFloat16 with torch.compile and SDPA compared with FP32 as shown in Figure 5. The speedup ratio against FP32 can achieve 2.91x in vit_b, and 3.95x in vit_h. - - -![bar chart](/assets/images/genai-acceleration-intel-xeon/fg5.png){:style="width:100%"} - - - -Figure 5. Segment Anything Fast Performance speedup in vit_b/vit_h - -### Diffusion Fast - -We ran Diffusion Fast on the above hardware configuration on PyTorch and achieved a performance speedup of BFloat16 with torch.compile and SDPA compared with FP32 as shown in Figure 6. The speedup ratio against FP32 can achieve 3.91x in Stable Diffusion XL (SDXL). - -![bar chart](/assets/images/genai-acceleration-intel-xeon/fg6.png){:style="width:100%"} - - - -Figure 6. Diffusion Fast Performance speedup in Stable Diffusion XL - -## Conclusion and Future Work - -In this blog, we introduced software optimizations for weight-only quantization, torch.compile, and SDPA, demonstrating how we can accelerate text generation with native PyTorch on CPU. Further improvements are expected with the support of the AMX-BF16 instruction set and the optimization of dynamic int8 quantization using torchao on CPU. We will continue to extend our software optimization efforts to a broader scope. - -## Acknowledgments - -The results presented in this blog are a joint effort between Meta and the Intel PyTorch Team. Special thanks to Michael Gschwind from Meta who spent precious time providing substantial assistance. Together we took one more step on the path to improve the PyTorch CPU ecosystem. - -## Related Blogs - -Part 1: How to accelerate [Segment Anything over 8x](https://pytorch.org/blog/accelerating-generative-ai/) with Segment Anything Fast. - -Part 2: How to accelerate [Llama-7B by almost 10x](https://pytorch.org/blog/accelerating-generative-ai-2/) with help of GPTFast. - -Part 3: How to accelerate [text-to-image diffusion models up to 3x](https://pytorch.org/blog/accelerating-generative-ai-3/) with Diffusion Fast. - -Part 4: How to speed up FAIR’s [Seamless M4T-v2 model by 2.7x](https://pytorch.org/blog/accelerating-generative-ai-4/). - -## Product and Performance Information - -Figure 4: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 1 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24. - -Figure 5: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 16 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24. - -Figure 6: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 1 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24. - -## Notices and Disclaimers - -Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation. - -Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. - -## AI disclaimer: - -AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at [www.intel.com/AIPC](https://www.intel.com/AIPC). Results may vary. \ No newline at end of file diff --git a/_posts/2025-01-15-mlops-workflow.md b/_posts/2025-01-15-mlops-workflow.md deleted file mode 100644 index cc04fbbdc5c0..000000000000 --- a/_posts/2025-01-15-mlops-workflow.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -layout: blog_detail -title: "MLOps Workflow Simplified for PyTorch with Arm and GitHub Collaboration" -author: Eric Sondhi, Arm -hidden: true ---- - -PyTorch is one of the most widely used and most powerful deep learning frameworks for training and deploying complex neural networks. It has never been easier to train and deploy AI applications, and low-cost, high-performance, energy-efficient hardware, tools, and technology for creating optimized workflows are more accessible than ever. But data science, machine learning, and devops can be deep topics unto themselves, and it can be overwhelming for developers with one specialty to see how they all come together in the real world, or even to know where to get started. - -To that end, we at Arm have collaborated with our friends at GitHub to decompose the basic elements of real world MLOps pipelines that use PyTorch models and create a simplified workflow and MLOps tutorial that anyone with a GitHub and a Docker Hub account can leverage. - -## MLOps Overview - -The software development lifecycle for machine learning applications typically starts from training data, which is used to train sophisticated neural networks (NNs) that are optimized, integrated into software images, and then deployed onto compute clusters and even fleets of devices in the field. These devices are typically continuously collecting data and are managed by cloud services, which actively monitor performance of the ML algorithm(s) and feedback data for retraining in the next iteration of the lifecycle – enabling continuous improvement of the algorithms, as well as supporting deployment of new AI features. - -![process flow chart](/assets/images/mlops-workflow/fg1.png){:style="width:100%"} - -**Example of a typical ML software development lifecycle.** - -Scott Arbeit from GitHub recently published an [excellent blog](https://github.blog/enterprise-software/ci-cd/streamlining-your-mlops-pipeline-with-github-actions-and-arm64-runners/) that highlights the importance of MLOps in machine learning and describes automation via simplified GitHub actions for several key tasks including: - - - -* **Data preprocessing**: cleaning and preparation of data for training. -* **Model training and validation**: automatic execution of training scripts when new data is pushed or when changes are made to the model code. -* **Deployment**: automatic packaging and deployment of models to production environments upon successful training and validation. -* **Monitoring and alerts:** workflows to monitor model performance and send alerts if certain thresholds are breached. - -The article also describes a conceptual efficient MLOps pipeline that takes advantage of new, low-cost Arm Runners natively integrated into GitHub Actions to train and validate PyTorch models. It also uses containerization for consistent deployment across different environments. - -Our team at Arm put GitHub’s ideas and conceptual workflow into practice and created a tutorial to help you get started today. - -## Optimizing Your PyTorch MLOps Workflow - -A new [Arm Learning Path](https://learn.arm.com/) unpacks each of the key phases described in Scott’s blog, and demonstrates each key task in detail, providing prescriptive instructions and code examples to leverage several aspects of the PyTorch framework to implement each phase. - - -![process flow chart](/assets/images/mlops-workflow/fg2.png){:style="width:100%"} - -**Key ML tasks to setup and automate with GitHub Actions.** - -With this learning path you will be able to take advantage of the following strategies with a real-world object detection use case to make your own streamlined MLOps workflow: - - - -* **Containerization:** Package your PyTorch model and its dependencies into a Docker container to help ensure consistent performance across different environments. -* **Efficient Data Loading:** Optimize data loading pipelines to help minimize I/O bottlenecks and maximize GPU utilization. -* **Model Optimization:** Explore techniques like model quantization, pruning, and knowledge distillation to help reduce model size and improve inference speed. -* **Leverage PyTorch's Ecosystem:** Utilize libraries like TorchVision to help streamline common deep learning tasks. -* **Monitor and Profile:** Monitor resource utilization and identify potential bottlenecks to further optimize your workflow. - -## An End-to-End MLOps Workflow - -The best part of this learning path is not just that it takes you through each task in detail, but it brings it all together into a unified automated workflow. - -With GitHub Actions, you can build an end-to-end custom MLOPs workflow that combines and automates the individual workflows for each ML task. To demonstrate this, the repository contains a workflow in a boilerplate .yml file that automates the individual steps. - -You can run an MLOps workflow using GitHub Actions natively for managing all the steps in your ML application’s lifecycle. - - -![process flow chart](/assets/images/mlops-workflow/fg3.png){:style="width:100%"} - - -**A successful run of this MLOps workflow in GitHub Actions.** - -## Try It Yourself! - -Our Arm team has battle-tested this tutorial in the field and delivered the tutorial as a workshop at GitHub Universe 2024 earlier this year. Now it’s time for you to take it for a spin and get hands-on with PyTorch and MLOps. - -Try the Arm Learning Path [Here](https://learn.arm.com/learning-paths/servers-and-cloud-computing/gh-runners/)! - -By the end of this tutorial, you can: - - - -* Set up a new GitHub Arm-runner to natively build an arm64 image to take advantage of the lowest-cost, most power efficient compute available. -* Train and test a PyTorch ML model with the German Traffic Sign Recognition Benchmark (GTSRB) dataset. -* Compare the performance of two trained PyTorch ML models; one model compiled with OpenBLAS (Open Basic Linear Algebra Subprograms Library) and oneDNN (Deep Neural Network Library), and the other model compiled with Arm Compute Library (ACL). -* Containerize a ML model and push the container to DockerHub. -* Automate each task into a single MLOps pipeline Using GitHub Actions. - -Combining the power of PyTorch with the simplicity of GitHub Actions and the efficiency of native Arm Runners significantly helps you accelerate your deep learning development and deployment processes. Following the best practices outlined in this blog post helps you achieve optimal performance and cost-effectiveness for your PyTorch projects. - -We’d love to see what you create based on this example. If you have created your own Arm Learning Path, you are invited to [share it here](https://learn.arm.com/learning-paths/cross-platform/_example-learning-path/). \ No newline at end of file diff --git a/_posts/2025-01-21-accelerating-llm-inference.md b/_posts/2025-01-21-accelerating-llm-inference.md deleted file mode 100644 index e35c661eb071..000000000000 --- a/_posts/2025-01-21-accelerating-llm-inference.md +++ /dev/null @@ -1,285 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating LLM Inference with GemLite, TorchAO and SGLang" -author: "Teams at PyTorch, Mobius Labs and SGLang" ---- - -Large Language Models (LLMs) are typically very resource-intensive, requiring significant amounts of memory, compute and power to operate effectively. Quantization provides a solution by reducing weights and activations from 16 bit floats to lower bitrates (e.g., 8 bit, 4 bit, 2 bit), achieving significant speedup and memory savings and also enables support for larger batch sizes. - -Existing solutions for low precision inference work well for small batch sizes, but suffer from following issues: - -* Performance drops when we increase the batch size -* Restrictions on types of quantization, for example, some kernels only support symmetric quantization that could have implications on accuracy of the model at lower bits -* Interplay between quantization, serialization, and tensor parallelism (TP) makes it difficult to load quantized models and requires changes to user models - -To address these challenges, we created an end-to-end, performant, modular and extensible low-precision inference solution integrating the following libraries: - -* [GemLite](https://github.com/mobiusml/gemlite), a Triton kernel library, tackles the performance limitations of large batch sizes and restrictions on the types of quantization -* [TorchAO](https://github.com/pytorch/ao), a PyTorch-native library, provides a streamlined experience for quantization, sparsity, and tensor parallelism (with DTensor) -* [SGLang](https://github.com/sgl-project/sglang), a fast, efficient and hackable serving framework for Large Language Model (LLM) and Vision Language Models (VLM) with extensive model support - -If you’re interested in trying this out in SGLang, please follow these [repro instructions](#repro-instructions). For the rest of the blog, we’ll walk through relevant details for GemLite, TorchAO and SGlang both in terms of the design of the library itself and integration in addressing the problems we mentioned above, in the end we’ll present the benchmarking results on Llama 3.1-8B model across different batch sizes and tensor parallel sizes. - -## 1. Teaser of Results - -Following is a summary of the results in 8xH100 machine on Llama 3.1-8B for decode. For all experiments, the baseline is bfloat16 torch.compiled model: - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - bfloat16 w/ torch.compile - int4 weight only quantization, group size 64 - float8 per row dynamic quantization -
        Batch size 1, TP size 1 - 131 tokens/sec - 255 tokens/sec (1.95x speedup) - 166 tokens/sec (1.27x speedup) -
        Batch size 32, TP size 1 - 2799 tokens/sec - 3241 tokens/sec (1.16x speedup) - 3586 tokens/sec (1.28x speedup) -
        Batch size 32, TP size 4 - 5575 tokens/sec - 6334 tokens/sec (1.14x speedup) - 6159 tokens/sec (1.10x speedup) -
        - - -Our solution supports NVIDIA GPUs, including H100 and A100, and achieves speedup over the compiled bfloat16 baseline across batch sizes and TP sizes for both int4 weight only (from 1.14x to 1.95x) and float8 dynamic quantization (from 1.10x to 1.28x). Note that quantization may have a small impact on accuracy, which is outside the scope of this blogpost. Our int4 weight-only quantization is compatible with accuracy preserving techniques like HQQ. Please refer to [TorchAO's README](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md#cuda-backend-1), [this benchmark](https://huggingface.co/mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib), and [this blog](https://neuralmagic.com/blog/we-ran-over-half-a-million-evaluations-on-quantized-llms-heres-what-we-found/) for more information. - - -## 2. GemLite: Kernel Development - -The kernels were developed as part of GemLite, a project dedicated to optimizing low-bit matrix multiplication kernels. Developed using Triton, GemLite provides highly flexible and performant solutions across various activations, bitrates and hardware. In a nutshell, the kernels offer: - - - -* Support for various activation data types: fp16, int8 and fp8 -* Compatibility: works seamlessly with non-packed (e.g., int8, fp8) and packed formats (e.g., uint4, uint2, uint1) -* Performance Optimization: includes optimized kernels and autotuning tools to achieve high performance across different hardware and batch sizes -* Integration: Compatible with torch.compile and CUDA graphs, ensuring support for advanced features like tensor parallelism - -### Kernel Selection - -Optimizing kernel selection for large language model (LLM) generation requires addressing the distinct needs of different batch sizes. LLM workloads involve a mix of compute-bound and memory-bound iterations: smaller batch sizes are memory-bound, while larger batch sizes become compute-bound. GemLite kernels are designed to adapt to these varying demands, ensuring optimal execution for each scenario. - -In memory-bound scenarios, where data transfer is the limiting factor, the processor often waits for data to be fetched, leading to underutilized computational resources. For batch size = 1, a GEMV kernel performs best, whereas for larger batch sizes, GEMM kernels are more efficient. For batch sizes between 2 and 64, when matrices are "skinny," a GEMM-SPLITK kernel is used to enable better GPU utilization ([arXiv](https://arxiv.org/abs/2402.00025)). - -GemLite includes the following kernels optimized for each of these scenarios: - -### Single Sample Inference - -For single-sample inferences, we use GEMV kernels. However, asymmetric quantization methods require additional metadata, such as scales and zero points, to be loaded for each block. This can lead to increased memory transfer, so careful handling is essential. - -Specifically, for packed data, our experiments indicate that loading scales and zero points only once per two consecutive blocks minimizes redundant operations. Since these blocks share the same metadata, this approach results in: - -* 5–8% end-to-end inference speedup compared to the default GEMV kernel -* 30–40% improvement over the traditional Split-K method - -This new kernel/algorithm, GEMV_REVSPLITK, is available [here](https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemv_revsplitK_A16fWnO16f_int32packing.py). - -For non-packed data, the [GEMV_SPLITK](https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemv_splitK_A16fWnO16f_int32packing.py) algorithm is employed. This algorithm iterates over the k-dimension to compute the dot product without relying on Triton's tl.dot. - -### Batched Inference - -For moderate batch sizes, we use the GEMM-based Split-K method ([arXiv](https://arxiv.org/abs/2402.00025)) which splits the k-dimension (weight rows) into multiple jobs. The optimal-split SPLIT_K parameter is found by autotuning values ranging from 1 to 16. Setting SPLIT_K=1 enables a fallback implementation to a GEMM kernel, allowing the same kernel code to be used for compute-bound batch sizes starting from 32 and 64, depending on the matrix shape and the device. - -### Maximizing High Performance: Key Implementation Insights - -Various implementation details must be carefully addressed to achieve high performance. Following are some of the key aspects we focused on to ensure high performance: - -1. Autotuning for Performance - - - [Autotuning](https://triton-lang.org/main/python-api/generated/triton.autotune.html) is critical for achieving optimal kernel performance. Since this process can be time-intensive, GemLite provides tools to automatically save and load autotuning results for all kernels. This ensures that the autotuning process is performed only once per GPU device, minimizing runtime, reducing repetitive overhead, and maintaining consistent performance across runs. - -2. Ensuring Kernel Correctness - - - Ensuring kernel correctness across different quantization and configuration settings is essential. Triton’s [early configuration pruning](https://triton-lang.org/main/python-api/generated/triton.autotune.html) plays a key role in this process. For example, during Split-K tuning, configurations are selected only if K is divisible by BLOCK_SIZE_K × SPLIT_K,, and BLOCKS_SIZE_K is further pruned based on the group-size value. This approach ensures both efficiency and correctness in kernel operation. - -3. Overcoming Bit-Unpacking Bottlenecks - - - When deploying on data center-grade GPUs like NVIDIA’s A100 and H100, performance bottlenecks related to bit-unpacking were observed. To mitigate these, various bit-packing configurations were explored, including packing along columns versus rows and experimenting with different bit-packing widths (e.g., 8-bit vs. 32-bit). Notably, transitioning from 32-bit to 8-bit packing delivered performance improvements of up to 18% on the A100 and 6% on the H100 - -4. torch.compile compatibility - - - To ensure seamless compatibility with PyTorch’s torch.compile, kernel calls are wrapped in a [custom_op](https://pytorch.org/tutorials/advanced/python_custom_ops.html). This integration allows advanced features such as pre-hooks and early configuration pruning to function correctly, delivering accurate results without sacrificing performance. While some of these [features](https://github.com/pytorch/pytorch/issues/139059) are not yet fully supported in PyTorch, the custom_op implementation effectively bridges the gap, ensuring smooth integration and high performance. - - -## 3. TorchAO - -TorchAO is a PyTorch native quantization and sparsity library for both training and inference, featuring simple user APIs to train, quantize and deploy low precision models, and composability with other PyTorch features like distributed inference and torch.compile. - -PyTorch does not support low precision dtypes or different packing formats by default. With Tensor Subclass, we extend PyTorch native Tensor abstractions and model quantization as dtype conversion, while different packing formats for custom kernels are handled through layouts. For example, we support quantized linear operations with int4 weights, packed in a Tensor Core friendly layout, with tinygemm or GemLite kernel implementations. More details can be found [here](https://pytorch.org/ao/stable/contributor_guide.html). - - -![flow diagram](/assets/images/accelerating-llm-inference/fg1.png){:style="width:100%"} - - -Apart from more PyTorch native abstractions for developers, we want to highlight two benefits of this design for modeling users. - -1. [Serialization](https://pytorch.org/ao/stable/serialization.html): Save and load quantized weights into a state_dict just like a floating point model, eliminating the need to transform floating point model to quantized model before the quantized weights are loaded. This reduces friction of distributing and deploying quantized models. - -2. [Composability](#torch-tensor-parallel): Seamless integration with downstream features like tensor parallel, allowing users to focus on modeling without worrying about compatibility with tensor parallel, torch.compile, and other PyTorch features. Since these features are implemented with Tensor level abstraction, users can quantize and do distributed inference with no model changes most of the time. - - -### GemLite Kernel Integration - -To achieve the aforementioned benefits for the GemLite kernel, we integrated GemLite into TorchAO. This integration takes advantage of GemLite’s wide support and flexibility to allow for weight only quantization at 4 and 8 bits, under asymmetric and symmetric quantization schemes, 32 and 8 bit packing sizes, as well as grouped and ungrouped quantization. We enable this integration via the `quantize_` api which can be used alongside the GemLite constructor as follows - - -``` -quantize_(model, gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth)) -``` - - -The primary difficulty in creating this integration was making sure that the TorchAO composability guarantees were satisfied for the entire breadth of GemLite quantization kernel options. While the primary integration was relatively straight forward, making sure every different quantization type and their associated kernels worked well with tensor parallel was non-trivial. - - -### Torch Tensor Parallel {#torch-tensor-parallel} - -Tensor Parallelism is an effective way to speed up LLM inference. TP shards large matrices of linear or embedding modules onto multiple devices, typically in column-wise or row-wise styles. As the weight matrix gets distributed, computation is decomposed too. For example, the column-wise pattern below enables simultaneous matrix-vector multiply on four devices: - -![equation](/assets/images/accelerating-llm-inference/fg5.jpg){:style="max-width:300px; width:100%; display: block; margin-left: auto; margin-right: auto"} - - -PyTorch implements TP by converting a regular tensor (e.g. matrix *A*) into a *DTensor*: - -``` -dtensor = _shard_tensor(mA, device_mesh, (Shard(0),)) -``` - -Since DTensor stores meta information about the sharding, it knows how to reconstruct the full result when needed. Take Transformers’ feedforward module for example, as the down projection and up projection use column-wise and row-wise sharding respectively, DTensor will automatically perform an all-reduce on the ranks’ results as they move into the next operation. Such automation allows model authors to focus on computation without worrying about the communication needed for distributed execution. - -**Tensor Parallel and Quantization Order** - -Since both DTensor and quantization are tensor-level transformations, the application order matters in ensuring a workflow can generally work on different setups. We have two observations: (i) checkpoints are typically saved in quantized formats, to save the quantization overhead before each run; and (ii) TP may run on a different number of devices, depending on resource constraints or service agreements. As such, we first apply quantization to the original tensor, save it to disk depending on whether a reuse is desired. At service launch time, we load the quantized checkpoint and shard the tensors into DTensors on-the-fly as we load them into the model. - -**Tensor Parallel Support in TorchAO** - -Since we quantize the model first then distribute the Tensor, we’ll have `DTensor(QuantizedTensor(weight))`, where `DTensor` means a distributed Tensor class and `QuantizedTensor` means a quantized tensor class in TorchAO. `QuantizedTensor` should support the operators called when constructing a `DTensor`, including slice and view ops. To make sure the overall execution is efficient, the packed weight that’s sliced in the dimension 0 and 1 should match the result of first slice the unpacked weight then pack (pack and slice operation should commute), otherwise the packing format is not compatible with tensor parallelism. - - -## 4. SGLang - -SGLang is a fast serving framework for large language models and vision language models. It is known for its almost [zero-overhead batch scheduler](https://lmsys.org/blog/2024-12-04-sglang-v0-4/) and fast [constrained decoding](https://lmsys.org/blog/2024-02-05-compressed-fsm/). It is mainly implemented in Python, lightweight, and easy to hack. It is also one of the first frameworks to integrate torch.compile. - -**TorchAO integration in SGLang** - -We integrated `quantize_` API for applying a specific type of quantization to model into SGLang that supports int4 weight only quantization (both tinygemm and GemLite version), float8 dynamic quantization and a few other types of quantization so far. Users can enable quantization by adding `--torchao-config` argument to the benchmarking script. The currently enabled options also support tensor parallelism through composition with DTensor that is enabled with `--tp-size` option. - -**Torch Native Tensor Parallel Support in SGLang** - -Existing model definitions in SGLang use special linear modules that are coupled with tensor parallelism style, for example: `MergedColumnParallelLinear`, `QKVParallelLinear` and `RowParallelLinear`. To decouple the model definition and tensor parallelization style, we defined a [pytorch native model](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/torch_native_llama.py) that uses plain `nn.Linear` module from PyTorch and rely on PyTorch tensor parallelism APIs for parallelization and torch.compile for speedup. At related module hierarchies, we add a dictionary describing how a submodule should be parallelized. For example, in `class LlamaAttention`, we define: - -``` -_tp_plan = { - "qkv_proj": "Colwise_Sharded", - "o_proj": "Rowwise", -} -``` - -where `"qkv_proj" `and `"o_proj" `are the FQNs of the `wqkv` and `wo` projections, and the values are their TP styles. - -We then define a TP engine in `model_parallel.py`. It searches for `_tp_plan `recursively within the model, and applies the indicated TP styles to the submodules using PyTorch’s [parallelize_module](https://pytorch.org/docs/stable/distributed.tensor.parallel.html#torch.distributed.tensor.parallel.parallelize_module) API. - - -## 5. Results - -The evaluation focused on two popular quantization techniques for H100 machines: int4 weight-only quantization and float8 dynamic quantization. These methods were chosen due to their widespread use in optimizing memory efficiency and computational performance on H100 machines, making them ideal candidates for benchmarking against various workloads. - - - -* **int4 Weight-Only Quantization**: This method significantly reduces memory footprint and accelerates decode for memory-bound workloads, with minimal impact on performance in compute-intensive scenarios like prefill or larger batch sizes. We present results for bf16, GemLite, and tinygemm kernels below, across various batch sizes and tensor parallel configurations -* **float8 Dynamic Quantization**: While offering less memory savings, this method often provides higher accuracy and balanced speedups for both memory-bound and compute-bound tasks. With Hopper-grade hardware and native fp8 support, the efficient cutlass/cuBLAS kernels used by AO contribute to a significant speedup - -The graphs below show the decode tokens/sec for different tp sizes, each graph shows the results across different batch sizes and for different types of quantization: - - - -* BF16 is our bfloat16, torch.compile’d baseline -* tinygemm-4-64 is using `int4_weight_only` quantization in TorchAO, it’s a 4 bit groupwise quantization with group size of 64, using tinygemm kernel -* gemlite-4-64 is using `gemlite_uintx_weight_only `quantization in TorchAO, 4 means 4 bit, and 64 is also the group size, using GemLite kernel -* fp8dq-per_row is using `float8_dynamic_activation_float8_weight` quantization in TorchAO, both activation and weights are quantized with per row scales - -![bar chart](/assets/images/accelerating-llm-inference/fg2.png){:style="width:100%"} - -![bar chart](/assets/images/accelerating-llm-inference/fg3.png){:style="width:100%"} - -![bar chart](/assets/images/accelerating-llm-inference/fg4.png){:style="width:100%"} - - -For int4 weight-only quantization, at batch size 1, the tinygemm kernel achieved the best performance. However, its efficiency declined with increasing batch sizes. Conversely, GemLite effectively bridged this gap, delivering superior performance at larger batch sizes. GemLite also achieved a 9–10x speedup during the prefill phase compared to tinygemm, despite ongoing performance optimizations constrained by Triton. - -Float8 dynamic quantization showed 1.3x speedup over bfloat16 consistently with tensor parallel size 1 across different batch sizes and 1.1x to 1.2x speedup in larger tensor parallel sizes. As the tensor parallel size increases, the overall speedup decreases, which is expected due to the reduction in matmul size. Note that we do expect to get speedup for prefill as well, but since we rely on torch.compile for speedup and prefill compile is not enabled in SGLang yet, we will leave this for future work. - - -### Repro Instructions {#repro-instructions} - -We conducted benchmarks on an 8xH100 machine using GemLite 0.4.1, SGLang built from commit feb2b76, TorchAO nightly 0.8.0.dev20241223+cu124, and PyTorch 2.5.1. The Llama-3.1 Instruct models were chosen as the architecture for evaluation. - -``` -BATCH_SIZE=16 -# Note: gemlite is only compatible with float16 -# while int4wo-64 (tinygemm-4-64 as shown in the graph) and fp8dq-per_row should use bfloat16 -DTYPE=float16 -# int4wo-64, fp8dq-per_tensor -TORCHAO_CONFIG=gemlite-4-64 -TP_SIZE=2 -# Decode performance -python3 -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}' --dataset-name random --random-input 1024 --random-output 512 --random-range 1 --num-prompts $BATCH_SIZE --enable-torch-compile --dtype $DTYPE --torchao-config $TORCHAO_CONFIG --tp-size $TP_SIZE - -# Example output -# Benchmark... -# [2024-12-20 12:42:16 TP0] Prefill batch. #new-seq: 2, #new-token: 2046, #cached-token: 4, cache hit rate: \0.06%, token usage: 0.00, #running-req: 0, #queue-req: 0 -# ... -# [2024-12-20 12:45:35 TP0] Decode batch. #running-req: 16, #token: 16763, token usage: 0.01, gen throughput\ (token/s): 2.20, #queue-req: 0 -# [2024-12-20 12:45:38 TP0] Decode batch. #running-req: 16, #token: 24443, token usage: 0.02, gen throughput\ (token/s): 2739.89, #queue-req: 0 - -# We reported the last throughput (token/s) as the performance for decode -``` - -## Conclusion - -With performant and extensible kernels from [GemLite](https://github.com/mobiusml/gemlite), PyTorch native architecture optimization library [TorchAO](https://github.com/pytorch/ao) and high performance inference framework [SGLang](https://github.com/sgl-project/sglang), we showcased fast end-to-end quantized inference for both int4 and float8 across different batch sizes and tensor parallel sizes with simple and composable user APIs to reduce the resource requirement for LLMs. This integration is our first step towards meeting the needs of fast inference across different models, workloads, precisions and hardwares and we are looking forward to continuing advancing the state of the art for end to end mixed and low precision LLM inference. - -Our immediate future work focuses on the following: - - - -* Exploring diverse combinations of weight and activation quantization to strike the best balance between speed and accuracy -* Extending support to additional GPU architectures to broaden accessibility -* Enhancing compatibility with MoE models to address growing demands in scalable inference -* Allow for easy integration of fast custom kernels in TorchAO so that they can be easily leveraged by SGLang and other inference frameworks -* While we didn’t measure accuracy impact in this blogpost, we can develop auto quantization tool in TorchAO to allow users to trade off between performance and accuracy -* Better integration with tensor parallelism in SGLang to support running larger models -* Enable torch.compile for prefill phase in SGLang - -We also invite the community to actively test, provide feedback, and contribute to shaping the future of fast and efficient LLM inference. \ No newline at end of file diff --git a/_posts/2025-01-22-bringing-the-pytorch-community-together.md b/_posts/2025-01-22-bringing-the-pytorch-community-together.md deleted file mode 100644 index 41b8fe2a8562..000000000000 --- a/_posts/2025-01-22-bringing-the-pytorch-community-together.md +++ /dev/null @@ -1,134 +0,0 @@ ---- -layout: blog_detail -title: "Bringing the PyTorch Community Together" -author: "Team PyTorch" -hidden: true ---- - -As we step into a new year, it’s a great moment to reflect on the incredible community events that made 2024 a memorable year for the PyTorch Foundation. Global meetups, events, and conferences brought the community together to learn, connect, and grow. Here’s a quick recap of the year’s highlights and what to expect in 2025\. - -![PyTorch Seattle Meetup (May 23)](/assets/images/community-events-recap/fg5.jpg){:style="width:100%"} - -**PyTorch Seattle Meetup (May 23\)** - -We hosted a PyTorch Meetup in Seattle in May at the Meta Bellevue Office where Meta, Microsoft, and Google gave technical talks and about 60 attendees participated in discussion and networking. - -**PyTorch Docathon 2024 (June 4-20)** - -The PyTorch Docathon returned for its third edition, spanning over two weeks in June. This unique event focused on improving PyTorch’s documentation with contributions from community members worldwide. Documentation is the backbone of any successful open source project, and PyTorch’s Docathon fostered inclusivity and collaboration, making it easier for new users to adopt the framework and for experienced developers to maximize its potential. The 2024 Docathon resulted in more than 50 merged pull requests and was a testament to the collaborative spirit of the PyTorch community and its commitment to enhancing accessibility and usability. Watch the [PyTorch Docathon Kickoff](https://youtu.be/2D0aej50umA?feature=shared) on YouTube. - -![PyTorch Shanghai Meetup (August 15)](/assets/images/community-events-recap/fg3.png){:style="width:100%"} - -#### **PyTorch Shanghai Meetup (August 15\)** - -In August, the [PyTorch Shanghai Meetup](https://pytorch.org/blog/pytorch-shanghai-notes/) brought together developers, researchers, and enthusiasts in Shanghai, China. This event served as a platform for knowledge sharing, with engaging talks and networking opportunities. Highlights from the agenda included insights into PyTorch’s latest developments, community-led presentations showcasing innovative use cases, and networking sessions fostering collaboration among attendees. - -![PyTorch Conference 2024 (September 18-19)](/assets/images/community-events-recap/fg1.jpg){:style="width:100%"} - -#### **PyTorch Conference 2024 (September 18-19)** - -The PyTorch Conference in San Francisco was undoubtedly one of the year’s most significant events. This two-day gathering brought together top-tier researchers, developers, and academic communities, fostering collaboration and innovation in machine learning. - -![What Made It Special](/assets/images/community-events-recap/fg6.jpeg){:style="width:100%"} - -#### **What Made It Special:** - -* Keynote speeches from industry leaders and PyTorch maintainers. -* In-depth sessions covering PyTorch’s end-to-end machine learning capabilities. -* Hands-on workshops and breakout sessions. -* A vibrant expo area showcasing cutting-edge tools and applications. -* Startup Showcase where early-stage founders pitched their AI startups to a panel of top venture capitalists. -* DL Compiler Mini-Summit that took a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads. -* Fine-Tuning Mini-Summit that covered everything from memory efficiency, parameter-efficient fine-tuning and quantization to performance at scale and reproducible evaluations. -* Poster Session showcasing innovations in PyTorch, including model optimization, hardware integration, generative AI, quantization, and tools for enhanced performance and usability, with contributions from industry leaders. - -The conference’s focus on fostering collaboration underscored PyTorch’s role as a driving force in the open source ML community. Missed out? You can watch the [PyTorch Conference 2024 Playlist](https://youtube.com/playlist?list=PL_lsbAsL_o2B_znuvm-pDtV_cRhpqZb8l&si=mdoSkqMJYKRlzxlg) to catch any sessions you might have missed. - -![GPU MODE IRL Hackathon (September 21)](/assets/images/community-events-recap/fg4.jpg){:style="width:100%"} - -#### **GPU MODE IRL Hackathon (September 21\)** - -PyTorch sponsored this meetup in person in San Francisco where attendees made friends, watched keynotes, hacked all day, took breaks with afternoon talks, and then hacked all night. We heard about torchao, our new quantization and sparsity library, vLLM which deploys PyTorch models in production, llm.c, and more. Key takeaways included: GPU Mode IRL Hackathon 1st place winner was inspired by PyTorch FlexAttention to improve CUTLASS, NCCL in Triton would help us do distributed programming with a minimal NCCL reimplementation in pure Python, No libtorch pytorch binaries dramatically reduces binary sizes for on device deployments. - -![Consumer AI Edge Hackathon (November 22-23)](/assets/images/community-events-recap/fg8.png){:style="width:100%"} - -#### **Consumer AI Edge Hackathon (November 22-23)** - -The PyTorch team served as mentors and coaches in a Hackathon in Paris, co-sponsored by Hugging Face, Scaleway, and Entrepreneur First, challenging teams to create innovative consumer (B2C) applications leveraging Hugging Face, PyTorch and other open source on-device tools and models. 120+ people across 22 teams hacked for 2 days (and nights\!) building the future of AI-powered on-device solutions based on open source models and tools. Participants created innovative applications, powered by PyTorch, [ExecuTorch](https://github.com/pytorch/executorch/tree/main) and Hugging Face resources, such as an on-device yoga coach, a magical storytelling companion and a Kinect-like experience to mobile phones. The PyTorch team is planning similar events in other geographies in 2025 around innovative on-device AI applications. - -![PyTorch Korea User Group Meetup (November 30)](/assets/images/community-events-recap/fg9.png){:style="width:100%"} - -#### **PyTorch Korea User Group Meetup (November 30\)** - -The PyTorch Korea User Group, founded in 2018, is a community dedicated to introducing PyTorch to Korean-speaking users and growing together. The group began by translating PyTorch 0.3 tutorials into Korean and has since supported PyTorch's growth in Korea. The group focuses on three primary activities: - -1. Sharing knowledge for PyTorch learning and application, -2. Sharing insights and experiences in the field of artificial intelligence, and -3. Fostering growth through online and offline networking. - -The PyTorch Korea User Group reaches tens of thousands of Korean AI developers every month. If you're interested in their activities, check out these links: - -* [PyTorch Korea User Group](https://pytorch.kr) -* [PyTorch Korean Tutorials](https://tutorials.pytorch.kr) -* [PyTorch Korean Community](https://discuss.pytorch.kr) -* [GitHub Repository](https://github.com/PyTorchKorea) -* [YouTube Channel](https://youtube.com/@pytorchkr) - -![PyTorch Korea User Group 2025 Events Overview](/assets/images/community-events-recap/fg2.jpeg){:style="width:100%"} - -The PyTorch Korea User Group has planned three major activities for the year: - -1. **PyTorch CoreSIG** - Since December 2024, this weekly online event has been held every Wednesday afternoon. Led by Kim Hong-Seok, CSO of Rebellions (a PyTorch member company), it provides in-depth knowledge and experience regarding PyTorch internals. Approximately 150 Korean developers participate weekly, reflecting growing interest in PyTorch Core development in Korea. -2. **Offline Meetup** - These meetups provide opportunities to share insights and experiences in PyTorch and artificial intelligence, along with networking. Around 3–4 sessions are planned for this year, focusing on key topics in PyTorch and AI. -3. **Online Community Engagement** - This activity involves sharing and discussing various projects and papers in the AI field. For more information, visit: [https://discuss.pytorch.kr](https://discuss.pytorch.kr). - -#### **Open Source AI Night at NeurIPS 2024 (December 10\)** - -The PyTorch Foundation co-hosted a social event at NeurIPS along with The Fin AI and Open Finance Foundation that featured engaging discussions on open source AI and applications in finance. - -![PyTorch Webinars](/assets/images/community-events-recap/fg7.jpeg){:style="width:100%"} - -**PyTorch Webinars** - -Throughout 2024, PyTorch hosted the following virtual webinars: - -Expert Exchanges: - -* [How does batching work on modern CPUs?](https://www.youtube.com/live/HTcnp9NEHGY?feature=shared) -* [DistServe: disaggregating prefill and decoding for goodput-optimized LLM inference](https://www.youtube.com/live/Bh-jlh5vlF0?feature=shared) -* [Efficient Streaming Language Models with Attention Sinks](https://www.youtube.com/live/RnM84Sv9WpA?feature=shared) -* [Adapting open source models with Open-Instruct and Tulu](https://www.youtube.com/live/e1qUJFAo10s?feature=shared) -* [Efficient Generative Models: From Sparse to Distributed Inference](https://www.youtube.com/live/Eqg0VIiWrgM?feature=shared) - -Summer Series: - -* [Using PyTorch for Monocular Depth Estimation Webinar](https://youtu.be/xf2QgioY370?feature=shared) -* [Accelerating LLM family of models on Arm Neoverse based Graviton AWS processors with KleidiAI](https://youtu.be/NeHIhQWewug?feature=shared) -* [torch.compile: The Missing Manual](https://www.youtube.com/live/rew5CSUaIXg?feature=shared) - -Release Live Q&As: - -* [PyTorch 2.4: Live Q&A](https://www.youtube.com/live/ry_QgUIYX1E?feature=shared) -* [PyTorch 2.5 Live Q&A](https://www.youtube.com/live/B3IgXpl4xt4?feature=shared) - -Live Webinars: - -* [PyTorch Documentary Virtual Premiere](https://www.youtube.com/watch?v=EjgTv6aSeqk) -* [Using PyTorch to Help Predict Wildfires](https://www.youtube.com/watch?v=gSC_IHyx0IM) -* [Seismic Data to Subsurface Models with OpenFWI: Training an AI Model with PyTorch](https://www.youtube.com/watch?v=zvk3Rr-OjU0) -* [Dinosaur Bone Hunting with Intel AI](https://www.youtube.com/watch?v=w4JmPkqnD0E) - -Each of these events underscored the importance of collaboration and community engagement in advancing AI research and applications. Thank you to everyone who participated, organized, and supported these events—your contributions make all the difference\! - ---- - -### **Looking Ahead** - -2024 was packed with opportunities to connect, learn, and contribute, and there will be even more ways to connect with the PyTorch community in 2025\. - -Mark your calendar\! The [PyTorch Conference](https://events.linuxfoundation.org/pytorch-conference-2025/) is returning to San Francisco on October 22-23, 2025\. Get ready for an exciting event filled with technical deep dives, exciting announcements, insightful sessions, and enhanced opportunities for community collaboration. - -Stay tuned for more upcoming events and opportunities to get involved by [subscribing to our newsletter](https://pytorch.org/newsletter). \ No newline at end of file diff --git a/_posts/2025-01-24-how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus.md b/_posts/2025-01-24-how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus.md deleted file mode 100644 index 00241593ecf9..000000000000 --- a/_posts/2025-01-24-how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -layout: blog_detail -title: "How Intel Uses PyTorch to Empower Generative AI through Intel Arc GPUs" -author: "Team PyTorch" ---- - -Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads. - -**The Business Challenge** - -Our goal was to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel. We recognized the need to showcase the capabilities of the latest GenAI workloads on our newest line of client GPUs. To address this, we developed a starter application, [AI Playground](https://github.com/intel/ai-playground), which is open source and includes a comprehensive developer reference sample available on GitHub using PyTorch. This application seamlessly integrates image generation, image enhancement, and chatbot functionalities, using retrieval-augmented generation (RAG) features, all within a single, user-friendly installation package. This initiative not only demonstrates the functionality of these AI workloads but also serves as an educational resource for the ecosystem, guiding developers on effectively leveraging the [Intel® Arc™ GPU](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/arc.html) product line for advanced AI applications. This solution leverages Intel® Arc™ Xe Cores and [Xe Matrix Extensions (XMX)](https://www.intel.com/content/www/us/en/support/articles/000091112/graphics.html) for accelerating inferencing. - -![AI Playground](/assets/images/intel-case-study/fg1.png){:style="width:100%"} - -**How Intel Used PyTorch** - -PyTorch is the core AI framework for AI Playground. We extensively leverage PyTorch's eager mode, which aligns perfectly with the dynamic and iterative nature of our generative models. This approach not only enhances our development workflow but also enables us to rapidly prototype and iterate on advanced AI features. By harnessing PyTorch’s powerful capabilities, we have created a robust reference sample that showcases the potential of GenAI on Intel GPUs in one cohesive application. - -**Solving AI Challenges with PyTorch** - -PyTorch has been instrumental in addressing our AI challenges by providing a robust training and inference framework optimized for discrete and integrated Intel Arc GPU product lines. Choosing PyTorch over alternative frameworks or APIs was crucial. Other options would have necessitated additional custom development or one-off solutions, which could have significantly slowed our time to market and limited our feature set. With PyTorch, we leveraged its flexibility and ease of use, allowing our team to focus on innovation through experimentation, rather than infrastructure. The integration of [Intel® Extension for PyTorch](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-pytorch.html#gs.j6azz7) further enhanced performance by optimizing computational efficiency and enabling seamless scaling on Intel hardware, ensuring that our application ran faster and more efficiently. - -**A Word from Intel** - -*With PyTorch as the backbone of our AI Playground project, we achieved rapid development cycles that significantly accelerated our time to market. This flexibility enabled us to iteratively enhance features and effectively align with the commitments of our hardware launches in 2024\.* - -*\-Bob Duffy, AI Playground Product Manager* - -![PyTorch Case Stidu](/assets/images/intel-case-study/fg2.png){:style="width:100%"} - -**The Benefits of Using PyTorch** - -The biggest benefit of using PyTorch for us is the large PyTorch ecosystem, which connects us with an active and cooperative community of developers. This collaboration has facilitated the seamless deployment of key features from existing open source projects, allowing us to integrate the latest GenAI capabilities into AI Playground. Remarkably, we accomplished this with minimal re-coding, ensuring that these advanced features are readily accessible on Intel Arc GPUs. - -**Learn More** - -For more information about Intel’s AI Playground and collaboration with PyTorch, visit the following links: - -* [PyTorch Optimizations from Intel](https://www.intel.com/content/www/us/en/developer/tools/oneapi/optimization-for-pytorch.html#gs.j8h6mc) -* [AI Playground GitHub](https://github.com/intel/ai-playground) -* [AI Playground](https://intel.com/ai-playground) -* [AI Playground Deep Dive Video](https://youtu.be/cYPZye1MC6U) -* [Intel GPU Support Now Available in PyTorch 2.5](https://pytorch.org/blog/intel-gpu-support-pytorch-2-5/) \ No newline at end of file diff --git a/_posts/2025-01-28-2025-priorities-for-tac.md b/_posts/2025-01-28-2025-priorities-for-tac.md deleted file mode 100644 index 8e55be0b3338..000000000000 --- a/_posts/2025-01-28-2025-priorities-for-tac.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -layout: blog_detail -title: "2025 Priorities for the PyTorch Technical Advisory Council (TAC)" -author: "Luca Antiga, PyTorch TAC Chair" ---- - -![social share](/assets/images/1738166706211.jpg){:style="max-width:600px; width:100%; display: block; margin-left: auto; margin-right: auto"} - - -[2024 has been a year of incredible growth for PyTorch](https://pytorch.org/blog/2024-year-in-review/). As that continues in 2025, the PyTorch Foundation has made important steps towards evolving the governance of the project under the Linux Foundation’s vendor-neutral umbrella. - -An important piece of governance for PyTorch is represented by the Technical Advisory Council (TAC). The TAC acts as a bridge between the industry, including but not limited to the PyTorch Foundation members, the community, and the PyTorch core development team. - -Operating with transparency and inclusivity, the TAC gathers input, facilitates collaboration, and drives initiatives that enhance the experience for everyone who relies on PyTorch. - -In 2025, the TAC will focus on four key areas: - -1. **Build Open, Multi-Cloud Continuous Integration (CI):** Building on the groundwork from 2024, the TAC will oversee the transition to an open, community-driven CI infrastructure. In addition to ensuring the extremely high bar for correctness that PyTorch has, PyTorch’s CI is complex with a high-quality bar including many automated functional and performance daily test runs. In 2025, PyTorch’s CI infrastructure will be fully open sourced and extended to support multiple compute providers, enabling broader contribution and participation to the effort from organizations benefitting from PyTorch. -2. **Support more Accelerators:** The TAC is committed to creating a level playing field for the growing landscape of AI accelerators. By gathering industry players and PyTorch developers, the TAC will facilitate efforts towards third-party device support and provide levels of integration of external CI systems with the main PyTorch CI. This will make it easier for emerging hardware to gain adoption within the PyTorch ecosystem, and for users to experiment with diverse compute options for training and inference. -3. **Create a High-Quality, User-Centric Ecosystem:** A big focus for the TAC in early 2025 is on improving the experience and discoverability of the PyTorch ecosystem. With many projects growing organically, users often face challenges navigating projects of different scope and quality within the rapidly changing AI landscape. To solve this, a newly curated ecosystem landscape tool will be launched soon on the PyTorch website. We will also introduce lightweight, open processes to improve projects and ensure users a predictable, high-quality experience. In many ways, the experience with PyTorch is as good as its ecosystem. -4. **Gather Feedback from Industry and the Community:** PyTorch has widespread adoption across research labs, startups, and enterprises. Striking the right balance between expressiveness and performance across the board is a very challenging task, so the TAC set out to be one of the several ways the Core development team receives signals. During our monthly TAC meetings, we provide the opportunity to PyTorch Foundation members from industry and academia, as well as non-member organizations to present their use case, their challenges and discuss them directly with appropriate members of the Core team. This feedback loop helps prioritize improvements, ensuring the framework stays relevant in a fast-evolving AI landscape. - -By focusing on these priorities, the TAC aims to maintain PyTorch’s position as the leading deep learning framework, while ensuring it remains open, accessible, and responsive to the needs of its diverse community. - -As members of the TAC, we’re extremely excited to contribute to the success of PyTorch and to the impact it’s having in the real world. If you are a PyTorch user or developer, consider [participating in our monthly calls](https://zoom-lfx.platform.linuxfoundation.org/meetings/pytorch?__hstc=132719121.a26416c161ac91bef494ffc19f91a62e.1723036593114.1738082449904.1738088158683.375&__hssc=132719121.1.1738088158683&__hsfp=810579359) (they are open to everyone, and the recordings are available [here](https://lists.pytorch.org/g/tac)). Also, if you develop or maintain a project based on PyTorch, consider contributing it to the new PyTorch ecosystem ([instructions](https://github.com/pytorch-fdn/ecosystem)). \ No newline at end of file diff --git a/_posts/2025-01-29-pytorch2-6.md b/_posts/2025-01-29-pytorch2-6.md deleted file mode 100644 index 6ccac080294b..000000000000 --- a/_posts/2025-01-29-pytorch2-6.md +++ /dev/null @@ -1,146 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.6 Release Blog" ---- - -We are excited to announce the release of PyTorch® 2.6 ([release notes](https://github.com/pytorch/pytorch/releases/tag/v2.6.0))! This release features multiple improvements for PT2: `torch.compile` can now be used with Python 3.13; new performance-related knob `torch.compiler.set_stance`; several AOTInductor enhancements. Besides the PT2 improvements, another highlight is FP16 support on X86 CPUs. - -NOTE: Starting with this release we are not going to publish on Conda, please see [[Announcement] Deprecating PyTorch’s official Anaconda channel](https://github.com/pytorch/pytorch/issues/138506) for the details. - -For this release the experimental Linux binaries shipped with CUDA 12.6.3 (as well as Linux Aarch64, Linux ROCm 6.2.4, and Linux XPU binaries) are built with CXX11_ABI=1 and are [using the Manylinux 2.28 build platform](https://dev-discuss.pytorch.org/t/pytorch-linux-wheels-switching-to-new-wheel-build-platform-manylinux-2-28-on-november-12-2024/2581). If you build PyTorch extensions with custom C++ or CUDA extensions, please update these builds to use CXX_ABI=1 as well and report any issues you are seeing. For the next PyTorch 2.7 release we plan to switch all Linux builds to Manylinux 2.28 and CXX11_ABI=1, please see [[RFC] PyTorch next wheel build platform: manylinux-2.28](https://github.com/pytorch/pytorch/issues/123649) for the details and discussion. - -Also in this release as an important security improvement measure we have changed the default value for `weights_only` parameter of `torch.load`. This is a backward compatibility-breaking change, please see [this forum post](https://dev-discuss.pytorch.org/t/bc-breaking-change-torch-load-is-being-flipped-to-use-weights-only-true-by-default-in-the-nightlies-after-137602/2573) for more details. - -This release is composed of 3892 commits from 520 contributors since PyTorch 2.5. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve PyTorch. More information about how to get started with the PyTorch 2-series can be found at our [Getting Started](https://pytorch.org/get-started/pytorch-2.0/) page. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Beta - Prototype -
        torch.compiler.set_stance - Improved PyTorch user experience on Intel GPUs -
        torch.library.triton_op - FlexAttention support on X86 CPU for LLMs -
        torch.compile support for Python 3.13 - Dim.AUTO -
        New packaging APIs for AOTInductor - CUTLASS and CK GEMM/CONV Backends for AOTInductor -
        AOTInductor: minifier - -
        AOTInductor: ABI-compatible mode code generation - -
        FP16 support for X86 CPUs - -
        - - -*To see a full list of public feature submissions click [here](https://docs.google.com/spreadsheets/d/1TzGkWuUMF1yTe88adz1dt2mzbIsZLd3PBasy588VWgk/edit?usp=sharing). - - -## BETA FEATURES - - -### [Beta] torch.compiler.set_stance - -This feature enables the user to specify different behaviors (“stances”) that `torch.compile` can take between different invocations of compiled functions. One of the stances, for example, is - -“eager_on_recompile”, that instructs PyTorch to code eagerly when a recompile is necessary, reusing cached compiled code when possible. - -For more information please refer to the [set_stance documentation](https://pytorch.org/docs/2.6/generated/torch.compiler.set_stance.html#torch.compiler.set_stance) and the [Dynamic Compilation Control with torch.compiler.set_stance](https://pytorch.org/tutorials/recipes/torch_compiler_set_stance_tutorial.html) tutorial. - -### [Beta] torch.library.triton_op - -`torch.library.triton_op` offers a standard way of creating custom operators that are backed by user-defined triton kernels. - -When users turn user-defined triton kernels into custom operators, `torch.library.triton_op` allows `torch.compile` to peek into the implementation, enabling `torch.compile` to optimize the triton kernel inside it. - -For more information please refer to the [triton_op documentation](https://pytorch.org/docs/2.6/library.html#torch.library.triton_op) and the[ Using User-Defined Triton Kernels with torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html) tutorial. - -### [Beta] torch.compile support for Python 3.13 - -`torch.compile` previously only supported Python up to version 3.12. Users can now optimize models with `torch.compile` in Python 3.13. - -### [Beta] New packaging APIs for AOTInductor - -A new package format, “[PT2 archive](https://docs.google.com/document/d/1RQ4cmywilnFUT1VE-4oTGxwXdc8vowCSZsrRgo3wFA8/edit?usp=sharing)”, has been introduced. This essentially contains a zipfile of all the files that need to be used by AOTInductor, and allows users to send everything needed to other environments. There is also functionality to package multiple models into one artifact, and to store additional metadata inside of the package. - -For more details please see the updated [torch.export AOTInductor Tutorial for Python runtime](https://pytorch.org/tutorials/recipes/torch_export_aoti_python.html). - -### [Beta] AOTInductor: minifier - -If a user encounters an error while using AOTInductor APIs, AOTInductor Minifier allows creation of a minimal nn.Module that reproduces the error. - -For more information please see the [AOTInductor Minifier documentation](https://pytorch.org/docs/2.6/torch.compiler_aot_inductor_minifier.html). - -### [Beta] AOTInductor: ABI-compatible mode code generation - -AOTInductor-generated model code has dependency on Pytorch cpp libraries. As Pytorch evolves quickly, it’s important to make sure previously AOTInductor compiled models can continue to run on newer Pytorch versions, i.e. AOTInductor is backward compatible. - -In order to guarantee application binary interface (ABI) backward compatibility, we have carefully defined a set of stable C interfaces in libtorch and make sure AOTInductor generates code that only refers to the specific set of APIs and nothing else in libtorch. We will keep the set of C APIs stable across Pytorch versions and thus provide backward compatibility guarantees for AOTInductor-compiled models. - -### [Beta] FP16 support for X86 CPUs (both eager and Inductor modes) - -Float16 datatype is commonly used for reduced memory usage and faster computation in AI inference and training. CPUs like the recently launched [Intel® Xeon® 6 with P-Cores](https://www.intel.com/content/www/us/en/products/details/processors/xeon/xeon6-p-cores.html) support Float16 datatype with native accelerator [AMX](https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/overview.html). Float16 support on X86 CPUs was introduced in PyTorch 2.5 as a prototype feature, and now it has been further improved for both eager mode and Torch.compile + Inductor mode, making it Beta level feature with both functionality and performance verified with a broad scope of workloads. - - -## PROTOTYPE FEATURES - -### [Prototype] Improved PyTorch user experience on Intel GPUs - -PyTorch user experience on Intel GPUs is further improved with simplified installation steps, Windows release binary distribution and expanded coverage of supported GPU models including the latest Intel® Arc™ B-Series discrete graphics. Application developers and researchers seeking to fine-tune, inference and develop with PyTorch models on [Intel® Core™ Ultra AI PCs ](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/ai-pc.html)and [Intel® Arc™ discrete graphics](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/arc.html) will now be able to directly install PyTorch with binary releases for Windows, Linux and Windows Subsystem for Linux 2. - - - -* Simplified Intel GPU software stack setup to enable one-click installation of the torch-xpu PIP wheels to run deep learning workloads in an out of the box fashion, eliminating the complexity of installing and activating Intel GPU development software bundles. -* Windows binary releases for torch core, torchvision and torchaudio have been made available for Intel GPUs, and the supported GPU models have been expanded from Intel® Core™ Ultra Processors with Intel® Arc™ Graphics, [Intel® Core™ Ultra Series 2 with Intel® Arc™ Graphics](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html) and [Intel® Arc™ A-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/desktop/a-series/overview.html) to the latest GPU hardware [Intel® Arc™ B-Series graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/desktop/b-series/overview.html). -* Further enhanced coverage of Aten operators on Intel GPUs with SYCL* kernels for smooth eager mode execution, as well as bug fixes and performance optimizations for torch.compile on Intel GPUs. - -For more information regarding Intel GPU support, please refer to [Getting Started Guide](https://pytorch.org/docs/main/notes/get_start_xpu.html). - -### [Prototype] FlexAttention support on X86 CPU for LLMs - -FlexAttention was initially introduced in PyTorch 2.5 to provide optimized implementations for Attention variants with a flexible API. In PyTorch 2.6, X86 CPU support for FlexAttention was added through TorchInductor CPP backend. This new feature leverages and extends current CPP template abilities to support broad attention variants (e.x.: PageAttention, which is critical for LLMs inference) based on the existing FlexAttention API, and brings optimized performance on x86 CPUs. With this feature, it’s easy to use FlexAttention API to compose Attention solutions on CPU platforms and achieve good performance. - -### [Prototype] Dim.AUTO - -`Dim.AUTO` allows usage of automatic dynamic shapes with `torch.export`. Users can export with `Dim.AUTO `and “discover” the dynamic behavior of their models, with min/max ranges, relations between dimensions, and static/dynamic behavior being automatically inferred. - -This is a more user-friendly experience compared to the existing named-Dims approach for specifying dynamic shapes, which requires the user to fully understand the dynamic behavior of their models at export time. `Dim.AUTO` allows users to write generic code that isn’t model-dependent, increasing ease-of-use for exporting with dynamic shapes. - -Please see [torch.export tutorial](https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html#constraints-dynamic-shapes) for more information. - -### [Prototype] CUTLASS and CK GEMM/CONV Backends for AOTInductor - -The CUTLASS and CK backend adds kernel choices for GEMM autotuning in Inductor. This is now also available in AOTInductor which can run in C++ runtime environments. A major improvement to the two backends is improved compile-time speed by eliminating redundant kernel binary compilations and dynamic shapes support. \ No newline at end of file diff --git a/_posts/2025-02-05-warp-specialization.md b/_posts/2025-02-05-warp-specialization.md deleted file mode 100644 index 098e1f6261fe..000000000000 --- a/_posts/2025-02-05-warp-specialization.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -layout: blog_detail -title: "Enabling advanced GPU features in PyTorch - Warp Specialization" -author: "Meta and NVIDIA" ---- - -**Meta**: Hongtao Yu, Manman Ren, Bert Maher, Shane Nay -**NVIDIA**: Gustav Zhu, Shuhao Jiang - -Over the past few months, we have been working on enabling advanced GPU features for PyTorch and Triton users through the Triton compiler. One of our key goals has been to introduce warp specialization support on NVIDIA Hopper GPUs. Today, we are thrilled to announce that our efforts have resulted in the rollout of fully automated Triton warp specialization, now available to users in the upcoming release of Triton [3.2](https://github.com/triton-lang/triton/tree/release/3.2.x), which will ship with PyTorch 2.6. PyTorch users can leverage this feature by [implementing user-defined Triton kernels](https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html). This work leveraged an initial implementation of warp specialization in Triton by NVIDIA and we look forward to further development with the community in the future. - -Warp specialization (WS) is a GPU programming technique where warps (a group of 32 threads on NVIDIA GPUs) within a threadblock are assigned distinct roles or tasks. This approach optimizes performance by enabling efficient execution of workloads that require task differentiation or cooperative processing. It enhances kernel performance by leveraging an asynchronous execution model, where different parts of the kernel are managed by separate hardware units. Data communication between these units, facilitated via shared memory on the NVIDIA H100, is highly efficient. Compared to a uniform warp approach, warp specialization allows the hardware multitasking warp scheduler to operate more effectively, maximizing resource utilization and overall performance. - -Using GEMM as an example, a typical uniform warp approach on the H100 GPU involves 8 warps per thread block collectively computing a tile of the output tensor. These 8 warps are divided into two warp groups (WG), with each group cooperatively computing half of the tile using efficient warp-group-level MMA (WGMMA) instructions, as illustrated in Figure 1. - - -![Figure 1. GEMM K-loop Body with Uniform Warps](/assets/images/warp-specialization/fg1.jpg){:style="width:100%"} - -Figure 1. GEMM K-loop Body with Uniform Warps - -The implementation is clean, easy to understand, and generally performs well, thanks to an elegant software pipeliner. The pipeliner's purpose is to enhance instruction-level parallelism by executing non-dependent operations on different hardware units. For instance, load operations from the next loop iteration can be executed simultaneously with WGMMA operations in the current iteration. However, this approach relies heavily on the compiler to craft an instruction sequence that ensures load and WGMMA instructions are issued at precisely the right time. While this is relatively straightforward for GEMM, which involves a limited number of operations, it becomes significantly more challenging for more complex kernels, such as flash attention. - -On the other hand, warp specialization addresses programming challenges by separating operations intended to run simultaneously on different hardware units into distinct warps, synchronizing them efficiently using low-cost barriers in shared memory. This allows each warp to have its own instruction sequence, enabling instructions to be issued and executed continuously without being interrupted by other operations, thanks to the multi-way warp scheduler. An illustration of a warp-specialized GEMM can be seen in Figure 2. - - -![Figure 2. GEMM K-loop Body with Specialized Warps](/assets/images/warp-specialization/fg2.jpg){:style="width:100%"} - -Figure 2. GEMM K-loop Body with Specialized Warps - - -## How to enable WS - -To enable warp specialization, users simply need to specify two autotune flags: num_consumer_groups and num_buffers_warp_spec. For example, a warp-specialized GEMM implementation might look as shown below. Users can enable warp specialization by setting a non-zero value for num_consumer_groups, which defines the number of consumer warp groups. There is no corresponding flag to set the number of producer warp groups, as currently only one producer is supported. The num_buffers_warp_spec flag specifies the number of buffers the producer warp group will use to communicate with the consumer warp groups. A working example of a warp-specialized kernel is provided in the persistent GEMM [tutorial](https://github.com/triton-lang/triton/blob/6771065cb3137f7e64454cc047b9b74d577cbf7f/python/tutorials/09-persistent-matmul.py#L620). - -``` -@triton.autotune( - configs=[ - triton.Config( - { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 8, - }, - num_stages=2, - num_warps=4, - num_consumer_groups=2, - num_buffers_warp_spec=3, - ), - ], - key=["M", "N", "K"], -) -@triton.jit -def matmul_persistent_ws_kernel( - a_ptr, b_ptr, c_ptr, M, N, K, - stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, -): - pid = tl.program_id(axis=0) - num_pid_m = tl.cdiv(M, BLOCK_M) - num_pid_n = tl.cdiv(N, BLOCK_N) - pid_m = pid // num_pid_m - pid_n = pid % num_pid_n - offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - offs_k = tl.arange(0, BLOCK_K) - a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak) - b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) - for k in range(0, tl.cdiv(K, BLOCK_K)): - a = tl.load(a_ptrs) - b = tl.load(b_ptrs) - acc += tl.dot(a, b) - a_ptrs += BLOCK_K * stride_ak - b_ptrs += BLOCK_K * stride_bk - c = acc.to(tl.float16) - c_ptrs = c_ptr + stride_cm * offs_m[:, None] + stride_cn * offs_n[None, :] - tl.store(c_ptrs, c) -``` - - -## Under the Hood - -Warp specialization uses a set of hierarchical compiler transformations and IR changes to translate a user's non-warp-specialized kernel into warp-specialized machine code. These include: - - - -* **Task Partitioning**: The entire kernel is automatically divided into asynchronous tasks based on predefined heuristics. The compiler determines how to utilize one producer warp group and a user-specified number of consumer warp groups to execute the kernel. It assigns task IDs to specific anchor operations, which then influence the task assignments for remaining operations through asynchronous task ID propagation and dependency analysis. Since shared memory is the most efficient method for data transfer between warp groups across all supported platforms, the compiler optimizes task partitions to minimize register spills to shared memory, ensuring efficient execution. -* **Data Partitioning for Multiple Consumer Groups**: Efficiently partitioning data among multiple consumer groups is key to optimizing workload distribution. On the H100 GPU, the compiler, by default, attempts to partition the input tensor `A` along the `M` dimension, allowing each consumer group to compute half of the output tensor independently. This strategy, known as [cooperative partitioning](https://github.com/NVIDIA/cutlass/blob/main/media/docs/efficient_gemm.md#warp-specialization), maximizes efficiency under most conditions. However, if this split leads to inefficiencies—such as producing a workload smaller than the native WGMMA instruction size—the compiler dynamically adjusts and partitions along the `N` dimension instead. -* **Dataflow Pipelining**: The compiler creates cyclic shared memory buffers to pipeline dataflows across multiple-dimensional loops. Warp-specialized pipelining supports complex control flow. For example, our warp-specialized persistent GEMM kernel uses a doubly-nested loop, allowing the producer to begin fetching data for the next output tile while the consumer is finishing the compute for the prior tile. -* **Communication Operations**`: `We introduced four high-level Triton GPU IR (TTGIR) communication operations`—ProducerAcquireOp, ProducerCommitOp, ConsumerWaitOp, `and` ConsumerReleaseOp—`to manage pipelined dataflows. These support both TMA and non-TMA memory operations. -* **Code Partitioning**: Each async task is outlined into its own standalone code region, guarded by warp group ID checks. Control dependencies are duplicated as needed. -* **TTGIR to LLVM/PTX Materialization**: TTGIR communication operations are materialized into corresponding LLVM/PTX barrier operations. - - -## Performance - -The [warp specialization release](https://github.com/triton-lang/triton/pull/5622) introduces a range of Triton compiler transformations that collectively convert user code into warp-specialized kernels. This feature has been applied to several key kernels, including Flash Attention and FP8 row-wise GEMM, resulting in significant performance gains of 10% to 15%. Below, we highlight the latest performance metrics for these high-impact kernels. - - -![bar chart](/assets/images/warp-specialization/fg3.png){:style="width:100%"} - - - - -![bar chart](/assets/images/warp-specialization/fg4.png){:style="width:100%"} - - - -## Future Work - -Looking ahead, we plan to further enhance Triton's warp specialization support by introducing new features such as Ping-Pong scheduling, expanded buffer sharing support, improved transparent handling for TMA, refined partitioning heuristics for upcoming NVIDIA hardware. \ No newline at end of file diff --git a/_posts/2025-02-11-unlocking-pt-2-6-intel.md b/_posts/2025-02-11-unlocking-pt-2-6-intel.md deleted file mode 100644 index 2a0cb363e10f..000000000000 --- a/_posts/2025-02-11-unlocking-pt-2-6-intel.md +++ /dev/null @@ -1,75 +0,0 @@ ---- -layout: blog_detail -title: "Unlocking the Latest Features in PyTorch 2.6 for Intel Platforms" -author: "the Intel PyTorch Team" ---- - -[PyTorch* 2.6](https://pytorch.org/blog/pytorch2-6/) has just been released with a set of exciting new features including torch.compile compatibility with Python 3.13, new security and performance enhancements, and a change in the default parameter for torch.load. PyTorch also announced the deprecation of its official Anaconda channel. - -Among the performance features are three that enhance developer productivity on Intel platforms: - -1. Improved Intel GPU availability -2. FlexAttention optimization on x86 CPU for LLM -3. FP16 on x86 CPU support for eager and Inductor modes - -## Improved Intel GPU Availability - -To provide developers working in artificial intelligence (AI) with better support for Intel GPUs, the PyTorch user experience on these GPUs has been enhanced. This improvement includes simplified installation steps, a Windows* release binary distribution, and expanded coverage of supported GPU models, including the latest Intel® Arc™ B-Series discrete graphics. - -These new features help promote accelerated machine learning workflows within the PyTorch ecosystem, providing a consistent developer experience and support. Application developers and researchers seeking to fine-tune, perform inference, and develop with PyTorch models on [Intel® Core™ Ultra AI PCs ](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/arc.html) and [Intel® Arc™ discrete graphics](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/arc.html) will now be able to install PyTorch directly with binary releases for Windows, Linux*, and Windows Subsystem for Linux 2. - -The new features include: - -* Simplified Intel GPU software stack setup to enable one-click installation of the torch-xpu PIP wheels to run deep learning workloads in a ready-to-use fashion, thus eliminating the complexity of installing and activating Intel GPU development software bundles.  -* Windows binary releases for torch core, torchvision and torchaudio have been made available for Intel GPUs, expanding from [Intel® Core™ Ultra Series 2](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html) with Intel® Arc™ Graphics and [Intel® Arc™ A-Series graphics ](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/desktop/a-series/overview.html)to the latest GPU hardware [Intel® Arc™ B-Series graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/desktop/b-series/overview.html) support.  -* Further enhanced coverage of Aten operators on Intel GPUs with SYCL* kernels for smooth eager mode execution, as well as bug fixes and performance optimizations for torch.compile on Intel GPUs.  - -Get a tour of new environment setup, PIP wheels installation, and examples on Intel® Client GPUs and Intel® Data Center GPU Max Series in the [Getting Started Guide](https://pytorch.org/docs/main/notes/get_start_xpu.html). - -## FlexAttention Optimization on X86 CPU for LLM - -FlexAttention was first introduced in [PyTorch 2.5](https://pytorch.org/blog/pytorch2-5/), to address the need to support various Attentions or even combinations of them. This PyTorch API leverages torch.compile to generate a fused FlashAttention kernel, which eliminates extra memory allocation and achieves performance comparable to handwritten implementations. - -Previously, FlexAttention was implemented for CUDA* devices based on the Triton backend. Since PyTorch 2.6, X86 CPU support of FlexAttention was added through TorchInductor CPP backend. This new feature leverages and extends current CPP template abilities to support broad attention variants (e.g., PageAttention, which is critical for LLMs inference) based on the existing FlexAttention API, and brings optimized performance on x86 CPUs. With this feature, user can easily use FlexAttention API to compose their Attention solutions on CPU platforms and achieve good performance. - -Typically, FlexAttention is utilized by popular LLM ecosystem projects, such as Hugging Face transformers and vLLM in their LLM related modeling (e.g., PagedAttention) to achieve better out-of-the-box performance. Before the official adoption happens, [this enabling PR](https://github.com/huggingface/transformers/pull/35419) in Hugging Face can help us the performance benefits that FlexAttention can bring on x86 CPU platforms. - -The graph below shows the performance comparison of PyTorch 2.6 (with this feature) and PyTorch 2.5 (without this feature) on typical Llama models. For real-time mode (Batch Size = 1), there is about 1.13x-1.42x performance improvement for next token across different input token lengths. As for best throughput under a typical SLA (P99 token latency <=50ms), PyTorch 2.6 achieves more than 7.83x performance than PyTorch 2.5 as PyTorch 2.6 can run at 8 inputs (Batch Size = 8) together and still keep SLA while PyTorch 2.5 can only run 1 input, because FlexAttention based PagedAttention in PyTorch 2.6 provides more efficiency during multiple batch size scenarios. - - -![Figure 1. Performance comparison of PyTorch 2.6 and PyTorch 2.5 on Typical Llama Models](/assets/images/unlocking-pt-2-6-intel.png){:style="width:100%"} - - -**Figure 1. Performance comparison of PyTorch 2.6 and PyTorch 2.5 on Typical Llama Models** - -## FP16 on X86 CPU Support for Eager and Inductor Modes - -Float16 is a commonly used reduced floating-point type that improves performance in neural network inference and training. CPUs like recently launched [Intel® Xeon® 6 with P-Cores](https://www.intel.com/content/www/us/en/products/details/processors/xeon/xeon6-p-cores.html) support Float16 datatype with native accelerator [AMX](https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/overview.html), which highly improves the Float16 performance. Float16 support on x86 CPU was first introduced in PyTorch 2.5 as a prototype feature. Now it has been further improved for both eager mode and Torch.compile + Inductor mode, which is pushed to Beta level for broader adoption. This helps the deployment on the CPU side without the need to modify the model weights when the model is pre-trained with mixed precision of Float16/Float32. On platforms that support AMX Float16 (i.e., the Intel® Xeon® 6 processors with P-cores), Float16 has the same pass rate as Bfloat16 across the typical PyTorch benchmark suites: TorchBench, Hugging Face, and Timms. It also shows good performance comparable to 16 bit datatype Bfloat16. - -## Summary - -In this blog, we discussed three features to enhance developer productivity on Intel platforms in PyTorch 2.6. These three features are designed to improve Intel GPU availability, optimize FlexAttention for x86 CPUs tailored for large language models (LLMs), and support FP16 on x86 CPUs in both eager and Inductor modes. Get [PyTorch 2.6](https://pytorch.org/) and try them for yourself or you can access PyTorch 2.6 on the [Intel® Tiber™ AI Cloud](https://ai.cloud.intel.com/) to take advantage of hosted notebooks that are optimized for Intel hardware and software. - -## Acknowledgements - -The release of PyTorch 2.6 is an exciting milestone for Intel platforms, and it would not have been possible without the deep collaboration and contributions from the community. We extend our heartfelt thanks to [Alban](https://github.com/albanD), [Andrey](https://github.com/atalman), [Bin](https://github.com/desertfire), [Jason](https://github.com/jansel), [Jerry](https://github.com/jerryzh168) and [Nikita](https://github.com/malfet) for sharing their invaluable ideas, meticulously reviewing PRs, and providing insightful feedback on RFCs. Their dedication has driven continuous improvements and pushed the ecosystem forward for Intel platforms. - -## References - -* [FlexAttention in PyTorch](https://pytorch.org/blog/flexattention/) -* [PagedAttention Optimization](https://arxiv.org/abs/2309.06180) -* [Intel® Xeon® 6 with P-Cores](•%09https:/www.intel.com/content/www/us/en/products/details/processors/xeon/xeon6-p-cores.html) - -## Product and Performance Information - -Measurement on AWS EC2 m7i.metal-48xl using: 2x Intel® Xeon® Platinum 8488C, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB [8], DSA [8], IAA[8], QAT[on CPU, 8], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4400 MT/s]), BIOS Amazon EC2 1.0, microcode 0x2b000603, 1x Elastic Network Adapter (ENA) 1x Amazon Elastic Block Store 800G, Ubuntu 24.04.1 LTS 6.8.0-1018-aws Test by Intel on Jan 15th 2025. - -## Notices and Disclaimers - -Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation. - -Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. - -## AI disclaimer: - -AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at [www.intel.com/AIPC](http://www.intel.com/AIPC). Results may vary. \ No newline at end of file diff --git a/_posts/2025-02-12-datathon-2025.md b/_posts/2025-02-12-datathon-2025.md deleted file mode 100644 index 0f69fd074382..000000000000 --- a/_posts/2025-02-12-datathon-2025.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -layout: blog_detail -title: "Solve Real-Word AI Challenges with PyTorch at Datathon 2025: DataOrbit" -author: "Aakash Senthilnathan" -hidden: true ---- - -**We’re excited to have PyTorch sponsor [Datathon 2025: DataOrbit](https://dataorbit-2025.devpost.com/)**, a place where students can collaborate with a team to solve problems using real-world datasets! This event, hosted by Data Science UCSB in collaboration with Gaucho Sports Analytics and ACM@UCSB, will take place on **February 22–23rd, 2025 at UC Santa Barbara**, with the incredible opportunity to present your project to a panel of corporate and faculty judges – **including the executive director of Pytorch!** – for a chance to win prizes up to $3000. - - -![logo](/assets/images/datathon-2025.png){:style="max-width:700px; width:100%; display: block; margin-left: auto; margin-right: auto"} - -PyTorch’s versatility and power have made it an essential tool for tackling complex data problems in domains ranging from computer vision and natural language processing to time series analysis. At Datathon 2025: DataOrbit, participants will have the chance to leverage PyTorch’s dynamic framework, ease of use, and robust ecosystem to build innovative solutions. Whether you’re building machine learning models, experimenting with deep learning architectures, or applying PyTorch to solve real-world challenges, workshops and mentors will be available to help you dive deeper into its capabilities and accelerate your project’s success. - -**Register Here:** [tinyurl.com/dataorbit25-reg](http://tinyurl.com/dataorbit25-reg) (Open until February 21st or until capacity is reached) - -Additional information regarding the timeline of events can be found on the registration form. - -About the Datathon - - - -* Open only to undergraduate students in the United States -* In-person events over 36 hours -* Teams sizes of 2-5 people -* 10 different prize tracks -* Workshops and office hours teaching essential data science tools and techniques -* Professional development workshops + networking opportunities with our sponsors -* All meals provided -* A fun time! - -*If you have a group you would like to work with, we require that every member register separately. If you do not have a group, we will have an opportunity at the beginning of the event to participate in an activity to form groups. Unfortunately, at this time we do not provide travel accommodations or lodging for participants.* - -*If you are interested in mentoring students virtually during the course of our datathon, or have any other questions contact us at datascience.ucsb@gmail.com.* \ No newline at end of file diff --git a/_posts/2025-02-19-optimize-llms.md b/_posts/2025-02-19-optimize-llms.md deleted file mode 100644 index b2dfec99bd0b..000000000000 --- a/_posts/2025-02-19-optimize-llms.md +++ /dev/null @@ -1,176 +0,0 @@ ---- -layout: blog_detail -title: "Optimize LLMs for Efficiency & Sustainability" -hidden: true -author: "Zach Lasiuk, Arm" ---- - -The rapid growth of large language model (LLM) applications is linked to rapid growth in energy demand. According to the International Energy Agency (IEA), data center electricity consumption is projected to roughly double by 2026 primarily driven by AI. This is due to the energy-intensive training requirements for massive LLMs – however, the increase in AI Inferencing workloads also plays a role. For example, compared with traditional search queries, a single AI inference can consume about [10x more energy](https://www.weforum.org/stories/2024/07/generative-ai-energy-emissions/). - -As developers, we directly affect how energy-intensive our AI solution is. There are technical decisions we can take to help make our AI solution more environmentally sustainable. Minimizing compute to deliver LLM solutions is not the only requirement for creating sustainable AI use. For example, systemic changes, such as policy interventions may be needed, but utilizing energy efficient solutions is an important factor and is an impactful intervention we can adopt right away. - -With that said, minimizing your LLM inference cloud compute requirements also leads to reducing your cloud bill and makes your app more energy efficient, creating a win-win situation. In this blog, we will take you through the steps to creating an LLM chatbot by optimizing and deploying a Llama 3.1 model on PyTorch, quantifying the computational efficiency benefits of specific architecture decisions. - - -## What will we evaluate? - -For this blog, our goal is to create an immersive fantasy storytelling app where users enter a fantasy world by chatting with a Generative AI. The first location is the land of Wicked, allowing people to role-play walking around the Emerald City and observe the sights and scenes in real-time. We’ll implement this via a chatbot and a custom system prompt. - -We will be evaluating LLM performance on CPUs. You can see the advantages of[ CPU vs GPU inference here](https://www.arm.com/resources/ebook/cpu-inference). In general, leveraging CPUs in the cloud for LLM inference is a great choice for models around 10B parameters or less like the Llama series. - -We will also be using Arm-based CPUs, specifically the AWS Graviton series. Based on studies,[ the Arm-based Graviton3 server can provide 67.6 percent lower workload carbon intensity built in](https://newsroom.arm.com/blog/aws-graviton-decarbonize-compute). While this study was based on a simulation, it is an excellent start to showing the possibilities for minimizing our app’s energy requirements. - -First, you’ll see how to run a simple LLM chatbot on PyTorch, then explore three techniques to optimize your application for computational efficiency: - -1. Model optimization: Utilizing 4-bit quantization and added KleidiAI kernels. -2. Shortcut optimization: Implementing a vector database to handle common queries. -3. Architecture optimization: Adopting a serverless architecture. - -Let’s get started. - - -## Run Llama-3.1 via PyTorch on AWS Graviton4 - -To maximize energy efficiency, we will only use the minimum server resources needed to support this LLM chatbot. For this [Llama-3.1 8-billion parameter model](https://huggingface.co/meta-llama/Llama-3.1-8B), 16 cores, 64GB RAM, and disk space of 50GB is required. We will use the r8g.4xlarge Graviton4 instance running Ubuntu 24.04, as it meets these specifications. - -Spin up this EC2 instance, connect to it, and start installing the requirements: - - -``` - sudo apt-get update - sudo apt install gcc g++ build-essential python3-pip python3-venv google-perftools -y -``` - - -Then install Torchchat, the library developed by the PyTorch team that enables running LLMs across devices: - - -``` - git clone https://github.com/pytorch/torchchat.git - cd torchchat - python3 -m venv .venv - source .venv/bin/activate - ./install/install_requirements.sh -``` - - -Next, install the Llama-3.1-8b model from Hugging Face through the CLI. You will first need to make a Hugging Face access token on your HF account. This will download the 16GB model to your instance, which may take a few minutes: - - -``` - pip install -U "huggingface_hub[cli]" - huggingface-cli login - - python torchchat.py export llama3.1 --output-dso-path exportedModels/llama3.1.so --device cpu --max-seq-length 1024 -``` - - -Now you are ready to run the LLM model, adding a system prompt to be a guiding storyteller in the land of Wicked: - - -``` - LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4 TORCHINDUCTOR_CPP_WRAPPER=1 TORCHINDUCTOR_FREEZING=1 OMP_NUM_THREADS=16 python torchchat.py generate llama3.1 --device cpu --chat -``` - - -Type ‘y’ to enter a system prompt and enter the following prompt: - - -*You are the guiding storyteller for a fantasy adventure application. Immerse users in the enchanting world of Wicked, guiding them through interactive, real-time experiences in the Emerald City. Describe vivid sights, dynamic scenes, and engage users in storytelling that feels alive and responsive. Allow users to make choices that shape their journey while maintaining the magical tone of the Wicked universe.* - -Then enter your user query: - - -*I walk through the Emerald City gates and look up* - -The output will show on the screen, taking about 7 seconds to generate the first token with less than 1 token per second. - - -![terminal](/assets/images/optimize-llms.png){:style="width:100%"} - - -This example took 245 seconds, or 4 minutes, to generate its complete reply—not very fast. The first optimization we’ll look at will speed up the LLM generation, reducing its computational footprint. - - -### Optimization 1: KleidiAI and Quantization - -Several optimizations are possible from the basic implementation above. The simplest and quickest one t to do is to quantize the model from FP16 to INT4. This approach trades-off some accuracy while cutting the model size from 16Gb to about 4Gb, increasing the inference speed in the process. - -Another common optimization comes in leveraging TorchAO (Torch Architecture Optimization), the PyTorch library that works seamlessly with TorchChat to enhance model performance through various quantization and sparsity methods. - -Lastly, we’ll use Arm KleidiAI optimizations. These are micro-kernels written in assembly that lead to significant performance improvements for LLM inference on Arm CPUs. You can read more about [how KleidiAI kernels work if interested](https://learn.arm.com/learning-paths/cross-platform/kleidiai-explainer/). - -To implement these optimizations, spin up a fresh EC2 instance and follow the instructions [on how to run a Large Language Model (LLM) chatbot with PyTorch](https://learn.arm.com/learning-paths/servers-and-cloud-computing/pytorch-llama/). When ready, run the model and enter the same system prompt and user query as above. You’ll get results that significantly speed up the inference: Less than 1 second to first token, and about 25 tokens per second. - -This cuts the inference time from 245 seconds to about 10 seconds. This results in less power-draw from your server, as it is spending more time idle vs running a power-hungry inference. All else being equal, this is a more carbon-friendly solution than the non-optimized app. The next two approaches go beyond model inference optimization, modifying the solution architectural to further reduce computational load. - - -### Optimization 2: FAISS to match database for common questions - -As stated in the introduction, model inferences are typically more computationally expensive than other search techniques. What if you could automatically respond to common user queries without performing an LLM inference? Using a query/response database is an option to bypass LLM inference and respond efficiently. For this interactive storytelling app, you can imagine common questions about specific characters, the world itself, and rules about what the chatbot is/is not capable of that can have pre-generated answers. - -However, a traditional exact-match database isn’t sufficient as users can phrase the same query in many ways. Asking about the chatbot’s capabilities could all invite the same answer but be phrased differently: - - - -* “What are you capable of?” -* “Tell me what you can do.” -* “How can I interact with you?” - -Implementing semantic search solves this issue by matching a user’s query to the most relevant pre-generated answer by understanding the user’s intent. The [FAISS library](https://github.com/facebookresearch/faiss) is a great option to implement semantic search. - -The computational savings of this approach depends on three factors: - - - -1. Percentage of user queries that can be serviced by semantic search instead of LLM. -2. Computational cost of running the LLM inference. -3. Computational cost of running the semantic search. - -With the savings equation being: - - -``` - Computational_savings = (% of queries) * (LLM_cost – search_cost). -``` - - -This type of architecture makes sense in a few situations. One is if your system has common queries with many repeat questions. Another is large-scale systems with hundreds of thousands of incoming queries, where small percentage savings add up to meaningful changes. Lastly, if your LLM inference is very computationally expensive compared to the search cost, particularly with larger parameter models. - -The final optimization approach is transitioning from server to serverless. - - -### Optimization 3: Serverless approach - -Using serverless architectures are popular for many reasons, one being only paying for active compute time, and eliminating costs with idle servers. Idling servers require a non-trivial amount of power to keep on, wasting energy while waiting. - -This cost efficiency translates into being an inherently more environmentally friendly architecture, as it reduces wasteful energy consumption. Further, multiple applications share underlying physical infrastructure, improving resource efficiency. - -To set up your own serverless chatbot, you need to first containerize the quantized Llama-3.1-8b with TorchChat, TorchAO, and Arm KleidiAI optimizations with a python script containing a Lambda entry function `lambda_handler`. One deployment option is to upload your container to AWS ECR and attach the container to your Lambda function. Then set up an API Gateway WebSocket or similar to interact with your Lambda through an API. - -There are two notable limitations to using a serverless architecture to host your LLM, the first being token generation speed. Recall that the server-based approach delivered about 25 tokens/second with KleidiAI optimizations. The serverless approach delivers an order of magnitude slower, which we measured at around about 2.5 tokens/second. This limitation mainly results from Lambda functions deploying onto Graviton2 servers. When deployment moves to CPUs with more SIMD channels, like Graviton3 and Graviton4, the tokens/second should increase over time. Learn more about architecture optimizations introduced in Graviton3 via the [Arm Neoverse-V1 CPU here](https://developer.arm.com/Processors/Neoverse%20V1). - -This slower speed restricts the viable use cases for serverless LLM architectures, but there are certain cases where this can be seen as an advantage. In our use cases of interactive storytelling, slowly revealing information creates a sense of immersion, building anticipation and mimicking real-time narration. Other use cases include: - - - -* Guided meditation apps with slow, relaxing word delivery -* Virtual friend engaging in thoughtful conversation, or a therapeutic conversation. -* Poetry generation or interactive art to slow delivery creating a contemplative aesthetic. - -Users may have a better experience with slower token generation in the right applications. When prioritizing a more sustainable solution, restrictions end up becoming strengths. As an analogy, a common critique of modern movies today is that their overreliance on visual effects leads to fewer compelling storylines vs older movies. The cost restrictions of VFX meant older movies had to craft captivating dialog, leveraging skillful camera angles and character positioning to fully engage viewers. Similarly, focusing on sustainable AI architectures can lead to more engaging, immersive experiences when done thoughtfully. - -The second serverless limitation on LLM inferences is the cold-start time of about 50 seconds. If implemented poorly, a user waiting 50 seconds with no alternative will likely leave the app. You can turn this limitation into a feature in our Wicked-based experience with several design tricks: - - - -* Create a “prologue experience” where you guide users through hard-coded questions and answers, priming them for where they will land in Emerald City and collecting input to shape their upcoming experience. -* Make the waiting period a countdown timer, revealing hard-coded text snippets of the story or world-building. A character, like the wizard, could communicate with the user with fragmented lines to build suspense and prime the user into the right mindset. -* Create an audio intro with music from the movie or musical, along with rotating visuals to draw users into the atmosphere of the Wicked world. - - -### Thinking outside the box - -Implementing a sustainability-minded solution architecture includes and goes beyond optimizing your AI inferences. Understand how users will interact with your system, and right-size your implementation accordingly. Always optimizing for fast tokens per second or time to first token will hide opportunities for engaging features. - -With that said, you should be leveraging straightforward optimizations when possible. Using TorchAO and Arm KleidiAI micro-kernels are great ways to speed up your LLM chatbot. By combining creative solution architectures and optimizing where possible, you can build more sustainable LLM-based applications. Happy coding! \ No newline at end of file diff --git a/_posts/2025-02-26-accelerating-generative-ai-segment-anything-2.md b/_posts/2025-02-26-accelerating-generative-ai-segment-anything-2.md deleted file mode 100644 index 87751067df7b..000000000000 --- a/_posts/2025-02-26-accelerating-generative-ai-segment-anything-2.md +++ /dev/null @@ -1,1342 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Generative AI with PyTorch: Segment Anything 2 - Fast and furious inference with low latency and fast cold starts" ---- - -This post is a follow-up to our [first entry in the multi-series blog focused on how to accelerate generative AI models](https://pytorch.org/blog/accelerating-generative-ai/) with pure, native PyTorch and a focus on latency and elastic scalability. We use torch.compile and torch.export to create highly optimized low latency versions of SAM2 that can be quickly scaled up on new instances. - -By utilizing AOTInductor's (AOTI) ahead-of-time compilation via torch.export, reduced precision, batched prompts and GPU preprocessing we observe up to **13x improvement in p90 execution latency** and **queue times compared to regular eager mode PyTorch**. - -We calculate our final results and demonstrate the improvement in a realistic deployment on auto-scaling cloud infrastructure from [Modal](https://modal.com). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 execution latency -
        -(ms / improvement) -
        p90 execution latency -
        -(ms / improvement) -
        - eager float32 - AOTI float16 - eager float32 - AOTI float16 -
        AMG - 741 - 112 (6.6x) - 1140 - 176 (6.5x) -
        SPS - 98 - 20 (4.9x) - 130 - 28 (4.6x) -
        MPS - 269 - 38 (7.1x) - 714 - 52 (13.7x) -
        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 queue time (ms / improvement) - p90 queue time (ms / improvement) -
        - eager float32 - AOTI float16 - eager float32 - AOTI float16 -
        AMG - 201 - 41 (4.9x) - 815 - 327 (2.6x) -
        SPS - 31 - 33 (0.9x) - 441 - 49 (9.0x) -
        MPS - 40 - 37 (1.1x) - 942 - 75 (12.6x) -
        - - - -## The Tasks - -The first post focused on processing a small number of varying prompts (points of interest) per image. These points represented the center points of the ground truth masks. For this post, we'll now focus on a broader set of tasks. Single prompt segmentation (SPS), multi prompt segmentation (MPS), automatic mask generation (AMG) which generates the full set of masks for the input image without a given set of prompts. The first post focused on MPS only. - -![comparison of 3 images](/assets/images/accelerating-generative-ai-2.jpg){:style="width:100%"} - - - -The little star in the image represents a user prompt. For AMG there are no prompts and masks are filtered down heuristically from a dense grid of initial candidate prompts (guesses). For SPS and MPS user prompts are derived from the center points of AMG masks. For SPS we choose the mask with the largest area. - -**Note that SAM2 uses a different backbone than SAM1. In particular, we only consider the largest and most accurate sam2.1_hiera_large backbone for this blog.** - -We aggregate the scripts needed to reproduce the results in [torchao's example folder](https://github.com/pytorch/ao/tree/main/examples/sam2_amg_server) and incrementally upstream the more stable parts of the [changes to the SAM2 model in torchao](https://github.com/pytorch/ao/tree/main/torchao/_models/sam2) to the main [SAM2](https://github.com/facebookresearch/sam2) repository. So if you are interested in taking a look at the cutting-edge variant or would like to contribute experimental features, please don't hesitate to reach out to the torchao repository and team. For the more stable and latest model version, please head on over to SAM2 directly. - - -## Overview - -We categorize the changes presented here into two. **Fast** changes constrain themselves to techniques that are not meant to affect model accuracy. **Furious** changes sacrifice some numerical accuracy for additional speed by making use of approximations such as low-precision data types. - -Approximations may slightly lower precision metrics in favor of significantly improved performance while still passing an end-to-end check based on mean intersection over union (mIoU). - -To measure the performance improvements we processed 1000 images, which were selected at random from the SAM2 validation dataset. We look at the p50 and p90 latency per image. To measure accuracy we consider the mIoU. Most notably for the AMG task we also define a fail count metric. We consider a comparison failed if the **number of masks** differs. This turns out to be a fairly unstable quantity and we can see that the other tasks are not as sensitive to small numeric changes as AMG. - - -## The Setup - -We are running the offline experiments on a regular H100 devserver, which is a fairly beefy and performant machine. - -However, we try to look at these tasks with realistic constraints. In particular, we would like to emulate a server-side inference environment. That means we don't use DataLoader to hide the latency of image preprocessing or decoding routines. - -For the latency calculations we include decoding, segmentation and conversion of masks to a dictionary of run-length encoded masks. Or put differently, we exclude loading the images into in-memory host bytearrays and storing the resulting dictionaries as json files on disk. This is meant to emulate a more realistic setting. - -More concretely, consider the code below for the routines we include in our measurements. For any task `gen_masks` produces a batched bool Tensor bitmask that represents the corresponding object masks. We then compress this bitmask into a run length encoded (rle) format that can be used to transfer back the results from a remote server much more efficiently. - - -``` -image_tensors = decode_img_bytes(...) -masks = gen_masks(image_tensors, ...) -rle_dicts = [rle_dict_from_masks(m) for m in masks] -``` - - - -## Optimizations - - -### ao: eager code optimizations - -The most effective tool for this work is the PyTorch autograd profiler combined with `record_function`. To build this software, we've used the profiler repeatedly to observe the program and confirm the effectiveness of any changes. It's also important to keep in mind that the profiler itself has overhead. The more data you collect, such as stack traces, the more overhead you introduce, which might skew the collected trace. But it is excellent to find synchronization points, space between kernels and GPU kernels that take a long time. - -GPU traces help you understand bottlenecks that are not necessarily easily addressed by compile. We found that AutomaticMaskGeneration in particular is dominated by the data structure used to store the masks and by the routine used to convert the masks to a run-length encoded compressed format. We also found a large part of AMG performance is dominated by the large number of masks created as a single batch. Sometimes candidate masks can be filtered down to fewer candidates earlier in the postprocessing stage by reordering operations. This in turn significantly speeds up the later operations. - -In order to confirm the accuracy of our implementation we first compare without any changes in settings and using float32 precision. We see that mIoU is unchanged and the masks match perfectly when using the exact same settings. This means that these eager mode changes did not affect the accuracy of these tasks. - -AMG - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU / fail count -
        Baseline - 864 - 1144 - 4350 - reference -
        AO - 693 - 786 - 4010 - 1 / 0 -
        - - - -### ao: batching prompts - -Another lossless performance optimization that we were able to apply is batching the user input prompt calculations. When optimizing for latency at batch size 1 on a server-grade GPU such as an H100 we are often left with a lot of spare memory. We can easily trade off that memory for more performance by processing more points of interest (also called user prompts) at once. Remember that SAM2 is split into two parts: First the backbone (image encoder), second the prediction and decoding of masks based on a set of user prompts / points of interest. It is the second part where we may expect a larger or even varying number of inputs and it is this second part where we apply batching. - -This causes a large increase in memory, but also much better latency. The baseline generates one mask per prompt in a loop. For AMG the baseline processes 64 prompts at once and all that is needed is to change it to 1024, which is the number of candidate prompts generated. For SPS we process one prompt at a time, but it's still included below for completeness. - -AMG - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU / fail count -
        Baseline - 864 - 1144 - 4350 - reference -
        AO + batching - 613 - 706 - 33786 - 0.9999995 / 0 -
        - - -SPS - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU -
        Baseline - 116 - 181 - 1337 - reference -
        AO - 110 - 170 - 1339 - 1 -
        - - -MPS - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU -
        Baseline - 276 - 681 - 1337 - reference -
        AO + batching - 126 - 225 - 8021 - 0.9999992 -
        - - -As a technical side note: Most notably to enable batching for MPS, and to avoid a significant manual rewrite of the code base to support multiple prompts at the same time, we used a Tensor subclass we call MapTensor. A MapTensor allows us to pass a batch of N prompts, but have it advertise a batch size of 1. Any operation is then automatically broadcast to the wrapped Tensor and propagated throughout the prediction part of the model. This works because individual prompt predictions are independent of one another. This is very similar to torch.vmap. - - -``` -center_points_torch = to_map_tensor(center_points_torch) -center_points_label_torch = to_map_tensor(center_points_label_torch) -masks, scores, _ = mask_generator.predictor.predict( - point_coords=center_points_torch, - point_labels=center_points_label_torch, - multimask_output=True, - return_logits=False, - return_type="torch", -) -# Unwrapping MapTensor -masks = masks.elems -scores = scores.elems -``` - - - -### fast: fullgraph compilation - -Just as with our first post, we first remove GPU syncs and graph breaks to make use of fullgraph compiled model code with max-autotune kernels where appropriate. After some rewriting, we are able to compile the image encoder and the prediction of masks. - -We run the experiments twice to get a sense of the overhead due to compilation. We run it once in an environment with an empty TORCHINDUCTOR_CACHE_DIR and then again while ingesting the artifacts from the previous run. In particular, auto-tuning can take a long time and happens on the first call in a pristine environment. We call the second run "warm". The first iteration is typically expected to be slow due to various other related initialization processes, but compile increases it significantly, even if an existing cache is used and the same exact shapes are fed again. Having said that, an overhead of a few seconds in a warm environment is often still stomachable on the very first call. - -Most of these drawbacks can be mitigated and compiling causes a significant improvement in latency and reduction in memory. - -AMG - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU / -
        -fail count -
        first iteration -
        -(ms) -
        AO + batching - 613 - 706 - 33786 - 0.9999995 / 0 - 1125 -
        + compile (cold) - 423 - 513 - 29349 - skipped - 404866 -
        + compile (warm) - 439 - 530 - 29349 - 0.994 / 190 - 8544 -
        - - -The number of masks produced per mask can vary slightly when using automatic mask segmentation. There is ambiguity in the number of masks per object the model may produce. For example, a car may be subdivided into frames, windows and doors or treated as a whole. When a modification causes the number of masks to change, we consider the comparison failed and we only calculate the mIoU on masks with an exact match. This does not apply to the other tasks. We found that the number of masks generated is very sensitive to small numerical changes. The other tasks use the same code and MPS in particular can help us further verify correctness. - -SPS - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU - first iteration -
        -(ms) -
        AO - 110 - 170 - 1339 - 1 - 562 -
        + compile (cold) - 102 - 158 - 1343 - skipped - 319954 -
        + compile (warm) - 100 - 160 - 1302 - 0.9999 - 8947 -
        - - -MPS - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU - first iteration -
        -(ms) -
        AO + batching - 126 - 225 - 8021 - 0.9999992 - 504 -
        + compile (cold) - 129 - 215 - 8021 - skipped - 333308 -
        + compile (warm) - 113 - 213 - 8021 - 0.998 - 8617 -
        - - - -### furious: TF32, float16 and GPU preprocessing - -We found that using float16 is the right level of precision for a few significant subcomponents of the model. In particular, the image encoder and mask decoder weights can be converted entirely to float16. We can also use TensorFloat32 precision for the remaining float32 matrix operations. It should be possible to further reduce the precision and we may address this in a future post. We also move image preprocessing such as image normalization onto the GPU with the furious mode. We can't use GPU decoding (nvJPEG) routines, because the differences are too significant and the model suffers from significant degradation in mIoU, so image decoding still happens on the CPU. - -AMG - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU / -
        -fail count -
        AO -
        -+ batching -
        -+ compile (warm) -
        439 - 530 - 29349 - 0.994 / 190 -
        + furious - 165 - 240 - 28335 - 0.978 / 306 -
        - - -This causes a significant degradation in mIoU for the AMG task, but doesn't affect the other tasks. After an in-depth investigation, we still chalk this up to numerical instability and reordering of operations. More work is needed to further investigate this and it may not be interesting to run the AMG task in lower precision. The other tasks, however, benefit drastically in latency with minimal changes in mIoU. - -SPS - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU -
        AO -
        -+ compile (warm) -
        100 - 160 - 1302 - 0.9999 -
        + furious - 32 - 63 - 861 - 0.9997 -
        - - -MPS - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU -
        AO -
        -+ batching -
        -+ compile (warm) -
        113 - 213 - 8021 - 0.998 -
        + furious - 36 - 64 - 4222 - 0.997 -
        - - - -### AOTInductor's (AOTI) ahead-of-time compilation via torch.export - -When scaling elastically it often is not possible to accommodate long startup times. That means the first iteration cannot be slow, but we must quickly deliver results. This is when torch.compile's current compilation overhead can get in the way. To address this we can use AOTInductor's (AOTI) ahead-of-time compilation via torch.export. AOTI lets us compile the model on a representative input and store the resulting code in a binary that is quick to load and run. - -AOTI via torch.export is a new feature and we currently can't export everything that is compilable. We've been able to export the image encoder for all tasks but have only been able to export the mask prediction for the AMG and SPS tasks due to varying prompts. torch.export also supports dynamic shapes, but we need to invest a bit more time to prepare the code for it. - -AMG: AO + batching + furious - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU / -
        -fail count -
        first iteration -
        -(ms) -
        + compile (warm) - 165 - 240 - 28335 - 0.978 / 306 - 10341 -
        + load export -
        -(cold) -
        162 - 233 - 27927 - 0.974 / 308 - 906 -
        - - -SPS: AO + furious - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU - first iteration -
        -(ms) -
        + compile (warm) - 32 - 63 - 861 - 0.9997 - 7989 -
        + load export -
        -(cold) -
        35 - 66 - 1686 - 0.9997 - 763 -
        - - -Note that loading the exported model significantly increases memory. It likely only increases peak memory utilization, because initialization really needs to be delayed before loading up an exported model to avoid having twice the weights in memory at once. This is something we could address, but the memory consumption is nowhere near the limit. We don't see an increase in the other tasks, because AMG and MPS peak memory is dominated by processing batches of masks. One way to reduce that could be to operate on masks in the rle format (or some other sparse format) earlier on, but for now, there is no reason for this given the current memory consumption and focus on latency. - -MPS: AO + batching + furious - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU - first iteration -
        -(ms) -
        + compile (warm) - 36 - 64 - 4222 - 0.997 - 9626 -
        + load export -
        -(cold) -
        43 - 72 - 3813 - 0.997 - 747 -
        - - -Using export by itself doesn't seem to benefit from extensive warmup and can be run in a pristine new inductor cache directory. But again, we do not evict the CUDA cache or other caches. In the section on Modal, we are running some of these experiments in a pristine environment. - -When only processing 1000 images in a new process, using export can really be worth it to save out on compile and other cold start overhead. - - -### bonus: More GPU preprocessing - -At this point, the latency is fairly low. In particular, for the SPS and MPS tasks we are processing at around 30ms to 40ms. Let's bring back the pseudo-code from the setup section again. - - -``` -image_tensors = decode_img_bytes(...) -masks = gen_masks(image_tensors, ...) -rle_dicts = [rle_dict_from_masks(m) for m in masks] -``` - - -Further profiling showed that at this point `decode_img_bytes` takes about 10ms. In particular, it uses torchvision's ToTensor transform to convert from a numpy Tensor to a scaled, float32 torch.Tensor. The bytes passed to ToTensor have already been decoded and converted to an numpy ndarray. By slightly rewriting ToTensor, using torchvision's v2 API and moving the uint8 decoded smaller integer Tensor to GPU first before scaling, we can gain another 10ms in latency. Without including `decode_img_bytes` in our analysis we would have missed this opportunity that has real-world impact on server-side inference. - - -``` -image_tensor = torch.from_numpy(image_tensor) -image_tensor = image_tensor.permute((2, 0, 1)) -image_tensor = image_tensor.cuda() -image_tensor = v2.ToDtype(torch.float32, scale=True)( image_tensor) -``` - - -Note in particular that using pinned memory to perform asynchronous data transfers doesn't apply, since the time it takes to move the Tensor into pinned memory isn't worth the gain in asynchronicity for this data movement. For future work, we might want to explore further improvements here by using more advanced direct memory transfer techniques. - -AMG: AO + batching + furious - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU / -
        -fail count -
        first iteration -
        -(ms) -
        + load export -
        -(cold) -
        162 - 233 - 27927 - 0.974 / 308 - 906 -
        + load export (warm) - 157 - 230 - 27927 - 0.974 / 308 - 799 -
        + load export (warm) -
        -+ preproc -
        136 - 208 - 27950 - 0.977 / 311 - 908 -
        - - -SPS: AO + furious - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU - first iteration -
        -(ms) -
        + load export -
        -(cold) -
        35 - 66 - 1686 - 0.9997 - 763 -
        + load export (warm) - 31 - 63 - 1686 - 0.9997 - 683 -
        + load export (warm) -
        -+ preproc -
        19 - 25 - 1711 - 0.9997 - 658 -
        - - -MPS: AO + batching + furious - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 latency (ms) - p90 latency (ms) - memory (MiB) - mIoU - first iteration -
        -(ms) -
        + load export -
        -(cold) -
        43 - 72 - 3813 - 0.997 - 747 -
        + load export (warm) - 53 - 81 - 3813 - 0.997 - 807 -
        + load export (warm) -
        -+ preproc -
        31 - 41 - 3837 - 0.997 - 671 -
        - - -This small change has a significant impact on the SPS and MPS task. - - -## Deploying on Modal - -Finally, we deployed our optimized inference onto [Modal](https://modal.com), a serverless infrastructure provider, to demonstrate that the benefits of these optimizations can be realized in a more realistic deployment setting. - -In particular, compilation and AOTI via torch.export requires extra work. In a naïve deployment that work might be added to every single inference execution, adding latency that dwarfs any improvements from a faster model. This is particularly challenging with elastic or autoscaling infrastructure, where replicas of our inference service need to be regularly and automatically created and destroyed. - -We share a deployment script in the torchao repository ([cli_on_modal.py](https://github.com/pytorch/ao/tree/main/examples/sam2_amg_server)) to demonstrate one pattern for an elastic deployment. We build the exported models ahead of time and then upload them to [distributed storage](https://modal.com/docs/guide/volumes). Relative to eager execution, this adds a bit of extra work when replicas spin up since they need to read this data over a network, but this is far less costly than compilation or export. - -We benchmarked this deployment with a large batch inference workload: sending 1000 images for concurrent processing. The deployment scales up to ten replicas on ten GPUs at peak and scales down to zero GPUs when inactive. - -First, let’s look at the execution latencies. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 execution latency -
        -(ms / improvement) -
        p90 execution latency -
        -(ms / improvement) -
        - eager float32 - AOTI float16 - eager float32 - AOTI float16 -
        - - Modal - Offline - - Modal - Offline -
        AMG - 741 - 112 (6.6x) - 136 (5.4x) - 1140 - 176 (6.5x) - 208 (5.5x) -
        SPS - 98 - 20 (4.9x) - 19 (5.2x) - 130 - 28 (4.6x) - 25 (5.2x) -
        MPS - 269 - 38 (7.1x) - 31 (8.7x) - 714 - 52 (13.7x) - 41 (17.4x) -
        - - -We notice that execution latencies on Modal and Offline are fairly close, especially relative to the baseline, indicating that optimizing the deployment offline was a reasonable proxy for optimizing the deployment directly. - -In addition to execution latency, our batch workload has queueing time, since there are fewer replicas than there are inputs, and so some inputs have to wait in line. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        - p50 queue time (ms) - p90 queue time (ms) -
        - eager float32 - AOTI float16 - eager float32 - AOTI float16 -
        AMG - 201 - 41 (4.9x) - 815 - 327 (2.6x) -
        SPS - 31 - 33 (0.9x) - 441 - 49 (9.0x) -
        MPS - 40 - 37 (1.1x) - 942 - 75 (12.6x) -
        - - -Even though the queueing system provided by the infrastructure is unchanged, the queue latencies also decrease when we use our optimized model – in the p90 case by a factor of 2 to 12. That’s because when we finish previous inputs faster (from reduced execution latency) we can pull our next inputs sooner (reducing their queueing time). - -If you’re interested in optimizing SAM2 inference or deployments further, don’t hesitate to reach out to us at the [torchao repository](https://github.com/pytorch/ao)! - - -## Conclusions - -We rewrote Meta's original SAM2 in pure PyTorch with little loss of accuracy and a strong focus on latency. We deployed our optimized inference onto [Modal](https://modal.com), a serverless infrastructure provider, to demonstrate that the benefits of these optimizations can be realized in a more realistic deployment setting. - -By utilizing AOTInductor's (AOTI) ahead-of-time compilation via torch.export, reduced precision, batched prompts and GPU preprocessing we observe up to 13x improvement in p90 execution latency and queue times compared to regular eager mode PyTorch. - -With elastic or autoscaling infrastructure, where replicas of our inference service need to be regularly and automatically created and destroyed, a naïve deployment of torch.compile can add work to inference execution that dwarfs any improvements from a faster model. By utilizing AOTInductor's (AOTI) ahead-of-time compilation via torch.export, we are able to upload exported models ahead of time and read this data over a network, which enables us to get the benefits of compilation without significantly increased work. - -For more details on how to reproduce the data in this blog post, [check out the experiments folder of torchao](https://github.com/pytorch/ao/tree/main/examples/sam2_amg_server). Please don't hesitate to contact us or [open an issue](https://github.com/pytorch/ao/issues/new) if you run into any technical issues. \ No newline at end of file diff --git a/_posts/2025-03-04-submit-to-speak.md b/_posts/2025-03-04-submit-to-speak.md deleted file mode 100644 index 89d9907b682d..000000000000 --- a/_posts/2025-03-04-submit-to-speak.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -layout: blog_detail -title: "📣 Submit to Speak at PyTorch Conference + Save on Registration" ---- - -Step into the Future of AI at PyTorch Conference 2025. - - -![banner ad for conference](/assets/images/submit-to-speak/fg1.png){:style="width:100%"} - - -The Call for Proposals for **PyTorch Conference 2025** is officially open! - -**Join us in San Francisco from October 22–23, 2025,** to showcase your expertise and innovations with PyTorch—the industry-leading, open-source machine learning framework powering innovations from bare-metal infrastructure to sophisticated application and agent layers. This is your opportunity to share insights, breakthroughs, and case studies with a global audience of AI and Generative AI practitioners, researchers, and developers. - -![people watching presentation at conference](/assets/images/submit-to-speak/fg2.jpg){:style="width:100%"} - - -Submit your proposals and prepare to engage, learn, and network alongside some of the brightest minds in the AI/ML community. We’re seeking sessions, Birds of a Feather discussions, lightning talks, and poster sessions on the following topics: - -* Core PyTorch Framework -* PyTorch on Accelerator Hardware -* PyTorch Ecosystem and Tools -* AI Applications and Use Cases -* AI in Research and Academia -* AI in Industry and Enterprise Applications -* AI Infrastructure and Scalability -* Ethical AI, Governance, and Regulation -* Training, Fine-Tuning, and Alignment -* Inference, Deployment, and Serving -* Performance Measurement and Benchmarking -* Data Engineering and Management for AI -* Generative AI and Large Language Models (LLMs) -* Model Optimization and Efficiency -* Open Source Collaboration, Education and Community Building -* Edge AI and On-Device -* DL Compilers and Kernel Authoring - - -
        -

        Learn more and submit your talk by Sunday, June 1, at 11:59 PDT!

        - - SUBMIT TO SPEAK - -
        - - ---- - -![people arriving at conference](/assets/images/submit-to-speak/fg3.jpg){:style="max-width:300px; display: block; float: right;"} - -**Save up to USD$500 with Super Early Bird Pricing!** - -* Reserve your pass by **11:59 PM PDT on March 21** and score Super Early Bird pricing for just **USD$499**. That’s a savings of up to USD$500! -* Student or faculty? Learn more about our **[discounted academic rate](https://events.linuxfoundation.org/pytorch-conference/register/#registration-rates)**. -* Need help covering travel costs? We offer discretionary travel funding for those community members who would otherwise not be able to attend. **[Learn more](https://events.linuxfoundation.org/pytorch-conference/register/#additional-information)**. - - - ---- - - -**Become a Sponsor at PyTorch Conference 2025!** - -Seize your opportunity to influence the future of Generative AI and Machine Learning by sponsoring PyTorch Conference 2025. PyTorch is at the forefront of innovation—empowering rapid experimentation, flexible model development, and efficient deployment into production environments with its powerful, versatile ecosystem of tools and thriving community of dedicated users. - -As a sponsor, you'll gain more than visibility; you'll strategically position your organization at the heart of a vibrant, global AI/ML ecosystem. Connect directly with **3,000+** expert attendees, researchers, engineers, and decision-makers, and actively shape the conversations driving the next generation of AI advancements. - - - -For more details on CFP submissions, registration, and sponsorship, visit **the** [PyTorch Conference Website](https://events.linuxfoundation.org/pytorch-conference/). \ No newline at end of file diff --git a/_posts/2025-03-05-activation-checkpointing-techniques.md b/_posts/2025-03-05-activation-checkpointing-techniques.md deleted file mode 100644 index 782722e96681..000000000000 --- a/_posts/2025-03-05-activation-checkpointing-techniques.md +++ /dev/null @@ -1,233 +0,0 @@ ---- -layout: blog_detail -title: "Current and New Activation Checkpointing Techniques in PyTorch" ---- - -As models scale in depth, batch size, and sequence length, etc, activation memory becomes an increasingly significant contributor to the overall memory usage. To help address this, PyTorch provides utilities for [activation checkpointing](https://pytorch.org/docs/stable/checkpoint.html), which reduce the number of saved tensors by recomputing them when needed, trading off memory usage for additional compute. - -In this post, we’ll walk through the basics of what activation memory is, the high-level ideas behind existing activation checkpointing techniques, and also introduce some newer techniques that aim to improve flexibility and provide more optimization/automation out of the box. - -As we look at these techniques, we'll compare how these methods fit into a speed vs. memory trade-off diagram and hopefully provide some insight on how to choose the right strategy for your use case. - -*(If you prefer to jump straight to the new APIs, please skip ahead to the “Selective Activation Checkpoint” and “Memory Budget API” sections below.)* - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg1.png){:style="width:100%"} - - ---- - - -## Activation Memory Basics - -By default, in eager mode (rather than using `torch.compile`), PyTorch’s autograd preserves intermediate activations for backward computation. For example, if you call `sin` on a tensor `x` during the forward pass, autograd must remember `x` to compute `cos(x)` during backward. - - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg2.png){:style="max-width:400px; display: block; margin-left: auto; margin-right: auto"} - - -If this tensor `x` is saved at the beginning of the forward pass, it remains in memory throughout both the forward and backward phases. It can only be cleared after it is used to compute the gradient, which happens at the end of the backward pass (due to the reverse order of execution). - -Thus, as you proceed through the forward pass and perform more and more operations, you accumulate more and more activations, resulting in more and more activation memory until it (typically) reaches its peak at the start of backward (at which point activations can start to get cleared). - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg3.png){:style="width:100%"} - - -*In the diagram above, the orange boxes represent operations, black arrows represent their tensor inputs and outputs. The black arrows that cross over the right represent tensors that autograd saves for backward.* - -A useful way to visually organize this default saving behavior in eager as well as the techniques we're about to introduce is based on how they trade off speed versus memory. - - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg4.png){:style="width:100%"} - - -The ideal place to be on this diagram is the top-left, where you have "high" speed but also low memory usage. - -We begin by putting the default saving behavior on the **top-right** (for reasons we'll explain in more detail as we introduce more points for other techniques). - - ---- - - -## Activation Checkpointing (AC) - -**[Activation checkpointing (AC)](https://pytorch.org/docs/stable/checkpoint.html)** is a popular technique to reduce memory usage in PyTorch. - -During forward, any operations performed inside the AC'd region do not save tensors for backward. (Only the inputs to the function are saved.) During backward, the intermediate activations needed for gradient computation are rematerialized by running the function a second time. - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg5.png){:style="width:100%"} - - -*In the diagram (right), the black box shows where activation checkpointing is applied. Compared to the default eager approach (left), this setup results in fewer tensors being saved (1 versus 3).* - -Applying AC on the right parts of the model has the effect of reducing peak memory, because the intermediate activations are no longer materialized in memory when the memory usage typically peaks (at the beginning of backward). - -On the speed-versus-memory tradeoff diagram, AC is plotted on the **bottom-left.** Relative to eager mode, it reduces the amount of memory saved for backward but comes with an added cost in compute due to recomputation. - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg6.png){:style="width:100%"} - - -Note that AC’s speed–memory tradeoff /can/ be adjusted by selecting which parts of the forward pass to checkpoint and by defining how many checkpoint regions to use. However, implementing these changes may require modifying your model’s structure and can be cumbersome depending on how your code is organized. For the purposes of this diagram, we assume only one region is checkpointed; under this assumption, AC appears as a single point on the tradeoff diagram. - -Also note that “memory” here does not refer to peak memory usage; rather, it indicates the how much memory is saved for backward for a fixed region. - - ---- - - -## torch.compile and min-cut partitioner - -Another notable approach to keep in mind is **torch.compile** (introduced in PyTorch 2.0). Like activation checkpointing, `torch.compile` can also perform some level of recomputation under the hood. Specifically, it traces the forward and backward computations into a single joint graph, which is then processed by a [“min-cut” partitioner](https://dev-discuss.pytorch.org/t/min-cut-optimal-recomputation-i-e-activation-checkpointing-with-aotautograd/467). This partitioner uses a min-cut/max-flow algorithm to split the graph such that it minimizes the number of tensors that need to be saved for backward. - -At first glance, this might sound a lot like what we want for activation memory reduction. However, the reality is more nuanced. By default, the partitioner’s primary goal is to reduce runtime. As a result, it only recomputes certain types of operations—primarily simpler, fusible, and non-compute-intensive ops (like pointwise ops). - -Placing "compile" on the speed-versus-memory tradeoff diagram... - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg7.png){:style="width:100%"} - - -It is to the top-left of the eager non-AC point, as we expect `torch.compile` to improve on both speed and memory. - -On the other hand, relative to activation checkpointing, torch.compile is more conservative about what it recomputes, placing it closer to the top-left on the speed-versus-memory diagram. - - ---- - - -## Selective Activation Checkpoint [NEW!] - -While normal checkpointing recomputes every op in a chosen region, [selective activation checkpointing (SAC)](https://pytorch.org/docs/main/checkpoint.html#torch.utils.checkpoint.create_selective_checkpoint_contexts) is an additional setting on top of activation checkpointing that you can apply to have a more granular control over which operations to recompute. - -This can be useful if you have certain more expensive operations like matmuls which you prefer to avoid recomputing, but still generally want to recompute cheaper operations like pointwise. - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg8.png){:style="width:100%"} - - -*Where plain AC (left) would save a single tensor and then recompute the entire AC'd region, with SAC (right) you can selectively save specific operations (marked red) in the region, so you can avoid recomputing them.* - -To specify what to selectively save, you can specify a policy_fn. To illustrate the additional trade offs you can make with this, we present two simple policy functions. - - -### Policy 1: Not recomputing matmuls: - - -``` -aten = torch.ops.aten -compute_intensive_ops = [ - aten.mm, - aten.bmm, - aten.addmm, -] -def policy_fn(ctx, op, *args, **kwargs): - if op in compute_intensive_ops: - return CheckpointPolicy.MUST_SAVE - else: - return CheckpointPolicy.PREFER_RECOMPUTE -``` - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg9.png){:style="width:100%"} - - -### Policy 2: More aggressively save anything compute intensive - - -``` -# torch/_functorch/partitioners.py -aten = torch.ops.aten -compute_intensive_ops = [ - aten.mm, - aten.convolution, - aten.convolution_backward, - aten.bmm, - aten.addmm, - aten._scaled_dot_product_flash_attention, - aten._scaled_dot_product_efficient_attention, - aten._flash_attention_forward, - aten._efficient_attention_forward, - aten.upsample_bilinear2d, - aten._scaled_mm -] -def policy_fn(ctx, op, *args, **kwargs): - if op in compute_intensive_ops: - return CheckpointPolicy.MUST_SAVE - else: - return CheckpointPolicy.PREFER_RECOMPUTE -``` - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg10.png){:style="width:100%"} - - -On the speed-versus-memory diagram, SAC is plotted as a range of points from closer to AC to closer to Eager, depending on your chosen policy. - - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg11.png){:style="width:100%"} - - -**Try it out!** (Available in 2.5 as a prototype feature; see [docs](https://pytorch.org/docs/main/checkpoint.html#torch.utils.checkpoint.create_selective_checkpoint_contexts) for more info + copy-pastable example) - - -``` -from torch.utils.checkpoint import checkpoint, create_selective_checkpoint_contexts - -# Create a policy function that returns a CheckpointPolicy -def policy_fn(ctx, op, *args, **kwargs): - if op in ops_to_save: - return CheckpointPolicy.MUST_SAVE - else: - return CheckpointPolicy.PREFER_RECOMPUTE - -# Use the context_fn= arg of the existing checkpoint API -out = checkpoint( - fn, *args, - use_reentrant=False, - # Fill in SAC context_fn's policy_fn with functools.partial - context_fn=partial(create_selective_checkpoint_contexts, policy_fn), -) - -``` ---- - - - -## (compile-only) Memory Budget API [NEW!] - -As mentioned previously, any given SAC policy can be represented as a point on a speed-memory tradeoff diagram. Not all policies are created equal, however. The "optimal" policies are the ones that fall on a pareto curve, e.g. for all policies that incur the same memory overhead, this policy is the one that minimizes the amount of required compute. - -For users who are using torch.compile, we offer a **memory budget API** that automatically applies SAC over your compiled region with a pareto-optimal policy given a user-specified "memory budget" between 0 and 1, where a budget of 0 behaves like plain-AC and a budget of 1 behaves like default torch.compile. - - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg12.png){:style="width:100%"} - - -Below are some real results on a transformer model: - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg13.png){:style="width:100%"} - - -We observe a 50% memory reduction by recomputing only pointwise ops, with a steady drop-off as you recompute more and more of your matmuls. Attention is the most expensive, so you tend to want to recompute those last. - -**Try it out!** (Available in 2.4 as an experimental feature; see this [comment block](https://github.com/pytorch/pytorch/blob/68a363548409a3ff17965770304ee5e12fe718d9/torch/_functorch/config.py#L110-L122) for more info) - - -``` -torch._dynamo.config.activation_memory_budget = 0.5 - -out = torch.compile(fn)(inp) -``` - ---- - - - - -## Conclusion - - -![flow diagram](/assets/images/activation-checkpointing-techniques/fg14.png){:style="width:100%"} - - -In summary, activation checkpointing techniques in PyTorch offer a variety of ways to balance memory and compute demands, from simple region-based checkpointing to more selective and automated methods. By choosing the option that best matches your model’s structure and resource constraints, you can achieve significant memory savings with an acceptable trade-off in compute. - - -## Acknowledgements - -We would like to thank Meta's [xformers](https://github.com/facebookresearch/xformers) team including [Francisco Massa](https://github.com/fmassa) for working on the original version of Selective Activation Checkpoint. \ No newline at end of file diff --git a/_posts/2025-03-06-peak-performance-minimized-memory.md b/_posts/2025-03-06-peak-performance-minimized-memory.md deleted file mode 100644 index 6271d6412aff..000000000000 --- a/_posts/2025-03-06-peak-performance-minimized-memory.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -layout: blog_detail -title: "Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel" -author: LinkedIn and Meta ---- - -**LinkedIn**: Shivam Sahni, Byron Hsu, Yanning Chen -**Meta**: Ankith Gunapal, Evan Smothers - -This blog explores the integration of a custom triton kernel, Liger Kernel with `torch.compile` to enhance the performance of fine-tuning large language models (LLMs) using torchtune. torchtune, a PyTorch-native library, offers modular building blocks and customizable finetuning recipes which include `torch.compile` support for various LLMs, while Liger Kernel provides optimized Triton kernels to improve training efficiency and reduce memory usage. The integration involves modifying the `TransformerDecoder` module in torchtune to bypass the linear layer computation, allowing the Liger Fused Linear Cross Entropy Loss to handle the forward projection weights. Experiments conducted on an NVIDIA A100 instance demonstrate that `torch.compile` outperforms PyTorch Eager in throughput and memory efficiency, with Liger Kernel further reducing peak memory allocation and enabling larger batch sizes. The results show a 47% reduction in peak memory at batch size 256 and a marginal increase in throughput with `meta-llama/Llama-3.2-1B` , confirming the effectiveness of the integration without affecting the loss curves. - - -## Introduction to torchtune - -torchtune is a PyTorch-native library which has been designed for finetuning LLMs. torchtune provides composable and modular building blocks along with finetuning recipes that can be easily customized for your use case, as will be shown in this blog. \ -torchtune provides: - - - -* PyTorch implementations of popular LLM model architectures from Llama, Gemma, Mistral, Phi, and Qwen model families -* Hackable training recipes for full finetuning, LoRA, QLoRA, DPO, PPO, QAT, knowledge distillation, and more -* Out-of-the-box memory efficiency, performance improvements, and scaling with the latest PyTorch APIs, including `torch.compile` -* YAML configs for easily configuring training, evaluation, quantization or inference recipes -* Built-in support for many popular dataset formats and prompt templates - - -## Introduction to Liger Kernel - -Liger Kernel is an open source library of optimized Triton kernels designed to enhance the efficiency and scalability of training Large Language Models (LLMs). It focuses on kernel-level optimizations such as operation fusing and input chunking, achieving significant improvements in training throughput and GPU memory usage compared to existing implementations like those from HuggingFace. By using a single line of code, Liger Kernel can improve [training throughput by 20% and reduce memory usage by 60%](https://www.linkedin.com/blog/engineering/open-source/liger-kernel-open-source-ecosystem-for-efficient-llm-training). - - -![Fused Linear Cross Entropy](/assets/images/peak-performance-minimized-memory/fg1.png){:style="width:100%"} - - - - -The bulk of LIger Kernel’s performance improvement comes from the Fused Linear Cross Entropy (FLCE) Loss, whose core idea is as follows: - -In LLMs, the vocabulary size has increased significantly, leading to a large logit tensor during cross-entropy (CE) loss computation. This logit tensor consumes excessive memory, causing a bottleneck in training. For example, when training with a batch size of 8 and sequence length of 4096, the 256k vocabulary size results in a 16.8 GB logit tensor. The FLCE kernel breaks down the computation into smaller chunks, reducing memory consumption. - -Here's how it works: - - - -1. Flattens the 3D hidden states into a 2D matrix by collapsing the batch size and sequence length dimensions. -2. Applies the linear projection head sequentially on the chunked hidden states. -3. Computes the partial loss and returns the chunked logits gradient using the Liger CE kernel. -4. Derives the chunked hidden states gradients and accumulates the projection head gradients. - -Torchtune’s recipes provide `torch.compile` support out of the box. It has been shown that utilizing `torch.compile` with FLCE makes [FLCE 2x faster](https://github.com/linkedin/Liger-Kernel/issues/227). - - -## Integrating Liger Kernel with torch.compile & torchtune - -We demonstrate integration of Liger Kernel with `torch.compile` & torchtune by running a full fine-tuning recipe with `meta-llama/Llama-3.2-1B`. To make this integration happen, we have defined a custom full finetuning recipe, the details of the changes are mentioned below. - - -``` -CUDA_VISIBLE_DEVICES=0,1,2,3 tune run --nproc_per_node 4 recipes/full_finetune_distributed.py --config llama3_2/1B_full optimizer=torch.optim.AdamW optimizer.fused=True optimizer_in_bwd=False gradient_accumulation_steps=1 dataset.packed=True compile=True enable_activation_checkpointing=True tokenizer.max_seq_len=512 batch_size=128 -``` - - -One of the inputs to the LCE Kernel is the forward projection weights. torchtune is designed as a modular library with composable blocks. There is a `TransformerDecoder` [block](https://github.com/pytorch/torchtune/blob/main/torchtune/modules/transformer.py#L322) where at the end of the block, we pass the final hidden state through a linear layer to get the final output. Since the linear layer is combined with the CE loss in LCE Kernel, we write a custom `forward` function for `TransformerDecoder` where we skip the computation through the linear layer. - -In the full finetuning recipe, we override the model's forward method with this custom method - - -``` -import types -from liger_kernel.torchtune.modules.transformers import decoder_forward -self._model.forward = types.MethodType(decoder_forward, self._model) -``` - - -We then pass the model's forward projection weights to calculate the loss with LCE Kernel - - -``` -from liger_kernel.transformers.fused_linear_cross_entropy import ( - LigerFusedLinearCrossEntropyLoss, -) - -# Use LCE loss instead of CE loss -self._loss_fn = LigerFusedLinearCrossEntropyLoss() - -# call torch.compile on the loss function -if self._compile: - training.compile_loss(self._loss_fn, verbose=self._is_rank_zero) - -# pass the model's forward projection weights for loss computation -current_loss = ( - self._loss_fn( - self._model.output.tied_module.weight, - logits, - labels, - ) - * current_num_tokens - ) -``` - - -The complete code and instructions can be found in the [GitHub repo](https://github.com/pytorch-labs/applied-ai/tree/liger_kernel/third_party). - - -## Experiments & Benchmarking Results - -We conduct 3 types of experiments to demonstrate how Liger Kernel integration with `torch.compile` enhances the performance of torchtune. We set up our experiments on an instance running NVIDIA A100. We fine-tune a small LLM `meta-llama/Llama-3.2-1B `with differing batch sizes. We record the throughput in terms of tokens/second and measure the peak memory allocated during finetuning. Since it's a small model, we only use 4 A100 GPUs for the benchmarking. The following are the experiments we conducted: - - - -1. Increase batch_size in powers of 2 with PyTorch eager -2. Increase batch_size in powers of 2 with torch.compile -3. Increase batch_size in powers of 2 with torch.compile & Liger integration - -We notice that with PyTorch Eager, throughput increases with increasing batch_size till we hit OOM at batch_size 256. With `torch.compile`, the throughput is higher than PyTorch Eager for each batch_size. We see that the peak memory allocation reduces drastically with increasing batch_size and more than 50% reduction in peak memory at batch_size 128. This results in `torch.compile` being able to support batch_size 256 and hence, the overall throughput with `torch.compile` being 36% greater than PyTorch Eager. Integrating Liger Kernel with `torch.compile` doesn’t drop the throughput at lower batch_size but with increasing batch_size, we notice that torchtune is consuming less memory compared to torch.compile. At batch_size 256, we see a 47% reduction in peak memory allocation with the Liger kernel. This allows us to use batch_size 512 with `torch.compile` & Liger. We notice that there is a marginal 1-2% increase in throughput compared to `torch.compile` without custom triton kernels. - - -![Plot of tokens/sec per rank vs batch_size](/assets/images/peak-performance-minimized-memory/fg2.png){:style="width:100%"} - -
        -

        Figure 2: Plot of tokens/sec per rank vs batch_size

        -
        - -![Peak memory allocated vs batch_size](/assets/images/peak-performance-minimized-memory/fg3.png){:style="width:100%;margin-top: 60px;"} - -
        -

        Figure 3: Peak memory allocated vs batch_size

        -
        - -To rule out any potential functional issues with our integration of Liger Kernel with torchtune, we plot the loss curve against training steps with & without Liger. We see that there is no visible difference in the loss curves. - - -![Plot of loss vs training steps for batch_size=128](/assets/images/peak-performance-minimized-memory/fg4.png){:style="width:100%"} - -
        -

        Figure 4: Plot of loss vs training steps for batch_size=128

        -
        - - -## Next Steps - - - -* Enable Liger kernels for [DPO loss](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/chunked_loss/dpo_loss.py#L7) and [distillation loss](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/chunked_loss/fused_linear_distillation.py#L9) in torchtune’s recipes for [DPO](https://pytorch.org/torchtune/main/recipes/dpo.html) and [knowledge distillation](https://pytorch.org/blog/llama-into-torchtune/), respectively. -* Support Liger integration in torchtune with [tensor parallel training](https://github.com/pytorch/torchtune/pull/2330). - - -## Acknowledgments - -We thank Hamid Shojanazeri (Meta), Less Wright (Meta), Horace He (Meta) & Gregory Chanan (Meta) for their feedback and support in making this blog post happen. diff --git a/_posts/2025-03-07-pt-fedora-os-communities.md b/_posts/2025-03-07-pt-fedora-os-communities.md deleted file mode 100644 index 77081b55ea04..000000000000 --- a/_posts/2025-03-07-pt-fedora-os-communities.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -layout: blog_detail -title: "Powering AI with PyTorch, Fedora, and Open Source Communities" -author: Sudhir Dharanendraiah -hidden: true ---- - - -![man speaking at a conference](/assets/images/pt-fedora-os-communities/fg1.jpg){:style="width:100%"} - - -At [DevConf.IN 2025](https://www.devconf.info/in/) in Pune, I had the opportunity to host a **[PyTorch Meetup](https://pretalx.devconf.info/devconf-in-2025/talk/W3YURM/)** on February 28th. The session, titled "**Powering AI with PyTorch, Fedora, and Open Source Communities**" was aimed at introducing PyTorch to students and professionals, explaining why **PyTorch+Fedora** form an ideal AI development platform. The other key aspect I covered was collaboration between open source communities. - - -## Introduction to PyTorch - - -## The Power of Deep Learning made simple - - -With the explosion of GPTs, there is a renowned interest in the field of AI and ML. The myth of developing AI/ML technologies and its applications is rocket science and far-fetched, needs correction. Only open source has the power to demystify this myth and further evolve the technology to make it versatile and developer friendly. Since its inception, PyTorch has evolved and has been a driving force to make AI/ML development extremely simple. I covered the aspects of PyTorch key components, its features and why PyTorch is the best choice as a deep learning framework. - - -![man speaking at a conference](/assets/images/pt-fedora-os-communities/fg2.jpg){:style="width:100%"} - - - -The codewalk through was designed to showcase how easy and simple it is to utilise the power of GPUs, creating a simple neural network and training the model. The code walkthrough was very well received and it was great to hear back from the attendees that they never knew how powerful PyTorch is for deep learning. The real world examples sighted how this powerful framework can be used beyond the common GPTs and has the power to influence across a broad spectrum of applications. - - -## Fedora+PyTorch the Ideal AI/ML Development Platform - -![man speaking at a conference](/assets/images/pt-fedora-os-communities/fg3.jpg){:style="width:100%"} - -![man speaking at a conference](/assets/images/pt-fedora-os-communities/fg4.jpg){:style="width:100%"} - - -One of the highlights of the event was the discussion on Fedora’s role as an AI platform. Fedora’s reliability, flexibility, and strong community support make it an ideal partner for PyTorch, allowing developers to focus on model-building without worrying about infrastructure. The students were intrigued by the idea of contributing to Fedora’s AI/ML ecosystem while building their own projects. Sumantro Mukherjee spoke about the AI policy in Fedora and how one can start contributing to the AI/ML using Fedora as a platform. He highlighted how Fedora is evolving to meet the needs of AI practitioners. The idea that an open-source operating system could provide the perfect foundation for AI research sparked an engaging conversation. - - -## Innovation in Open Source When Communities Come Together - -![charts](/assets/images/pt-fedora-os-communities/fg5.jpg){:style="width:100%"} - -It is important that we learn from history and repeat the good things! When open source communities come together they can create seismic shifts in the industry. To drive this home, I took the audience on a journey through history, revisiting a pivotal moment when Apache and Linux came together, solving common problems and fundamentally reshaping enterprise computing. That moment was not just about technology; it was about collaboration. It was about two powerful communities recognizing that they were stronger together. Today, we stand at the cusp of another such moment - PyTorch and Linux, particularly Fedora, are coming together to shape the future of AI/ML. This is not just an opportunity but a responsibility for contributors, developers, and AI/ML enthusiasts to be part of this movement. - - -## Looking Ahead - -![man speaking at a conference](/assets/images/pt-fedora-os-communities/fg6.jpg){:style="width:100%"} - -One of the best parts of the event was the enthusiasm it generated. Diverse audience, including students, AI enthusiasts, and industry professionals. Notably, Vincent Caldeira (CTO, APAC, Red Hat) and Chris Butler (Senior Principal Chief Architect, Red Hat) were present, reinforcing the growing interest in open-source AI/ML. Many students were eager to explore PyTorch and Fedora, contribute to open-source AI projects, and start their own AI experiments. Industry experts saw the potential for scalable, community-driven AI innovation. The session sparked curiosity and conversations that continued long after the event ended. \ No newline at end of file diff --git a/_posts/2025-03-11-scaling-recommendation-2d-sparse-parallelism.md b/_posts/2025-03-11-scaling-recommendation-2d-sparse-parallelism.md deleted file mode 100644 index 230b3d0337bb..000000000000 --- a/_posts/2025-03-11-scaling-recommendation-2d-sparse-parallelism.md +++ /dev/null @@ -1,219 +0,0 @@ ---- -layout: blog_detail -title: "Scaling Recommendation Systems Training to Thousands of GPUs with 2D Sparse Parallelism" -author: "PyTorch Team at Meta: Chunzhi Yang, Rich Zhu, Zain Huda, Liangbei Xu, Xin Zhang, Jiyan Yang, Dennis van der Staay, Wang Zhou, Jin Fang, Jade Nie, Yuxi Hu" ---- - -At Meta, recommendation systems are the cornerstone of delivering relevant and personalized ads to billions of users globally. Through technologies like PyTorch's TorchRec, we've successfully developed solutions that enable model training across hundreds of GPUs. While these systems have served us well, recent research on scaling laws has revealed a compelling opportunity: we can achieve significantly better model performance by training dramatically larger neural networks. - -However, this insight presents us with a new challenge. Our current training infrastructure, though highly optimized for hundreds of GPUs, cannot efficiently scale to the thousands of GPUs needed to train these larger models. The leap from hundreds to thousands of GPUs introduces complex technical challenges, particularly around handling sparse operations in recommendation models. These challenges require fundamentally new approaches to distributed training, which we address with a novel parallelization strategy. - -**To address these issues, we introduced 2D embedding parallel, a novel parallelism strategy that overcomes the sparse scaling challenges inherent in training large recommendation models across thousands of GPUs. This is available today in TorchRec through the DMPCollection API.** This approach combines two complementary parallelization techniques: data parallelism for the sparse components of the model, and model parallelism for the embedding tables, leveraging TorchRec's robust sharding capabilities. By strategically integrating these techniques, we've created a solution that scales to thousands of GPUs and now powers Meta's largest recommendation model training runs. - -**What are the sparse scaling challenges?** - -We identified three key challenges that prevented us from naively scaling our model to thousands of GPUs: - -* **Imbalancing and straggler issue:** with more GPUs it’s harder to achieve balanced sharding, some ranks can have much heavier workload for embedding computations, which can slow down the entire training. -* **Communication across nodes:** As training jobs utilize an increased number of GPUs, the all-to-all communication bandwidth can drop under certain network topologies which can increase communication latency significantly. -* **Memory overhead:** The memory used by input features is often negligible, however, as we use thousands of GPUs, we can introduce larger input features and the memory requirements can become significant. - -With 2D embedding parallel, we can describe our new parallelism scheme like this, in this example we have 2 model replicas (Replica 1: GPU1/GPU3, Replica 2: GPU2/GPU4) - - -![Flow diagram](/assets/images/scaling-recommendation-2d-sparse-parallelism/fg1.png){:style="width:100%"} - -***Figure 1: Layout illustration of 2D Sparse Parallelism*** - -With 2D sparse parallelism we address these challenges, instead of sharding tables across all ranks, we first evenly divide all ranks into several parallel groups: - - - -1. Within each group, we use model parallel for the embedding tables, such as column-wise/row-wise sharding. At scale, for our largest tables, we have also developed a grid sharding, which shards embedding tables on the row and column dimension. -2. Across groups, we do data parallel, such that each rank in a group has its corresponding replica rank in the other groups (replica rank means storing the same embedding table shards). - 1. After each group has completed its own backward pass, we all reduce the embedding table weights across the replicas to keep them synchronized. - -## Our production solution - -TorchRec is our library to build the sparse part of the recommendation models in native PyTorch. With the traditional API being DistributedModelParallel which applies model parallel to the embedding tables. We introduce a new API alongside it, known as DMPCollection, which serves as the main entry point for enabling 2D parallel on TorchRec models. We designed it to be as easy of a change as applying FSDP/DDP is. - -To understand what DMPCollection does, we have to understand what DistributedModelParallel (DMP) does first: - - - -1. Create embedding tables, known as EmbeddingBagCollection and EmbeddingCollections. -2. Generate a sharding plan with respect to GPU topology, embedding tables, memory available, input data, and more. -3. Wrap model with DMP and the associated sharding plan passed in. -4. DMP initializes and shards the embedding tables in accordance with the sharding plan. -5. On a train step, DMP takes an input batch, communicates it to the appropriate GPUs containing the embedding table shard of interest, looks up the value, and returns it back to the GPU that requested it. This is all done on the global process group, with some exceptions for special sharding (such as table row wise sharding) - -DistributedModelParallel was built for model parallel with many parts working under the assumption of sharding and working around the global world size. We need to change these parts in a way where we can introduce additional dimensions of parallelism without losing the optimizations and feature set of TorchRec. - -DMPCollection changes a few key parts to enable 2D parallel in an extensible way, - - - -* Generate sharding plans for the smaller sharding group once, once passed in we communicate to the appropriate ranks across the global group and remap the ranks to fit the new sharding group ranks. -* Create two new NCCL process groups, known as sharding and replica process groups. The sharding process group is passed into sharding and train step components of TorchRec. The replica process group is used for the weight and optimizer state synchronization, the all reduce call happens over this process group. - * The sub NCCL process groups allow us to efficiently communicate only between the ranks that are relevant for a particular comm. Each rank will have two associated process groups. - -To the user, the change is very simple, while taking away all the complexity around applying the parallelism strategies to the model. - -## How do we create these sharding and replication groups? - -These process groups are one of the keys to DMPCollection’s performant implementation. From our earlier diagram, we showed a simple 2x2 GPU setup, however, at scale, how do we assign which ranks are part of a given sharding group and what are their replica ranks across the sharding groups? - -Consider the following setup with 2 nodes, each with 4 GPUs. The sharding and replication groups under 2D parallel will be, - - - - - - - -
        - - - - - - - - - - - - - - -
        Sharding Group - Sharding Ranks -
        0 - 0, 2, 4, 6 -
        1 - 1, 3, 5, 7 -
        - - -
        - - - - - - - - - - - - - - - - - - - - - - -
        Replication Group - Replication Ranks -
        0 - 0, 1 -
        1 - 2, 3 -
        2 - 4, 5 -
        3 - 6, 7 -
        - - -
        - - -We use the following formulation, - - - -1. Divide all trainers into G sharding groups, each with L trainers - 1. Groups, G, is determined by G = T / L, where T is total number of trainers -2. For each group, G, we assigned non-contiguous trainer ranks based on the group it’s in, following, - 2. [i, G+i, 2G+i, ..., (L - 1) G+i], where* i = 0 to G-1* -3. From the groups, G, we can create the replication group, which is every G continuous ranks - 3. (0 to G-1, G to 2* G - 1) each continuous set stores the duplicate embedding table shards. - -This means our sharding groups, G, are of size L, which can be known as the number of ranks to apply model parallel across. This, in turn, gives us replica groups, each of size G, which are the ranks we data parallel across. - -In DMPCollection, we’re able to create these process groups efficiently with the use of DeviceMesh, we create the entire GPU topology in a 2x2 matrix, with each row representing the group of sharding ranks and each column representing the corresponding replica ranks, - -``` -create peer matrix -num_groups = global_world_size // sharding_group_size -for each group_rank in num_groups: - peers = [num_groups * rank + group_rank for rank in range(sharding_group_size)] - add peer to peer matrix - -initalize DeviceMesh with two dimensions (shard, replicate) -slice DeviceMesh on shard for sharding process group -slide DeviceMesh on replicate for replica process group -``` - -With our DeviceMesh approach, should we want to change the topology or provide further flexibility in the future, we can easily extend our creation logic to any form of topologies and even extend for further dimensions of parallelism if needed. - -## Performance of 2D parallel - -Our rank partitioning strategy optimizes communication patterns by strategically placing model replica ranks for each shard within the same compute node. This architecture provides significant performance benefits for the weight synchronization operation. After the backward pass, we perform all-reduce operations to synchronize model weights—which is an expensive process given the large parameter counts we have to communicate and sync—with our setup of placing replicas on the same node we leverage intra node’s high-bandwidth over-relying on slower inter-node bandwidth. - -The effect of this design choice on the other communication collectives generally improves the latencies. The improvement stems from two factors. - - - -1. By sharding the embedding tables over a reduced number of ranks and conducting communications for the model within the smaller group, we achieve a lower all-to-all latency. -2. With the replication in 2D parallel, our embedding lookup latency on a rank reduces, we can reduce the local batch size to 1/Nth of the equivalent global batch size, where N is the number of model replicas. - -A production model trace exemplifies these two factors, here we run the 2D parallel job on 1024 GPUs, with a sharding group size of 256 GPUs. - -![State diagram](/assets/images/scaling-recommendation-2d-sparse-parallelism/fg2.png){:style="width:100%"} - -***Figure 2: Comparing latencies between non 2D parallel and 2D parallel workloads*** - -There are two key levers users have to tune to maximize performance for their workloads: - - - -1. The size of the model sharding group relative to the global world size. The global world size divided by the sharding group size represents the number of model replicas we will have. - 1. To maximize performance, users can look to scale up their model up to 8x, this scaling factor maintains the intra-host all reduce. - 1. For further scaling, the all reduce would have to happen over inter host. From our experiments, we did not see an obvious performance regression and in fact note advantages of an inter host all reduce. We can change our sharding and replica topology to inter host all reduce, which can help us introduce fault tolerance strategies should a particular host go down. -2. Frequency of all reduce synchronization, DMPCollection comes with a sync() call, which can be tuned to be called every N training steps, performing a sort of local SGD training. With scale, reducing the frequency of synchronization can bring significant gains to performance. - -## Future Work - -Readers should note that 2D sparse parallel training differs from non-parallelized training because we synchronize the embedding table weights rather than the gradients. This approach is made possible by TorchRec's use of FBGEMM, which provides optimized kernels under the hood. One of FBGEMM's key optimizations is the fusion of the optimizer in the backward pass. Instead of fully materializing the embedding table gradients—which would consume significant memory—they are passed directly to the optimizer update. Attempting to materialize and synchronize these gradients would create substantial overhead, making that approach impractical. - -Our exploration revealed that to achieve training results comparable to the baseline, we synchronize optimizer states on a delayed schedule, with the timing dependent on the number of sharding/replica groups (ie: for Adagrad we update the momentum behind by one sync step). This approach also enables users to implement local SGD or semi-synchronized training strategies, which can achieve convergence and potentially produce better loss curves than the baseline. - -We thank you for reading our post! This is an exciting direction we have come across that we hope to develop further to maximize performance of recommendation systems and push the state of the art. - - \ No newline at end of file diff --git a/_posts/2025-03-13-pytorch-landscape.md b/_posts/2025-03-13-pytorch-landscape.md deleted file mode 100644 index 4cc3687be952..000000000000 --- a/_posts/2025-03-13-pytorch-landscape.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: blog_detail -title: "Introducing the New PyTorch Landscape: Your Guide to the PyTorch Ecosystem" ---- - -We’re excited to reveal our brand new PyTorch Landscape. The [PyTorch Landscape](https://landscape.pytorch.org/) helps researchers, developers, and organizations easily locate useful, curated, community-built tools that augment the PyTorch core framework. - - -landscape banner - -## What the Landscape Offers - -The Landscape visually organizes projects into three categories—Modeling, Training, and Optimizations—making finding relevant frameworks, libraries, and projects easy. Users can quickly locate curated, valuable tools for a variety of use cases that complement the PyTorch framework. Each tool that is part of the Landscape has been reviewed and vetted by PyTorch project experts. The projects in the Landscape are considered to be mature and healthy and provide valuable capabilities that complement the PyTorch framework in their respective domains. - - -## Explore the AI Landscape - -The **Explore** page presents platforms, tools, and libraries, each with a logo, description, and links to GitHub and further details. This categorized, visual approach simplifies discovery and provides quick access to essential technologies. - - -## Guide Page: A Closer Look - -For deeper insights, the **Guide** page expands on each project, highlighting methodologies and trends shaping AI development, from adversarial robustness to self-supervised learning. There are also project statistics provided for each project, including metrics such as number of stars, contributors, commit history, languages used, license, and other valuable metrics that provide an in-depth understanding of the project and how it may be used. - - -## Tracking AI’s Growth: The Stats Page - -The **Stats** page provides insights into AI development trends, tracking repository activity, programming languages, and industry funding data. - -* Repositories: 117 repositories, 20.5k contributors, and 797.2k stars across 815MB of source code. -* Development Trends: Weekly commit activity over the last year. -* Licensing Breakdown: Repositories are categorized by license type. -* Funding & Acquisitions: Insights into investment trends, including funding rounds and acquisitions. - - -## Why Use the PyTorch Landscape? - -Finding useful and high quality open source projects that complement the PyTorch core system can be overwhelming. The PyTorch Landscape offers a clear, accessible way to explore the ecosystem of community-built tools, whether you're researching, building models, or making strategic decisions. - -Stay ahead with the [PyTorch Landscape](https://landscape.pytorch.org/) — your guide to the PyTorch Ecosystem. - -## Want to Contribute a Project to the PyTorch Landscape? - -Have you built a useful open source tool that you would like to share with the PyTorch community? Then help us grow the Ecosystem by contributing your tool! You can find the [instructions to apply here](https://github.com/pytorch-fdn/ecosystem). We welcome all contributions from the community! \ No newline at end of file diff --git a/_posts/2025-03-16-pytorch-at-gtc.md b/_posts/2025-03-16-pytorch-at-gtc.md deleted file mode 100644 index 94be8a113f5f..000000000000 --- a/_posts/2025-03-16-pytorch-at-gtc.md +++ /dev/null @@ -1,109 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch at GTC 2025" -author: "Team PyTorch at NVIDIA" -hidden: true ---- - -[GTC](https://www.nvidia.com/gtc/) is coming back to San Jose on March 17–21, 2025. Join PyTorch Foundation members Arm, AWS, Google Cloud, IBM, Lightning AI, Meta, Microsoft Azure, Snowflake, and thousands of developers as we celebrate PyTorch. Together learn how AI & accelerated computing are helping humanity solve our most complex challenges. - -Join in person with [discounted GTC registration](https://www.nvidia.com/gtc/?ncid=GTC-NVI0K8HVX) for PyTorch Foundation or [watch online](https://register.nvidia.com/flow/nvidia/gtcs25/registration/) with free registration. - - -![book cover](/assets/images/pytorch-at-gtc.jpg){:style="max-width:500px; display: block; margin-left: auto; margin-right: auto"} - - -### [Scaling Open Source AI: From Foundation Models to Ecosystem Success](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1738966749087001K1dG) - -Hear from PyTorch Foundation’s Executive Director Matt White & panelists from UC Berkeley, Meta, NVIDIA, & Sequoia Capital how open source is transforming AI development, bringing together experts from industry, academia, and venture capital to discuss the technical and business aspects of collaborative open source AI development They’ll examine how open source projects like PyTorch, vLLM, Ray, and NVIDIA's NeMo are accelerating AI innovation while creating new opportunities for businesses and researchers. They'll share real-world experiences from PyTorch's development, Berkeley's research initiatives, and successful AI startups. Take away valuable insights into the technical and business aspects of open source AI. – Monday, Mar 17 10:00 AM - 11:00 AM PDT - - -## PyTorch @ GTC - -[The Performance of CUDA with the Flexibility of PyTorch ](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1726155993061001WWZM) -Mark Saroufim, Software Engineer, Meta Platforms - -This talk explores how PyTorch users are also becoming CUDA developers. We'll start with motivating examples from eager, the launch of torch.compile and the more recent trend of kernel zoos. We will share details on how we went about integrating low bit matmuls in torchao and the torch.compile CUTLASS backend. We'll also discuss details on how you can define, build and package your own custom ops in PyTorch so you get the raw performance of CUDA while maintaining the flexibility of PyTorch. - -[Make My PyTorch Model Fast, and Show Me How You Did It](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1727978036338001UVLu) -Thomas Viehmann, Principal Research Engineer, Lightning AI -Luca Antiga, CTO, Lightning AI - -PyTorch is popular in deep learning and LLMs for richness and ease of expressions. To make the most of compute resources, PyTorch models benefit from nontrivial optimizations, but this means losing some of their ease and understandability. Learn how with Thunder, a PyTorch-to-Python compiler focused on usability, understandability, and extensibility, you can optimize and transform (i.e., distribute across many machines) models while • leaving the PyTorch code unchanged • targeting a variety of models without needing to adapt to each of them • understanding each transformation step because the results are presented as simple Python code • accessing powerful extension code for your own optimizations with just one or a few lines of code We'll show how the combination of Thunder transforms and the NVIDIA stack (NVFuser, cuDNN, Apex) delivers optimized performance in training and inference on a variety of models. - -[FlexAttention: The Flexibility of PyTorch With the Performance of FlashAttention](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1726184633014001Jh5G) -Driss Guessous, Machine Learning Engineer, Meta Platforms - -Introducing FlexAttention: a novel PyTorch API that enables custom, user-defined attention mechanisms with performance comparable to state-of-the-art solutions. By leveraging the PyTorch compiler stack, FlexAttention supports dynamic modifications to attention scores within SDPA, achieving both runtime and memory efficiency through kernel fusion with the FlashAttention algorithm. Our benchmarks on A100 GPUs show FlexAttention achieves 90% of FlashAttention2's performance in forward passes and 85% in backward passes. On H100 GPUs, FlexAttention's forward performance averages 85% of FlashAttention3 and is ~25% faster than FlashAttention2, while backward performance averages 76% of FlashAttention3 and is ~3% faster than FlashAttention2. Explore how FlexAttention balances near-state-of-the-art performance with unparalleled flexibility, empowering researchers to rapidly iterate on attention mechanisms without sacrificing efficiency. - -[Keep Your GPUs Going Brrr : Crushing Whitespace in Model Training](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1731693095418001cruA) -Syed Ahmed, Senior Software Engineer, NVIDIA -Alban Desmaison, Research Engineer, Meta -Aidyn Aitzhan, Senior Software Engineer, NVIDIA - -Substantial progress has recently been made on the compute-intensive portions of model training, such as high-performing attention variants. While invaluable, this progress exposes previously hidden bottlenecks in model training, such as redundant copies during collectives and data loading time. We'll present recent improvements in PyTorch achieved through Meta/NVIDIA collaboration to tackle these newly exposed bottlenecks and how practitioners can leverage them. - -[Accelerated Python: The Community and Ecosystem](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1727176757800001qp7T) -Andy Terrel, CUDA Python Product Lead, NVIDIA -Jeremy Tanner, Open Source Programs, NVIDIA -Anshuman Bhat, CUDA Product Management, NVIDIA - -Python is everywhere. Simulation, data science, and Gen AI all depend on it. Unfortunately, the dizzying array of tools leaves a newcomer baffled at where to start. We'll take you on a guided tour of the vibrant community and ecosystem surrounding accelerated Python programming. Explore a variety of tools, libraries, and frameworks that enable efficient computation and performance optimization in Python, including CUDA Python, RAPIDS, Warp, and Legate. We'll also discuss integration points with PyData, PyTorch, and JAX communities. Learn about collaborative efforts within the community, including open source projects and contributions that drive innovation in accelerated computing. We'll discuss best practices for leveraging these frameworks to enhance productivity in developing AI-driven applications and conducting large-scale data analyses. - -[Supercharge large scale AI with Google Cloud AI hypercomputer (Presented by Google Cloud)](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1734571562315001xMKM) -Deepak Patil, Product Manager, Google Cloud -Rajesh Anantharaman, Product Management Lead, ML Software, Google Cloud - -Unlock the potential of your large-scale AI workloads with Google Cloud AI Hypercomputer – a supercomputing architecture designed for maximum performance and efficiency. In this session, we will deep dive into PyTorch and JAX stacks on Google Cloud on NVIDIA GPUs, and showcase capabilities for high performance foundation model building on Google Cloud. - -[Peering Into the Future: What AI and Graph Networks Can Mean for the Future of Financial Analysis](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1739906058885001OxEF) -Siddharth Samsi, Sr. Solutions Architect, NVIDIA -Sudeep Kesh, Chief Innovation Officer, S&P Global - -Artificial Intelligence, agentic systems, and graph neural networks (GNNs) are providing the new frontier to assess, monitor, and estimate opportunities and risks across work portfolios within financial services. Although many of these technologies are still developing, organizations are eager to understand their potential. See how S&P Global and NVIDIA are working together to find practical ways to learn and integrate such capabilities, ranging from forecasting corporate debt issuance to understanding capital markets at a deeper level. We'll show a graph representation of market data using the PyTorch-Geometric library and a dataset of issuances spanning three decades and across financial and non-financial industries. Technical developments include generation of a bipartite graph and link-prediction GNN forecasting. We'll address data preprocessing, pipelines, model training, and how these technologies can broaden capabilities in an increasingly complex world. - -[Unlock Deep Learning Performance on Blackwell With cuDNN](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1727984645671001Y9eq) -Yang Xu (Enterprise Products), DL Software Engineering Manager, NVIDIA - -Since its launch, cuDNN, a library for GPU-accelerating deep learning (DL) primitives, has been powering many AI applications in domains such as conversational AI, recommender systems, and speech recognition, among others. CuDNN remains a core library for DL primitives in popular frameworks such as PyTorch, JAX, Tensorflow, and many more while covering training, fine-tuning, and inference use cases. Even in the rapidly evolving space of Gen AI — be it Llama, Gemma, or mixture-of-experts variants requiring complex DL primitives such as flash attention variants — cuDNN is powering them all. Learn about new/updated APIs of cuDNN pertaining to Blackwell’s microscaling format, and how to program against those APIs. We'll deep dive into leveraging its graph APIs to build some fusion patterns, such as matmul fusion patterns and fused flash attention from state-of-the-art models. Understand how new CUDA graph support in cuDNN, not to be mistaken with the cuDNN graph API, could be exploited to avoid rebuilding CUDA graphs, offering an alternative to CUDA graph capture with real-world framework usage. - -[Train and Serve AI Systems Fast With the Lightning AI Open-Source Stack (Presented by Lightning AI)](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1736347047099001au7y) -Luca Antiga, CTO, Lightning AI - -See how the Lightning stack can cover the full life cycle, from data preparation to deployment, with practical examples and particular focus on distributed training and high-performance inference. We'll show examples that focus on new features like support for multi-dimensional parallelism through DTensors, as well as quantization through torchao. - - -## Connect With Experts (Interactive Sessions) - -[Meet the Experts From Deep Learning Framework Teams ](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1728516848639001tO7H) -Eddie Yan, Technical Lead of PyTorch, NVIDIA -Masaki Kozuki, Senior Software Engineer in PyTorch, NVIDIA -Patrick Wang (Enterprise Products), Software Engineer in PyTorch, NVIDIA -Mike Ruberry, Distinguished Engineer in Deep Learning Frameworks, NVIDIA -Rishi Puri, Sr. Deep Learning Engineer and Lead for PyTorch Geometric, NVIDIA - - -## Training Labs - -[Kernel Optimization for AI and Beyond: Unlocking the Power of Nsight Compute ](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1726073884811001C0za) -Felix Schmitt, Sr. System Software Engineer, NVIDIA -Peter Labus, Senior System Software Engineer, NVIDIA - -Learn how to unlock the full potential of NVIDIA GPUs with the powerful profiling and analysis capabilities of Nsight Compute. AI workloads are rapidly increasing the demand for GPU computing, and ensuring that they efficiently utilize all available GPU resources is essential. Nsight Compute is the most powerful tool for understanding kernel execution behavior and performance. Learn how to configure and launch profiles customized for your needs, including advice on profiling accelerated Python applications, AI frameworks like PyTorch, and optimizing Tensor Core utilization essential to modern AI performance. Learn how to debug your kernel and use the expert system built into Nsight Compute, known as “Guided Analysis,” that automatically detects common issues and directs you to the most relevant performance data all the way down to the source code level. - -[Make Retrieval Better: Fine-Tuning an Embedding Model for Domain-Specific RAG](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1725042189130001cmoW) -Gabriel Moreira, Sr. Research Scientist, NVIDIA -Ronay Ak, Sr. Data Scientist, NVIDIA - -LLMs power AI applications like conversational chatbots and content generators, but are constrained by their training data. This might lead to hallucinations in content generation, which requires up-to-date or domain-specific information. Retrieval augmented generation (RAG) addresses this issue by enabling LLMs to access external context without modifying model parameters. Embedding or dense retrieval models are a key component of a RAG pipeline for retrieving relevant context to the LLM. However, an embedding model’s effectiveness to capture the unique characteristics of the custom data hinges on the quality and domain relevance of its training data. Fine-tuning embedding models is gaining interest to provide more accurate and relevant responses tailored to users’ specific domain. - -In this lab, you'll learn to generate a synthetic dataset with question-context pairs from a domain-specific corpus, and process the data for fine-tuning. Then, fine-tune a text embedding model using synthetic data and evaluate it. - - -## Poster Presentations - -[Single-View X-Ray 3D Reconstruction Using Neural Back Projection and Frustum Resampling](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1729781473379001KiPD) -Tran Minh Quan, Developer Technologist, NVIDIA - -[Enable Novel Applications in the New AI Area in Medicine: Accelerated Feature Computation for Pathology Slides](https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=pytorch#/session/1729757102989001KDG4) -Nils Bruenggel, Principal Software Engineer, Roche Diagnostics Int. AG \ No newline at end of file diff --git a/_posts/2025-03-19-pt-day-china-2025-cfp.md b/_posts/2025-03-19-pt-day-china-2025-cfp.md deleted file mode 100644 index 44f98dfd7ee1..000000000000 --- a/_posts/2025-03-19-pt-day-china-2025-cfp.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Day China 2025 Call for Proposals Open" ---- - -We’re excited to announce the **first-ever [PyTorch Day China](https://www.lfasiallc.com/pytorch-day-china/)**! This new event, hosted by the PyTorch Foundation, will take place on **June 7 in Beijing, China**, bringing together AI practitioners, researchers, and industry professionals to explore the latest advancements in open source AI and machine learning. Co-located with the **BAAI Conference**, PyTorch Day China is a chance to connect with the community, share knowledge, and help shape the future of deep learning. - - -![PyTorch Day China 2025 Call for Proposals Open](/assets/images/pt-day-china-2025-cfp.jpg){:style="max-width:500px; display: block; margin-left: auto; margin-right: auto"} - - -## Why Submit a Proposal? - -PyTorch Day China offers a platform for AI practitioners and researchers to showcase their work, exchange ideas, and connect with others in the community. If you're working on innovative applications, tools, or research in the PyTorch ecosystem, we encourage you to share your expertise. - - -## Topics for Submission: - - - -* AI Applications and Use Cases -* Core PyTorch Framework -* DL Compilers and Kernel Authoring -* Edge AI and On-Device -* Ethical AI, Governance, and Regulation -* Generative AI and Large Language Models (LLMs) with PyTorch -* Open Source Collaboration, Education, and Community Building -* Optimization for Training and Inference -* PyTorch on Accelerator Hardware -* PyTorch Ecosystem and Tools -* PyTorch in Research and Academia -* Performance Measurement and Benchmarking -* Scaling Training and Inference - -**The submission deadline is April 13. Submit and learn more here:** [https://www.lfasiallc.com/pytorch-day-china/call-for-proposals-cfp/](https://www.lfasiallc.com/pytorch-day-china/call-for-proposals-cfp/) - - -## Why Attend? - -PyTorch Day China will feature **technical talks, discussions, and poster sessions** that highlight real-world applications and developments in AI and machine learning. Attendees will have the opportunity to learn from experts, contribute to the open source community, and engage with fellow PyTorch users. Registration information will be available in April. - - -## Event Details - -* **Date:** June 7, 2025 -* **Location:** Zhongguancun Exhibition Center, Beijing, China -* **Address:** 索家坟, Hai Dian Qu, Bei Jing Shi, China, 100080 -* **Co-located with:** BAAI Conference - - -## Travel Information - -The venue, **Zhongguancun Exhibition Center**, is approximately **39 km from Beijing International Airport**. More details on travel and accommodation will be available on the **BAAI Conference website** and updated here as they become available. - - -## Have Questions? - -For inquiries, please contact pytorchevents@linuxfoundation.org. - -Submit your proposal by **April 13** and join the conversation shaping the future of PyTorch. \ No newline at end of file diff --git a/_posts/2025-03-19-sglang-joins-pytorch.md b/_posts/2025-03-19-sglang-joins-pytorch.md deleted file mode 100644 index 1334a6b6a52c..000000000000 --- a/_posts/2025-03-19-sglang-joins-pytorch.md +++ /dev/null @@ -1,105 +0,0 @@ ---- -layout: blog_detail -title: "SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine" -author: "SGLang Team" -hidden: true ---- - - -![sglang logo](/assets/images/sglang-join-pytorch/fg1.png){:style="max-width:400px; display: block; margin-left: auto; margin-right: auto"} - - -We’re thrilled to announce that the SGLang project has been integrated into the PyTorch ecosystem! This integration ensures that SGLang aligns with PyTorch’s standards and practices, providing developers with a reliable and community-supported framework for fast and flexible serving of LLMs. - -To view the PyTorch Ecosystem, see the [PyTorch Landscape](https://landscape.pytorch.org/) and learn more about how projects can [join the PyTorch Ecosystem](https://github.com/pytorch-fdn/ecosystem). - - -## About SGLang - -SGLang is a fast-serving engine for large language models and vision language models. It makes the interaction with models faster and more controllable by co-designing the backend runtime and frontend language. - -The core features include: - -* Fast Backend Runtime: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ). -* Flexible Frontend Language: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions. -* Extensive Model Support: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models. -* Active Community: SGLang is open source and backed by an active community with industry adoption. - -SGLang is famous for its fast speed. It can often significantly outperform other state-of-the-art frameworks in terms of serving throughput and latency. You can learn more about the underlying techniques from the past release blog posts: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/). - -SGLang has been widely adopted by leading industry companies and frontier research labs. For example, xAI uses SGLang to serve its flagship model, [Grok 3](https://grok.com/), which is currently the best model according to the Chatbot Arena leaderboard. Microsoft Azure uses SGLang to serve [DeepSeek R1](https://techcommunity.microsoft.com/blog/azurehighperformancecomputingblog/running-deepseek-r1-on-a-single-ndv5-mi300x-vm/4372726) on AMD GPUs, which is currently the best open source model. - - -## Serving DeepSeek Models - -You can easily launch a Docker container to serve a DeepSeek model with the following command: - -``` -# Pull the latest image -docker pull lmsysorg/sglang:latest - -# Launch a server -docker run --gpus all --shm-size 32g -p 30000:30000 -v ~/.cache/huggingface:/root/.cache/huggingface --ipc=host --network=host --privileged lmsysorg/sglang:latest \ - python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code --port 30000 -``` - -Then you can query the server with the OpenAI-compatible API - -``` -import openai -client = openai.Client(base_url=f"http://127.0.0.1:30000/v1", api_key="None") - -response = client.chat.completions.create( - model="deepseek-ai/DeepSeek-V3", - messages=[ - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=0, - max_tokens=64, -) -``` - -The server launch command above works for 8xH200. You can find detailed instructions for other hardware (MI300X, H100, A100, H20, L40S) at https://docs.sglang.ai/references/deepseek.html. - -SGLang integrates DeepSeek-specific optimizations, such as MLA throughput optimizations, MLA-optimized kernels, data-parallel attention, multi-token prediction, and DeepGemm, making it the top choice for serving DeepSeek models by dozens of [companies](https://x.com/lmsysorg/status/1887262321636221412), including AMD, NVIDIA, and many cloud providers. The team is actively working on integrating more optimizations following the 2025 H1 roadmap below. - - -## Serving Llama Models - -Similarly, you can launch the server for a Llama 3.1 text model with: - -``` -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct -``` - -Or a Llama 3.2 multimodal model with: - -``` -python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct --chat-template=llama_3_vision -``` - - -## Roadmap - -This year, the SGLang team will continue to push the boundaries of system efficiency. You can find the roadmap of 2025H1 [here](https://github.com/sgl-project/sglang/issues/4042). The focus is - -- Throughput-oriented large-scale deployment similar to the DeepSeek inference system -- Long context optimizations -- Low latency speculative decoding -- Reinforcement learning training framework integration -- Kernel optimizations - -## Community - -SGLang has been deployed to large-scale production, generating trillions of tokens every day. It has an active community with over three hundred contributors on GitHub. It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, iFlytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI. - - -![logos](/assets/images/sglang-join-pytorch/fg2.png){:style="width:100%;"} - - - -## Conclusion - -We’re excited to welcome SGLang to the PyTorch ecosystem. SGLang accelerates the serving of large language and vision language models. It’s widely adopted by industry, powering the large-scale online serving of frontier models like Grok and DeepSeek. - -We invite you to explore the [SGLang GitHub repo](https://github.com/sgl-project/sglang/tree/main), join the [community on Slack](https://slack.mindee.com/), and reach out to [contact@sglang.ai](mailto:contact@sglang.ai) for inquiries or collaboration opportunities. Together, we can make powerful AI models accessible to everyone. \ No newline at end of file diff --git a/_posts/2025-04-03-pt-day-france-cfp.md b/_posts/2025-04-03-pt-day-france-cfp.md deleted file mode 100644 index 9ed63b302833..000000000000 --- a/_posts/2025-04-03-pt-day-france-cfp.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Day France 2025: Call For Proposals Open" ---- - -We’re pleased to announce **[PyTorch Day France 2025](https://events.linuxfoundation.org/pytorch-day-france/)**, a dedicated gathering of the PyTorch community held **7 May 2025** in **Paris, France**. Proudly hosted by the **PyTorch Foundation** and co-located with **[GOSIM AI Paris 2025](https://paris2025.gosim.org/)**, this event will bring together developers, researchers, and practitioners driving innovation in open source AI and machine learning. - -Whether you're building cutting-edge models or contributing to the ecosystem, PyTorch Day France is your opportunity to connect, collaborate, and help shape the future of deep learning. - - - -![PT Day CFP](/assets/images/pt-day-cfp.png){:style="max-width:600px; display: block; margin-left: auto; margin-right: auto"} - - -## Why Attend? - -Set in the vibrant atmosphere of STATION F, the world’s largest startup campus, PyTorch Day France will offer a full day of: - -* Insightful Technical Talks -* Interactive Discussions -* Engaging Poster Sessions - -The event is designed to foster open exchange across the PyTorch ecosystem, providing a space to learn from peers, share practical insights, and explore the latest research and applications in AI. - - -## Submit a Proposal - -We are currently accepting proposals for talks. If you have a project, idea, or research story you'd like to share with the PyTorch community, we want to hear from you. - -📩 Email your **talk title and abstract** to [pytorchevents@linuxfoundation.org](mailto:pytorchevents@linuxfoundation.org) for consideration. - - -## Registration - -To register for PyTorch Day France, please visit the **GOSIM AI Paris website**, and use the code PYTORCHFRIEND to receive 25% off. - -👉 [https://paris2025.gosim.org/](https://paris2025.gosim.org/) - -We encourage early registration to secure your spot and ensure access to both PyTorch Day France and the broader GOSIM AI Paris programming. - - -## Venue - -STATION F -5 Parv. Alan Turing, 75013 Paris, France -A landmark of innovation and entrepreneurship in the heart of Paris. - - -## Travel and Accommodations - -Participants are responsible for their own travel and lodging. For those arriving internationally, Paris Charles de Gaulle Airport is approximately 38.4 km from STATION F. Additional information about accommodations and transportation may be available on the [GOSIM AI Paris website](https://paris2025.gosim.org/). - - -## Questions? - -For any inquiries, please contact us at [pytorchevents@linuxfoundation.org](mailto:pytorchevents@linuxfoundation.org). - -We look forward to welcoming the PyTorch community to Paris this May for a day of collaboration, learning, and open source AI innovation. \ No newline at end of file diff --git a/_posts/2025-04-08-accelerating-whisper-arm-w-transformers.md b/_posts/2025-04-08-accelerating-whisper-arm-w-transformers.md deleted file mode 100644 index 10db0cabc270..000000000000 --- a/_posts/2025-04-08-accelerating-whisper-arm-w-transformers.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Whisper on Arm with PyTorch and Hugging Face Transformers" -author: Pareena Verma, Arm ---- - -Automatic speech recognition (ASR) has revolutionized how we interact with technology, clearing the way for applications like real-time audio transcription, voice assistants, and accessibility tools. OpenAI Whisper is a powerful model for ASR, capable of multilingual speech recognition and translation. - -A new Arm Learning Path is now available that explains how to accelerate Whisper on Arm-based cloud instances using PyTorch and Hugging Face transformers. - -**Why Run Whisper on Arm?** - -Arm processors are popular in cloud infrastructure for their efficiency, performance, and cost-effectiveness. With major cloud providers such as AWS, Azure, and Google Cloud offering Arm-based instances, running machine learning workloads on this architecture is becoming increasingly attractive. - -**What You’ll Learn** - -The [Arm Learning Path](https://learn.arm.com/learning-paths/servers-and-cloud-computing/whisper/) provides a structured approach to setting up and accelerating Whisper on Arm-based cloud instances. Here’s what you cover: - -**1. Set Up Your Environment** - -Before running Whisper, you must set up your development environment. The learning path walks you through setting up an Arm-based cloud instance and installing all dependencies, such as PyTorch, Transformers, and ffmpeg. - -**2. Run Whisper with PyTorch and Hugging Face Transformers** - -Once the environment is ready, you will use the Hugging Face transformer library with PyTorch to load and execute Whisper for speech-to-text conversion. The tutorial provides a step-by-step approach for processing audio files and generating audio transcripts. - -**3. Measure and Evaluate Performance** - -To ensure efficient execution, you learn how to measure transcription speeds and compare different optimization techniques. The guide provides insights into interpreting performance metrics and making informed decisions on your deployment. - -**Try it Yourself** - -Upon completion of this tutorial, you know how to: - -* Deploy Whisper on an Arm-based cloud instance. -* Implement performance optimizations for efficient execution. -* Evaluate transcription speeds and optimize further based on results. - -**Try the live demo today** and see audio transcription in action on Arm: [Whisper on Arm Demo](https://learn.arm.com/learning-paths/servers-and-cloud-computing/whisper/_demo/). \ No newline at end of file diff --git a/_posts/2025-04-23-pytorch-2-7.md b/_posts/2025-04-23-pytorch-2-7.md deleted file mode 100644 index 1f31b9f2e6c3..000000000000 --- a/_posts/2025-04-23-pytorch-2-7.md +++ /dev/null @@ -1,161 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch 2.7 Release" ---- - -We are excited to announce the release of PyTorch® 2.7 ([release notes](https://github.com/pytorch/pytorch/releases/tag/v2.7.0))! This release features: - -* support for the [NVIDIA Blackwell GPU architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/) and pre-built wheels for [CUDA 12.8](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html) across Linux x86 and arm64 architectures. -* *torch.compile* support for Torch Function Modes which enables users to override any *torch.** operation to implement custom user-defined behavior. -* Mega Cache which allows users to have end-to-end portable caching for torch; -* new features for FlexAttention - LLM first token processing, LLM throughput mode optimization and Flex Attention for Inference. - -This release is composed of 3262 commits from 457 contributors since PyTorch 2.6. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.7. More information about how to get started with the PyTorch 2-series can be found at our [Getting Started](https://pytorch.org/get-started/pytorch-2.0/) page. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        Beta - Prototype -
        Torch.Compile support for Torch Function Modes - NVIDIA Blackwell Architecture Support -
        Mega Cache - PyTorch Native Context Parallel -
        - Enhancing Intel GPU Acceleration -
        - FlexAttention LLM first token processing on x86 CPUs -
        - FlexAttention LLM throughput mode optimization on x86 CPUs -
        - Foreach Map -
        - Flex Attention for Inference -
        - Prologue Fusion Support in Inductor -
        - - -*To see a full list of public feature submissions click [here](https://docs.google.com/spreadsheets/d/1TzGkWuUMF1yTe88adz1dt2mzbIsZLd3PBasy588VWgk/edit?usp=sharing). - - -## BETA FEATURES - - -### [Beta] Torch.Compile support for Torch Function Modes - -This feature enables users to override any *torch.** operation to implement custom user-defined behavior. For example, ops can be rewritten to accommodate a specific backend. This is used in FlexAttention to re-write indexing ops. - -See the [tutorial](https://pytorch.org/tutorials/recipes/torch_compile_torch_function_modes.html) for more information. - - -### [Beta] Mega Cache - -Mega Cache allows users to have end-to-end portable caching for torch. The intended use case is after compiling and executing a model, the user calls *torch.compiler.save_cache_artifacts()* which will return the compiler artifacts in a portable form. Later, potentially on a different machine, the user may call *torch.compiler.load_cache_artifacts()* with these artifacts to pre-populate the torch.compile caches in order to jump-start their cache. - -See the [tutorial](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html#torch-compile-end-to-end-caching-mega-cache) for more information. - - -## PROTOTYPE FEATURES - - -### [Prototype] NVIDIA Blackwell Architecture Support - -PyTorch 2.7 introduces support for NVIDIA's new Blackwell GPU architecture and ships pre-built wheels for CUDA 12.8. For more details on CUDA 12.8 see [CUDA Toolkit Release](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html). - - - -* Core components and libraries including cuDNN, NCCL, and CUTLASS have been upgraded to ensure compatibility with Blackwell platforms. -* PyTorch 2.7 includes Triton 3.3, which adds support for the Blackwell architecture with torch.compile compatibility. -* To utilize these new features, install PyTorch with CUDA 12.8 using: *pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cu128* - -More context can also be found [here](https://github.com/pytorch/pytorch/issues/145949). - - -### [Prototype] PyTorch Native Context Parallel - -PyTorch Context Parallel API allows users to create a Python context so that every *torch.nn.functional.scaled_dot_product_attention() *call within will run with context parallelism. Currently, PyTorch Context Parallel supports 3 attention backends: 1. Flash attention; 2. Efficient attention; and 3. cuDNN attention. - -As an example, this is [used within TorchTitan as the Context Parallel solution for LLM training](https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082). - -See [tutorial](https://pytorch.org/tutorials/prototype/context_parallel.html) here. - - -### [Prototype] Enhancing Intel GPU Acceleration - -This latest release introduces enhanced performance optimizations for Intel GPU architectures. These improvements accelerate workloads across various Intel GPUs through the following key enhancements: - - - -* Enable torch.compile on Windows 11 for Intel GPUs, delivering the performance advantages over eager mode as on Linux. -* Optimize the performance of PyTorch 2 Export Post Training Quantization (PT2E) on Intel GPU to provide a full graph mode quantization pipelines with enhanced computational efficiency. -* Improve Scaled Dot-Product Attention (SDPA) inference performance with bfloat16 and float16 to accelerate attention-based models on Intel GPUs. -* Enable AOTInuctor and torch.export on Linux to simplify deployment workflows. -* Implement more Aten operators to enhance the continuity of operators execution on Intel GPU and increase the performance on Intel GPU in eager mode. -* Enable profiler on both Windows and Linux to facilitate model performance analysis. -* Expand the Intel GPUs support to [Intel® Core™ Ultra Series 2 with Intel® Arc™ Graphics](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html), and [Intel® Arc™ B-Series graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/desktop/b-series/overview.html) on both Windows and Linux. - -For more information regarding Intel GPU support, please refer to [Getting Started Guide](https://pytorch.org/docs/main/notes/get_start_xpu.html). - -See also the tutorials [here](https://pytorch.org/tutorials/prototype/inductor_windows.html) and [here](https://pytorch.org/tutorials/prototype/pt2e_quant_xpu_inductor.html). - - -### [Prototype] FlexAttention LLM first token processing on x86 CPUs - -FlexAttention x86 CPU support was first introduced in PyTorch 2.6, offering optimized implementations — such as PageAttention, which is critical for LLM inference—via the TorchInductor C++ backend. In PyTorch 2.7, more attention variants for first token processing of LLMs are supported. With this feature, users can have a smoother experience running FlexAttention on x86 CPUs, replacing specific *scaled_dot_product_attention* operators with a unified FlexAttention API, and benefiting from general support and good performance when using torch.compile. - - -### [Prototype] FlexAttention LLM throughput mode optimization - -The performance of FlexAttention on x86 CPUs for LLM inference throughput scenarios has been further improved by adopting the new C++ micro-GEMM template ability. This addresses the performance bottlenecks for large batch size scenarios present in PyTorch 2.6. With this enhancement, users can transparently benefit from better performance and a smoother experience when using FlexAttention APIs and torch.compile for LLM throughput serving on x86 CPUs. - - -### [Prototype] Foreach Map - -This feature uses torch.compile to allow users to apply any pointwise or user-defined function (e.g. torch.add) to lists of tensors, akin to the existing *torch._foreach_** ops. The main advantage over the existing *torch._foreach_** ops is that any mix of scalars or lists of tensors can be supplied as arguments, and even user-defined python functions can be lifted to apply to lists of tensors. Torch.compile will automatically generate a horizontally fused kernel for optimal performance. - -See [tutorial](https://pytorch.org/tutorials/recipes/foreach_map.html) here. - - -### [Prototype] Flex Attention for Inference - -In release 2.5.0, [FlexAttention](https://pytorch.org/blog/flexattention/)* torch.nn.attention.flex_attention* was introduced for ML researchers who’d like to customize their attention kernels without writing kernel code. This update introduces a decoding backend optimized for inference, supporting GQA and PagedAttention, along with feature updates including nested jagged tensor support, performance tuning guides and trainable biases support. - -### [Prototype] Prologue Fusion Support in Inductor - -Prologue fusion optimizes matrix multiplication (matmul) operations by fusing operations that come before the matmul into the matmul kernel itself, improving performance by reducing global memory bandwidth. diff --git a/_posts/2025-04-25-pytorch-2-7-intel-gpus.md b/_posts/2025-04-25-pytorch-2-7-intel-gpus.md deleted file mode 100644 index 7643d20ae51b..000000000000 --- a/_posts/2025-04-25-pytorch-2-7-intel-gpus.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -layout: blog_detail -title: "Accelerate PyTorch 2.7 on Intel® GPUs" -author: the Intel PyTorch Team ---- - -[PyTorch 2.7](https://pytorch.org/blog/pytorch-2-7/) continues to deliver significant functionality and performance enhancements on Intel® GPU architectures to streamline AI workflows. Application developers and researchers seeking to fine-tune, inference and develop PyTorch models on Intel GPUs will now have a consistent user experience across various operating systems, including Windows, Linux and Windows Subsystem for Linux (WSL2). This is made possible through improved installation, eager mode script debugging, a performance profiler, and graph model (torch.compile) deployment. As a result, developers have greater options with a unified GPU programming paradigm for both front-end and back-end development. - -## Incremental improvements of Intel GPU support in PyTorch - -Since PyTorch 2.4, we've made steady improvements to Intel GPU support with each release. With PyTorch 2.7, we are excited to share that we have established a solid foundation to have Intel GPU work in both graph mode (torch.compile) and eager mode on Windows and Linux. This includes a wide range of Intel GPU products, many of which you may already access. We hope these enhancements will unlock more ubiquitous hardware for your AI research and development. - -* Over time, we have expanded Intel GPU Support across Windows and Linux, including these products: - * [Intel® Arc™ A-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/desktop/a-series/overview.html) - * [Intel® Arc™ B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/desktop/b-series/overview.html) - * [Intel® Core™ Ultra Processors with Intel Arc Graphics](https://www.intel.com/content/www/us/en/support/articles/000097599/processors.html) - * [Intel® Core™ Ultra Mobile Processors (Series 2) with Intel Arc Graphics](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/core-ultra-series-2-mobile-product-brief.html) - * [Intel® Core™ Ultra Desktop Processors (Series 2) with Intel Arc Graphics](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/core-ultra-desktop-processors-series-2-brief.html) - * [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) -* [Simpler installation](https://pytorch.org/docs/2.7/notes/get_start_xpu.html) of torch-xpu PIP wheels and an effortless setup experience. -* High ATen operation coverage with SYCL and oneDNN for smooth eager mode support with functionality and performance. -* Notable speedups with torch.compile through default TorchInductor and Triton backend, proved by measurable performance gains with Hugging Face, TIMM, and TorchBench benchmarks. - -Check out the detailed advancements in these related release blogs:[ PyTorch 2.4](https://pytorch.org/blog/intel-gpus-pytorch-2-4/),[ PyTorch 2.5](https://pytorch.org/blog/intel-gpu-support-pytorch-2-5/), and[ PyTorch 2.6](https://pytorch.org/blog/unlocking-pt-2-6-intel/). - - -## What's New in PyTorch 2.7 - -These are the features in PyTorch 2.7 that were added to help accelerate performance on Intel GPUs. - - - -* Improve scaled dot-product attention (SDPA) inference performance with bfloat16 and float16 to accelerate attention-based models on Intel GPUs. -With the new SDPA optimization for Intel GPUs on PyTorch 2.7, Stable Diffusion float16 inference achieved up to 3x gain over PyTorch 2.6 release on Intel® Arc™ B580 Graphics and Intel® Core™ Ultra 7 Processor 258V with Intel® Arc™ Graphics 140V on eager mode. See Figure 1 below. - - -![chart](/assets/images/pytorch-2-7-intel-gpus/fg1.png){:style="width:100%"} - -**Figure 1. PyTorch 2.7 Stable Diffusion Performance Gains Over PyTorch 2.6** - -* Enable torch.compile on Windows 11 for Intel GPUs, delivering the performance advantages over eager mode as on Linux. With this, Intel GPUs became the first accelerator to support torch.compile on Windows. Refer to[ Windows tutorial](https://pytorch.org/tutorials/prototype/inductor_windows.html) for details. -Graph model (torch.compile) is enabled in Windows 11 for the first time across Intel GPUs, delivering the performance advantages over eager mode as on Linux by PyTorch 2.7. The latest performance data was measured on top of PyTorch Dynamo Benchmarking Suite using Intel® Arc™ B580 Graphics on Windows showcase torch.compile speedup ratio over eager mode as shown in Figure 2. Both training and inference achieved similar significant improvements. - - -![chart](/assets/images/pytorch-2-7-intel-gpus/fg2.png){:style="width:100%"} - -**Figure 2. Torch.compile Performance Gains Over Eager Mode on Windows** - - - -* Optimize the performance of PyTorch 2 Export Post Training Quantization (PT2E) on Intel GPU to provide full graph mode quantization pipelines with enhanced computational efficiency. Refer to [PT2E tutorial](https://pytorch.org/tutorials/prototype/pt2e_quant_xpu_inductor.html) for details. -* Enable AOTInductor and torch.export on Linux to simplify deployment workflows. Refer to[ AOTInductor tutorial](https://pytorch.org/docs/main/torch.compiler_aot_inductor.html) for details. -* Enable profiler on both Windows and Linux to facilitate model performance analysis. Refer to the[ PyTorch profiler tutorial](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#pytorch-profiler) for details. - -Review the [Getting Started on Intel GPU Guide](https://pytorch.org/docs/2.7/notes/get_start_xpu.html) for a tour of the environment setup and a quick start on Intel GPUs. - - -## Future Work - -Looking ahead, we will continue the Intel GPU upstream efforts in future PyTorch releases to: - -* Attain state-of-the-art PyTorch-native performance to showcase competitive GEMM computational efficiency for torch.compile, and enhance performance for LLM models through FlexAttention and lower precision data types. -* Broaden feature compatibility by delivering distributed XCCL backend support for Intel® Data Center GPU Max Series. -* Expand accelerator support across core PyTorch ecosystem components including torchao, torchtune, and torchtitan. - -Follow along in the [PyTorch Dev Discussion](https://dev-discuss.pytorch.org/t/intel-gpu-cpu-enabling-status-and-feature-plan-2025-h1-update/2913) to learn more about Intel GPU & CPU enabling status and features. As we get further along, we will create tickets on GitHub to document our progress. - - -## Summary - -In this blog, we reviewed the Intel GPU upstream progress starting in PyTorch 2.4 and highlighted the new features of PyTorch 2.7 that accelerate AI workload performance across various Intel GPUs. These new features, especially SDPA on Windows, achieved up to 3x inference (Stable Diffusion, float16) gain over PyTorch 2.6 release on Intel Arc B580 Graphics and Intel Core Ultra 7 Processor 258V with Intel Arc Graphics 140V. Also, torch.compile on Windows delivers similar performance advantages over eager mode on Dynamo benchmarks as on Linux. - - -## Acknowledgments - -We want to thank the following PyTorch maintainers for their technical discussions and insights: [Nikita Shulga](https://github.com/malfet), [Jason Ansel](https://github.com/jansel), [Andrey Talman](https://github.com/atalman), [Alban Desmaison](https://github.com/alband), and [Bin Bao](https://github.com/desertfire). - -We also thank collaborators from PyTorch for their professional support and guidance. - -## Product and Performance Information - -Measurement on Intel Core Ultra 7 258V: 2200 MHz, 8 Core(s), 8 Logical Processor(s) with Intel Arc 140V GPU (16GB), GPU memory 18.0 GB, using Intel Graphics Driver 32.0.101.6647 (WHQL Certified), Windows 11 Pro - 24H2. And Intel Core Ultra 5 245KF: 4200 MHz, 14 Core(s), 14 Logical Processor(s), Intel Arc B580 Graphics, dedicated GPU memory 12.0 GB, shared GPU memory 15.8 GB, using Intel Graphics Driver 32.0.101.6647 (WHQL Certified), Windows 11 Enterprise LTSC - 24H2. Test by Intel on Apr 8th, 2025. - -## Notices and Disclaimers - -Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation. - -Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. - -## AI Disclaimer - -AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at [www.intel.com/AIPC](http://www.intel.com/AIPC). Results may vary. \ No newline at end of file diff --git a/_posts/2025-04-28-accelerating-training-float8-rowwise-crusoe.md b/_posts/2025-04-28-accelerating-training-float8-rowwise-crusoe.md deleted file mode 100644 index 245688c07605..000000000000 --- a/_posts/2025-04-28-accelerating-training-float8-rowwise-crusoe.md +++ /dev/null @@ -1,195 +0,0 @@ ---- -layout: blog_detail -title: "Accelerating Large Scale Training and Convergence with PyTorch Float8 Rowwise on Crusoe 2K H200s" -author: Meta and Crusoe ---- - -**Meta**: Less Wright, Hamid Shojanazeri, Vasiliy Kuznetsov, Daniel Vega-Myhre, Gokul Nadathur, Will Constable, Tianyu Liu, Tristan Rice, Driss Guessous, Josh Fromm, Luca Wehrstedt, Jiecao Yu -**Crusoe**: Ethan Petersen, Martin Cala, Chip Smith - -Working with [Crusoe.AI](http://Crusoe.AI) we were provided access to one of their new 2K H200 clusters in Iceland, which enabled us to showcase training accelerations of 34 - 43% at scale by leveraging TorchTitan’s HSDP2 and TorchAO’s new float8 rowwise, with comparable convergence and stability vs BF16. - - -![bar chart](/assets/images/accelerating-training-float8-rowwise-crusoe/fg1.png){:style="width:100%;"} - - -In this post we detail the synergy of H200’s with PyTorch’s new Float8 rowwise training with TorchTitan’s FSDP2/HSDP2 and CP at scale. - -## Background - what is an H200? - -H200’s are an ‘enhanced’ H100, offering the exact same compute as an H100, but with two additional improvements. - -* Larger global memory, 141GiB HBM3e vs the standard 80GiB HBM3 -* Memory bandwidth is ~43% faster with 4.8TB/s vs 3.35 TB/s. The faster memory transfer has an outsized effect on training speed, especially for PyTorch’s AsyncTP. - -## What is PyTorch Float8 rowwise? - -Float 8 Rowwise is a finer grained resolution for Float8 vs the previous ‘tensor wise’ Float8. It is designed to ensure finer grained accuracy to support larger workloads that tend to become more sensitive to quantization at scale and as training progresses. - -There are two key improvements with Float8 rowwise: - -* Each row now maintains its own scaling factor versus a single scaling factor for the entire tensor, thus improving quantization precision. Finer grained scaling per row helps reduce the effect of outliers (extreme values that force the quantization scaling factor to stretch and degrade the precision of the normally distributed values) and thus ensures better precision. -* The scaling factor itself is now implemented by rounding down to the nearest power of 2. This has been shown to help reduce quantization errors when multiplying/dividing by the scaling factor as well as ensuring large values remain scaled to the same value in both the forward and backward passes. - -Note that other large scale models have been trained using Float8 at 2K scale with a combination of 1x128 groupwise and 128x128 blockwise, with power of 2 scaling factors. They had the same goal of improving Float8’s precision for supporting large scale training. - -Thus, Float8 rowwise offers a similar promise to enable Float8 for very large scale training, but we wanted to provide proof of stability and convergence at scale, which training on the Crusoe H200 2k cluster provided initial verification thereof. - -## Showcasing Float8 Rowwise Loss convergence vs BF16 at 1600 and 1920 GPU Scale: - -In order to verify comparable loss convergence, we ran two separate runs at both 1920 and then 1600 (1.6k) gpu scale using TorchTitan and Lllama3 70B. The 1.6K GPU runs were set for 2.5k iterations, using TorchTitans’ HSDP2 and Context Parallel to enable 2D parallelism. - -The loss convergence tests were run using Titan’s deterministic mode - this mode effectively freezes most potential sources of variation from run to run, and thus helps ensure that the only substantial change is what we want to test, namely the loss convergence and loss curves of BF16 vs Float8 Rowwise. - -Note that deterministic mode also slows down training speed because various kernels will not be autotuned to maximize throughput (otherwise we risk using different kernels between runs and introducing variance). - -Two runs were completed, one with BF16 and the other with Float8 Rowwise. - -Both runs completed their assigned 2.5k iters without issue, showcasing the Crusoe cluster stability, with FP8 completing at exactly 24 hours and BF16 finishing after 31 hours, 19 minutes. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        DType - Time / Iters - Loss -
        - - -
        BF16 - 24 hours - 3.15453 -
        Float8 Rowwise - 24 hours - 2.86386 -
        - - -
        BF16 - 31 hours, 19 minutes / 2.5K - 2.88109 -
        Float8 Rowwise - 24 hours / 2.5K - 2.86386 -
        - - -At the 24 hour mark, Float8 completed 2.5K iterations showcasing the comparative speed up (even in deterministic mode) of float8 training. At the 24 hour mark, Float8 enabled a **+9.21%** relative improvement in loss compared to BF16 for the same 24 hours of large scale training time. - - -After 31 hours, 19 minutes, the BF16 run finally completed its 2.5k iters. - - -The final loss numbers: -BF16 = **2.88109** -Float8 = **2.86386** - -From the loss curves we observed very similar curves at the first and last ⅓ and then a turbulent zone in the middle where both showed similar spikes, but with a slight skew to the relative timing of the spikes. - - -![line chart](/assets/images/accelerating-training-float8-rowwise-crusoe/fg2.png){:style="width:100%;"} - - -As a result of this, we can see that PyTorch’s Float8 rowwise offers similar convergence but over 33% speedup for the same amount of training time. - -## Long Term Training stability with Float8 Rowwise - -Beyond showcasing comparable convergence, we also wanted to show longer term training stability with Float8 and thus we launched a 4 day, 15K run at 256 scale. - -![line chart](/assets/images/accelerating-training-float8-rowwise-crusoe/fg3.png){:style="width:100%;"} - - -As shown above, Float8 training ran for over 100 hours with no issues, highlighting the long term stability of Float8 Rowwise. - -## Determinism in TorchTitan - -To verify determinism and to see if the spikiness in the longer runs was from scale, we also ran a smaller run comprising of 2 runs of BF16, and 1 run of Float8 at 256 scale, and with HSDP2 only (i.e. without 2D Context parallel). - -In this case both BF16 runs had identical curves and final loss, and we saw a similar spikiness zone for all three runs. - -At the 2K iteration mark, both Float8 and BF16 ending at nearly identical points: -BF16 *2 = **3.28538** -Float8 rowwise = **3.28203** - -![line chart](/assets/images/accelerating-training-float8-rowwise-crusoe/fg4.png){:style="width:100%;"} - - -The above result confirms that neither CP nor scale (2k) are responsible for spikiness in the loss as we saw similar effect at 256 scale as well. The most likely explanation for the loss spikes could be content distribution in the dataset. - -For the sake of determinism, the experiments were run with a serialized C4 dataset (not shuffled), meaning the spikes could be from encountering new content within the dataset. - -## Net speedups at various Scales with Float8 rowwise: - -We performed shorter runs at various GPU scales to understand how Float8 Rowwise would scale in terms of training acceleration as cluster sizes expanded. Doubling in scale from 960 to 1920, Float8 continued to deliver impressive training speedups, with a range of over 34-43% gains compared to BF16. We also want to note that scaling from 1k to 2k GPUs communication overhead likely kicked in and we observed a 4% hit on throughput with BF16. - -![bar chart](/assets/images/accelerating-training-float8-rowwise-crusoe/fg5.png){:style="width:100%;"} - - -As shown in the longer training runs at scale above, Float8 rowwise delivered substantial speedups with equal or even slightly improved loss endpoints while delivering 34% speedups at 1920 (DeepSeek) scale. - -## How can I use Float8 Rowwise in my training? - -Float8 Rowwise is available now for you to use in your large scale training. It is packaged in [TorchAO’s](https://github.com/pytorch/ao) latest builds (0.9 and higher) and integrated into [TorchTitan](https://github.com/pytorch/torchtitan) natively if you want to get up and running quickly. - -To activate Float8 Rowwise in TorchTitan: - -First enable the model converter to hotswap the nn.linears into float8 linear layers in your models .toml file - see line 29: - - -![code](/assets/images/accelerating-training-float8-rowwise-crusoe/fg6.png){:style="max-width:600px; display: block; margin-left: auto; margin-right: auto"} - -Secondly, specify the ‘rowwise’ float8 recipe - see line 72: - - -![code](/assets/images/accelerating-training-float8-rowwise-crusoe/fg7.png){:style="max-width:600px; display: block; margin-left: auto; margin-right: auto"} - - -Note that you have three choices for the ‘recipe_name’: - -* rowwise which is the recommended default, -* tensorwise (the older style float8) and -* rowwise_with_gw_hp. - -The gw_hp rowwise option keeps the gradients to the weights in BF16 precision during the backwards pass, and this can further enhance float8 precision for extremely sensitive workloads. But, it can ironically be a bit more performant than generic rowwise if the majority of the matmul sizes in your model are smaller (with an estimated tipping point at roughly 13-16K dimensions on H100). - -Thus while we recommend rowwise as the default, it may be worth comparing with gw_hp on your model to verify which provides the best performance, with an upside of even greater precision. - -By toggling the model converter on and off with a #, you can directly compare training acceleration between BF16 and Float8 Rowwise to understand the potential speedups for your own training. - -## Future Updates: - -We’ll have an additional update coming showcasing multiple improvements for Pipeline Parallel and Async Distributed Checkpointing so please stay tuned. \ No newline at end of file diff --git a/_posts/2025-04-29-pt-foundation-expands.md b/_posts/2025-04-29-pt-foundation-expands.md deleted file mode 100644 index a0b0454ae588..000000000000 --- a/_posts/2025-04-29-pt-foundation-expands.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -layout: blog_detail -title: "PyTorch Foundation Expands to an Umbrella Foundation to Accelerate AI Innovation" -author: Matt White, Executive Director, PyTorch Foundation ---- - -Today, I am thrilled to announce a significant milestone for the PyTorch Foundation: we are expanding our scope to become an umbrella foundation, allowing us to host additional projects. This expansion positions the PyTorch Foundation to foster a broader ecosystem of high-value, trusted, and innovative AI projects that cater to all stages of the AI lifecycle—from training and inference to industry-specific applications. - -## Why Expand? - -Since its inception at the Linux Foundation two and a half years ago, the PyTorch Foundation has rapidly grown, now encompassing over 30 member organizations and 120 vibrant ecosystem projects. PyTorch itself has become the framework of choice for AI researchers, practitioners, and industry leaders worldwide. Our flagship PyTorch Conference has seen attendance multiply sixfold over just two years, reflecting the community’s tremendous enthusiasm and engagement. - -With new initiatives such as PyTorch Day events, global community meetups, the PyTorch Ambassador Program, Open Source Program Office (OSPO) outreach, the Speaker’s Bureau, and our upcoming training and certification programs, we have significantly deepened our community’s expertise and collaboration capabilities. To sustain and accelerate this momentum, the logical next step was to expand the PyTorch Foundation into an umbrella organization. - -## What Does an Umbrella Foundation Mean? - -By transitioning into an umbrella foundation, PyTorch will now host a range of diverse, high-quality AI and ML projects beyond PyTorch Core. These include foundation-hosted projects in two categories: - - -* **Platform Projects**: Domain-agnostic solutions essential across various stages of the AI lifecycle, such as training, inference, model optimization, and deployment as well as agentic systems. -* **Vertical Projects**: Domain-specific projects tailored to particular industries or applications, such as biomedical imaging, protein folding, and geospatial analysis. - -Projects under our umbrella gain immediate access to vendor-neutral governance, enhanced visibility, increased funding opportunities, and robust community engagement and support. - -## Foundation-Hosted vs. Ecosystem Projects - -As we expand, it’s important to clarify the distinction between foundation-hosted and ecosystem projects: - -* **Foundation-Hosted Projects** are projects that fall under the umbrella, they are officially governed and administered under the PyTorch Foundation’s neutral and transparent governance model. Project maintainers continue to oversee their project, and they transfer assets to the Linux Foundation for independent stewardship and adopt an open governance model significantly reducing vendor bias and encouraging broader community contributions and adoption. These projects have greater stability and longevity and integrate with the larger PyTorch community. -* **Ecosystem Projects** remain independently managed but receive recognition and increased visibility by aligning themselves closely with the PyTorch Foundation community standards. These projects meet specific quality and maturity criteria but retain full independence in governance and asset management. - -## How to Join the PyTorch Ecosystem or Become a Foundation-Hosted Project - -We have clearly defined pathways for projects looking to become part of the PyTorch community: - -1. **[Ecosystem Project Status](https://github.com/pytorch-fdn/ecosystem)**: Projects must meet defined criteria, such as active development, comprehensive documentation, CI/CD infrastructure, clear governance, and community engagement. Approved ecosystem projects benefit from increased exposure and official recognition on the [PyTorch Landscape](https://landscape.pytorch.org/). -2. **[Candidate Project Status](https://github.com/pytorch-fdn/foundation-hosted)**: Ecosystem projects aspiring to foundation-hosted status can become candidates by securing sponsorship from a PyTorch Foundation [Technical Advisory Council (TAC)](/tac) voting member. Candidates receive guidance on meeting all necessary governance, technical, and strategic criteria. -3. **[Foundation-Hosted Project Status](https://github.com/pytorch-fdn/foundation-hosted)**: Candidate projects demonstrating high maturity, stability, multi-platform support, security best practices, and strategic value to the PyTorch community can be approved by the TAC. These projects gain extensive benefits, including neutral trademark hosting, foundation support, marketing and events resources, governance guidance, and strategic funding opportunities. - -## Ensuring Long-Term Success and Innovation - -By expanding our scope to become an umbrella foundation, the PyTorch Foundation is uniquely positioned to enhance collaboration, innovation, and sustained growth across the entire AI community. Our mission is clear: create a vendor-neutral, open source environment where the best AI and ML tools can thrive, benefiting users, contributors, and industry stakeholders worldwide. - -*“PyTorch is absolutely the foundation of the innovation happening in AI today and with projects like Llama, ChatGPT, and hundreds of thousands of open projects built on PyTorch, it has cemented itself as a critical ingredient to the world of AI. This move to create an umbrella foundation enables PyTorch to significantly expand its ecosystem both horizontally and vertically in this new era of agentic systems. I am very excited about this opportunity to take the PyTorch community to the next level!” - Joe Spisak, Product Director for PyTorch at Meta.* - -*"PyTorch sits at the very core of AI today. Meanwhile, the depth of the AI stack has grown dramatically—evolving from enabling accelerated compute to powering fully autonomous systems. Broadening the PyTorch Foundation is a key step in keeping the AI revolution open and accessible to all, across the stack and aligned with the principles PyTorch was built on." - Luca Antiga, CTO at Lightning AI.* - -We are incredibly optimistic about the opportunities ahead and excited to welcome new projects into our growing family. The PyTorch Foundation remains deeply committed to driving AI innovation forward, and together, we will continue to build the future of open source artificial intelligence. - -Stay tuned for more updates, announcements, and opportunities to participate! \ No newline at end of file diff --git a/_posts/2025-04-30-6x-faster-async-checkpointing.md b/_posts/2025-04-30-6x-faster-async-checkpointing.md deleted file mode 100644 index 12a2f9e1b1de..000000000000 --- a/_posts/2025-04-30-6x-faster-async-checkpointing.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -layout: blog_detail -title: "6x faster Async Checkpointing in PyTorch, using Cached Plans, no GIL contention" -author: Meta and Crusoe ---- - -**Meta**: Less Wright, Meet Vadakkanchery, Saurabh Mishra, Ela Krepska, Hamid Shojanazeri, Pradeep Fernando -**Crusoe**: Ethan Petersen, Martin Cala, Chip Smith - -PyTorch DCP (Distributed Checkpointing) has recently enabled new optimizations in asynchronous checkpointing to reduce GPU utilization drop by minimizing collective overhead and improving overall checkpointing efficiency. - -Using Crusoe’s 2K H200 cluster, with TorchTitan and training a Llama3-70B, we were able to verify these new features deliver substantial speedups at 1856 GPU scale, reducing the background processing time for async DCP checkpoints from ~436 seconds to ~67 seconds. - -This is roughly a 6.5x reduction in background checkpoint processing time, enabling even more total training time to proceed at full training throughput. - -![chart](/assets/images/6x-faster-async-checkpointing/fg1.png){:style="width:100%"} - - -*Fig 1: 1856 training run with high frequency checkpointing. The first checkpoint (drop down in tps) does not have a cached save plan, and the background processing takes far longer than the rest where the cached plan is used.* - - -## Background: What is Asynchronous Checkpointing? - -In a standard checkpointing workflow, GPUs are blocked while the checkpointing data is offloaded from GPU to CPU and then written to storage. After the save to physical media is complete, training can resume. - -Asynchronous checkpointing greatly reduces this downtime by enabling the actual saving to storage to be done via CPU threads, allowing GPU-based training to continue while the checkpoint data is being persisted in parallel. It is used primarily for intermediate/fault tolerant checkpoints as it unblocks the GPUs much faster compared to the synchronous checkpoints. \ -For example, in our large-scale experiment, GPU training was blocked for less than a second (.78 seconds at 1856 scale) while checkpoint data was moved from GPU to CPU (staging). At that point, GPU training immediately continues, which is a substantial training time improvement over traditional checkpointing. For reference, Async Checkpointing is covered in more detail [here](https://pytorch.org/blog/reducing-checkpointing-times/). - - -## Challenges with Asynchronous Checkpointing - -However, the background processing inherent in Asynchronous Checkpointing has additional challenges that result in a temporary reduction of training throughput while the storage phase is being completed. These are highlighted below. - - -### GPU utilization drop from GIL contention: - -The Global Interpreter Lock (GIL) in Python is a mechanism that prevents multiple native threads from executing Python bytecode at the same time. This lock is necessary mainly because CPython's memory management is not thread-safe. - -DCP currently uses background threads for metadata collectives and uploading to storage. Although these expensive steps are done asynchronously, it leads to contention for the GIL with the trainer threads. This causes the GPU utilization (QPS) to suffer significantly and also increases the e2e upload latency. For large-scale checkpoints, the overhead of the CPU parallel processing has a suppressive effect on net GPU training speed since CPUs also drive the training process via GPU kernel launches. - -Please refer to the following figure from our experiments: - -![chart](/assets/images/6x-faster-async-checkpointing/fg2.png){:style="width:100%"} - - -*Fig 2: One can see a sustained drop in training QPS even after staging (i.e. blocking operation to trainer) is complete.* - -The first dip in Figure 2 (marked by the purple line) indicates that staging is complete, and training can continue. However, a second drop is evident (marked by the area between the purple and yellow lines) which is due to trainer thread and checkpointing threads contending for the Python GIL, leading to degraded training QPS until the checkpoint thread completes execution. - - -### Collective communications cost: - -DCP performs multiple collectives today for various reasons: dedupe, global metadata for the checkpoint, resharding, and distributed exception handling. Collectives are costly as these require network I/O and pickling/unpickling of the large metadata being sent across the GPU network. These collectives become extremely expensive as the job scale grows, leading to significantly higher e2e latency and potential for collective timeouts. - - -## Solutions - - -### Process based async checkpointing - -DCP now supports async checkpoint save via a background process. This helps avoid the training QPS drop by eliminating the python GIL contention with the trainer threads. Please see Fig 2 for checkpointing via threads and Fig 3 for checkpointing via background process. - - -### Caching of the save plans - -DCP has a clear boundary between the planning and storage I/O steps. SavePlanner in DCP is a stateful component which acts as an access proxy to the state_dict. Planner manages save plans prepared by individual ranks, which carry metadata information necessary to do the write I/O. The planning step involves a collective operation to gather a comprehensive view of the checkpoint on the coordinator rank. The coordinator rank is responsible for de-duplicating parameters/weights to eliminate redundancies, validating the global plan to ensure accuracy and consistency, and creating the global metadata structs. This is followed by a scatter collective where the coordinator rank assigns I/O tasks to each rank. Any transformations done on the plans affect how the storage components finally write the data. - -During the course of a training job, multiple checkpoints are saved. In the majority of these cases, only the checkpoint data changes between different save instances, and thus, the plan remains the same. This presented an opportunity for us to cache the plans, pay the planning cost only on the first save, and then amortize that cost across all the subsequent attempts. Only the updated plans (plans which changed in the next attempt) are sent via collective, thus reducing the collective overhead significantly. - - -## Experiment Results - -**Set up:** 1856 H200 GPUs, Llama3-70B, HSDP2 with TorchTitan - -After deploying both the solutions above, the following are the key results: - -* TPS drop has significantly narrowed, with a peak dip to 372 vs 315 tps, and for a greatly reduced time window (~67 seconds vs ~437 seconds). This time window is now mostly attributed to the blocking for CPU processing. -* Subsequent checkpoint save attempts also continue to be much faster due to very low overhead at the planning stage. E2E latency is thus improved by over 6.5x. This will allow our partners to increase the checkpointing frequency and reduce the lost training progress (i.e. wasted training time). - -If you look at the very first downspike in Figure 1, this drawdown in GPU processing time takes training throughput from 700 down to 320 tps, and suppresses it for roughly 7 minutes (467 seconds). Once the CPUs have finished processing, training continues again at full speed. - -Previously, this ~7 minute suppression would be repeated at *every* checkpoint. However, with the new process-based checkpointing feature, only the first checkpoint has the full drawdown time (mainly due to overhead from daemon process initialization), as all future checkpoints are executed via the background process, mitigating GIL contention with the trainer threads. - -This is visually shown in all the subsequent checkpoints where the average MFU suppression time drops to just over a minute, reflected by the sharp spikes that almost immediately revert to full MFU throughput. - - -![chart](/assets/images/6x-faster-async-checkpointing/fg3.png){:style="width:100%"} - - -*Fig 3: The red box shows the non-cached plan checkpoint, which also includes Checkpoint Background Init process overhead, while the purple box highlights the first checkpoint to run with the cached plan.* - -This means that even large-scale checkpointing, such as shown in Fig 2 at 1856 GPU scale, can be done with ~6x reduced training throughput impact. This enables Asynchronous DCP checkpointing to be run more frequently (thus better rollback protection) while enhancing total training throughput relative to previous Async Checkpointing overhead. - -**Using DCP’s cached checkpointing:** - -This feature is already available as part of the PyTorch nightly builds, and you can test out PyTorch’s Asynchronous DCP checkpointing directly in TorchTitan. Following are the instructions to enable these features: - -* Process-based asynchronous checkpointing: - * Set the **async_checkpointer_type** to AsyncCheckpointerType.PROCESS in the [async_save](https://github.com/pytorch/pytorch/blob/main/torch/distributed/checkpoint/state_dict_saver.py#L193) API. (*file*: pytorch/torch/distributed/checkpoint/state_dict_saver.py) -* Save plan caching: - * Set the **enable_plan_caching** flag to true in the [DefaultSavePlanner](https://github.com/pytorch/pytorch/blob/main/torch/distributed/checkpoint/default_planner.py#L78C9-L78C28). (*file*: pytorch/torch/distributed/checkpoint/default_planner.py) - - -## Future work - -DCP will be rolling out additional optimizations to further improve the checkpointing cost. Currently even though the save plans are cached, coordinator rank still prepares the metadata. For larger jobs and models with many tensors, this overhead is non-trivial. In the next iteration, DCP will eliminate the metadata overhead and improve the e2e latency further. DCP will also introduce additional optimizations, such as zero-overhead checkpointing, to enable efficient checkpointing in large-scale jobs. - -Stay tuned! diff --git a/_posts/2025-04-30-flexattention-for-inference.md b/_posts/2025-04-30-flexattention-for-inference.md deleted file mode 100644 index 587aedf2158a..000000000000 --- a/_posts/2025-04-30-flexattention-for-inference.md +++ /dev/null @@ -1,380 +0,0 @@ ---- -layout: blog_detail -title: "FlexAttention Part II: FlexAttention for Inference" -author: Joy Dong, Boyuan Feng, Driss Guessous, Joel Schlosser, Yanbo Liang, Horace He ---- - -## Overview - -In PyTorch 2.5.0 release, we introduced [FlexAttention](https://pytorch.org/blog/flexattention/) `torch.nn.attention.flex_attention` for ML researchers who’d like to customize their attention kernels without writing kernel code. This blog introduces our decoding backend optimized for inference, supporting GQA and PagedAttention, along with feature updates including nested jagged tensor support, performance tuning guides and trainable biases support. - -If you’re looking for an easy way to play around with FlexAttention in your post-training / inference pipeline, PyTorch native post-training library [torchtune](https://github.com/pytorch/torchtune) and inference codebase [gpt-fast](https://github.com/pytorch-labs/gpt-fast) already have FlexAttention integrated. Try it out! - -We are excited to share that our paper on FlexAttention has been accepted for presentation at the MLSys2025 Conference held from May 12-15th in Santa Clara, California. - -Title: **FlexAttention: A Programming Model for Generating Optimized Attention Kernels.** [Poster](https://mlsys.org/virtual/2025/poster/3007) - - -## FlexAttention for Inference - -TL;DR: `torch.compile` lowers `flex_attention` to a fused [FlashDecoding](https://pytorch.org/blog/flash-decoding/) kernel when it runs on a very short query. - -One fused attention kernel does not suit all – especially in long-context LLM inference. - -The decoding phase of LLM inference is an iterative process: tokens are generated one at a time, requiring `N` forward passes to generate an `N`-token sentence. Fortunately, each iteration doesn’t need to recompute self-attention over the full sentence — previously calculated tokens are cached, therefore we only need to attend the newly generated token to the cached context. - - -![chart](/assets/images/flexattention-for-inference/fg1.png){:style="width:100%"} - - -This results in a unique attention pattern where a short query sequence (1 token) attends to a long key-value cache (context length up to 128k). Traditional optimizations for square attention kernels (`q_len ≈ kv_len`) don’t directly apply here. This pattern poses new challenges for GPU memory utilization and occupancy. We build a dedicated FlexDecoding backend optimized for long-context LLM inference incorporating decoding-specific techniques from [FlashDecoding](https://pytorch.org/blog/flash-decoding/). - -FlexDecoding is implemented as an alternative backend for the `torch.nn.attention.flex_attention `operator. `flex_attention` automatically switches to the FlexDecoding backend for its JIT compilation when given a short query and a long KV cache. If the input shape changes significantly, for example transitioning from the prefill phase to decoding, JIT recompilation generates a separate kernel for each scenario. - -``` -flex_attention = torch.compile(flex_attention) - -k_cache = torch.random(B, H, 16384, D) -v_cache = torch.random(B, H, 16384, D) - -... - -# Prefill Phase: query shape = [B, H, 8000, D] -flex_attention(q_prefill, k_cache, v_cache, ...) # Uses FlexAttention backend optimized for prefill & training - -# Decoding Phase: q_last_token shape = [B, H, 1, D] -flex_attention(q_last_token , k_cache, v_cache, ...) # Recompiles with the FlexDecoding backend - -# decode 2 tokens at the same time: q_last_2_tokens shape = [B, H, 2, D] -flex_attention(q_last_2_tokens, k_cache, v_cache, ...) # No recompilation needed! Runs the decoding kernel again. -``` - - -## Working with KV Cache - -One of the key optimizations for efficient inference is maintaining a preallocated KV cache that updates **in place** as new tokens are generated. Instead of enforcing a specific KV cache policy with a dedicated API, FlexDecoding allows users to define and manage the KV cache themselves. - -Similar to FlexAttention, FlexDecoding takes user-defined `mask_mod` and `score_mod` functions. These functions modify attention scores before the softmax operation. - -![chart](/assets/images/flexattention-for-inference/fg2.png){:style="width:100%"} - -``` -score_mod(score, b, h, q_idx, kv_idx) -> tensor # return updated score -``` - -Score is a scalar pytorch tensor that represents the dot product of a query token and a key token. The rest of the arguments specify which score is being computed: - - - -* `b` batch index -* `h` attention head index -* `q_idx` token position in query tensor -* `kv_idx` token position in key/value tensor - -In the decoding phase, previously calculated tokens are cached, and only the latest generated token (i-th) is used as the query. A naive causal mask on this one token query looks like this: - -``` -def causal(score, b, h, q_idx, kv_idx): - return torch.where(q_idx >= kv_idx, score, -float("inf")) -``` - - -![chart](/assets/images/flexattention-for-inference/fg3.png){:style="width:100%"} - - -This is problematic: the new token “*saw*” should attend to all previously generated tokens i.e. “*The cat sat on the mat and saw*”, not just the first entry in the kv cache. To correct this, the `score_mod` needs to **offset q_idx** **by i **for accurate decoding. - - -![chart](/assets/images/flexattention-for-inference/fg4.png){:style="width:100%"} - - -Creating a new `score_mod` for each token to accommodate the offset is slow since it means FlexAttention needs to be recompiled every iteration for a different `score_mod`. Instead, - -We define this `offset` as a tensor and increment its value at each iteration: - -``` -offset = torch.tensor(i, "cuda") -def causal_w_offset(score, b, h, q_idx, kv_idx): - return torch.where(q_idx + offset >= kv_idx, score, -float("inf")) - -# Attend the i-th token -flex_attention(..., score_mod=causal_w_offset ) # Compiles the kernel here -... -# Attend the i+1-th token -offset = offset + 1 # Increment offset -flex_attention(..., score_mod=causal_w_offset ) # Doesn't need to recompile! -``` - -Notably, here `offset` becomes a captured tensor and it does not need to recompile if `offset` changes values. - -Manually rewriting your `score_mod` and `mask_mod` for offset handling isn't necessary. We can automate this process with a generic rewriter: - -``` -offset = torch.tensor(i, "cuda") - -def get_score_mod_w_offset(score_mod: _score_mod_signature, _offset: tensor): - def _score_mod(score, b, h, q, kv): - return score_mod(score, b, h, q + _offset, kv) - return _score_mod - -def get_mask_mod_w_offset(mask_mod: _mask_mod_signature, _offset: tensor): - def _mask_mod(b, h, q, kv): - return mask_mod(b, h, q + _offset, kv) - return _mask_mod - -causal_w_offset = get_score_mod_w_offset(causal, offset) -``` - -## BlockMask for Inference - -We can also use BlockMask with inference to leverage mask sparsity. The idea is to precompute the BlockMask once during model setup and use slices of it during decoding - - -### Precomputing BlockMask - -During setup, we create a squared BlockMask for `MAX_SEQ_LEN x MAX_SEQ_LEN`: - -``` -from torch.nn.attention.flex_attention import create_block_mask - -def causal_mask(b, h, q_idx, kv_idx): - return q_idx >= kv_idx - -block_mask = create_block_mask(causal_mask, B=None, H=None, Q_LEN=MAX_SEQ_LEN,KV_LEN=MAX_SEQ_LEN) -``` - -![chart](/assets/images/flexattention-for-inference/fg5.png){:style="width:100%"} - - -### Using BlockMask During Decoding - -For the i-th token, we use a slice of the mask: - -``` -block_offset = i // block_mask.BLOCK_SIZE[0] -block_mask_slice = block_mask[:, :, block_offset] - -# don't forget to use the mask_mod with offset! -block_mask_slice.mask_mod = get_mask_mod_w_offset(causal_mask) -``` - -![chart](/assets/images/flexattention-for-inference/fg6.png){:style="width:100%"} - - -## Performance - - -![chart](/assets/images/flexattention-for-inference/fg7.png){:style="width:100%"} - -FlexDecoding kernel performs on par with FlashDecoding (FAKV) and significantly outperforms pytorch scaled_dot_product_attention ([code](https://github.com/pytorch/pytorch/blob/main/benchmarks/transformer/score_mod.py)). - - -![chart](/assets/images/flexattention-for-inference/fg8.png){:style="width:100%"} - -FlexDecoding boosts LLaMa3.1-8B serving performance by 1.22x-2.04x, and LLaMa3.1-70B performance by 0.99x - 1.66x compared to SDPA in gpt-fast. ([code](https://github.com/pytorch-labs/gpt-fast)) - - -## Paged Attention - -[vLLM](https://blog.vllm.ai/2023/06/20/vllm.html) is one of the popular LLM serving engines, powered by the efficient memory management from PagedAttention. Existing [PagedAttention](https://github.com/vllm-project/vllm/blob/main/csrc/attention/paged_attention_v2.cu) implementation requires dedicated CUDA kernels and shows limited flexibility on supporting emerging attention variants. In this section, we present a PT2-native PagedAttention implementation that is enabled by flex attention and torch.compile. - -PagedAttention scatters KV cache to reduce memory fragmentation and support higher batch sizes. Without PagedAttention, KV cache from the same request are stored in a contiguous memory, requiring 2 tensor of shape *B x H x KV LEN x D*. We call it a logical KV cache. Here, KV_LEN is the maximum sequence length over all requests in a batch. Considering the Figure 1(a), KV_LEN is 9 thus all requests must be padded to 9 tokens, leading to large memory waste. With PagedAttention, we can chunk each request into multiple pages of the same size page_size and scatter these pages into a physical KV cache of shape *1 x H x max seq len x D*, where max_seq_len=n_pages x page_size. This avoids padding requests to the same length and saves memory. Specifically, we provide an `assign` API to update KV cache via index computations: - -``` -def assign( - batch_idx: torch.Tensor, - input_pos: torch.Tensor, - k_val: torch.Tensor, - v_val: torch.Tensor, - k_cache: torch.Tensor, - v_cache: torch.Tensor, -) -> None -``` - -Behind this `assign` API is a page table, a tensor mapping logical KV cache to physical KV cache: - -[batch_idx, logical_page_idx] -> physical_page_idx - -`assign` takes `k_val` and `v_val` and scatters to physical KV cache guided by the mapping from the page table. - - -![chart](/assets/images/flexattention-for-inference/fg9.png){:style="width:100%"} - - -**Paged Attention with Page Table** - -A natural question is, how to integrate PagedAttention with flex attention to support diverse attention variants? A naive idea is to materialize the logical KV cache before computing with flex attention. But this leads to redundant memory copy and bad performance. Another idea is to build a dedicated CUDA or Triton kernel for paged attention, similar to [existing PagedAttention implementation](https://github.com/vllm-project/vllm/blob/main/csrc/attention/paged_attention_v2.cu). However, this adds much manual effort and code complexity. - -Instead, we design a fused indirect memory access by converting a logical block mask according to the page table. In FlexAttention, we exploit BlockMask to identify logical blocks and skip redundant computation. While Paged Attention adds an extra layer of indirect memory access, we can further convert the logical block mask to the physical block mask corresponding to the page table, as illustrated in Figure 2. Our PagedAttention implementation provides a `convert_logical_block_mask` via torch.gather calls: - -``` -def convert_logical_block_mask( - block_mask: BlockMask, - batch_idx: Optional[torch.Tensor] = None, -) -> BlockMask -``` - -![chart](/assets/images/flexattention-for-inference/fg10.png){:style="width:100%"} - - - -**Paged Attention via Block Mask Conversion** - -One remaining question is how to rewrite user-specified `mask_mod` and `score_mod` for PagedAttention. When users specify these modifications, they write with logical indices without the knowledge of the page table maintained at runtime. The following code shows an automated conversion at runtime which is necessary to rewrite user-specified modifications with physical kv indices. The `new_mask_mod` would take the physical_kv_idx and convert it back to the logical_kv_idx and apply user-specified `mask_mod` on the logical_kv_idx for the correct mask. For efficiency, we maintain physical_to_logical as a mapping from physical_kv_block to logical_kv_block to facilitate the conversion. For correctness, we mask out-of-boundary blocks as False with a `torch.where` call. After batching logical KV caches from multiple requests into the same physical KV cache, there are much more physical blocks than the number of logical blocks for each request. Thus, a physical block may not have a corresponding logical block for a specific request during block mask conversion. By masking as False with `torch.where`, we can ensure the correctness that data from different requests do not interfere with each other. Similarly, we can convert the [score_mod](https://github.com/pytorch/pytorch/blob/main/torch/nn/attention/experimental/_paged_attention.py#L308-L338) automatically. - -``` -def get_mask_mod(mask_mod: Optional[_mask_mod_signature]) -> _mask_mod_signature: - if mask_mod is None: - mask_mod = noop_mask - - def new_mask_mod( - b: torch.Tensor, - h: torch.Tensor, - q_idx: torch.Tensor, - physical_kv_idx: torch.Tensor, - ): - physical_kv_block = physical_kv_idx // page_size - physical_kv_offset = physical_kv_idx % page_size - logical_block_idx = physical_to_logical[b, physical_kv_block] - logical_kv_idx = logical_block_idx * page_size + physical_kv_offset - return torch.where( - logical_block_idx >= 0, mask_mod(b, h, q_idx, logical_kv_idx), False - ) - - return new_mask_mod -``` - -Figure 3 demonstrates the latency from Paged Attention ([code](https://github.com/pytorch-labs/attention-gym/blob/main/attn_gym/paged_attention/latency.py)). Overall, there is less than 5% overhead from Flex Attention with Paged Attention, compared with Flex Attention only. We also observe an on-par performance with Flash Attention v2. A [minimal serving example](https://github.com/pytorch-labs/attention-gym/blob/main/attn_gym/paged_attention/throughput.py) further shows that PagedAttention can support 76x higher batch size when evaluating on [OpenOrca dataset](https://huggingface.co/datasets/Open-Orca/OpenOrca) which includes 1M GPT-4 completions and 3.2M GPT-3.5 completions. - - -![chart](/assets/images/flexattention-for-inference/fg11.png){:style="width:100%"} - - -**Paged Attention: Latency under diverse sequence length** - - -## Ragged input sequences with Nested Jagged Tensors (NJTs) - -FlexAttention now supports ragged-sized input sequences through the use of Nested Jagged Tensors (NJTs). NJTs represent ragged-sized sequences by packing sequences into a single “stacked sequence” and maintaining a set of offsets delimiting sequence boundaries for each batch item. - -A block mask can be created for input NJTs through the new `create_nested_block_mask()` API. The returned block mask is compatible with the ragged structure of the given NJT, treating it as a single “stacked sequence” with inter-sequence attention automatically masked out. The mask_mod or score_mod function can be written as usual. - -``` -from torch.nn.attention.flex_attention import create_nested_block_mask, flex_attention - -BATCH = 8 -NUM_HEADS = 8 -D = 16 -device = "cuda" - -# Input NJTs of shape (BATCH, SEQ_LEN*, D) with ragged SEQ_LEN -sequence_lengths = [torch.randint(5, 30, ()).item() for _ in range(BATCH)] -query = torch.nested.nested_tensor([ - torch.randn(seq_len, NUM_HEADS * D, device=device) - for seq_len in sequence_lengths -], layout=torch.jagged) -key = torch.randn_like(query) -value = torch.randn_like(query) - -# View as shape (BATCH, NUM_HEADS, SEQ_LEN*, HEAD_DIM) -query = query.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2) -key = key.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2) -value = value.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2) - -# Simple causal mask -def my_mask_mod(b, h, q_idx, kv_idx): - return q_idx >= kv_idx - -# Construct a block mask using the ragged structure of the -# specified query NJT. Ragged-sized sequences are treated as a single -# "stacked sequence" with inter-sequence attention masked out. -block_mask = create_nested_block_mask(my_mask_mod, 1, 1, query) - -# For cross attention, create_nested_block_mask() also supports a -# rectangular block mask using the ragged structures of both query / key. -#block_mask = create_nested_block_mask(my_mask_mod, 1, 1, query, key) - -output = flex_attention(query, key, value, block_mask=block_mask) -``` - -## Trainable Biases - -FlexAttention now supports trainable parameters in `score_mod functions.` This feature enables users to reference tensors that require gradients within their `score_mod` implementations, with gradients automatically backpropagating through these parameters during training. - - -### Memory-Efficient Gradient Accumulation - -Instead of materializing the full attention scores matrix, FlexAttention uses atomic additions (`tl.atomic_add`) to accumulate gradients. This approach significantly reduces memory usage at the cost of introducing some non-determinism in gradient calculations. - - -### Handling Broadcasted Operations - -Broadcasting operations in the forward pass (e.g., `score + bias[h]`) require special consideration in the backward pass. When broadcasting a tensor across multiple attention scores within a head or other dimensions, we need to reduce these gradients back to the original tensor shape. Rather than materializing the full attention score matrix to perform this reduction, we use atomic operations. While this incurs some runtime overhead, it allows us to maintain memory efficiency by avoiding the materialization of large intermediate tensors. - - -### Current Limitations - -The implementation currently allows only a single read from each input tensor in the `score_mod` function. For example, `bias[q_idx] + bias[kv_idx]` would not be supported as it reads from the same tensor twice. We hope to remove this restriction in the future. - - -### Simple Example: - -``` -bias = torch.randn(num_heads, requires_grad=True) -def score_mod(score, b, h, q_idx, kv_idx): - return score + bias[h] -``` - -## Performance Tuning for FlexAttention - - -### TL;DR - -For optimal performance, compile FlexAttention using `max-autotune`, especially when dealing with complex `score_mods` and `mask_mods`: - -flex_attention = torch.compile(flex_attention, dynamic=True, mode='max-autotune') - - -### What is `max-autotune`? - -`max-autotune` is a `torch.compile` mode in which TorchInductor sweeps many kernel parameters (e.g., tile size, `num_stages`) and selects the best-performing configuration. This process allows kernels to test both successful and failing configurations without issues, and find the best viable configuration. - -While compilation takes longer with `max-autotune`, the optimal configuration is cached for future kernel executions. - -Here’s an example of FlexAttention compiled with `max-autotune`: - -``` -triton_flex_attention_backward_7 0.2528 ms 100.0% BLOCKS_ARE_CONTIGUOUS=False, BLOCK_M1=32, BLOCK_M2=32, BLOCK_N1=32, BLOCK_N2=32, FLOAT32_PRECISION="'ieee'", GQA_SHARED_HEADS=7, HAS_FULL_BLOCKS=False, IS_DIVISIBLE=False, OUTPUT_LOGSUMEXP=True, PRESCALE_QK=False, QK_HEAD_DIM=128, ROWS_GUARANTEED_SAFE=False, SM_SCALE=0.08838834764831843, SPARSE_KV_BLOCK_SIZE=1073741824, SPARSE_Q_BLOCK_SIZE=1073741824, V_HEAD_DIM=128, num_stages=4, num_warps=4 -``` - -### Why Use `max-autotune` for FlexAttention? - -The amount of shared memory utilized in FlexAttention depends on `score_mod` and `mask_mod` methods. This variability means that the preconfigured default kernel parameters may lead to performance cliffs or even out of shared memory** **errors on certain hardware for some masks/mods. - -For instance, with document masks, default configurations can halve GPU occupancy, reducing performance to ~75% of its potential on some GPUs. To avoid such issues, we strongly recommend enabling `max-autotune`. - - -## Updates and Enhancements - -* Now available as a prototype feature in PyTorch 2.5.0 -* Fixed critical correctness issues, including a bug affecting multiple calls to FlexAttention within the same call to torch.compile - - -## Expanded Architecture Support - -* Arbitrary sequence length support - no longer requires multiples of 128 -* Added native grouped-query attention (GQA) support via `is_gqa=True` -* Enhanced dimension flexibility: - * Different QK and V head dimensions - * Non-power-of-two head dimensions -* Trainable attention biases (prototype) - - -## Under the Hood - -* New fused CPU backend -* Improved TF32 handling for float32 inputs -* Resolved various dynamic shape issues -* Output layout matching query strides - -These updates make FlexAttention more robust and flexible while maintaining its core promise of combining PyTorch's ease of use with FlashAttention's performance benefits. \ No newline at end of file diff --git a/_posts/2025-05-01-docathon-2025.md b/_posts/2025-05-01-docathon-2025.md deleted file mode 100644 index 1ad33370e775..000000000000 --- a/_posts/2025-05-01-docathon-2025.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -layout: blog_detail -title: 'Announcing the PyTorch Docathon 2025' ---- - -![PyTorch Docathon 2025](/assets/images/docathon-2025.png){:style="max-width:600px; display: block; margin-left: auto; margin-right: auto"} - - -We're thrilled to announce the [2025 PyTorch Docathon](https://community.linuxfoundation.org/events/details/lfhq-pytorch-foundation-presents-pytorch-docathon-june-3rd-18th-2025/)! This is a hackathon-style event aimed at enhancing PyTorch documentation with the support of the community. Documentation is a vital component of any technology, and by refining it, we can simplify the onboarding process for new users, help them effectively utilize PyTorch's features, and ultimately speed up the transition from research to production in machine learning. - - -## WHY PARTICIPATE - - -### Low Barrier to Entry - -Unlike many open-source projects that require deep knowledge of the codebase and previous contributions to join hackathon events, the Docathon is tailored for newcomers. While we expect participants to be familiar with Python, and have basic knowledge of PyTorch and machine learning, there are tasks related to website issues that don't even require that level of expertise. - - -### Tangible Results - -A major advantage of the Docathon is witnessing the immediate impact of your contributions. Enhancing documentation significantly boosts a project's usability and accessibility, and you'll be able to observe these improvements directly. Seeing tangible outcomes can also be a strong motivator to continue contributing. - - -### Collaborative Environment - -The Docathon fosters a collaborative atmosphere, offering you the chance to work alongside other contributors and PyTorch maintainers to improve the documentation. This is a fantastic opportunity to learn from peers, exchange ideas, and build connections. - - -### Learning Opportunities - -Even if you're not a PyTorch expert, the Docathon offers a valuable learning experience. You'll have the chance to delve into PyTorch modules, test tutorials on your machine, and explore them in the CI environment. - - -## WHO SHOULD PARTICIPATE - -Whether you’re a seasoned documentation expert or just starting out, we invite everyone to join in the PyTorch docathon to contribute and develop your skills and knowledge to help improve the documentation for everyone! We will have issues labelled by skill level, and the PyTorch Discord will be available for collaboration and help. - - -## EVENT DETAILS - - - -* June 3: Kick-off 10 AM PT -* June 4 - June 15: Submissions and Feedback -* June 16 - June 17: Final Reviews -* June 18: Winner Announcements - -Make sure to [RSVP](https://community.linuxfoundation.org/events/details/lfhq-pytorch-foundation-presents-pytorch-docathon-june-3rd-18th-2025/) to the event so you receive all the notifications and instructions on how to participate. - -Further details about the Docathon will be shared during the Kick-off call on June 3. - - -**Don't forget to register for this year's event: [RSVP now](https://community.linuxfoundation.org/events/details/lfhq-pytorch-foundation-presents-pytorch-docathon-june-3rd-18th-2025/)** \ No newline at end of file diff --git a/_posts/2025-05-01-how-ibm-uses-pt-terratorch.md b/_posts/2025-05-01-how-ibm-uses-pt-terratorch.md deleted file mode 100644 index db6955023bc0..000000000000 --- a/_posts/2025-05-01-how-ibm-uses-pt-terratorch.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -layout: blog_detail -title: 'How IBM Research Uses PyTorch and TerraTorch to Make Geospatial Computer Vision Accessible for Everyone' -hidden: true ---- - -Earth Observation-based analytics are becoming essential for understanding our planet — from monitoring deforestation to tracking urban development and analyzing the impacts of climate change. However, the coding and deep learning skills for applying AI models to satellite imagery and earth observation data has traditionally been a major barrier for many practitioners. - -By IBM Research’s launch of TerraTorch 1.0, a PyTorch domain library for fine-tuning of Geospatial Computer Vision Foundation Models, we make geospatial AI not only more accessible but also more practical for the wider PyTorch community. Our goal: simplify the process so that any data scientist, researcher, or enthusiast can build powerful geospatial models with ease and low GPU and data processing requirements. - -![globes](/assets/images/how-ibm-uses-pt-terratorch/fg1.png){:style="width:100%"} - - -**The power of foundation models, even with 75-95% of the input data removed, the models do a fantastic job in reconstruction of the input data - therefore learning the underlying physics of our planet in a deep, latent space** - -## The Business Challenge - -Our goal was to remove the technical barriers that prevent people from working with satellite imagery, weather and climate data at scale. Together with NASA, we’ve developed the Prithvi family of foundation models. Integrating the latest innovations of AI research using the clean API PyTorch provides has facilitated the job. - -We wanted to create a framework that anyone can use to go from raw data to inference ready models in just a few steps. - - -![globes](/assets/images/how-ibm-uses-pt-terratorch/fg2.png){:style="width:100%"} - - -**How a weather and climate foundation model created and fine-tuned on PyTorch is used for weather forecasts** - -## How IBM Research Used PyTorch - -We’ve built TerraTorch on top of PyTorch, leveraging its dynamic ecosystem to integrate: - - - -* PyTorch Lightning for clean, scalable training loops -* TorchGeo for geospatial data handling and transformations (PyTorch transforms) -* For foundation models like the leading generative multimodal foundation model ['Terramind'](https://research.ibm.com/blog/terramind-esa-earth-observation-model), co-developed by IBM and ESA, and [the ‘Prithvi’ family](https://huggingface.co/ibm-nasa-geospatial), co-developed by IBM and NASA, TerraTorch has been used to fine-tune all of the downstream geospatial models for satellite imagery, weather and climate data. It includes the family of fine-tuned models that IBM has released as part of [Granite](https://huggingface.co/collections/ibm-granite/granite-geospatial-models-667dacfed21bdcf60a8bc982). In addition, other interesting foundation models and ecosystem components like Clay, SatMAE, Satlas, DeCur and DOFA are included in TerraTorch. -* Powerful and state-of-the-art vision transformers to experiment with modern neural network architectures -* TerraTorch-Iterate build on top of PyTorch, Optuna, MLFlow and Ray Tune for Hyperparameter Optimization (HPO), Neural Architecture Search (NAS) and Foundation Model Benchmarking (GeoBench), where TerraTorch became the reference implementation - - -![flow diagram](/assets/images/how-ibm-uses-pt-terratorch/fg5.png){:style="width:100%"} - -**The fine-tuning and inference process is completely described in a single YAML config file. There, the architectural building blocks of the model (backbone, neck, decoder, head) are defined. The Model Factory assembles the model using the build-in and custom registries. In addition, the Optimizer and Data Modules are created as defined in the config. Finally, everything is passed to the Lightning Trainer, who executes the task.** - - -With PyTorch’s flexibility, we were able to prototype quickly, iterate on model architectures, and deploy pipelines for a range of geospatial applications — from flood and biomass detection to increasing resolution of climate data, where some of our our work became part of the [IBM Granite Geospatial Model Family](https://huggingface.co/collections/ibm-granite/granite-geospatial-models-667dacfed21bdcf60a8bc982). - - -![flow diagram](/assets/images/how-ibm-uses-pt-terratorch/fg3.png){:style="width:100%"} - - -**Architecture of the Prithvi-EO-2.0-600M foundation model which IBM Research developed together with NASA** - -## Solving AI Challenges with PyTorch - -PyTorch helped us to tackle three major challenges: - -* Ease of experimentation: Dynamic computation graphs, automatic differentiation, full abstraction of CUDA and rich visualization tools made it simple to test different models and training strategies. -* Scalability: With DDP, FSDP, PyTorch Lightning and TorchGeo, we could train models on large-scale datasets without worrying about infrastructure. -* Community support: PyTorch - the de-facto standard in AI research - with its active community and excellent documentation made it easy to overcome hurdles and stay up to date with the latest advancements in AI research. - -## A Word from IBM Research - -*"PyTorch gave me the power to turn complex linear algebra and optimization problems into accessible, shareable solutions for the community. It feels empowering that we’re building and fine-tuning models for anyone curious about understanding our planet through AI."* - -— Romeo Kienzler, AI Research Engineer at IBM Research Zurich, Rueschlikon - - -![quote](/assets/images/how-ibm-uses-pt-terratorch/fg4.png){:style="width:100%"} - - -## The Benefits of Using PyTorch - -Using PyTorch allowed us to: - - - -* Build a reproducible, open-source framework for fine-tuning geospatial foundation models -* Share our work with the community through easy-to-follow notebooks, TerraTorch configuration files, tutorials and model checkpoints on HuggingFace -* Rapidly iterate over foundation model architectures and deploy fine-tuned models for inference, from research to real-world client products - -## Learn More - -For more information about this project and to explore the code, visit: - -* [GitHub Repository](https://github.com/IBM/terratorch) -* [IBM Research: Simplifying Geospatial AI with TerraTorch 1.0](https://research.ibm.com/blog/simplifying-geospatial-ai-with-terra-torch-1-0) -* [TerraTorch PrithviEOv2 example notebooks](https://github.com/IBM/terratorch/tree/main/examples/tutorials/PrithviEOv2) -* [TerraMind example notebooks](https://github.com/IBM/terramind/tree/main/notebooks) -* [Run TerraMind using TerraTorch on Colab](https://colab.research.google.com/github/IBM/terramind/blob/main/notebooks/terramind_v1_base_sen1floods11.ipynb) diff --git a/_posts/2025-05-02-pt-day-france-featured-sessions.md b/_posts/2025-05-02-pt-day-france-featured-sessions.md deleted file mode 100644 index 36bd9bacd37b..000000000000 --- a/_posts/2025-05-02-pt-day-france-featured-sessions.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -layout: blog_detail -title: 'PyTorch Day France Featured Sessions: A Defining Moment for Open Source AI' ---- - -[PyTorch Day France](https://events.linuxfoundation.org/pytorch-day-france/) offers a front-row seat to the future of open source AI. Taking place **7 May at Station F in Paris** and co-located with **[GOSIM AI Paris](https://paris2025.gosim.org/)**, this one-day event will bring together developers, researchers, and industry leaders for a day of technical sessions, real-world insights, and community exchange. - - -## 🌍 A Major Milestone for the PyTorch Foundation - -This event marks the very first **PyTorch Day**, launching a new international series hosted annually in different regions to convene AI researchers, developers, engineers, and enthusiasts. PyTorch Days are designed to spotlight open source AI advancements, foster community collaboration, and provide a forum to learn about active, high-impact AI projects built using PyTorch. - -PyTorch Day France also represents a pivotal moment in the PyTorch Foundation’s journey. With its recent [expansion into an umbrella foundation]( https://pytorch.org/blog/pt-foundation-expands/), PyTorch is now positioned to support a broader ecosystem of trusted, community-driven AI projects across the full AI lifecycle. - -At PyTorch Day France, you’ll hear directly from PyTorch Foundation **Executive Director, Matt White,** about this transition—and get a first look at some exciting announcements. - - -## 🎟️ Registration Details - -[Register now](https://www.eventbrite.com/e/gosim-ai-paris-tickets-1265928669729?aff=oddtdtcreator) with code **PYTORCH** for **free access** to the full day of **PyTorch Day France** sessions, **plus** **GOSIM AI Paris**. - -🔗Two events, one registration—double the sessions, double the innovation. \ -[Register here](https://www.eventbrite.com/e/gosim-ai-paris-tickets-1265928669729?aff=oddtdtcreator) - - -## 📅 Featured Sessions - -The day’s agenda includes deep technical dives and applied AI use cases from across the community, including the following talks: - - - -* [Luca Antiga (Lightning AI)](https://sched.co/21nz4) - *Lightning Thunder: Supercharged PyTorch for Modern Hardware* -* [Erwan Gallen & Eldar Kurtic (Red Hat)](https://sched.co/21nyd) - *Scaling LLM Inference with vLLM: Multi‑Accelerator Serving and Quantized LLMs* -* [Pierre Rouanet (Pollen Robotics)](https://sched.co/21nyX) - *Real-World Robotics as the Next Frontier for AI?* -* [Pablo Montalvo (Hugging Face)](https://sched.co/21nzG) - *PyTorch x Transformers: Pythonicity, Autodiff, and Modularity Defining Modern AI* -* [Pedro Ortis (Common Crawl)](https://sched.co/21nym) - *Harnessing Common Crawl for AI and ML Applications* -* [Meriem Bendris (NVIDIA)](https://sched.co/21nys) - *Teaching Mistral to Reason: Post-Training with PyTorch and NVIDIA* -* [Olatunji Ruwase (Snowflake)](https://sched.co/21nyy) - *DeepSpeed – Efficient Training Scalability for Deep Learning Models* - -[View the full schedule](https://pytorchdayfrance2025.sched.com/). - -Whether you’re a contributor, practitioner, or simply curious about what’s ahead, PyTorch Day France is an opportunity to connect with the community and shape what’s next for our ecosystem. diff --git a/_posts/2025-05-02-pt-korea-user-group-recap.md b/_posts/2025-05-02-pt-korea-user-group-recap.md deleted file mode 100644 index b5a2126271b8..000000000000 --- a/_posts/2025-05-02-pt-korea-user-group-recap.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -layout: blog_detail -title: 'Recap of the PyTorch Korea User Group Meetup: A Technical Conference with a PyTorch Core Maintainer' -author: 'Jiho Kim, PyTorch Korea User Group' ---- - -At the end of March, the PyTorch Korea User Group hosted a special meetup that brought together prominent speakers for deep discussions on the PyTorch core and its broader ecosystem. With the event more than doubling in size compared to past gatherings, we were able to connect with even more developers and share insights. Huge thanks to [goorm](https://goorm.co/) for sponsoring the fantastic venue! 😄 - - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg1.jpg){:style="width:100%"} - - - -This recap is for those who couldn’t attend in person, as well as for participants who want to revisit the energy and insights of the day. The event featured experts in core PyTorch, AI accelerators, inference optimization, and large language model development. Below is a quick overview of the key sessions that anchored the conference. - - - -## 1️⃣ Jerry Lee | PyTorch Foundation - -Representing the PyTorch Foundation, part of the Linux Foundation, Jaeung provided an overview of how PyTorch is driving core open source technologies forward. He shared PyTorch's growth story, the many global projects currently in motion, and the ecosystem’s impressive 20%+ annual growth. The session also covered how the foundation operates, how member organizations are involved, and upcoming plans that are particularly useful for practitioners. - - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg2.jpg){:style="width:100%"} - - -## 2️⃣ Alban Desmaison | PyTorch Roadmap - -Alban shared the design philosophy behind PyTorch and Meta’s official contribution roadmap ([link](https://dev-discuss.pytorch.org/t/meta-pytorch-team-2025-h1-roadmaps/2794)). He provided a deep technical dive into the differences between Eager and Compiled modes, especially breaking down the backend architecture of device Eager execution. Practical tools and improvements were also introduced—such as memory profilers, enhanced custom operator support, and pinned memory optimizations. - - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg3.jpg){:style="width:100%"} - - - - -## 3️⃣ Hongseok Kim | PyTorch on Rebellions AI Accelerators: Status - -Rebellions is building runtime integration for their proprietary NPU architecture, fully aligned with the structural changes in PyTorch 2.0. This talk introduced the performance and scalability of their upcoming chip, their integration strategy with the PyTorch runtime, and challenges in supporting Eager Mode. Hongseok also previewed their roadmap toward releasing these features within the year. - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg4.jpg){:style="width:100%"} - - - -## 4️⃣ Kyujin Cho | Backend.AI: A Unified Platform for All AI Accelerators - -Backend.AI abstracts and integrates various AI accelerators into a unified workflow. As the diversity of accelerator architectures grows, the need for portability and infrastructure unification becomes even more important. This session showcased features across development and operations—from NPU scheduling and resource allocation to monitoring. Backend.AI currently supports accelerators from NVIDIA, Intel, Tenstorrent, Rebellions, and more. - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg5.jpg){:style="width:100%"} - - - -## 5️⃣ Taeho Kim | Optimizing & Deploying Models Across Multiple Chipsets Using NetsPresso - -This talk focused on the challenges of inference in real-world industrial applications of AI models. As new state-of-the-art models emerge rapidly, there’s a growing need for environments that can quickly validate device compatibility—ideally with one-click ease. NetsPresso is actively working on a static graph representation compatible with PyTorch, offering efficient support for model development, optimization, and testing. - - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg6.jpg){:style="width:100%"} - - -## 6️⃣ Jungyeop Lee | The Journey to Reproduce Deepseek-R1 - -Jungyeop took us through his journey of reproducing Deepseek, a large language model—an effort that involved 201 experiments. He shared real-world lessons from training with Korean data, tokenizer modifications, and fine-tuning strategies. His practical insights and next steps were especially valuable for those building or re-implementing large models from scratch. - - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg7.jpg){:style="width:100%"} - - -## 7️⃣ Sol Kim | A journey from TCP architecture to production-level LLMs - -Sol presented an integrated optimization approach to deploying large models using the TCP(Tensor Contraction Processor) architecture, which supports tensor contraction at the hardware level. The talk highlighted optimization techniques built on hardware abstraction layers (HALs) and bottom-up integration strategies with PyTorch—offering a hybrid hardware-software perspective. - - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg8.jpg){:style="width:100%"} - -## 💡 Panel Talk & Q&A 💡 - -The event wrapped up with an engaging panel discussion. Attendees asked sharp questions, and the speakers offered insightful answers. It was a powerful moment that captured the community’s enthusiasm for PyTorch and their hunger for deeper technical understanding. - - -![people at a conference](/assets/images/pt-korea-user-group-recap/fg9.jpg){:style="width:100%"} - - -## Final Thoughts - -Since our first offline meetup in October 2022, the PyTorch Korea User Group has held five major technical conferences. Each event deepens our appreciation for the scale and depth of the PyTorch ecosystem. With perspectives from users, contributors, and ecosystem builders, the stories we share are only growing—and we’re committed to continuing this journey together. - -See you at the next conference—with even more exciting talks to come! 🙌 \ No newline at end of file diff --git a/_resources/cn-docs.md b/_resources/cn-docs.md deleted file mode 100644 index 4575636110a6..000000000000 --- a/_resources/cn-docs.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: 中文文档 -summary: Docs and tutorials in Chinese, translated by the community. -class: pytorch-resource -link: https://pytorch.apachecn.org/ -order: 3 -featured-home: true -summary-home: Docs and tutorials in Chinese, translated by the community. - ---- diff --git a/_resources/contribution-guide.md b/_resources/contribution-guide.md deleted file mode 100644 index 6f729b384ef6..000000000000 --- a/_resources/contribution-guide.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Contribution Guide -summary-home: 'Learn how you can contribute to PyTorch code and documentation.' -summary: 'Learn how you can contribute to PyTorch code and documentation.' -class: pytorch-resource -link: https://pytorch.org/docs/master/community/contribution_guide.html -order: 8 -featured-home: true ---- diff --git a/_resources/contributor.md b/_resources/contributor.md deleted file mode 100644 index 842e4209b6d1..000000000000 --- a/_resources/contributor.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Newsletter -summary-home: 'Stay up-to-date with the latest updates.' -summary: 'Stay up-to-date with the latest updates.' -link: /newsletter -class: pytorch-resource -order: 13 -featured-home: true ---- diff --git a/_resources/design-philosophy.md b/_resources/design-philosophy.md deleted file mode 100644 index 724b7aba10b4..000000000000 --- a/_resources/design-philosophy.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Design Philosophy -summary-home: 'PyTorch design principles for contributors and maintainers.' -summary: 'PyTorch design principles for contributors and maintainers.' -class: pytorch-resource -link: https://pytorch.org/docs/master/community/design.html -order: 9 -featured-home: true ---- diff --git a/_resources/dive-into-deep-learning.md b/_resources/dive-into-deep-learning.md deleted file mode 100644 index 4cd4dd383cfe..000000000000 --- a/_resources/dive-into-deep-learning.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Dive into Deep Learning -summary-home: An interactive deep learning book. -summary: An interactive deep learning book. -link: https://d2l.ai/ -order: 11 -featured-home: false -show-pytorch-logo: true ---- diff --git a/_resources/docs.md b/_resources/docs.md deleted file mode 100644 index 0f47871ab552..000000000000 --- a/_resources/docs.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Docs -summary: Access comprehensive developer documentation. -class: pytorch-resource -link: https://pytorch.org/docs/ -order: 1 - ---- diff --git a/_resources/example-projects.md b/_resources/example-projects.md deleted file mode 100644 index ecc8ac378afb..000000000000 --- a/_resources/example-projects.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Examples -summary: View example projects for vision, text, RL, and more. -class: pytorch-resource -link: https://github.com/pytorch/examples -order: 6 ---- diff --git a/_resources/fast-ai.md b/_resources/fast-ai.md deleted file mode 100644 index 1f9b6eaa8951..000000000000 --- a/_resources/fast-ai.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: fast.ai -summary: Get up and running on PyTorch quickly with free learning courses. -class: pytorch-resource -link: https://www.fast.ai/ -order: 9 ---- diff --git a/_resources/github.md b/_resources/github.md deleted file mode 100644 index 3c6b703e939d..000000000000 --- a/_resources/github.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: GitHub -summary: Report bugs, request features, discuss issues, and more. -summary-home: Report bugs, request features, discuss issues, and more. -class: github -link: https://github.com/pytorch/pytorch -order: 3 -featured-home: false ---- diff --git a/_resources/governance.md b/_resources/governance.md deleted file mode 100644 index 1c3da287de49..000000000000 --- a/_resources/governance.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Governance -summary-home: 'Learn about the PyTorch governance hierarchy.' -summary: 'Learn about the PyTorch governance hierarchy.' -class: pytorch-resource -link: https://pytorch.org/docs/master/community/governance.html -order: 10 -featured-home: true ---- diff --git a/_resources/jp-tutorials.md b/_resources/jp-tutorials.md deleted file mode 100644 index 25730ad83c59..000000000000 --- a/_resources/jp-tutorials.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: 日本語 (PyTorch) -summary: Tutorials in Japanese, translated by the community. -class: pytorch-resource -link: https://yutaroogawa.github.io/pytorch_tutorials_jp/ -order: 5 -featured-home: true -summary-home: Tutorials in Japanese, translated by the community. - ---- diff --git a/_resources/korean_tutorials.md b/_resources/korean_tutorials.md deleted file mode 100644 index b0ccc9c5b09b..000000000000 --- a/_resources/korean_tutorials.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: 파이토치 (PyTorch) 튜토리얼 -summary: Tutorials in Korean, translated by the community. -class: pytorch-resource -link: https://tutorials.pytorch.kr/ -order: 4 -featured-home: true -summary-home: Tutorials in Korean, translated by the community. - ---- diff --git a/_resources/maintainers.md b/_resources/maintainers.md deleted file mode 100644 index b5c74998be0f..000000000000 --- a/_resources/maintainers.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Maintainers -summary-home: 'Learn about the PyTorch core and module maintainers.' -summary: 'Learn about the PyTorch core and module maintainers.' -class: pytorch-resource -link: https://pytorch.org/docs/master/community/persons_of_interest.html -order: 7 -featured-home: true ---- diff --git a/_resources/mobile-demo.md b/_resources/mobile-demo.md deleted file mode 100644 index 42f17b0a90a2..000000000000 --- a/_resources/mobile-demo.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Mobile Demo -summary-home: Check out the PyTorch Mobile demo app for iOS and Android. -summary: Check out the PyTorch Mobile demo app for iOS and Android. -class: pytorch-resource -link: https://github.com/pytorch/android-demo-app -order: 10 -featured-home: false ---- diff --git a/_resources/pytorch-discuss.md b/_resources/pytorch-discuss.md deleted file mode 100644 index 1c88d271f169..000000000000 --- a/_resources/pytorch-discuss.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: PyTorch Discuss -summary-home: Browse and join discussions on deep learning with PyTorch. -summary: Browse and join discussions on deep learning with PyTorch. -class: pytorch-resource -link: https://discuss.pytorch.org -order: 1 -featured-home: true ---- diff --git a/_resources/slack.md b/_resources/slack.md deleted file mode 100644 index cb5cac584618..000000000000 --- a/_resources/slack.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Slack -summary-home: 'Discuss advanced topics.' -summary: 'Discuss advanced topics.' -class: slack -link: https://join.slack.com/t/pytorch/shared_invite/zt-2j2la612p-miUinTTaxXczKOJw48poHA -order: 2 -featured-home: true ---- diff --git a/_resources/training-cert.md b/_resources/training-cert.md deleted file mode 100644 index 0456d1ad5810..000000000000 --- a/_resources/training-cert.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: PyTorch Training & Certification -summary-home: Further your education and career goals. -summary: Further your education and career goals. -class: pytorch-resource -link: https://training.linuxfoundation.org/full-catalog/?_sf_s=PyTorch -order: 14 -featured-home: true ---- diff --git a/_resources/tutorials.md b/_resources/tutorials.md deleted file mode 100644 index 619ec3755b22..000000000000 --- a/_resources/tutorials.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Tutorials -summary: Get in-depth tutorials for beginners and advanced developers. -class: pytorch-resource -link: https://pytorch.org/tutorials -order: 2 - ---- diff --git a/_sass/_variables.scss b/_sass/_variables.scss deleted file mode 100644 index 25c95f460053..000000000000 --- a/_sass/_variables.scss +++ /dev/null @@ -1,123 +0,0 @@ -$custom-font-size: 16px; -$black: #000000; -$white: #ffffff; -$dark_grey: #6c6c6d; -$light_grey: #f3f4f7; -$orange: #ee4c2c; -$medium_grey: #f3f4f7; -$not_quite_black: #262626; -$slate: #262626; -$very_light_grey: #f3f4f7; -$very_dark_grey: #CCCDD1; -$content_text_color: #6c6c6d; -$code_background_color: #f3f4f7; -$dark_blue: #3d5a97; -$quick_start_grey: #6c6c6d; -$command_block_black: #6c6c6d; -$smoky_grey: #CCCDD1; -$medium_smoky_grey: #CCCDD1; -$code_link_color: #4974D1; -$purple: #812CE5; -$light_white: #e2e2e2; -$mid_gray: #797676; - -$desktop_header_height: 90px; -$mobile_header_height: 68px; -$desktop_footer_height: 620px; -$site_horizontal_padding: 30px; - -@import "../node_modules/bootstrap/scss/variables"; - -@mixin desktop { - @media screen and (min-width: 768px) { @content; } -} - -@mixin full-nav-menu-desktop { - @media screen and (min-width: 1200px) { @content; } -} - -@mixin max-width-desktop { - @media screen and (min-width: 1240px) { @content; } -} - -@mixin small-desktop { - @media (min-width: 768px) and (max-width: 1239px) { @content; } -} - -@function rem($px) { - @return ($px / 16px) * 1rem; -} - -@mixin code_font_family { - font-family: IBMPlexMono,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace; -} - -@mixin clearfix { - &:before, - &:after { - content: ""; - display: table; - } - &:after { - clear: both; - } - & { - *zoom: 1; - } -} - -@mixin default_link_styles { - a:link, - a:visited, - a:hover { - color: $orange; - text-decoration: none; - } - - @include desktop { - a:hover { - text-decoration: underline; - } - - a.social-icon:hover { - text-decoration: none; - } - } -} - -@mixin animated_border_hover_state { - @include desktop { - &:after { - content: ""; - display: block; - width: 0; - height: 1px; - position: absolute; - bottom: 0; - left: 0; - background-color: $orange; - transition: width .250s ease-in-out; - } - - &:hover:after { - width: 100%; - } - &:hover { - color: $not_quite_black; - } - } -} - -@mixin external_link_icon { - &:after { - content: url($baseurl + "/assets/images/external-link-icon.svg"); - margin-left: 15px; - } -} - -@mixin blog_date_and_feature { - font-size: rem(18px); - letter-spacing: 0; - line-height: rem(24px); - margin-bottom: rem(10px); -} diff --git a/_sass/announcement.scss b/_sass/announcement.scss deleted file mode 100644 index 650d57ba0f08..000000000000 --- a/_sass/announcement.scss +++ /dev/null @@ -1,406 +0,0 @@ -.announcement { - .hero-content { - top: $mobile_header_height + 80px; - height: 250px; - position: relative; - margin-bottom: 120px; - justify-content: center; - - @include desktop { - top: $mobile_header_height + 110px; - height: 350px; - } - - h1 { - font-size: rem(60px); - text-transform: uppercase; - font-weight: lighter; - letter-spacing: 1.08px; - margin-bottom: rem(10px); - line-height: 1.05; - color: $white; - - @include desktop { - font-size: rem(72px); - } - - } - - h1.small { - font-size: 40px; - @include desktop { - font-size: 58px; - } - } - - .lead { - margin-bottom: rem(25px); - padding-top: rem(30px); - color: $white; - width: 100%; - } - } - - - .row { - justify-content: center; - } - - .main-content { - margin-bottom: 5rem; - padding-bottom: 0; - } - - .main-background { - height: 370px; - @include desktop { - height: 450px; - } - - } - - .card-container { - display: grid; - grid-template-columns: repeat(2, 1fr); - gap: 20px; - padding-top: 3rem; - .card { - border: none; - display: block; - a { - color: $black; - } - .card-body { - display: flex; - flex-direction: column; - height: 100%; - justify-content: space-between; - padding: 0; - - img { - width: 100%; - height: 207px; - object-fit: contain; - padding: 20px; - @media screen and (min-width: 1000px) { - padding: 30px; - } - } - } - } - @media screen and (min-width: 1000px) { - grid-template-columns: repeat(3, 1fr); - gap: 36px; - } - } - - .contact-us-section { - background-color: $code_background_color; - padding: 50px 0; - .row { - justify-content: center; - .lead { - padding-top: rem(24px); - } - .hbspt-form { - padding: 30px 0; - - .hs-button { - background-image: url($baseurl + "/assets/images/chevron-right-orange.svg"); - background-size: 6px 13px; - background-position: top 16px right 11px; - background-repeat: no-repeat; - border-radius: 0; - border: none; - background-color: $white; - color: $quick_start_grey; - font-weight: 400; - position: relative; - letter-spacing: 0.25px; - padding: rem(12px) rem(32px) rem(12px) rem(12px); - margin: 10px 0; - - @include animated_border_hover_state; - - @include desktop { - background-position: top 19px right 11px; - } - - } - - fieldset.form-columns-2, fieldset.form-columns-1 { - max-width: 100%; - .hs-form-field { - max-width: 100%; - padding: 10px 0; - width: 100%; - input { - border: none; - width: 100%; - } - textarea { - border: none; - width: 100%; - } - } - } - - li.hs-form-radio { - input[type=radio] { - width: auto !important; - } - - span { - margin-left: 5px; - } - } - - ul { - list-style-type: none; - } - } - } - } - - .light-background-section { - background-color: $white; - .content { - padding: 40px 0; - } - - ul li { - font-size: 1.25rem; - font-weight: 300; - } - } - - .darker-background-section { - background-color: #f3f4f7; - .content { - padding: 40px 0; - } - } - - .grey-background-section { - background-color: #f3f4f7; - padding: 60px 0; - img { - height: 100px; - } - p { - font-size: 14px; - line-height: 170%; - } - } - - .color-background-section { - background-image: url("/assets/images/pytorch_bg_purple.jpg"); - background-size: 100% 100%; - background-repeat: no-repeat; - padding: 60px 0; - h2 { - color: white; - } - } - - .body-side-text { - .lead { - margin-bottom: rem(25px); - padding-top: rem(24px); - } - } - - img { - width: 100%; - } - - h2.upper { - font-size: 25px; - line-height: 130%; - text-align: center; - letter-spacing: 1.75px; - text-transform: uppercase; - margin-bottom: 30px; - } - - h3.upper { - font-size: 19px; - text-transform: uppercase; - letter-spacing: 1.75px; - line-height: 130%; - margin: 25px 0; - } - - table.benefits { - background-color: white; - font-size: 14px; - text-align: center; - td.benefit { - border-left: none; - min-width: 300px; - text-align: left; - @include desktop { - min-width: 520px; - } - } - tbody { - td { - border-left: 1px solid #812CE5; - vertical-align: middle; - } - td.benefit { - font-weight: 600; - } - } - thead, tfoot { - background-color: #812CE5; - color: white; - font-size: 16px; - font-weight: 700; - @include desktop { - font-size: 20px; - } - td { - border-left: 1px solid #000; - vertical-align: middle; - border-top: none; - } - a { - text-decoration: underline; - color: white; - } - td.price { - font-size: 14px; - line-height: 1.2; - @include desktop { - font-size: 16px; - } - } - } - img { - width: 15px; - } - } - .modal-header { - border-bottom: none; - padding-bottom: 0; - } - - .consolidated-employees { - tbody td { - font-weight: 600; - } - td.no-border { - border-left: none; - } -} - - .member-boxes { - gap: 20px; - margin: 0; - div.col-sm { - background-color: white; - } - } -} - -.board-member { - margin: 35px 0; - img { - margin-bottom: 15px; - } - a svg { - margin-top: 5px; - height: 25px; - max-width: 30px; - fill: #000; - color: #000; - } - a:hover svg { - fill: $orange; - color: $orange; - } -} - - -.announcement .cloud-credits-table { - font-size: 1.1rem; - margin-top: 40px; - ul { - padding-left: 20px; - li { - margin-top: 10px; - font-size: 1.1rem; - } - } - - .col-md { - border-radius: 5px; - margin-bottom: 40px; - } - - .card { - border-radius: 6px; - } - - .thead { - border-top-left-radius: 5px; - border-top-right-radius: 5px; - color: #fff; - padding: 14px 20px; - text-align: center; - } - .col-md:first-child .thead { - background: conic-gradient(from 53deg at 37% 100%, #828282 0, hsla(0, 0%, 51%, .95) 100%); - } - .col-md:nth-child(2) .thead { - background: conic-gradient(from 53deg at 37% 100%, #ab9344 0, rgba(171, 147, 68, .95) 100%); - } - .col-md:nth-child(3) .thead { - background: conic-gradient(from 53deg at 37% 100%, #293850 0, rgba(41, 56, 80, .95) 100%); - } - - .tbody { - border-bottom: 1px solid #d0d0d0; - border-left: 1px solid #d0d0d0; - border-right: 1px solid #d0d0d0; - height: 100%; - padding: 26px 20px; - } - - .tfoot { - background-color: #000; - border-bottom-left-radius: 5px; - border-bottom-right-radius: 5px; - color: #fff; - padding: 20px; - text-align: center; - } -} - -.announcement .steps-columns { - background-color: transparent; - - .col-md { - margin-bottom: 20px; - padding: 20px; - } - - h3 { - margin-bottom: 20px; - } - - .step { - font-size: 1.5rem; - margin-bottom: 5px; - margin-top: 20px; - } - - ul { - padding-left: 20px; - li { - margin-top: 10px; - } - } - -} \ No newline at end of file diff --git a/_sass/article.scss b/_sass/article.scss deleted file mode 100644 index 8b7aa931d584..000000000000 --- a/_sass/article.scss +++ /dev/null @@ -1,159 +0,0 @@ -article.pytorch-article { - max-width: 920px; - margin: 0 auto; - padding-bottom: 90px; - - h2, - h3, - h4, - h5, - h6 { - margin-top: rem(30px); - margin-bottom: rem(24px); - color: $not_quite_black; - } - - h2 { - font-size: rem(24px); - letter-spacing: 1.33px; - line-height: rem(32px); - margin-top: rem(50px); - text-transform: uppercase; - } - - h3 { - font-size: rem(24px); - letter-spacing: -0.25px; - line-height: rem(30px); - text-transform: none; - } - - h4, - h5, - h6 { - font-size: rem(18px); - letter-spacing: -0.19px; - line-height: rem(30px); - } - - p { - margin-bottom: rem(18px); - } - - p, - ul li, - ol li, - dl dt, - dl dd, - blockquote { - font-size: rem(18px); - line-height: rem(30px); - color: $content_text_color; - } - - table { - margin-bottom: rem(40px); - width: 100%; - } - - table thead { - border-bottom: 1px solid #cacaca; - } - - table th, - table tr, - table td { - color: $content_text_color; - font-size: rem(16px); - letter-spacing: -0.17px; - } - - table th { - padding: rem(10px); - color: $not_quite_black; - } - - - table td { - padding: rem(5px); - } - - ul, - ol{ - margin: rem(24px) 0 rem(50px) 0; - - @include desktop { - padding-left: rem(100px); - } - - li { - margin-bottom: rem(10px); - } - } - - dl { - margin-bottom: rem(40px); - } - - dl dt { - margin-bottom: rem(12px); - font-weight: 400; - } - - pre { - margin-bottom: rem(40px); - } - - hr { - margin-top: rem(75px); - margin-bottom: rem(75px); - } - - blockquote { - font-size: rem(12px); - font-style: italic; - padding: 15px 15px 5px 15px; - width: 100%; - background-color: rgba(211, 211, 211, 0.3); - border-left: 2px solid #000000; - } - - h3.no_toc { - margin: 0px; - } - - nav { - float: right; - display: block; - overflow-y: auto; - background-color: white; - margin-left: 20px; - border-left: 1px #717171; - } - - nav li { - font-size: 12px; - line-height: 20px; - padding-top: 0px; - list-style: none; - } - - nav a { - color: #717171; - font-weight: bold; - } - - ul#markdown-toc { - padding-left: 1em; - margin: 0px; - } - - ul#markdown-toc ul { - margin: 0px; - padding-left: 1em; - } - - ul#markdown-toc li { - margin: 0px; - } -} diff --git a/_sass/base_styles.scss b/_sass/base_styles.scss deleted file mode 100644 index 419e7be1d655..000000000000 --- a/_sass/base_styles.scss +++ /dev/null @@ -1,707 +0,0 @@ -* { - font-family: FreightSans, Helvetica Neue, Helvetica, Arial, sans-serif; - font-weight: 400; /* normal - https://developer.mozilla.org/en-US/docs/Web/CSS/font-weight#Common_weight_name_mapping */ -} - -h1, h2, h3, h4, h5, h6 { - font-family: FreightSans; -} - -p { - margin-bottom: 1.25rem; -} - -a, em, i, b, strong, u, span { - font-size: inherit; -} - -a:link, -a:visited, -a:hover { - text-decoration: none; - color: $orange; -} - -p { - @include default_link_styles; -} - -.btn, -a.btn { - border-radius: 0; - border: none; - background-color: $light_grey; - color: $quick_start_grey; - font-weight: 400; - position: relative; - letter-spacing: 0.25px; - - &.btn-lg { - font-size: 1.125rem; - padding-top: rem(8px); - } - - &.btn-white { - background-color: $white; - } - - &.btn-orange { - background-color: $orange; - } - - &.btn-demo { - color: $white; - } - - @include animated_border_hover_state; -} - -.navbar { - padding-left: 0; - padding-right: 0; -} - -html { - position: relative; - min-height: 100%; - font-size: 12px; - - @include desktop { - font-size: 16px; - } -} - -body { - @include desktop { - margin: 0 0 $desktop_footer_height; - } - - &.no-scroll { - height: 100%; - overflow: hidden; - } -} - -a, .btn { - &.with-right-arrow { - padding-right: rem(32px); - position: relative; - background-image: url($baseurl + "/assets/images/chevron-right-orange.svg"); - background-size: 6px 13px; - background-position: top 10px right 11px; - background-repeat: no-repeat; - @include desktop { - background-size: 8px 14px; - background-position: top 15px right 12px; - padding-right: rem(32px); - } - } - &.with-left-arrow { - padding-left: rem(32px); - position: relative; - background-image: url($baseurl + "/assets/images/chevron-left-grey.svg"); - background-size: 6px 13px; - background-position: top 10px left 11px; - background-repeat: no-repeat; - @include desktop { - background-size: 8px 14px; - background-position: top 16px left 12px; - padding-left: rem(32px); - } - } -} - -.main-background { - position: absolute; - top: 0; - left: 0; - width: 100%; - height: 350px; - background-size: 100% 100%; - background-repeat: no-repeat; - background-image: url($baseurl + "/assets/images/pytorch_bg_purple.jpg"); - - @include desktop { - height: 640px; - } - - &.home-page-background { - z-index: -1; - height: 350px; - - @include desktop { - height: 570px; - } - } - &.hub-background { - height: 380px; - @include desktop { - height: 495px; - } - } - &.ecosystem-background { - @include desktop { - height: 472px; - } - } - &.events-background { - @include desktop { - height: 472px; - } - } - &.ecosystem-join-background { - @include desktop { - height: 435px; - } - } - &.ecosystem-detail-background { - } - &.resources-background { - height: 380px; - @include desktop { - height: 472px; - } - } - &.get-started-background { - height: 275px; - @include desktop { - height: 380px; - } - } - &.comm-stories-background { - height: 275px; - @include desktop { - height: 380px; - } - } - &.style-guide { - } - &.announcement-background { - } - &.features-background { - height: 335px; - @include desktop { - height: 300px; - } - } - &.blog-background { - } - &.mobile-background { - } - &.deep-learning-background { - } -} - -.bg-light-grey { - background-color: $light_grey; -} - -.text-dark-grey { - color: $dark_grey; -} - -.sidebar-links .top-section { - color: $black; -} - -.sidebar-links ul { - list-style-type: none; - padding-left: 0; - li { - color: $dark_grey; - margin-left: 20px; - a { - color: inherit; - } - } -} - -.sidebar-links .with-sub-sections { - &.top-section:before { - content: "+ "; - font-family: "Courier New", Courier, monospace; - width: 50px; - } - - &.top-section.open:before { - content: "- "; - font-family: "Courier New", Courier, monospace; - width: 50px; - } -} - -.bg-very-light-grey { - background-color: $very_light_grey; -} - -.email-subscribe-form { - input.email { - color: $orange; - border: none; - border-bottom: 1px solid #939393; - width: 100%; - background-color: transparent; - outline: none; - font-size: 1.125rem; - letter-spacing: 0.25px; - line-height: 2.25rem; - } - - ::-webkit-input-placeholder { /* Chrome/Opera/Safari */ - color: $orange; - } - ::-moz-placeholder { /* Firefox 19+ */ - color: $orange; - } - :-ms-input-placeholder { /* IE 10+ */ - color: $orange; - } - :-moz-placeholder { /* Firefox 18- */ - color: $orange; - } - - input[type="submit"] { - position: absolute; - right: 0; - top: 10px; - height: 15px; - width: 15px; - background-image: url($baseurl + "/assets/images/arrow-right-with-tail.svg"); - background-color: transparent; - background-repeat: no-repeat; - background-size: 15px 15px; - background-position: center center; - -webkit-appearance: none; - -moz-appearance: none; - appearance: none; - border: 0; - } -} - -.email-subscribe-form-fields-wrapper { - position: relative; -} - -.bg-slate { - background-color: $slate; -} - -.tweets-wrapper { - width: 100%; - - p { - font-size: rem(16px); - line-height: rem(24px); - letter-spacing: 0.22px; - } - - ol { - padding-left: 0; - } - - a { - color: $orange; - } - - img, - .timeline-Tweet-actions, - .timeline-Tweet-media, - .MediaCard { - display: none !important; - } -} - -.tweet { - margin-bottom: 2.2rem; - word-wrap: break-word; - - a { - color: $orange; - display: inline; - span { - color: inherit; - } - } - - p, span { - font-size: 1rem; - line-height: 1.5rem; - letter-spacing: 0.22px; - color: #A0A0A1; - } - - p { - @include max-width-desktop { - padding-right: 40px; - } - } - - span.retweeted, - span.in-reply-to { - font-size: rem(13px); - } - - p.tweet-header { - margin-bottom: rem(5px); - line-height: rem(12px); - } - - .tweet-bird { - &:before { - content: ""; - position: relative; - left: 0; - background-image: url($baseurl + "/assets/images/logo-twitter-grey.svg"); - background-size: 20px 16px; - display: inline-block; - width: 20px; - height: 16px; - - @include desktop { - margin-bottom: rem(10px); - } - } - } -} - -.anchorjs-link { - color: $quick_start_grey !important; - @include desktop { - &:hover { - color: inherit; - text-decoration: none !important; - } - } -} - -.article-page-module { - background-color: $light_grey; - padding-top: rem(30px); - padding-bottom: rem(30px); - - @include desktop { - padding-top: rem(60px); - padding-bottom: rem(60px); - } - - @include max-width-desktop { - .col-md-3 { - padding-left: 20px; - padding-right: 20px; - } - } - - .module-link-col { - .btn { - padding-left: 0; - } - @include desktop { - text-align: right; - .btn { - padding-left: inherit; - } - } - } - - .module-content-wrapper { - margin-top: rem(20px); - margin-bottom: rem(20px); - @include desktop { - margin-top: 0; - margin-bottom: 0; - } - } - - img { - margin-bottom: rem(30px); - width: 100%; - } - - h3 { - font-size: rem(24px); - letter-spacing: 1.33px; - line-height: rem(32px); - text-transform: uppercase; - margin-bottom: rem(20px); - @include desktop { - margin-bottom: rem(60px); - } - } - - h5, p { - font-size: rem(16px); - line-height: rem(24px); - } - - h5 { - color: $not_quite_black; - } - - p { - color: $very_dark_grey; - letter-spacing: 0.25px; - } -} - -.article-page-module .module-header { - position: relative; -} - -.article-page-module .module-button { - padding-left: 0; - @include desktop { - position: absolute; - right: 15px; - top: 0; - padding-top: 0; - padding-bottom: rem(2px); - background-position: center right; - padding-right: 16px; - } -} - -article.pytorch-article .note-card { - border-radius: 0; - border: none; - background-color: $orange; - color: white; - padding: 30px; - margin-bottom: 50px; - - h4 { - font-size: 1.5rem; - letter-spacing: 1.33px; - line-height: 2rem; - text-transform: uppercase; - color: white; - margin-top: 0; - margin-bottom: rem(18px); - } - - p { - font-size: rem(18px); - line-height: 1.5em; - margin-bottom: 0; - color: white; - a { - color: white; - font-weight: 700; - } - } - -} - -.ecosystem-card, -.resource-card, -.hub-card { - border-radius: 0; - border: none; - height: 110px; - margin-bottom: rem(20px); - margin-bottom: rem(30px); - overflow: scroll; - - @include max-width-desktop { - height: 150px; - overflow: inherit; - } - - @include small-desktop { - height: 170px; - overflow: inherit; - } - - p.card-summary { - font-size: rem(18px); - line-height: rem(24px); - margin-bottom: 0; - color: $dark_grey; - } - - h4 { - color: $slate; - margin-bottom: rem(18px); - overflow: hidden; - white-space: nowrap; - text-overflow: ellipsis; - } - - a { - height: 100%; - - @include desktop { - min-height: 190px; - } - - @include small-desktop { - min-height: 234px; - } - } - - @include animated_border_hover_state; - &:hover { - p.card-summary { - color: $not_quite_black; - } - } -} - -.ecosystem-card .card-body { - background-position: top rem(20px) right rem(20px); - background-repeat: no-repeat; - padding: rem(25px) rem(30px); - - &.reasoning { - background-image: url($baseurl + "/assets/images/logo-elf.svg"); - background-size: 29px 25px; - } - - &.tool { - background-image: url($baseurl + "/assets/images/logo-wav2letter.svg"); - background-size: 29px 25px; - } - - &.language { - background-image: url($baseurl + "/assets/images/logo-parlai.svg"); - background-size: 29px 25px; - } - - &.vision { - background-image: url($baseurl + "/assets/images/logo-detectron.svg"); - background-size: 29px 25px; - } - -} - -.resource-card { - border: 1px solid #d6d7d8; - background-color: transparent; - margin-bottom: rem(20px); - - @include desktop { - margin-bottom: 0; - } - - @include small-desktop { - height: 225px; - } - - .pytorch-image { - position: relative; - height: rem(20px); - width: rem(20px); - top: rem(50px); - } - - a { - letter-spacing: 0.25px; - color: $not_quite_black; - } - - .card-body { - display: block; - padding: 0 15px 0 0; - position: relative; - top: 20px; - margin-left: 60px; - - @include small-desktop { - top: 18px; - } - - @include max-width-desktop { - top: 30px; - margin-left: 80px; - padding-right: 30px; - } - } - - &.slack, - &.github, - &.pytorch-resource { - &:before { - content: ""; - background-size: 32px 32px; - background-repeat: no-repeat; - display: block; - position: absolute; - height: 32px; - width: 32px; - top: 15px; - left: 15px; - - @include max-width-desktop { - left: 30px; - top: 30px; - } - } - } - - &.slack { - &:before { - background-image: url($baseurl + "/assets/images/logo-slack.svg"); - } - } - - &.github { - &:before { - background-image: url($baseurl + "/assets/images/logo-github.svg"); - } - } - - &.pytorch-resource { - &:before { - background-image: url($baseurl + "/assets/images/logo-icon.svg"); - } - } - - .pytorch-discuss { - .discuss { - color: $orange; - font-weight: 400; - } - } - - @include animated_border_hover_state; -} - -.article-page-module.similar-projects { - .ecosystem-card p.card-summary { - font-size: rem(16px); - height: 36px; - @include desktop { - height: 50px; - } - } -} - -#twitter-widget iframe { - display: none !important; -} - -body.general .main-content-wrapper { - margin-top: 80px; - - @include desktop { - margin-top: 100px; - } -} - -.domain-card { - background-color: $light_grey; - padding: 40px 20px; - margin: 20px 0; - h4 { - color: $black; - } - p { - color: $dark_grey; - margin-bottom: 0; - } - &:hover { - h4 { - color: $orange; - } - } -} - diff --git a/_sass/blog.scss b/_sass/blog.scss deleted file mode 100644 index b23dd57108b1..000000000000 --- a/_sass/blog.scss +++ /dev/null @@ -1,364 +0,0 @@ -.blog { - .navbar-nav .nav-link { - color: $black; - } - - .main-content { - padding-bottom: 1.5rem; - @include desktop { - padding-top: 1.70rem; - padding-bottom: 3.5rem; - } - } - - .main-background { - height: 290px; - @include desktop { - height: 485px; - } - } - - .blog-detail-background { - height: 300px; - @include desktop { - height: 312px; - } - } - - .main-content-menu { - .navbar-nav .nav-link { - text-transform: capitalize; - - &.selected { - color: $orange !important; - text-decoration: underline; - text-decoration-color: $orange; - opacity: 0.75 !important; - } - } - - .nav-item:last-of-type { - @include desktop { - position: absolute; - right: 0; - a { - margin-right: 0; - } - } - } - } - - .zoom-in { - cursor: zoom-in; - } - - .zoomed { - cursor: zoom-out; - img { - margin: auto !important; - position: absolute; - top: 0; - left:0; - right:0; - bottom: 0; - max-width: 98%; - } - } - - .nav-logo { - background-image: url($baseurl + "/assets/images/logo-dark.svg"); - } - - .main-content-wrapper { - margin-top: 275px; - .row.blog-index { - margin-top: 30px; - p { - color: $dark_grey; - } - } - .row.blog-vertical { - display: block; - max-width: 100%; - margin: auto; - .col-md-4 { - display: initial; - } - .btn { - float: left; - } - } - .vertical-blog-container { - border-bottom: 1px solid #E2E2E2; - padding-bottom: 3rem; - &:last-of-type { - margin-bottom: 2rem; - } - } - @include desktop { - margin-top: 380px + $desktop_header_height; - .row.blog-index - [class*="col-"]:not(:first-child):not(:last-child):not(:nth-child(3n)) { - padding-right: rem(35px); - padding-left: rem(35px); - } - - .row.blog-index [class*="col-"]:nth-child(3n) { - padding-left: rem(35px); - } - - .row.blog-index [class*="col-"]:nth-child(3n + 1) { - padding-right: rem(35px); - } - - .col-md-4 { - margin-bottom: rem(23px); - } - } - - h4 { - a { - font-family: FreightSans; - font-size: rem(24px); - color: $black; - letter-spacing: 0; - line-height: rem(32px); - font-weight: 400; - } - } - - .author { - color: $orange; - font-size: rem(20px); - letter-spacing: 0.25px; - line-height: rem(30px); - margin-bottom: rem(30px); - } - - .author-icon { - position: relative; - top: rem(26px); - height: rem(17px); - width: rem(19px); - } - } - - .blog-detail-content { - padding-bottom: 2.8rem; - } - - .blog-detail-wrapper { - @include desktop { - margin-top: 234px + $desktop_header_height; - } - } - - .jumbotron { - top: rem(105px); - @include desktop { - height: rem(405px); - } - - .container { - @include desktop { - padding-bottom: rem(45px); - } - } - - .blog-index-title { - overflow: hidden; - margin-top: 1.5rem; - white-space: nowrap; - text-overflow: ellipsis; - color: white; - @include desktop { - overflow: unset; - white-space: unset; - text-overflow: unset; - } - } - - h1 { - letter-spacing: -1.65px; - font-size: rem(52px); - line-height: rem(56px); - text-transform: none; - color: $white; - a { - color: $white; - word-wrap: break-word; - } - } - - h2 { - color: $white; - } - - .blog-title { - display: inline-flex; - &:hover { - color: $white; - } - } - - .blog-detail-container { - padding-top: 4rem; - @include desktop { - padding-top: rem(174px); - } - } - - p { - font-size: rem(20px); - letter-spacing: 0; - line-height: rem(30px); - color: $white; - } - - .btn { - margin-top: rem(12px); - padding-top: rem(9px); - } - - .blog-page-container { - p.blog-date { - padding-top: rem(10px); - } - .btn { - margin-bottom: rem(10px); - } - } - } - - .blog-detail-jumbotron { - top: 45px; - @include desktop { - height: 107px; - top: 75px; - } - } - - p.blog-date { - @include blog_date_and_feature; - color: $dark_grey; - } - - p.featured-post { - @include blog_date_and_feature; - color: $white; - } - - p.featured-blog-preview { - margin-bottom: rem(12px); - } - - #blogPostFilter { - .nav-link { - opacity: 0.53; - font-size: rem(20px); - color: $black; - letter-spacing: 0; - line-height: rem(34px); - } - } - - .page-link { - font-size: rem(20px); - letter-spacing: 0; - line-height: rem(34px); - color: $orange; - width: rem(120px); - text-align: center; - } - - .blog-modal { - max-width: 75%; - top: 5rem; - &:hover { - cursor: zoom-out; - } - @media (max-width: 575px) { - max-width: 100%; - top: 10rem; - } - } - - .blog-image { - cursor: zoom-in; - } - - @media (max-width: 1067px) { - .jumbotron { - h1 { - margin-right: 0; - margin-top: 1.5rem; - a { - font-size: rem(45px); - line-height: rem(40px); - } - } - } - - .main-content-wrapper { - .col-md-4 { - margin-bottom: rem(75px); - } - } - - .similar-posts { - margin-bottom: rem(50px); - } - } - - @media (max-width: 1050px) { - .main-content-wrapper { - .author-icon { - left: rem(-30px); - } - } - } - - table { - tr { - th { - font-weight: 600; - } - } - } - - .pytorch-article { - .enterprise-azure-logo-container { - padding-left: 0; - img { - margin-bottom: 0; - } - } - } -} - -.blog .pytorch-article img { - margin-bottom: rem(18px); -} - -twitterwidget { - margin: 0 auto; - margin-top: rem(18px) !important; - margin-bottom: rem(18px) !important; -} - -.pytorch-article .outlined-code-block { - border: 1px solid black; - padding: 1rem; - margin-bottom: 1rem; - pre { - margin: 0; - padding: 0; - background-color: white; - } -} - -.pytorch-article .reference-list { - li { - overflow-wrap: anywhere; - } -} diff --git a/_sass/bootstrap-overrides.scss b/_sass/bootstrap-overrides.scss deleted file mode 100644 index 4593f29e05cc..000000000000 --- a/_sass/bootstrap-overrides.scss +++ /dev/null @@ -1,15 +0,0 @@ -.container { - padding-left: $site_horizontal_padding; - padding-right: $site_horizontal_padding; - max-width: 1240px; - - @mixin max-width-desktop { - padding-left: 0; - padding-right: 0; - } -} - -.container-fluid { - padding-left: 0; - padding-right: 0; -} diff --git a/_sass/code.scss b/_sass/code.scss deleted file mode 100644 index 21a7e83a274d..000000000000 --- a/_sass/code.scss +++ /dev/null @@ -1,50 +0,0 @@ -code, kbd, pre, samp, code b { - @include code_font_family; - span { - @include code_font_family; - } -} - -pre { - padding: rem(18px); - background-color: $code_background_color; - - code { - font-size: rem(14px); - } - - &.highlight { - background-color: $light_grey; - line-height: rem(21px); - } -} - -code.highlighter-rouge { - color: $content_text_color; - background-color: $light_grey; - padding: 2px 6px; -} - -a:link, -a:visited, -a:hover { - code.highlighter-rouge { - color: $code_link_color; - } - - &.has-code { - color: $code_link_color; - } -} - -p, -h1, -h2, -h3, -h4, -h5, -h6 { - code { - font-size: 78.5%; - } -} diff --git a/_sass/community-stories.scss b/_sass/community-stories.scss deleted file mode 100644 index 7d2c0d0c9067..000000000000 --- a/_sass/community-stories.scss +++ /dev/null @@ -1,187 +0,0 @@ -.comm-stories { - .community-stories-wrapper { - background-color: white; - } - .community-stories { - padding-top: 0; - .production-info-container, - .research-info-container { - display: flex; - flex-flow: column; - } - .sticky-top { - top: 15%; - } - } - .production-container, - .research-container { - display: flex; - padding-left: 0; - @media (max-width: 767px) { - flex-flow: wrap; - } - } - .production-section, .research-section { - max-width: 920px; - margin: 0 auto 0 auto; - padding: 0 30px 43px 30px; - width: 90%; - .production-item, .research-item { - padding-bottom: 2rem; - padding-top: 2rem; - border-bottom: 1px solid #d6d7d8; - h2 { - padding-bottom: 1rem; - } - } - } - .production-side-nav-container, - .research-side-nav-container { - #research-sidebar-list, - #production-sidebar-list{ - padding-left: 0; - .active { - color: $orange; - } - ul { - padding-left: 3rem; - list-style: none; - li { - line-height: 36px; - a { - color: #8c8c8c; - } - } - } - } - } - - .production-section, .research-section { - p { - font-size: 18px; - margin-top: 2rem; - } - @include small-desktop { - width: 100%; - padding-left: 5px; - padding-right: 5px; - } - @media (max-width: 767px) { - width: 100%; - padding-left: 5px; - padding-right: 5px; - } - } - - .main-content-wrapper { - margin-top: 275px; - @include desktop { - margin-top: 380px; - } - } - - .jumbotron { - color: $white; - height: 190px; - @include desktop { - height: 260px; - } - } -} -.ecosystem .community-stories.main-content { - padding-top: 0; -} - -.community-stories-container-fluid { - height: 5rem; - width: 100%; - padding-bottom: 7rem; - @media screen and (max-width: 767px) { - margin-top: 2rem; - } - @include full-nav-menu-desktop { - margin-left: 0; - } -} - - - -.comm-stories .community-stories.main-content .navbar { - padding-left: 0; - padding-bottom: 0; - padding-top: 0; - .nav-item { - cursor: pointer; - &:last-of-type { - position: relative; - } - } - @media (min-width: 992px) { - .nav-item { - padding: 2rem; - cursor: pointer; - } - - .nav-link { - position: relative; - top: 10%; - transform: translateY(-50%); - } - } - - .nav-select { - background-color: $white; - .nav-link { - color: $orange; - font-weight: 500; - } - } - - .nav-link { - font-size: rem(18px); - color: #8c8c8c; - @include desktop { - margin-left: rem(30px); - } - &:hover { - color: $orange; - } - } - - .community-stories-nav-link { - padding-left: rem(20px); - padding-right: rem(20px); - - @include desktop { - padding-left: rem(30px); - padding-right: rem(30px); - } - } - - .community-stories-nav { - flex-direction: row; - } - - .nav-item { - padding-top: rem(15px); - padding-bottom: rem(15px); - @include desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @include small-desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @media (max-width: 990px) { - padding-bottom: rem(10px); - padding-top: 1rem; - } - } - - .navbar-toggler { - margin-left: rem(40px); - } -} - - diff --git a/_sass/compact.scss b/_sass/compact.scss deleted file mode 100644 index 12578e5cbae5..000000000000 --- a/_sass/compact.scss +++ /dev/null @@ -1,82 +0,0 @@ -.compact-cards { - width: 100%; - a { - color: #6C6C6D; - &:hover { - color: $orange; - } - } -} - -.compact-hub-card-wrapper { - padding: 0; -} - -.compact-card-container { - display: flex; - align-items: center; -} - -.compact-card-body { - padding-top: 8px; - &:hover { - border-bottom: 1px solid $orange; - color: $orange; - .compact-item-title { - color: $orange - } - } - .compact-hub-card-title-container { - width: 75%; - display: flex; - } -} - -.compact-model-card { - height: auto; - border-bottom: 1px solid #E2E2E2; -} - -.compact-item-title { - padding-left: 0; - color: #000; -} - -.compact-card-summary { - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; - top: 5px; -} - -.compact-hub-divider { - padding: 0; - width: 100%; -} - -.hub-select-container { - position: absolute; - right: 0; - height: 2rem; -} - -.compact-hub-index-cards { - padding-bottom: 2rem; -} - -.full-hub-icon { - &:hover { - cursor: pointer; - height: 3rem; - } -} - -.compact-hub-icon { - margin-left: 0.5rem; - margin-right: rem(50px); - &:hover { - cursor: pointer; - } -} - - diff --git a/_sass/contributors.scss b/_sass/contributors.scss deleted file mode 100644 index cc3507edb3d4..000000000000 --- a/_sass/contributors.scss +++ /dev/null @@ -1,339 +0,0 @@ -.ecosystem .contributor-jumbotron { - @include desktop { - height: 262px; - } - width: 90%; - - .container { - max-width: 920px; - } - - h1 { - padding-top: 0; - span { - font-weight: 300; - color: $purple; - } - } -} - -.ecosystem .contributor-jumbotron .contributor-jumbo-text { - h1 { - color: white; - } - h2 { - color: white; - padding-top: 0; - } -} - -.hidden { - display: none; -} - -.contributor-container-fluid { - height: 4rem; - width: 100%; - @media screen and (max-width: 767px) { - margin-top: 2rem; - } - @include full-nav-menu-desktop { - margin-left: 0; - } -} - -.ecosystem .contributor.main-content { - padding-top: 0; -} - -.ecosystem .contributor.main-content .navbar { - padding-left: 0; - padding-bottom: 0; - padding-top: 0; - .nav-item { - cursor: pointer; - &:last-of-type { - position: relative; - } - } - @media (min-width: 992px) { - .nav-item { - padding: 2rem; - cursor: pointer; - } - - .nav-link { - position: relative; - top: 10%; - transform: translateY(-50%); - } - } - - .nav-select { - background-color: $white; - .nav-link { - color: $orange; - font-weight: 500; - } - } - - .nav-link { - font-size: rem(18px); - color: #8c8c8c; - @include desktop { - margin-left: rem(30px); - } - &:hover { - color: $orange; - } - } - - .contributor-nav-link { - padding-left: rem(20px); - padding-right: rem(20px); - - @include desktop { - padding-left: rem(30px); - padding-right: rem(30px); - } - } - - .contributor-nav { - flex-direction: row; - } - - .nav-item { - padding-top: rem(15px); - padding-bottom: rem(15px); - @include desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @include small-desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @media (max-width: 990px) { - padding-bottom: rem(10px); - padding-top: 1rem; - } - } - - .navbar-toggler { - margin-left: rem(40px); - } -} - -.past-issue-container { - display: flex; - @media (max-width: 767px) { - display: block; - } -} - -.past-issue-container .get-started-cloud-sidebar{ - .sticky-top { - position: sticky; - top: 15%; - @media (max-width: 767px) { - position: relative; - top: 0; - margin-left: 0; - } - } - - .pytorch-article { - li { - list-style: initial; - } - } - - li { - list-style-type: none; - line-height: 36px; - color: #8c8c8c; - } - span { - white-space: nowrap; - } -} - -#past-issues { - max-width: 920px; - margin: auto; - margin-top: 0; - margin-bottom: 0; -} - -.contributor-container { - max-width: 920px; - left: 0; - right: 0; - margin-left: auto; - margin-right: auto; - padding-left: 30px; - padding-right: 30px; - width: 90%; -} - -.past-issue-container.container { - padding-left: 5px; - padding-top: 45px; -} - -.nav-background { - width: 100%; - background-color: $very_light_grey; -} - -#get-started-contributor-sidebar-list { - padding-left: 0; - .active { - color: $orange; - } - li { - a { - color: #8c8c8c; - } - } -} - -.two-column-row { - max-width: 920px; - margin: 0 auto 0 auto; - padding: 0 30px 43px 30px; - width: 90%; - - @include desktop { - display: flex; - } - - h2 { - text-transform: uppercase; - font-weight: 100; - margin-bottom: 30px; - } - - p { - margin-bottom: 40px; - } - - .content-left { - flex: 60%; - padding-top: 76px; - - @include desktop { - margin-right: 62px; - } - - h2 { - color: $orange; - } - - .contributor-consent-check { - max-width: 400px; - } - - .email-consent { - color: $mid_gray; - font-size: 14px; - } - - .please-accept-terms { - display: none; - color: $orange; - font-size: 14px; - } - } - - .content-right { - flex: 40%; - padding-top: 76px; - - h2 { - color: $purple; - } - } - - .contributor-form { - margin: -8px 0 47px 0; - - .form-success, - .form-fail { - color: $orange; - display: none; - flex: none; - margin: 8px 0 12px 0; - } - - form { - width: 100%; - - .contributor-form-ui { - display: flex; - max-width: 390px; - flex-wrap: wrap; - - input[type="text"] { - border: 1px solid darken($color: #f3f3f3, $amount: 5); - border-radius: 4px; - flex: 1 70%; - padding: 5px 8px 5px 8px; - margin-right: 10px; - - &::placeholder { - color: darken($color: #f3f3f3, $amount: 20); - } - - &:focus { - border: 1px solid $orange; - } - } - - input[type="submit"] { - background: darken($color: #f3f3f3, $amount: 5); - border: none; - border-radius: 4px; - color: #6d6d6d; - - &:hover { - background: darken($color: #f3f3f3, $amount: 20); - color: darken($color: #6d6d6d, $amount: 20); - } - } - } - } - - input[type="checkbox"] { - margin: 1px 6px 0 0; - } - - .contributor-consent-check { - color: $mid_gray; - margin-top: 1rem; - } - } - - .contributors-button { - background-image: url($baseurl + "/assets/images/chevron-right-orange.svg"); - background-color: $white; - background-size: 6px 13px; - background-position: center right 10px; - background-repeat: no-repeat; - border: 2px solid $light_grey; - color: $dark_grey; - cursor: pointer; - font-size: 1.125rem; - outline: none; - letter-spacing: -0.25px; - line-height: rem(28px); - margin-bottom: 0.125rem; - padding: rem(10px) rem(30px) rem(10px) rem(20px); - - a { - color: $dark_grey; - } - - @include animated_border_hover_state; - } -} diff --git a/_sass/cookie-banner.scss b/_sass/cookie-banner.scss deleted file mode 100644 index 86885c83dad1..000000000000 --- a/_sass/cookie-banner.scss +++ /dev/null @@ -1,53 +0,0 @@ -.cookie-banner-wrapper { - display: none; - - &.is-visible { - display: block; - position: fixed; - bottom: 0; - background-color: $light_grey; - min-height: 100px; - width: 100%; - z-index: 401; - border-top: 3px solid #ededee; - } - - .gdpr-notice { - color: $dark_grey; - margin-top: rem(25px); - text-align: left; - max-width: 1440px; - @include desktop { - width: 77%; - } - @include small-desktop { - width: inherit; - } - - .cookie-policy-link { - color: #343434; - } - } - - .close-button { - appearance: none; - background: transparent; - border: 1px solid $light_grey; - height: rem(21px); - position: absolute; - bottom: 42px; - right: 0; - top: 0; - cursor: pointer; - outline: none; - @include desktop { - right: 20%; - top: inherit; - } - - @include small-desktop { - right: 0; - top: 0; - } - } -} diff --git a/_sass/deep-learning.scss b/_sass/deep-learning.scss deleted file mode 100644 index 4399b4b775d0..000000000000 --- a/_sass/deep-learning.scss +++ /dev/null @@ -1,179 +0,0 @@ -.deep-learning { - .header-container { - @include max-width-desktop { - margin-bottom: 1rem; - } - } - - .jumbotron { - height: 180px; - @include desktop { - height: 250px; - } - - .thank-you-page-container { - margin-top: 0; - @include small-desktop { - margin-top: 250px; - } - } - - .deep-learning-jumbotron-text { - @include desktop { - margin-top: 55px; - - h1 { - padding-top: 30px; - } - } - @include small-desktop { - max-width: 95%; - flex-basis: 100%; - } - } - - .deep-learning-thank-you-text { - width: 80%; - .download-book-link { - display: inline-block; - } - } - - .deep-learning-landing-text { - width: 100%; - @include desktop { - width: 85% - } - } - - .deep-learning-book-container { - display: none; - @include desktop { - display: block - } - @include small-desktop { - display: none; - } - } - - .thank-you-book-container { - display: none; - @include small-desktop { - display: block; - } - @include desktop { - display: block; - } - } - } - - .deep-learning-col { - @include desktop { - max-width: 80%; - } - } - - .deep-learning-background { - @include desktop { - height: 440px; - } - } - - .header-holder { - @include desktop { - height: 90px; - } - } -} - -.deep-learning { - .main-content-wrapper { - margin-top: 250px; - @include desktop { - margin-top: 480px; - } - } - - .deep-learning-content { - @include desktop { - padding-top: 0; - } - } - - .main-background { - height: 250px; - @include desktop { - height: 440px - } - } - - .thank-you-wrapper { - margin-top: 400px; - @include desktop { - margin-top: 275px; - } - } - - .thank-you-background { - height: 438px; - @include desktop { - height: 680px; - } - } -} - -.deep-learning-container { - display: flex; - align-items: center; -} - -.deep-learning-logo { - background-image: url($baseurl + "/assets/images/pytorch-logo.png"); -} - -.deep-learning-row { - display: flex; - align-items: center; - .lead { - margin-top: 1rem; - margin-bottom: 2rem; - } - h1 { - @include small-desktop { - font-size: 3rem; - } - @include desktop { - margin-top: 2rem; - } - } -} - -.deep-learning-book { - max-width: 100%; - height: 400px; -} - -.deep-learning-form { - margin-left: -1rem; - @include desktop { - margin-left: 0; - margin-top: 1rem; - } -} - -#deep-learning-button { - margin-top: 2rem; -} - -.deep-learning-form { - .email-subscribe-form { - .deep-learning-input { - padding-left: .5rem; - background-color: #f3f4f7; - } - } - - #mce-error-response { - color: $orange; - } -} diff --git a/_sass/ecosystem.scss b/_sass/ecosystem.scss deleted file mode 100644 index 3c5289a12cdd..000000000000 --- a/_sass/ecosystem.scss +++ /dev/null @@ -1,450 +0,0 @@ -.ecosystem .jumbotron { - height: 170px; - @include desktop { - height: 300px; - } - - h1 { - padding-top: rem(135px); - color: $white; - } - - p.lead { - margin-bottom: rem(25px); - padding-top: rem(20px); - color: $white; - } - - .ecosystem-join { - margin-bottom: rem(48px); - } - - svg { - margin-bottom: rem(20px); - } -} - -.ecosystem .main-content { - @include desktop { - padding-top: 3.25rem; - } -} - -.ecosystem .main-content-wrapper { - background-color: $light_grey; - - margin-top: 340px; - @include desktop { - margin-top: 435px; - } -} - -.ecosystem.ecosystem-detail .main-content-wrapper { - background-color: $white; -} - -.ecosystem-cards-wrapper { - margin-bottom: rem(18px); - padding-top: rem(20px); - .col-md-6 { - @media (min-width: 768px) { - flex: 0 0 100%; - max-width: 100%; - } - - @include max-width-desktop { - flex: 0 0 50%; - max-width: 50%; - } - } -} - -.ecosystem .main-content-menu { - .navbar-nav .nav-link { - font-size: rem(18px); - color: $very_dark_grey; - padding-right: 0; - margin-right: rem(30px); - - &.selected { - color: $orange; - border-bottom: 1px solid $orange; - } - } - - .nav-item:last-of-type { - @include desktop { - position: absolute; - right: 0; - a { - margin-right: 0; - } - } - } -} - -.ecosystem.ecosystem-detail .main-content { - padding-bottom: 0; -} - -.ecosystem article.pytorch-article { - counter-reset: article-list; - - > ol { - padding-left: 0; - list-style-type: none; - } - - > ol > li { - @include max-width-desktop { - position: relative; - - &:before { - counter-increment: article-list; - content: counter(article-list, decimal-leading-zero); - color: #B932CC; - line-height: rem(40px); - letter-spacing: -0.34px; - font-size: rem(32px); - font-weight: 300; - position: absolute; - left: -60px; - top: -16px; - padding: rem(10px) 0; - background-color: $white; - z-index: 10; - } - - &:after { - content: ""; - width: 2px; - position: absolute; - left: -42px; - top: 0; - height: 100%; - background-color: #f3f3f3; - z-index: 9; - } - } - - > h4 { - color: $slate; - } - - ul li { - list-style-type: disc; - } - } -} - -.ecosystem .quick-starts { - background: #ecedf1; - - .title-block, - #command, - .option, - .cloud-option { - border-color: #ecedf1; - } -} - -.ecosystem { - .join-link { - color: inherit; - text-decoration: underline; - } - - .join-notice { - text-align: center; - padding-top: rem(20px); - padding-bottom: rem(40px); - - p { - color: $dark_grey; - margin-bottom: 0; - line-height: rem(30px); - } - } - - .join-jumbotron { - @include desktop { - height: 262px; - } - width: 90%; - - .container { - max-width: 920px; - } - - h1 { - padding-top: rem(5px); - color: $white; - span { - font-weight: 300; - } - } - } - - .join-wrapper { - background-color: $light_grey; - - .main-content { - @include desktop { - padding-top: 1.5rem; - } - } - - .container { - max-width: 920px; - } - - #success-response { - color: $dark_grey; - } - } - - .join-intro { - color: $dark_grey; - line-height: 28px; - } - - .requirements { - - span { - color: $black; - font-weight: bold; - } - - .join-number { - color: $purple; - display: flex; - align-items: center; - @include desktop { - padding-left: rem(10px); - } - } - - p { - margin-bottom: 0; - margin-top: rem(-7px); - @include desktop { - padding-left: rem(24px); - } - } - - .col-md-11 { - @include desktop { - border-left: 2px solid $light_grey; - } - } - } - - .row.requirements { - padding-bottom: rem(40px); - } -} - -.ecosystem .experimental { - .ecosystem-card-title-container { - display: inline-flex; - .experimental-badge { - text-transform: uppercase; - margin-left: 15px; - background-color: #e4e4e4; - color: $not_quite_black; - opacity: 0.75; - font-size: rem(10px); - letter-spacing: 1px; - line-height: rem(22px); - height: rem(20px); - width: rem(96px); - text-align: center; - margin-top: rem(4px); - } - } -} - -.ecosystem { - .ecosystem-card-title-container { - .card-title { - padding-left: 0; - font-size: 1.5rem; - color: $slate; - } - } - - .star-list { - list-style: none; - padding-left: 0; - li { - display: inline; - } - li.github-stars-count-whole-number { - display: none; - } - } - - - .icon-count-container { - display: inline-block; - vertical-align: text-bottom; - margin-left: rem(8px); - } - - .github-logo { - height: 15px; - width: 13px; - margin-left: 10px; - } - - .github-stars-count { - color: $mid_gray; - position: relative; - top: rem(4px); - font-size: 14px; - margin-left: 0.125rem; - @include desktop { - top: rem(3px); - font-size: initial; - } - } -} - -.ecosystem-divider { - position: relative; - margin-bottom: 4rem; - margin-top: 1.5rem; - top: 3rem; -} - -.ecosystem{ - #dropdownSort, #dropdownSortLeft { - margin-left: 0; - } -} - -.ecosystem{ - #dropdownSortLeft { - font-size: 19px; - top: inherit; - right: inherit; - } -} - -.ecosystem-filter-menu { - ul { - list-style-type: none; - padding-left: rem(20px); - li { - padding-right: rem(20px); - word-break: break-all; - - a { - color: $mid_gray; - &:hover { - color: $orange; - } - } - } - } -} - -.ecosystem .ecosystem-filter { - cursor: pointer; - ul { - list-style-type: none; - } -} - -.ecosystem #dropdownFilter, #dropdownSort, #dropdownSortLeft { - color: $mid_gray; - cursor: pointer; - z-index: 1; - position: absolute; -} - -.ecosystem .pagination { - .page { - border: 1px solid #dee2e6; - padding: 0.5rem 0.75rem; - } - - .active .page { - background-color: #dee2e6; - } -} - -.ecosystem-form { - .hbspt-form { - padding-bottom: rem(48px); - - .hs-form-field { - width: 100%; - } - - .hs-form-field .input input { - width: 100%; - border: none; - border-bottom: 2px solid $purple; - height: rem(44px); - outline: none; - padding-left: rem(15px); - margin-bottom: rem(30px); - } - - .hs-richtext h3 { - text-transform: uppercase; - padding-top: rem(25px); - padding-bottom: rem(30px); - } - - label { - color: $dark_grey; - } - - textarea { - width: 100%; - border: none; - border-bottom: 2px solid $purple; - outline: none; - padding-left: rem(15px); - margin-bottom: rem(30px); - height: rem(90px); - padding-top: rem(10px); - } - - ::placeholder { - color: $dark_grey; - opacity: 0.5; - ; - } - - .actions { - display: flex; - width: 100%; - justify-content: center; - } - - .hs-button { - @include desktop { - padding-left: rem(18px); - background-origin: content-box; - background-size: 30px 15px; - } - padding-left: rem(12px); - margin-top: rem(40px); - background-color: $orange; - color: $white; - cursor: pointer; - border: none; - width: 30%; - height: rem(45px); - text-align: left; - background-repeat: no-repeat; - background-image: url(/assets/images/arrow-right-with-tail-white.svg); - background-size: 30px 12px; - background-position: right; - } - - } -} \ No newline at end of file diff --git a/_sass/enterprise.scss b/_sass/enterprise.scss deleted file mode 100644 index 5b9f79fac1e3..000000000000 --- a/_sass/enterprise.scss +++ /dev/null @@ -1,35 +0,0 @@ -.mobile .enterprise-jumbotron { - height: 210px; - @include desktop { - height: 280px; - } -} -.enterprise { - padding-bottom: 0; - p, li { - color: $content_text_color; - font-size: 18px; - } - h2 { - padding-bottom: 1.5rem; - } - .container { - padding: 48px 30px 48px 30px; - } - .enterprise-gray-container { - background-color: $light_grey; - } - .pyt-enterprise-logo { - background-image: url($baseurl + "/assets/images/PTE_lockup_PRIMARY.svg"); - background-repeat: no-repeat; - height: 60px; - } - .container { - max-width: 940px; - } - .enterprise-landing-azure-logo-container { - float: left; - padding: 0; - } -} - diff --git a/_sass/events.scss b/_sass/events.scss deleted file mode 100644 index 18e89c238ca2..000000000000 --- a/_sass/events.scss +++ /dev/null @@ -1,356 +0,0 @@ -.ecosystem { - .events-wrapper { - background-color: white; - @include desktop { - margin-top: 472px; - } - } - .events { - padding-top: 0; - .event-info-container { - display: flex; - flex-flow: column; - } - .sticky-top { - top: 15%; - } - .event-label { - margin-bottom: 2rem; - } - } - .live-event-container { - display: flex; - @media (max-width: 767px) { - flex-flow: wrap; - } - } - .events-section { - max-width: 920px; - margin: 0 auto 0 auto; - padding: 0 30px 43px 30px; - width: 90%; - .event-item { - padding-bottom: 3rem; - border-bottom: 1px solid #D6D7D8; - h2 { - padding-bottom: 1rem; - } - } - } - .community-event { - margin: 0; - padding: 3px 10px; - border: 1px solid #8c8c8c; - border-radius: 3px; - text-transform: uppercase; - font-size: 14px; - font-weight: 700; - color: #8c8c8c; - } - .event-side-nav-container { - padding-left: 3rem; - ul { - list-style: none; - } - } - .live-events-section { - p { - font-size: 18px; - margin-top: 2rem; - } - @include small-desktop { - width: 100%; - padding-left: 5px; - padding-right: 5px; - } - @media (max-width: 767px) { - width: 100%; - padding-left: 5px; - padding-right: 5px; - } - } -} -.ecosystem .events.main-content { - padding-top: 0; -} - -.events-container-fluid { - height: 5rem; - width: 100%; - padding-bottom: 7rem; - @media screen and (max-width: 767px) { - margin-top: 2rem; - } - @include full-nav-menu-desktop { - margin-left: 0; - } -} - -.events-container { - max-width: 920px; - left: 0; - right: 0; - margin-left: auto; - margin-right: auto; - padding-left: 0px; - padding-right: 0px; - width: 90%; -} - - - -.ecosystem .events.main-content .navbar { - padding-left: 0; - padding-bottom: 0; - padding-top: 0; - .nav-item { - cursor: pointer; - &:last-of-type { - position: relative; - } - } - @media (min-width: 992px) { - .nav-item { - padding: .5rem; - cursor: pointer; - } - - .nav-link { - position: relative; - top: 10%; - transform: translateY(-50%); - } - } - - .nav-select { - background-color: $white; - .nav-link { - color: $orange; - font-weight: 500; - } - } - - .nav-link { - font-size: rem(18px); - color: #8c8c8c; - @include desktop { - margin-left: rem(30px); - } - &:hover { - color: $orange; - } - } - - .events-nav-link { - padding-left: rem(15px); - padding-right: rem(5px); - - @include desktop { - padding-left: rem(20px); - padding-right: rem(20px); - } - } - - .events-nav { - flex-direction: row; - } - - .nav-item { - padding-top: rem(15px); - padding-bottom: rem(15px); - @include desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @include small-desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @media (max-width: 990px) { - padding-bottom: rem(10px); - padding-top: 1rem; - } - } - - .navbar-toggler { - margin-left: rem(40px); - } -} - -.events-video-wrapper { - width: 100%; - border: 1px solid $mid_gray; - background-color: $light_grey; - height: 21rem; - margin-top: 2.5rem; - .video-container { - display: flex; - top: 12%; - } - .video-tabs { - display: flex; - } - .events-video-nav { - flex-direction: row; - padding-right: 0; - margin-bottom: 1rem; - .nav-item { - border-right: 1px solid $mid_gray; - border-bottom: 1px solid $mid_gray; - } - .nav-select { - background-color: $white; - border-bottom: none; - .nav-link { - color: $orange; - } - } - } - .events-nav-link { - text-align: center; - } - .video { - position: relative; - height: 0; - padding-bottom: 30%; - place-self: center; - } - .video-info { - margin-left: 3rem; - max-width: 45%; - } - iframe { - height: 100%; - width: 100%; - position: absolute; - } -} -.video-links-container { - border: 1px solid $mid_gray; - .video-links { - display: flex; - .video-link-item { - padding-left: 1rem; - list-style: none; - } - } -} -.episode-header-text { - font-size: 26px; - margin-bottom: 2rem; -} -.episode-card-row { - display: block; - @media screen and (min-width: 908px) { - display: flex; - flex-wrap: wrap; - margin-bottom: 2rem; - } - .episode-card.resource-card { - height: 14rem; - margin-right: 1rem; - margin-bottom: 1rem; - background-color: $light_grey; - border: none; - max-width: 31%; - flex: auto; - ul { - list-style: none; - } - a{ - color: inherit; - } - .episode-body { - display: block; - position: relative; - top: 30px; - margin-left: 20px; - } - - .episode-title { - margin-left: 3.2rem; - margin-bottom: .5rem; - font-size: rem(24px); - @include desktop { - margin-left: 2.5rem; - } - } - - .guest-name { - font-weight: 500; - font-size: rem(20px); - overflow: hidden; - white-space: nowrap; - text-overflow: ellipsis; - } - - .episode-info { - display: flex; - justify-content: space-between; - span { - padding-left: 5px; - padding-right: 5px; - } - } - .info-divide { - display: block; - border-bottom: 1px solid #D6D7D8; - margin-top: .5rem; - margin-bottom: .5rem; - } - .episode-poster { - color: $orange; - } - .episode-date-time { - display: flex; - padding-left: 0; - span { - padding-left: 5px; - padding-right: 5px; - } - } - @media screen and (max-width: 907px) { - max-width: 100%; - margin-bottom: 1.25rem; - } - } - .episode-card.resource-card.pytorch-resource:before { - content: ""; - background-size: 32px 32px; - background-repeat: no-repeat; - display: block; - position: absolute; - height: 32px; - width: 32px; - top: 30px; - left: 15px; - @include desktop { - left: 30px; - top: 30px; - } - } -} - -.podcast-container { - padding-left: 0; - @include desktop { - display: flex; - .podcast-card:not(:first-of-type) { - margin-left: 1rem; - } - } - .podcast-card { - display: flex; - align-items: center; - justify-content: center; - margin-top: 2rem; - border: 1px solid #D6D7D8; - height: rem(140px); - @include animated_border_hover_state; - } - .podcast-title { - font-size: 24px; - font-weight: 400; - } - -} diff --git a/_sass/features.scss b/_sass/features.scss deleted file mode 100644 index 3d77df1e3326..000000000000 --- a/_sass/features.scss +++ /dev/null @@ -1,188 +0,0 @@ -.features { - .main-content { - padding-bottom: 0; - } - .navbar-nav .nav-link { - color: $black; - } - - .nav-logo { - background-image: url($baseurl + "/assets/images/logo-dark.svg"); - } - - .main-background { - @include desktop { - height: 575px - } - } -} - -.features { - .main-content-wrapper { - margin-top: 350px; - @include desktop { - margin-top: 540px; - } - } -} - -.features-row { - padding-bottom: rem(60px); - align-items: center; - - &:first-of-type { - margin-top: rem(20px); - } - - &:last-of-type { - padding-bottom: rem(72px); - } - - @include desktop { - padding-bottom: rem(96px); - &:first-of-type { - margin-top: 4.05rem; - } - } - - h3 { - font-size: rem(32px); - letter-spacing: 1.78px; - line-height: rem(36px); - font-weight: 400; - text-transform: uppercase; - margin-bottom: rem(20px); - font-weight: 300; - - @include small-desktop { - width: 80%; - } - - @include max-width-desktop { - width: 590px; - } - } - - p { - font-size: rem(18px); - letter-spacing: 0.25px; - line-height: rem(28px); - color: $dark_grey; - padding-right: rem(30px); - - @include small-desktop { - width: 80%; - } - - @include max-width-desktop { - width: 590px; - } - } - - .feature-content-holder { - width: 100%; - - @include max-width-desktop { - width: 495px; - } - - pre.highlight { - margin-bottom: 0; - } - } - - &:nth-child(odd) { - .col-md-6:nth-child(1n) { - order: 2; - } - - .col-md-6:nth-child(2n) { - order: 1; - } - - @include desktop { - .col-md-6:nth-child(1n) { - order: 1; - } - - .col-md-6:nth-child(2n) { - order: 2; - } - } - } - - &:nth-child(1n) { - h3 { - color: #B73BC9; - } - - .feature-content-holder { - border-bottom: 2px solid #B73BC9; - } - } - - &:nth-child(2n) { - h3 { - color: #D92F4C; - } - - .feature-content-holder { border-bottom: 2px solid #D92F4C; - } - } - - &:nth-child(3n) { - h3 { - color: #8038E0; - } - - .feature-content-holder { - border-bottom: 2px solid #8038E0; - } - } - - .col-md-6 { - @include max-width-desktop { - padding-left: 0; - padding-right: 0; - } - - @include desktop { - &:nth-of-type(2) { - .feature-content { - width: 100%; - - h3, p, .feature-content-holder { - float: right; - } - } - } - } - } -} - -.features .jumbotron { - height: 200px; - @include desktop { - height: 195px; - } - @media (max-width: 320px) { - height: 250px; - } - h1 { - padding-top: rem(30px); - } - @include desktop { - height: 468px; - h1 { - padding-top: 0; - } - } - h1, p { - color: $white; - } - .btn { - @include desktop { - margin-top: rem(6px); - } - } -} diff --git a/_sass/fonts.scss b/_sass/fonts.scss deleted file mode 100644 index 61c8cd48e86f..000000000000 --- a/_sass/fonts.scss +++ /dev/null @@ -1,111 +0,0 @@ -@font-face { - font-family: FreightSans; - font-weight: 700; - font-style: normal; - src: url($baseurl + "/assets/fonts/FreightSans/freight-sans-bold.woff2") format("woff2"), - url($baseurl + "/assets/fonts/FreightSans/freight-sans-bold.woff") format("woff"); -} - -@font-face { - font-family: FreightSans; - font-weight: 700; - font-style: italic; - src: url($baseurl + "/assets/fonts/FreightSans/freight-sans-bold-italic.woff2") format("woff2"), - url($baseurl + "/assets/fonts/FreightSans/freight-sans-bold-italic.woff") format("woff"); -} - -@font-face { - font-family: FreightSans; - font-weight: 500; - font-style: normal; - src: url($baseurl + "/assets/fonts/FreightSans/freight-sans-medium.woff2") format("woff2"), - url($baseurl + "/assets/fonts/FreightSans/freight-sans-medium.woff") format("woff"); -} - -@font-face { - font-family: FreightSans; - font-weight: 500; - font-style: italic; - src: url($baseurl + "/assets/fonts/FreightSans/freight-sans-medium-italic.woff2") format("woff2"), - url($baseurl + "/assets/fonts/FreightSans/freight-sans-medium-italic.woff") format("woff"); -} - -@font-face { - font-family: FreightSans; - font-weight: 100; - font-style: normal; - src: url($baseurl + "/assets/fonts/FreightSans/freight-sans-light.woff2") format("woff2"), - url($baseurl + "/assets/fonts/FreightSans/freight-sans-light.woff") format("woff"); -} - -@font-face { - font-family: FreightSans; - font-weight: 100; - font-style: italic; - src: url($baseurl + "/assets/fonts/FreightSans/freight-sans-light-italic.woff2") format("woff2"), - url($baseurl + "/assets/fonts/FreightSans/freight-sans-light-italic.woff") format("woff"); -} - -@font-face { - font-family: FreightSans; - font-weight: 400; - font-style: italic; - src: url($baseurl + "/assets/fonts/FreightSans/freight-sans-book-italic.woff2") format("woff2"), - url($baseurl + "/assets/fonts/FreightSans/freight-sans-book-italic.woff") format("woff"); -} - -@font-face { - font-family: FreightSans; - font-weight: 400; - font-style: normal; - src: url($baseurl + "/assets/fonts/FreightSans/freight-sans-book.woff2") format("woff2"), - url($baseurl + "/assets/fonts/FreightSans/freight-sans-book.woff") format("woff"); -} - -@font-face { - font-family: IBMPlexMono; - font-weight: 600; - font-style: normal; - unicode-range: u+0020-007f; - src: local("IBMPlexMono-SemiBold"), - url($baseurl + "/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2") - format("woff2"), - url($baseurl + "/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff") - format("woff"); -} - -@font-face { - font-family: IBMPlexMono; - font-weight: 500; - font-style: normal; - unicode-range: u+0020-007f; - src: local("IBMPlexMono-Medium"), - url($baseurl + "/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2") - format("woff2"), - url($baseurl + "/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff") - format("woff"); -} - -@font-face { - font-family: IBMPlexMono; - font-weight: 400; - font-style: normal; - unicode-range: u+0020-007f; - src: local("IBMPlexMono-Regular"), - url($baseurl + "/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2") - format("woff2"), - url($baseurl + "/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff") - format("woff"); -} - -@font-face { - font-family: IBMPlexMono; - font-weight: 300; - font-style: normal; - unicode-range: u+0020-007f; - src: local("IBMPlexMono-Light"), - url($baseurl + "/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff2") - format("woff2"), - url($baseurl + "/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff") - format("woff"); -} diff --git a/_sass/footer.scss b/_sass/footer.scss deleted file mode 100644 index 0433b1a0e446..000000000000 --- a/_sass/footer.scss +++ /dev/null @@ -1,511 +0,0 @@ -.site-footer { - padding: rem(60px) 0; - width: 100%; - background: $black; - background-size: 100%; - margin-left: 0; - margin-right: 0; - - @include desktop { - position: absolute; - left: 0; - bottom: 0; - height: $desktop_footer_height; - } - - p { - color: $white; - } - - ul { - list-style-type: none; - padding-left: 0; - margin-bottom: 0; - } - - ul li { - font-size: rem(18px); - line-height: rem(32px); - color: #A0A0A1; - padding-bottom: rem(6px); - - &.list-title { - padding-bottom: rem(12px); - color: $white; - p { - margin-bottom: 0; - } - } - } - - a:link, - a:visited { - color: inherit; - } - - @include desktop { - a:hover { - color: $orange; - } - } - - .privacy-policy { - background: #000000; - border-top: 1px solid #fff; - display: flex; - flex-direction: column; - margin-top: 40px; - ul { - border-bottom: 1px solid white; - .privacy-policy-links { - padding-bottom: 1rem; - padding-top: 1rem; - padding-right: 1rem; - display: inline-flex; - color: white; - } - } - .copyright { - padding-top: 1rem; - p { - color: #dfdfdf; - font-size: 14px; - } - a { - color: #dfdfdf; - font-weight: 600; - &:hover { - color: #dfdfdf; - font-weight: 600; - } - } - } - } -} - -.docs-tutorials-resources { - background-color: $slate; - color: $white; - padding-top: rem(40px); - padding-bottom: rem(40px); - - @include desktop { - padding-top: rem(66px); - padding-bottom: 4.09rem; - } - - h2 { - font-size: rem(24px); - letter-spacing: -0.25px; - text-transform: none; - margin-bottom: 0.25rem; - - @include desktop { - margin-bottom: rem(20px); - } - } - - .col-md-4 { - margin-bottom: rem(32px); - @include desktop { - margin-bottom: 0; - } - } - - .with-right-arrow { - margin-left: 12px; - background-position: top 3px right 11px; - - @include desktop { - background-position: top 6px right 11px; - } - - &:hover { - background-image: url($baseurl + "/assets/images/chevron-right-white.svg"); - } - } - - p { - font-size: rem(16px); - line-height: rem(24px); - letter-spacing: 0.22px; - color: #A0A0A1; - margin-bottom: rem(8px); - - @include desktop { - margin-bottom: rem(20px); - } - } - - a { - font-size: rem(18px); - color: $orange; - &:hover { - color: $white; - } - } -} - -.footer-container { - position: relative; -} - -.footer-logo-wrapper { - display: none; - @include desktop { - display: flex; - grid-column: span 6; - } - .footer-logo { - img { - width: 40px; - } - } -} - - -.footer-links-wrapper { - display: flex; - flex-wrap: wrap; - padding-bottom: 1rem; - border-bottom: 1px solid white; - - @include desktop { - flex-wrap: initial; - justify-content: flex-end; - } -} - -.footer-links-col { - margin-bottom: rem(60px); - width: 50%; - - @include desktop { - margin-bottom: 0; - width: 14%; - margin-right: 23px; - - &.follow-us-col { - width: 18%; - margin-right: 0; - } - } - - @include small-desktop { - width: 18%; - margin-right: 30px; - } -} - -.footer-social-icons { - margin: rem(137px) 0 rem(40px) 0; - - a { - height: 32px; - width: 32px; - display: inline-block; - background-color: $very_dark_grey; - border-radius: 50%; - margin-right: 5px; - - &.facebook { - background-image: url($baseurl + "/assets/images/logo-facebook-dark.svg"); - background-position: center center; - background-size: 9px 18px; - background-repeat: no-repeat; - } - - &.twitter { - background-image: url($baseurl + "/assets/images/logo-twitter-dark.svg"); - background-position: center center; - background-size: 17px 17px; - background-repeat: no-repeat; - } - - &.youtube { - background-image: url($baseurl + "/assets/images/logo-youtube-dark.svg"); - background-position: center center; - background-repeat: no-repeat; - } - } -} - -.site-footer { - .mc-field-group { - margin-top: -2px; - } -} - -.site-footer { - .email-subscribe-form input[type="submit"]{ - top: 9px; - @include desktop { - top: 13px; - } - } -} - -.social-links { - grid-column: span 12; - - @media (min-width: 600px) { - grid-column: span 8; - } - - @include desktop { - grid-column: span 6; - align-self: end; - } - - @media (max-width: 999px) { - margin-left: 10px; - margin-right: 10px; - } - - display: grid; - grid-column-gap: 3%; - grid-row-gap: 30px; - grid-template-columns: repeat(6, minmax(0, 1fr)); - - li { - text-align: center; - } -} - -.social-links { - svg { - height: 25px; - max-width: 30px; - fill: #fff; - color: #fff; - &:hover { - fill: #ee4c2c; - color: #ee4c2c; - } - } -} - -.lf-grid { - grid-column-gap: 3%; - grid-row-gap: 30px; - display: grid; - grid-template-columns: repeat(12,1fr); -} - -// removes captcha image from flow. -.hs-recaptcha { - display: none; -} - -.newsletter { - line-height: 140%; - margin-bottom: 80px; - - &__title { - line-height: 140%; - font-size: 24px; - @media (min-width: 1000px) { - font-size: 40px; - } - } - - .legal-consent-container { - display: none; - } - - p.newsletter__privacy { - max-width: 860px; - margin-top: 30px; - line-height: 21px; - font-size: 14px; - color: #dfdfdf; - a { - color: #dfdfdf; - font-weight: 600; - &:hover { - color: #dfdfdf; - font-weight: 600; - } - } -} - - // form container. - .hbspt-form { - min-height: 300px; - @media (min-width: 500px) { - min-height: 100px; - } - @media (min-width: 1000px) { - min-height: 20px; - } - - // Displays if required field not filled. - .hs-error-msg { - display: block; - margin-right: 8px; - color: $orange; - font-size: 14px; - line-height: 1.1em; - width: 95%; - padding-top: 15px; - } - - // form inputs wrapper. - .hs-form { - display: grid; - grid-template-columns: 1fr; - grid-gap: 30px; - - @media (min-width: 500px) { - grid-template-columns: minmax(0, 1fr) minmax(0, 1fr); - } - - @media (min-width: 700px) { - grid-template-columns: repeat(3, minmax(0, 1fr)); - } - - @media (min-width: 950px) { - grid-template-columns: 1fr 1fr 1fr 1fr 1fr; - grid-row-gap: 1.5rem; - grid-column-gap: 1.5rem; - } - - input[type='text'], - input[type='email'] { - height: 50px; - @media (min-width: 500px) { - height: 42px; - } - width: 100%; - background: transparent; - border: none; - border-bottom: 2px solid $white; - border-radius: 0; - transition: all 0.25s ease; - color: $white; - font-size: 16px; - @media (min-width: 500px) { - font-size: 20px; - } - line-height: 105%; - &::placeholder { - color: $white; - font-size: 16px; - @media (min-width: 500px) { - font-size: 20px; - } - line-height: 105%; - } - &:focus { - outline: 0; - border-bottom: 2px solid $orange; - transition: color 0.25s ease; - &::placeholder { - transition: color 0.25s ease; - color: transparent; - } - } - } - - // Controls autocomplete styles. - input, - textarea, - select { - &:-webkit-autofill, - &:-webkit-autofill:hover, - &:-webkit-autofill:focus { - -webkit-text-fill-color: $white; - } - } - - select { - appearance: none; - background: transparent; - border: 0px solid transparent; - border-bottom: 2px solid $white; - border-radius: 0; - box-shadow: 0 1px 0 1px rgba(0, 0, 0, 0); - display: block; - height: 50px; - @media (min-width: 500px) { - height: 42px; - } - margin: 0; - max-width: 100%; - padding: 0.25em 0 calc(0.25em + 1px) 5px; - transition: all 0.25s ease; - width: 100%; - color: $white; - font-size: 16px; - @media (min-width: 500px) { - font-size: 20px; - } - line-height: 105%; - - &::-ms-expand { - display: none; - } - - &:focus { - outline: 0; - border-bottom: 2px solid $orange; - &::placeholder { - transition: color 0.4s ease; - color: transparent; - } - } - - option { - font-weight: normal; - color: black; - } - } - - .hs-button { - border-radius: 5px; - margin-top: 20px; - border: none; - background-color: $orange; - color: $white; - font-weight: 400; - padding: 11px 40px; - font-size: 16px; - font-weight: 700; - text-decoration: none; - } - - // underline errors. - .hs-input.invalid { - border-bottom: 2px dashed red !important; - } - - // hide general error message. - .hs_error_rollup { - display: none; - } - } - } - - // success message for newsletter footer only. - .submitted-message { - display: flex; - align-content: center; - align-items: center; - justify-content: center; - border: 2px solid $white; - min-height: 280px; - font-size: 18px; - padding: 20px 20px 0; - line-height: 1.1em; - @media (min-width: 500px) { - min-height: 80px; - } - @media (min-width: 1000px) { - min-height: unset; - } - } - - .submitted-message p { - max-width: none; - } -} diff --git a/_sass/get-started.scss b/_sass/get-started.scss deleted file mode 100644 index ca3c335c41fe..000000000000 --- a/_sass/get-started.scss +++ /dev/null @@ -1,320 +0,0 @@ -.get-started article { - margin-bottom: rem(80px); -} - -.get-started .quick-start-guides { - ul { - margin-bottom: 0; - padding-left: 0; - } -} - -.get-started .main-content-wrapper { - margin-top: 275px; - @include desktop { - margin-top: 260px + $desktop_header_height; - } -} - -.get-started .jumbotron { - height: 190px; - @include desktop { - height: 260px; - } -} - -.get-started .main-content .navbar { - background-color: #f3f4f7; - - padding-left: 0; - padding-bottom: 0; - padding-top: 0; - - @media (min-width: 992px) { - li:first-of-type { - padding-left: rem(55px); - } - - .nav-item { - padding: 1rem; - cursor: pointer; - } - - .nav-link { - position: relative; - top: 10%; - transform: translateY(-50%); - } - } - - .nav-select { - background-color: $white; - .nav-link { - color: $orange; - font-weight: 500; - } - } - - .nav-link { - font-size: rem(18px); - color: #8c8c8c; - - &:hover { - color: $orange; - } - } - - .get-started-nav-link { - padding-left: rem(20px); - padding-right: rem(20px); - - @include desktop { - padding-left: rem(30px); - padding-right: rem(30px); - } - } - - .nav-item { - padding-top: rem(15px); - padding-bottom: rem(15px); - @include desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @include small-desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @media (max-width: 990px) { - padding-bottom: rem(10px); - padding-top: 1rem; - } - } - - .navbar-toggler { - margin-left: rem(40px); - } -} - -.get-started .main-content { - padding-top: 0; - @include desktop { - padding-top: 1.9rem; - } -} - -.get-started .quick-start-module { - padding-bottom: 0; - padding-top: 0; - background-color: $white; - - .option, - #command { - border: 2px solid $white; - background: $light_grey; - } - - .title-block { - border: 2px solid $white; - } - - .selected { - background-color: $orange; - } - - h1 { - font-size: rem(32px); - letter-spacing: 1.78px; - line-height: rem(40px); - text-transform: uppercase; - margin-bottom: rem(24px); - } -} - -.get-started .nav-menu-wrapper { - background-color: $light_grey; - .container { - padding-left: 0; - padding-right: 0; - @include desktop { - padding-left: 30px; - padding-right: 30px; - } - } -} - -.get-started .navbar-nav { - flex-direction: row; -} - -#installation .os { - display: none; -} - -#installation .selected { - display: block; -} - -#cloud .platform { - display: none; -} - -#cloud .selected { - display: block; -} - -.screencast { - iframe { - width: 100% !important; - } - display: none; -} - -.get-started { - .quick-starts { - .row.ptbuild, - .row.os, - .row.package, - .row.language, - .row.cuda { - margin-bottom: rem(20px); - @include desktop { - margin-bottom: 0; - } - } - - @include small-desktop { - flex: 0 0 100%; - max-width: 100%; - } - - @include desktop { - margin-bottom: rem(40px); - - .row { - margin-bottom: 0; - } - } - - @include max-width-desktop { - margin-bottom: 0; - } - } -} - -.get-started .get-started-locally-sidebar { - padding-top: rem(40px); - padding-bottom: rem(40px); - top: 15%; - z-index: 385; - - @include desktop { - padding-top: 0; - max-height: 100vh; - overflow: auto; - } - - ul { - padding-left: 0; - } - - li { - list-style-type: none; - line-height: 36px; - - a { - color: #8c8c8c; - &.active, - &:hover { - color: $orange; - } - } - - .subitem { - padding-left: rem(20px); - } - } - - li.subitem { - padding-left: rem(20px); - } -} - -.cloud-nav { - display: none; -} - -.get-started .get-started-cloud-sidebar { - padding-top: rem(50px); - padding-bottom: rem(40px); - top: 15%; - - ul { - padding-left: 0; - } - - li { - list-style-type: none; - line-height: 36px; - - a { - color: #8c8c8c; - &.active, - &:hover { - color: $orange; - } - } - - .subitem { - padding-left: rem(20px); - } - } - - li.subitem { - padding-left: rem(20px); - } -} - -.pytorch-2 .article-wrapper article.pytorch-article table tr td:first-of-type { - padding-left: 10px; -} - -.pytorch-2 .article-wrapper article.pytorch-article { - table,td{ - border: 1px solid #A0A0A1; - padding: 10px; - } - - b, em, h3, h2, p, a, strong, td, tr { - font-family: Verdana; - } - - ul, ol { - margin: 1.5rem 0 1.5rem 0; - - li { - font-family: Verdana; - } - } - - code { - font-family: IBMPlexMono,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace; - padding: 2px; - color: inherit; - background-color: #f1f1f1; - } - - p, a { - font-family: Verdana; - word-break: break-word; - strong { - font-family: Verdana; - } - } - - .QnATable { - @media screen and (max-width: 418px) { - max-width: 95vw; - } - } -} diff --git a/_sass/homepage.scss b/_sass/homepage.scss deleted file mode 100644 index 8ec34e9ca4d9..000000000000 --- a/_sass/homepage.scss +++ /dev/null @@ -1,393 +0,0 @@ -.homepage { - - .main-content-wrapper { - margin-top: 315px; - @include desktop { - margin-top: 472px; - } - } - h2 { - margin-bottom: rem(25px); - text-transform: uppercase; - letter-spacing: 1.78px; - line-height: rem(40px); - - @include desktop { - margin-bottom: rem(33px); - } - } - - h3 { - font-size: rem(24px); - letter-spacing: 1.33px; - line-height: rem(32px); - text-transform: uppercase; - margin-bottom: rem(20px); - } - - h5 { - margin-bottom: rem(8px); - @include desktop { - margin-bottom: rem(15px); - } - } - - .jumbotron { - height: 195px; - @include desktop { - height: 395px; - } - .btn { - margin-top: rem(6px); - } - } - - .ecosystem-row { - .card { - background-color: $light_grey; - } - } - - .homepage-header { - background-color: rgba(0, 0, 0, 0.165); - } -} - -.homepage-feature-module { - padding-top: rem(40px); - padding-bottom: rem(40px); - - @include desktop { - padding-top: rem(62px); - padding-bottom: rem(72px); - - .module-button { - position: absolute; - right: 15px; - top: 0; - } - } - - p { - color: $dark_grey; - font-size: 1.125em; - } - - .title { - color: $black; - font-weight: 300; - font-size: rem(24px); - - @include small-desktop { - font-size: rem(20px); - } - } - - .pytorch-title { - font-size: rem(24px); - letter-spacing: 0.33px; - line-height: rem(36px); - } - - .subtext { - font-size: rem(18px); - color: #8c8c8c; - letter-spacing: 0; - line-height: rem(24px); - - @include small-desktop { - font-size: rem(15px); - } - } -} - -.key-features-module { - padding-bottom: 0; - - @include desktop { - padding-bottom: 1.55rem; - } - - .key-features-boxes { - margin-top: rem(32px); - @include desktop { - margin-top: 0; - } - } - - .key-feature-box { - margin-bottom: rem(32px); - - p { - margin-bottom: 0; - letter-spacing: 0.25px; - } - - @include desktop { - margin-bottom: rem(40px); - } - } -} - -.community-heading { - margin-top: rem(32px); -} - -.community-module { - background-color: $white; - - .ecosystem-card { - height: auto; - - @include small-desktop { - padding: rem(10px); - } - } - - h2 { - margin-bottom: 0; - } - - h5 { - text-transform: uppercase; - color: #c6000a; - margin-bottom: rem(20px); - } - - .h2-subheadline { - margin-top: rem(20px); - margin-bottom: 2.6rem; - - @include desktop { - margin-top: 0; - } - } - - .card-body { - @include small-desktop { - padding: rem(10px); - } - } - - .module-button { - background-color: $light_grey; - } - - p { - margin-bottom: rem(40px); - letter-spacing: 0.25px; - } - - .module-subtext { - margin-right: rem(250px); - } - - .email-subscribe-form input.email { - border-bottom: 1px solid #d6d7d8; - font-size: rem(20px); - line-height: 0; - padding-bottom: rem(12px); - } - - .email-subscribe-form input[type="submit"] { - top: 6px; - @include desktop { - top: 10px; - } - } -} - -.pytorch-users-module, -.homepage-bottom-wrapper { - background-color: $light_grey; -} - -.pytorch-users-module { - @include desktop { - padding-bottom: 1.9rem; - } -} - -.community-avatar { - height: 60px; - width: 60px; -} - -.community-logo-bottom { - height: 200px; - background-color: $light_grey; -} - -.university-testimonials h2 { - margin-bottom: 2.2rem; -} - -.university-testimonials-content { - margin-top: rem(40px); - margin-bottom: 2rem; - - @include desktop { - margin-top: 0; - } - - .col-md-4 { - margin-bottom: rem(40px); - } - - .case-study-title { - font-size: rem(24px); - margin-bottom: rem(20px); - } - - p { - color: $dark_grey; - font-size: 1.125rem; - letter-spacing: 0.25px; - } - - .btn { - background-color: $white; - } -} - -.follow-us-on-twitter h2 { - margin-bottom: rem(20px); - @include desktop { - margin-bottom: rem(40px); - } -} - -.homepage-feature-module .tweets-wrapper p { - font-size: rem(16px); -} - -.quick-starts { - p { - font-size: rem(18px); - line-height: rem(28px); - } -} - -.quick-start-guides { - font-size: rem(24px); - letter-spacing: 0.25px; - line-height: rem(36px); - color: #a5a5a5; - - .step-counter { - margin-bottom: rem(3px); - } - - ul { - list-style-type: none; - padding-left: 0; - li { - margin-bottom: 0; - font-size: rem(18px); - - @include desktop { - margin-bottom: rem(12px); - &:last-of-type { - margin-bottom: 0; - } - } - - &.selected { - color: $orange; - &:before { - content: "\2022"; - position: absolute; - left: 0; - @include desktop { - left: -5px; - } - } - } - } - } - - .select-instructions { - color: $slate; - border-bottom: 2px solid #a5a5a5; - margin-bottom: rem(16px); - font-size: rem(18px); - display: inline-block; - @include desktop { - margin-bottom: 0; - } - } -} - -.homepage .news-banner-container { - background: $black; - color: $white; - text-align: center; - padding: 20px; - width: 90%; - - .right-arrow, .left-arrow { - height: 15px; - bottom: -3px; - position: relative; - @include desktop { - bottom: -8px; - } - &:hover { - cursor: pointer; - } - } - .right-arrow { - float: right; - } - .left-arrow { - float: left; - } -} - -.homepage #news-items { - .pagination { - display: none !important; - } -} - -.banner-info { - display: inline-block; - overflow: hidden; - text-overflow: ellipsis; - white-space: nowrap; - margin: auto; - width: 80%; - font-size: rem(18px); - @include desktop { - padding-top: 3px; - } - &:hover { - cursor: pointer; - color: $orange; - } -} - -.news-banner-text { - a { - color: white; - &:hover { - color: $orange; - } - } -} - -.no-banner { - padding-bottom: 2rem; -} - -.homepage-box-module { - div.col-md { - background: #F3F4F7; - margin: 10px; - padding: 30px; - - @include desktop { - margin: 20px; - } - } -} \ No newline at end of file diff --git a/_sass/hub-search.scss b/_sass/hub-search.scss deleted file mode 100644 index 6abaf474501e..000000000000 --- a/_sass/hub-search.scss +++ /dev/null @@ -1,106 +0,0 @@ -.hub .hub-search-wrapper { - @include desktop { - top: 8px; - } - .algolia-autocomplete .ds-dropdown-menu { - min-width: 100%; - max-width: 100% !important; - } - .algolia-autocomplete { - width: 100%; - } - &.active { - width: 100%; - } - span { - font-size: 1.125rem; - text-align: center; - } -} - -.hub #hub-search-icon { - @media (max-width: 480px) { - margin-top: 1rem; - } -} - -#hub-search-icon { - background-image: url($baseurl + "/assets/images/search-icon.svg"); - color: transparent; - opacity: 0.4; - width: 25px; - height: 25px; - margin-left: 3rem; - background-size: 15px 20px; - background-repeat: no-repeat; - right: 10px; - position: absolute; - z-index: 1; - cursor: pointer; - &:hover { - background-image: url($baseurl + "/assets/images/search-icon-orange.svg"); - opacity: 1; - } -} - -#hub-search-input { - background-color: $very_dark_grey; - border: none; - color: $black; - font-size: rem(18px); - font-weight: 300; - line-height: 20px; - outline: none; - position: relative; - display: none; - width: 100%; - border-radius: 5px; - padding: rem(14px) 0 rem(14px) rem(5px); -} - -#hub-close-search { - display: none; - margin-left: 20px; - opacity: 0.4; - right: 10px; - position: absolute; - z-index: 1; - cursor: pointer; - font-size: rem(18px); - @include desktop { - top: rem(18px); - } - &:hover { - color: $orange; - opacity: 1; - } -} - -.hub .hub-divider { - margin-bottom: 2.2rem; - margin-top: 1.5rem; -} - -.hub .active-hub-divider{ - border-color: $orange; -} - -.hub .hub-search-border { - display: flex; - align-items: center; - flex-direction: row; - border: none; - background-color: transparent; - border-radius: 20px; - width: 100%; -} - -.hub .hub-cards-wrapper { - z-index: 1000; -} - -.hub .nav-container { - display: flex; - width: 100%; - position: absolute; -} diff --git a/_sass/hub.scss b/_sass/hub.scss deleted file mode 100644 index cf3133e3f012..000000000000 --- a/_sass/hub.scss +++ /dev/null @@ -1,632 +0,0 @@ -.hub .jumbotron { - height: 300px; - @include desktop { - height: 420px; - } - - h1 { - color: $white; - #hub-header, #hub-sub-header { - font-weight: lighter; - } - #hub-sub-header { - } - } - - p.lead, p.hub-release-message { - margin-bottom: rem(25px); - padding-top: rem(25px); - color: $white; - - @include desktop { - width: 77%; - } - } - - p.hub-release-message { - padding-top: 0; - font-style: italic; - } - - svg { - margin-bottom: rem(20px); - } - - p.detail-lead { - padding-top: rem(50px); - color: $mid_gray; - width: 100%; - margin-bottom: 0px; - } - - p.lead-summary { - color: $dark_grey; - } -} - -.hub.hub-index .jumbotron { - height: 280px; - @include desktop { - height: 325px; - } -} - -.hub .detail-github-link { - background: $orange; - color: $white; -} - -.hub .detail-colab-link { - background: $yellow; - color: $black; -} - -.hub .detail-web-demo-link { - background: #4a9fb5; - color: $white; -} - -.hub { - .detail-colab-link, .detail-github-link, .detail-web-demo-link { - margin-top: 1rem; - } -} - -.hub { - .detail-button-container { - margin-top: rem(45px); - @include small-desktop { - margin-top: rem(20px); - } - @media (max-width: 320px) { - margin-top: rem(20px); - } - @media (max-width: 360px) { - margin-top: rem(20px); - } - } -} - -.hub a { - .detail-colab-link, .detail-github-link { - padding-right: rem(50px); - } -} - -.hub .detail-arrow { - color: $orange; - @include desktop { - font-size: 4.5rem; - } - font-size: 2.5rem; -} - -.hub .with-right-white-arrow { - padding-right: rem(32px); - position: relative; - background-image: url($baseurl + "/assets/images/chevron-right-white.svg"); - background-size: 6px 13px; - background-position: top 10px right 11px; - background-repeat: no-repeat; - @include desktop { - background-size: 8px 14px; - background-position: top 15px right 12px; - padding-right: rem(32px); - } -} - -.hub .main-content { - padding-top: rem(140px); - @include desktop { - padding-top: rem(135px); - } - @media (max-width: 320px) { - padding-top: rem(160px); - } -} - -.hub.hub-detail .main-content { - padding-top: rem(200px); - @include desktop { - padding-top: rem(150px); - } -} - -.hub.hub-detail .jumbotron { - height: 350px; - @include desktop { - height: 400px; - } -} - -.hub .main-content-wrapper { - background-color: $light_grey; - - @include desktop { - margin-top: 305px + $desktop_header_height; - } - margin-top: 300px; -} - -.hub-feedback-button { - border: 2px solid #e2e2e2; - color: #A0A0A1; - padding-left: 0; - padding-right: 5rem; - font-size: 1rem; - width: 13rem; - &:after { - bottom: -1px; - } -} - -.hub-flag { - background-image: url($baseurl + "/assets/images/feedback-flag.svg"); - background-size: 15px 20px; - background-position: center right 10px; - background-repeat: no-repeat; -} - -#hub-icons { - height: 2rem; - @media (max-width: 480px) { - position: initial; - padding-left: 0; - padding-top: 1rem; - } -} - -.hub.hub-detail .main-content-wrapper { - @include desktop { - margin-top: 300px + $desktop_header_height; - } - @include small-desktop { - margin-top: 400px + $desktop_header_height; - } - @media (max-width: 320px) { - margin-top: 330px; - } - margin-top: 305px; -} - -.hub .hub-cards-wrapper, .hub-cards-wrapper-right { - margin-bottom: rem(18px); - padding-top: rem(20px); - - .card-body { - .card-summary { - width: 75%; - } - .hub-image { - position: absolute; - top: 0px; - right: 0px; - height: 100%; - width: 25%; - img { - height: 100%; - width: 100%; - } - &:before { - content: ''; - position: absolute; - top: 0; - left: 0; - bottom: 0; - right: 0; - z-index: 1; - background: #000000; - opacity: .075; - } - } - } -} - -.hub .github-stars-count { - color: $mid_gray; - position: relative; - top: rem(4px); - font-size: 14px; - @include desktop { - top: rem(3px); - font-size: initial; - } -} - -.hub .github-stars-count-whole-number { - display: none; -} - -.hub .github-logo { - height: 15px; - width: 13px; -} - -.hub .icon-count-container { - display: inline-block; - vertical-align: text-bottom; - margin-left: rem(8px); -} - -.hub .detail-count { - font-size: rem(20px); -} - -.hub .main-stars-container { - display: flex; -} - -.hub .detail-stars-container { - display: inline-flex; - .github-stars-image { - margin-left: 0; - } -} - -.hub .card-body { - .hub-card-title-container { - width: 75%; - display: inline-flex; - max-width: rem(300px); - .experimental-badge { - text-transform: uppercase; - margin-left: rem(15px); - background-color: #e4e4e4; - color: $not_quite_black; - opacity: 0.75; - font-size: rem(10px); - letter-spacing: 1px; - line-height: rem(22px); - height: rem(20px); - width: rem(96px); - text-align: center; - margin-top: rem(4px); - } - .card-title { - padding-left: 0; - font-size: 1.5rem; - color: #262626; - } - .star-list { - list-style: none; - padding-left: 0; - li { - display: inline; - } - li.github-stars-count-whole-number { - display: none; - } - } - } -} - -.hub .hub-filter-menu { - ul { - list-style-type: none; - padding-left: rem(20px); - li { - padding-right: rem(20px); - word-break: break-all; - - a { - color: $mid_gray; - &:hover { - color: $orange; - } - } - } - } -} - -.hub .hub-filter { - cursor: pointer; -} - -.hub-index { - #dropdownSortLeft { - color: $mid_gray; - cursor: pointer; - z-index: 1; - position: absolute; - top: inherit; - left: 23%; - max-width: 4rem; - @media(min-width: 480px) and (max-width: 590px) { - left: 40%; - } - } -} - -.hub #dropdownFilter, #dropdownSort, #dropdownSortLeft { - color: $mid_gray; - cursor: pointer; - z-index: 1; - position: absolute; - top: 11rem; - right: 1rem; - left: inherit; - @media(min-width: 480px) and (max-width: 590px) { - top: 7rem; - } - @media(min-width: 590px) { - top: 5rem; - } - @include desktop { - top: 5rem; - } -} - -.hub .sort-menu { - left: inherit; - right: 1rem; - top: 12.5rem; - max-width: 12rem; - @media(min-width: 480px) and (max-width: 590px) { - top: 8.5rem; - } - @media(min-width: 590px) and (max-width: 900px) { - top: 6.5rem; - } - @media(min-width: 900px) and (max-width: 1239px) { - top: 6.5rem; - } - @include max-width-desktop { - right: 0; - top: 6.5rem; - } -} - -.hub-index .sort-menu { - left: 23%; - top: inherit; - max-width: 12rem; -} - -.hub .research-hub-title, -.research-hub-sub-title { - text-transform: uppercase; - letter-spacing: 1.78px; - line-height: rem(32px); -} - -.research-hub-sub-title { - padding-bottom: rem(20px); -} - -.hub .research-hub-title { - color: $orange; -} - -.hub .all-models-button, .full-docs-button { - font-size: 1.125rem; - position: relative; - cursor: pointer; - outline: none; - padding: rem(10px) rem(30px) rem(10px) rem(20px); - background-color: $white; - margin-bottom: 0.125rem; - border: 2px solid $light_grey; - letter-spacing: -0.25px; - line-height: rem(28px); - color: $dark_grey; - background-image: url($baseurl + "/assets/images/chevron-right-orange.svg"); - background-size: 6px 13px; - background-position: center right 10px; - background-repeat: no-repeat; - a { - color: $dark_grey; - } - - @include animated_border_hover_state; -} - -.hub .hub-column { - padding-bottom: rem(75px); -} - -.hub.hub-index .hub-column { - padding-bottom: 0; -} - -.hub .how-it-works { - padding-top: rem(50px); - padding-bottom: rem(45px); - .how-it-works-text { - color: $dark_grey; - font-size: rem(20px); - letter-spacing: 0; - line-height: rem(30px); - } - .how-it-works-title-col { - padding-bottom: rem(55px); - } - .full-docs-button { - margin-top: rem(30px); - } -} - -.hub .hub-code-text { - font-size: 80%; - color: $not_quite_black; - background-color: $light_white; - padding: 2px; -} - -.hub .hub-code-block { - display: block; - border-left: 3px solid $orange; - padding: rem(20px) rem(25px) rem(20px) rem(25px); - margin-bottom: rem(60px); -} - -.hub pre.highlight { - background-color: $light_white; - border-left: 2px solid $orange; -} - -.hub code.highlighter-rouge { - background-color: $light_white; -} - -.hub article { - padding-top: rem(20px); - @include desktop { - padding-top: 0; - } - p { - color: $slate; - } -} - -.hub .hub-detail-background { - @include desktop { - height: 515px; - } -} - -.hub .dropdown-menu { - border-radius: 0; - padding-bottom: 0; -} - -.hub .card { - &:hover { - .hub-image:before { - bottom: 100%; - } - } -} - -.hub.hub.hub-detail { - .github-stars-image { - img { - @include desktop { - height: 10px - } - height: 9px - } - } -} - -.hub #development-models-hide, #research-models-hide { - display: none; -} - -.hub .col-md-6.hub-column { - @media (min-width: 768px) { - flex: 0 0 100%; - max-width: 100%; - } - - @include max-width-desktop { - flex: 0 0 50%; - max-width: 50%; - } -} - -.hub .col-md-12.hub-column { - .col-md-6 { - @media (min-width: 768px) { - flex: 0 0 100%; - max-width: 100%; - } - - @include max-width-desktop { - flex: 0 0 100%; - max-width: 50%; - } - } -} - -.hub .featured-image { - padding-bottom: rem(20px); -} - -.hub .coming-soon { - font-weight: 300; - font-style: italic; -} - -.hub.hub-index .jumbotron { - @include desktop { - height: 325px; - } - h1 { - @include desktop { - padding-top: rem(55px); - } - padding-top: 0; - } - p.lead { - padding-top: rem(55px); - } -} - -.hub.hub-index .main-content-wrapper { - @include desktop { - margin-top: 190px + $desktop_header_height; - } - margin-top: 210px; -} - -.hub .page-link { - font-size: rem(20px); - letter-spacing: 0; - line-height: rem(34px); - color: $orange; - width: rem(120px); - text-align: center; -} - -.hub .filter-btn { - color: $mid_gray; - border: 1px solid $mid_gray; - display: inline-block; - text-align: center; - white-space: nowrap; - vertical-align: middle; - padding: 0.375rem 0.75rem; - font-size: 1rem; - line-height: 1.5; - margin-bottom: 5px; - &:hover { - border: 1px solid $orange; - color: $orange; - } -} - -.hub .selected { - border: 1px solid $orange; - background-color: $orange; - color: $white; - &:hover { - color: $white; - } -} - -.hub .all-tag-selected { - background-color: $mid_gray; - color: $white; - &:hover { - border-color: $mid_gray; - color: $white; - } -} - -.hub .pagination { - .page { - border: 1px solid #dee2e6; - padding: 0.5rem 0.75rem; - } - - .active .page { - background-color: #dee2e6; - } -} - -.hub .hub-tags-container { - width: 60%; - &.active { - width: 0; - } -} diff --git a/_sass/jumbotron.scss b/_sass/jumbotron.scss deleted file mode 100644 index 60817bd8fba3..000000000000 --- a/_sass/jumbotron.scss +++ /dev/null @@ -1,73 +0,0 @@ -.jumbotron { - background-color: transparent; - position: absolute; - left: 0; - right: 0; - margin-right: auto; - margin-left: auto; - padding: 0; - margin-bottom: 0; - display: flex; - align-items: center; - top: $mobile_header_height; - - @include desktop { - height: 550px; - top: $desktop_header_height; - } - - .jumbotron-content { - display: flex; - align-items: center; - } - - .lead { - font-weight: 400; - letter-spacing: 0.25px; - font-size: 20px; - line-height: 1.2; - @include desktop { - font-size: 29px; - } - } - - h1 { - font-size: rem(32px); - text-transform: uppercase; - font-weight: lighter; - letter-spacing: 1.08px; - margin-bottom: rem(10px); - line-height: 1.05; - margin-top: 4rem; - - @include desktop { - font-size: rem(62px); - margin-top: 0; - } - - img { - margin-bottom: 1rem; - } - } - - p { - font-size: rem(18px); - margin-bottom: rem(20px); - @include full-nav-menu-desktop { - width: 50%; - } - } - - &.on-dark-background { - h1, p { - color: $white; - } - } - - .btn { - padding-top: rem(9px); - @include desktop { - margin-top: rem(10px); - } - } -} diff --git a/_sass/main-content.scss b/_sass/main-content.scss deleted file mode 100644 index 42e4f6ae8e0e..000000000000 --- a/_sass/main-content.scss +++ /dev/null @@ -1,36 +0,0 @@ -.main-content-wrapper { - margin-top: 300px; - - @include desktop { - margin-top: 450px + $desktop_header_height; - min-height: 400px; - } -} - -.main-content { - padding-top: rem(24px); - padding-bottom: rem(24px); - - @include desktop { - padding-top: 2.625rem; - } -} - -.main-content-menu { - margin-bottom: rem(20px); - @include desktop { - margin-bottom: rem(80px); - } - - .navbar-nav .nav-link { - color: $slate; - padding-left: rem(30px); - padding-right: rem(30px); - - @include desktop { - &:first-of-type { - padding-left: 0; - } - } - } -} diff --git a/_sass/mobile.scss b/_sass/mobile.scss deleted file mode 100644 index 58b659399681..000000000000 --- a/_sass/mobile.scss +++ /dev/null @@ -1,133 +0,0 @@ -.mobile article { - margin-bottom: rem(80px); -} - -.mobile .main-background { - height: 275px; - - @include desktop { - height: 380px; - } -} - -.mobile .main-content-wrapper { - margin-top: 275px; - @include desktop { - margin-top: 260px + $desktop_header_height; - } -} - -.mobile .jumbotron { - height: 190px; - @include desktop { - height: 260px; - } -} - -.mobile .main-content .navbar { - background-color: $light_grey; - - padding-left: 0; - padding-bottom: 0; - padding-top: 0; - - @media (min-width: 992px) { - li:first-of-type { - padding-left: rem(55px); - } - - .nav-item { - padding: 2rem; - cursor: pointer; - } - - .nav-link { - position: relative; - top: 10%; - transform: translateY(-50%); - } - } - - .nav-select { - background-color: $white; - .nav-link { - color: $orange; - font-weight: 500; - } - } - - .nav-link { - font-size: rem(18px); - color: #8c8c8c; - @include desktop { - margin-left: rem(30px); - } - &:hover { - color: $orange; - } - } - - .nav-item { - padding-top: rem(15px); - padding-bottom: rem(15px); - @include desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @include small-desktop { - padding-bottom: 0; - padding-top: 2rem; - } - @media (max-width: 990px) { - padding-bottom: rem(10px); - padding-top: 1rem; - } - } - - .navbar-toggler { - margin-left: rem(40px); - } -} - -.mobile .main-content { - padding-top: 0; - @include desktop { - padding-top: 1.9rem; - } -} - -.mobile .nav-menu-wrapper { - background-color: $light_grey; -} - -.mobile .navbar-nav { - flex-direction: row; -} - -.mobile .mobile-page-sidebar { - padding-top: rem(40px); - padding-bottom: rem(40px); - top: 15%; - - @include desktop { - padding-top: 0; - } - - ul { - padding-left: 0; - } - - li { - list-style-type: none; - line-height: 23px; - margin-bottom: 15px; - - a { - color: #8c8c8c; - &.active, - &:hover { - color: $orange; - } - } - } -} diff --git a/_sass/navigation.scss b/_sass/navigation.scss deleted file mode 100644 index 420978c613c1..000000000000 --- a/_sass/navigation.scss +++ /dev/null @@ -1,443 +0,0 @@ -.header-holder { - height: $mobile_header_height; - - @include full-nav-menu-desktop { - height: $desktop_header_height - 20px; - } - - align-items: center; - display: flex; - left: 0; - margin-left: auto; - margin-right: auto; - position: fixed; - right: 0; - top: 0; - @include full-nav-menu-desktop { - top: 32px; - } - width: 100%; - z-index: 9999; - - &.blog-header, - &.blog-detail-header, - &.resources-header, - &.get-started-header, - &.features-header, - &.comm-stories-header, - &.ecosystem-header, - &.announcement-header, - &.hub-header, - &.mobile-header { - background-color: $white; - border-bottom: 1px solid #e2e2e2; - } -} - -.hello-bar { - display: none; - @include full-nav-menu-desktop { - background-color: #CC2F90; - color: $white; - display: flex; - letter-spacing: .34px; - justify-content: center; - padding: 4px 0; - position: fixed; - top: 0; - text-align: center; - z-index: 9999; - margin-left: auto; - margin-right: auto; - width: 100%; - a { - color: $white; - text-decoration: underline; - } - } -} - -.header-container { - position: relative; - display: flex; - align-items: center; - @include clearfix; - - @include full-nav-menu-desktop { - display: block; - } -} - -.header-logo { - height: 23px; - width: 93px; - background-image: url($baseurl + "/assets/images/logo.svg"); - background-repeat: no-repeat; - background-size: 93px 23px; - display: block; - float: left; - - @include full-nav-menu-desktop { - background-size: 108px 27px; - position: absolute; - height: 27px; - width: 108px; - top: 4px; - float: none; - } -} - -.main-menu-open-button { - background-image: url($baseurl + "/assets/images/icon-menu-dots.svg"); - background-position: center center; - background-size: 25px 7px; - background-repeat: no-repeat; - width: 25px; - height: 7px; - position: absolute; - right: 0; - top: 4px; - @include full-nav-menu-desktop { - display: none; - } -} - -.header-holder .main-menu { - display: none; - - @include full-nav-menu-desktop { - display: flex; - align-items: center; - justify-content: flex-end; - } - - ul { - display: flex; - align-items: center; - margin: 0; - } - - ul li { - display: inline-block; - margin-right: 34px; - position: relative; - - &.active { - &:after { - content: "•"; - bottom: -24px; - color: $orange; - font-size: rem(22px); - left: 0; - position: absolute; - right: 0; - text-align: center; - } - - a { - color: $orange; - } - - .with-down-arrow { - background-image: url($baseurl + "/assets/images/chevron-down-orange.svg"); - } - } - - &.resources-active:after { - left: -27px; - } - - &:last-of-type { - margin-right: 0; - } - } - - ul li a { - color: $white; - font-size: 1.2rem; - letter-spacing: 0; - line-height: rem(34px); - text-align: center; - text-decoration: none; - padding-bottom: 10px; - @include full-nav-menu-desktop { - &:hover { - color: #ffffff; - border-bottom: 2px solid #ffffff; - } - } - - &.with-down-arrow { - cursor: default; - padding-right: rem(32px); - position: relative; - background-image: url($baseurl + "/assets/images/chevron-down-white.svg"); - background-size: 14px 18px; - background-position: top 7px right 10px; - background-repeat: no-repeat; - padding-bottom: 20px; - &:hover { - border-bottom: none; - } - - .dropdown-menu { - border-radius: 0; - padding: 0; - - .dropdown-item { - color: $dark_grey; - border-bottom: 1px solid #e2e2e2; - &:last-of-type { - border-bottom-color: transparent; - } - - &:hover { - background-color: $orange; - } - - p { - font-size: rem(16px); - color: #757575; - } - } - - a.dropdown-item { - &:hover { - color: $white; - p { - color: $white; - } - } - } - } - } - } -} - -.mobile-main-menu { - display: none; - &.open { - background-color: $slate; - display: block; - height: 100%; - left: 0; - margin-left: auto; - margin-right: auto; - min-height: 100%; - position: fixed; - right: 0; - top: 0; - width: 100%; - z-index: 99999; - } -} - -.mobile-main-menu .container-fluid { - background-color: inherit; - align-items: center; - display: flex; - height: $mobile_header_height; - position: relative; - @include clearfix; - z-index: 1; -} - -.mobile-main-menu.open { - ul { - list-style-type: none; - padding: 0; - } - - ul li a, .resources-mobile-menu-title { - font-size: rem(32px); - color: $white; - letter-spacing: 0; - line-height: rem(64px); - } - - ul li.active a { - color: $orange; - } -} - -.main-menu-close-button { - background-image: url($baseurl + "/assets/images/icon-close.svg"); - background-position: center center; - background-repeat: no-repeat; - background-size: 24px 24px; - height: 24px; - position: absolute; - right: 0; - width: 24px; - top: -4px; -} - -.mobile-main-menu-header-container { - position: relative; -} - -.mobile-main-menu-links-container { - display: flex; - padding-left: rem(45px); - height: 100%; - min-height: 100%; - margin-top: 20px; - overflow-y: scroll; - @media only screen and (max-width: 320px) { - .main-menu { - padding-top: 5rem; - } - } - - .navSearchWrapper { - @media only screen and (max-width: 320px) { - width: 75%; - } - } -} - -#topnav-gh-icon { - background-image: url(/assets/social/github-white.svg); - color: white; - width: 33px; - height: 33px; - background-size: 23px 23px; - background-repeat: no-repeat; - background-position: 5px 4px; - border-radius: 25px; - &:hover { - background-color:#88888833; - } -} - -.blog-header, -.blog-detail-header, -.resources-header, -.get-started-header, -.features-header, -.ecosystem-header, -.announcement-header, -.comm-stories-header, -.hub-header, -.mobile-header { - .header-logo { - background-image: url($baseurl + "/assets/images/logo-dark.svg"); - } - - .main-menu ul li a { - color: $not_quite_black; - @include full-nav-menu-desktop { - &:hover { - color: $not_quite_black; - border-bottom: 2px solid $not_quite_black; - } - } - &.with-down-arrow { - background-image: url($baseurl + "/assets/images/chevron-down-black.svg"); - } - } - - .main-menu-open-button { - background-image: url($baseurl + "/assets/images/icon-menu-dots-dark.svg"); - } - - #topnav-gh-icon { - background-image: url(/assets/social/github-black.svg); - } -} - -.ecosystem-dropdown-menu, .resources-dropdown-menu { - left: -25px; - width: 300px; - display: none; - position: absolute; - z-index: 1000; - display: none; - top: 45px; - float: left; - min-width: 10rem; - padding: 0.5rem 0; - font-size: 1rem; - color: #212529; - text-align: left; - list-style: none; - background-color: $white; - background-clip: padding-box; - border: 1px solid rgba(0, 0, 0, 0.15); - border-radius: 0.25rem; -} - -.ecosystem-dropdown:hover, .resources-dropdown:hover, .resources-active:hover { - .ecosystem-dropdown-menu, .resources-dropdown-menu { - display: block; - } -} - -.main-menu ul li { - .ecosystem-dropdown-menu, .resources-dropdown-menu { - border-radius: 0; - padding: 0; - } -} - -.main-menu ul li { - .ecosystem-dropdown-menu, .resources-dropdown-menu { - .dropdown-item { - color: #6c6c6d; - border-bottom: 1px solid #e2e2e2; - } - } -} - -.header-holder .main-menu ul li a.nav-dropdown-item { - display: block; - font-size: rem(16px); - line-height: rem(21px); - width: 100%; - padding: 0.25rem 1.5rem; - clear: both; - font-weight: 400; - color: #757575; - text-align: left; - background-color: transparent; - border-bottom: 1px solid #e2e2e2; - p { - margin-bottom: .5rem; - } - &:last-of-type { - border-bottom-color: transparent; - } - &:hover { - background-color: $orange; - color: white; - } - .dropdown-title { - font-size: rem(18px); - color: #212529; - letter-spacing: 0; - line-height: 34px; - } - .docs-title { - display: block; - padding-top: 0.5rem; - } -} - -.header-holder .main-menu ul li a.nav-dropdown-item:hover .dropdown-title { - background-color: $orange; - color: white; -} - -.mobile-main-menu-links-container { - ul.resources-mobile-menu-items { - li { - padding-left: 15px; - a { - font-size: rem(24px); - line-height: rem(48px); - } - } - } -} - - diff --git a/_sass/quick-start-module.scss b/_sass/quick-start-module.scss deleted file mode 100644 index 884df6705cbd..000000000000 --- a/_sass/quick-start-module.scss +++ /dev/null @@ -1,435 +0,0 @@ -.quick-starts { - background: $light_grey; - - .col-md-2-4 { - position: relative; - width: 100%; - min-height: 1px; - padding-right: 15px; - padding-left: 15px; - } - - @media (min-width: 768px) { - .col-md-2-4 { - -webkit-box-flex: 0; - -ms-flex: 0 0 20%; - flex: 0 0 20%; - max-width: 20%; - } - } - - .start-locally-col { - margin-bottom: rem(20px); - .row.ptbuild, - .row.os, - .row.package, - .row.language, - .row.cuda { - margin-bottom: rem(20px); - @include desktop { - margin-bottom: 0; - } - } - - @include small-desktop { - flex: 0 0 100%; - max-width: 100%; - } - - @include desktop { - margin-bottom: rem(40px); - - .row { - margin-bottom: 0; - } - } - - @include max-width-desktop { - margin-bottom: 0; - } - - pre { - font-size: 80% !important; - background-color: #ffffff !important; - } - - .prev-versions-btn { - margin-top: 30px; - } - } - - .cloud-options-col { - @include small-desktop { - flex: 0 0 100%; - max-width: 100%; - margin-left: 0; - margin-top: rem(20px); - } - } - - p { - font-size: rem(18px); - line-height: rem(28px); - } - - .card-body { - flex: 1 1 auto; - } - - .cloud-option-image { - margin-left: rem(15px); - margin-right: rem(25px); - margin-bottom: rem(5px); - } - - .cloud-option-row { - margin-left: 0; - cursor: pointer; - } - - .option { - border: 2px solid $light_grey; - font-size: rem(16px); - color: $quick_start_grey; - letter-spacing: -0.22px; - line-height: rem(20px); - background: $white; - cursor: pointer; - } - - .option:hover { - background-color: $orange; - color: $white; - } - - .selected { - background-color: $orange; - color: $white; - } - - .block { - margin-bottom: rem(1px); - height: rem(40px); - display: flex; - align-items: center; - } - - .title-block { - margin: rem(1px); - height: rem(40px); - border: 2px solid $light_grey; - font-size: rem(16px); - color: $quick_start_grey; - line-height: rem(20px); - display: flex; - align-items: center; - } - - .title-block:before { - display: block; - content: "."; - color: transparent; - border-left: 2px solid $smoky_grey; - height: 100%; - position: absolute; - left: 0; - } - - #command { - color: #4a4a4a; - background-color: $white; - padding: rem(15px); - border: 2px solid $light_grey; - word-wrap: break-word; - display: table-cell; - vertical-align: middle; - - a { - font-size: 125%; - - @include desktop { - &:hover { - color: $orange; - } - } - } - - pre { - word-break: break-all; - white-space: normal; - } - } - - .command-container { - display: table; - width: 100%; - @include desktop { - min-height: rem(84px); - } - pre { - margin-bottom: 0px; - padding: 0px; - font-size: 75%; - background-color: #f3f4f7; - } - } - - .command-block { - height: rem(84px); - word-wrap: break-word; - color: $command_block_black; - } - - .command-block:before { - border-left: 2px solid $black; - } - - .quick-start-link { - color: $quick_start_grey; - } - - .mobile-heading { - @include desktop { - display: none; - } - display: flex; - align-items: center; - font-weight: 400; - } - - .command-mobile-heading { - @include desktop { - display: none; - } - display: flex; - align-items: center; - font-weight: 400; - color: $black; - } - - .headings { - display: none; - @include desktop { - display: block; - } - } - - .cloud-options-col { - margin-top: rem(20px); - @include desktop { - margin-top: 0; - } - } - - @media (max-width: 978px) { - .os-text { - margin-top: 0; - } - } -} - -.quick-start-guides { - font-size: rem(18px); - letter-spacing: 0.25px; - line-height: rem(36px); - color: $medium_smoky_grey; - - .select-instructions { - color: $slate; - border-bottom: 2px solid $medium_smoky_grey; - margin-bottom: rem(16px); - display: inline-block; - @include desktop { - margin-bottom: 0; - } - } -} - -.quick-start-module { - .option-module { - float: right; - } - - padding-top: rem(40px); - padding-bottom: rem(40px); - - @include desktop { - padding-top: rem(64px); - padding-bottom: rem(66px); - } - - p { - color: $dark_grey; - font-size: 1.125em; - letter-spacing: 0.25px; - padding-bottom: rem(15px); - margin-bottom: 1.4rem; - } - - h3 { - font-size: rem(24px); - letter-spacing: 1.33px; - line-height: rem(32px); - text-transform: uppercase; - margin-bottom: 2.1rem; - } -} - -.quick-starts .cloud-option-body { - display: flex; - align-items: center; - height: 64px; - padding: 0 0 0 rem(80px); - - @include animated_border_hover_state; - - @include desktop { - padding-right: rem(32px); - } - - @include small-desktop { - padding-right: rem(20px); - } - - position: relative; - background-image: url($baseurl + "/assets/images/chevron-right-orange.svg"); - background-size: 6px 13px; - background-position: center right 15px; - background-repeat: no-repeat; - - @include desktop { - background-size: 8px 14px; - } - - &:before { - opacity: 0.5; - position: absolute; - left: rem(30px); - top: 21px; - } - - &.aws:before { - content: url($baseurl + "/assets/images/aws-logo.svg"); - } - - &.microsoft-azure:before { - content: url($baseurl + "/assets/images/microsoft-azure-logo.svg"); - } - - &.lightning-studios:before { - content: url($baseurl + "/assets/images/lightning-studios-logo.svg"); - } - - &.google-cloud:before { - content: url($baseurl + "/assets/images/google-cloud-logo.svg"); - } - - &.colab:before { - content: url($baseurl + "/assets/images/colab-logo.svg"); - } - - @include desktop { - &:hover:before { - opacity: 1; - } - } -} - -.quick-starts .cloud-option { - background-color: $white; - margin-bottom: rem(2px); - border: 2px solid $light_grey; - font-size: rem(18px); - letter-spacing: -0.25px; - line-height: rem(30px); - color: $not_quite_black; - - #microsoft-azure { - p{ - color: $not_quite_black; - margin: 0; - padding: 0; - font-size: inherit; - line-height: 1.3rem; - } - span { - margin-bottom: 0; - padding-bottom: 0; - color: $orange; - padding: 0px 35px 0px 8px; - font-style: italic; - line-height: 1.3rem; - } - } - - @include small-desktop { - font-size: rem(16px); - } - - ul { - display: none; - width: 100%; - margin: 0 0 rem(20px) 0; - padding: 0; - - li { - margin-top: 0; - position:relative; - padding-left: rem(80px); - - @include small-desktop { - font-size: rem(16px); - } - - a { - color: $quick_start_grey; - letter-spacing: -0.25px; - line-height: 30px; - - @include desktop { - &:hover { - color: $orange; - } - } - } - - @include desktop { - &:hover:before { - content: "\2022"; - color: $orange; - position: absolute; - left: 36px; - } - } - - &:first-of-type { - margin-top: rem(20px); - } - } - } - - &.open { - .cloud-option-body { - background-image: url($baseurl + "/assets/images/chevron-down-orange.svg"); - background-size: 14px 14px; - border-bottom: 1px solid $orange; - color: $not_quite_black; - - @include desktop { - border-bottom: none; - } - - &:after { - width: 100%; - } - - &:before { - opacity: 1; - } - } - - ul { - display: block; - } - } -} diff --git a/_sass/resources.scss b/_sass/resources.scss deleted file mode 100644 index 2cd925bab3b6..000000000000 --- a/_sass/resources.scss +++ /dev/null @@ -1,29 +0,0 @@ -.resources .jumbotron { - align-items: flex-end; - color: $white; - height: 220px; - @include desktop { - height: 300px; - } - h1 { - padding-top: rem(135px); - } - p.lead { - margin-bottom: rem(25px); - padding-top: rem(20px); - } -} - -.resources .main-content-wrapper { - margin-top: 385px; - margin-bottom: 0.75rem; - @include desktop { - margin-top: 475px; - } -} - -.resources .resource-card { - @include desktop { - margin-bottom: rem(36px); - } -} diff --git a/_sass/search.scss b/_sass/search.scss deleted file mode 100644 index fa9a119aad42..000000000000 --- a/_sass/search.scss +++ /dev/null @@ -1,365 +0,0 @@ -/* Search */ -input[type='search'] { - -moz-appearance: none; - -webkit-appearance: none; -} - -.navSearchWrapper { - align-items: center; - align-self: center; - display: flex; - justify-content: center; - position: relative; - right: 10px; - top: 15px; - margin-left: 0; - padding-bottom: 20px; - @include desktop { - position: absolute; - margin-left: 30px; - display: block; - padding-left: 3px; - padding-bottom: 0; - } -} - -.tabletSearchWrapper { - top: 0px; - - @include small-desktop { - padding-bottom: 20px; - position: relative; - margin-left: 0; - } -} - -.navSearchWrapper .aa-dropdown-menu { - background: #f9f9f9; - border: 3px solid rgba(57, 57, 57, 0.25); - color: #393939; - font-size: rem(14px); - left: auto !important; - line-height: 1.2em; - right: 0 !important; -} - -.navSearchWrapper - .aa-dropdown-menu - .algolia-docsearch-suggestion--category-header { - background: $black; - color: white; - font-size: rem(14px); - font-weight: 400; -} - -.navSearchWrapper - .aa-dropdown-menu - .algolia-docsearch-suggestion--category-header - .algolia-docsearch-suggestion--highlight { - background-color: $black; - color: #fff; -} - -.navSearchWrapper - .aa-dropdown-menu - .algolia-docsearch-suggestion--title - .algolia-docsearch-suggestion--highlight, -.navSearchWrapper - .aa-dropdown-menu - .algolia-docsearch-suggestion--subcategory-column - .algolia-docsearch-suggestion--highlight { - color: $black; -} - -.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion__secondary, -.navSearchWrapper - .aa-dropdown-menu - .algolia-docsearch-suggestion--subcategory-column { - border-color: rgba(57, 57, 57, 0.3); -} - -.navSearchWrapper .algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { - @include desktop { - word-wrap: normal; - } -} - -input#search-input { - background-color: inherit; - border: none; - border-radius: 20px; - color: $black; - font-size: rem(18px); - font-weight: 300; - line-height: 20px; - outline: none; - padding-left: 25px; - position: relative; - -webkit-transition: 0.5s width ease; - -moz-transition: 0.5s width ease; - -o-transition: 0.5s width ease; - transition: 0.5s width ease; - display: none; - width: 220px; - background-image: url($baseurl + "/assets/images/search-icon.svg"); - background-size: 12px 15px; - background-repeat: no-repeat; - background-position: 8px 5px; - &:hover { - background-image: url($baseurl + "/assets/images/search-icon-orange.svg"); - } -} - -input#mobile-search-input { - font-size: 2rem; - background-color: transparent; - color: $white; - border: none; - outline: none; - padding-left: 25px; - position: relative; - border-top-left-radius: 20px; - border-bottom-left-radius: 20px; - width: 300px; - display: block; -} - -input#search-input:focus, -input#search-input:active { - color: $black; -} -.navigationSlider .slidingNav .navSearchWrapper .algolia-docsearch-footer a { - height: auto; -} -@media only screen and (max-width: 735px) { - .navSearchWrapper { - width: 100%; - } -} - -input::-webkit-input-placeholder { - color: #e5e5e5; -} - -input::-moz-placeholder { - color: #e5e5e5; -} - -input::placeholder { - color: #e5e5e5; -} - -.hljs { - padding: 1.25rem 1.5rem; -} - -@media only screen and (max-width: 1024px) { - .reactNavSearchWrapper input#search-input { - background-color: rgba(242, 196, 178, 0.25); - border: none; - border-radius: 20px; - box-sizing: border-box; - color: #393939; - font-size: rem(14px); - line-height: 20px; - outline: none; - padding-left: 25px; - position: relative; - transition: background-color 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55), - width 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55), color 0.2s ease; - width: 100%; - } - - .reactNavSearchWrapper input#search-input:focus, - .reactNavSearchWrapper input#search-input:active { - background-color: $black; - color: #fff; - } - - .reactNavSearchWrapper .algolia-docsearch-suggestion--subcategory-inline { - display: none; - } - - .reactNavSearchWrapper > span { - width: 100%; - } - - .reactNavSearchWrapper .aa-dropdown-menu { - font-size: rem(12px); - line-height: 2em; - padding: 0; - border-width: 1px; - min-width: 500px; - } - .reactNavSearchWrapper .algolia-docsearch-suggestion__secondary { - border-top: none; - } - .aa-suggestions { - min-height: 140px; - max-height: 60vh; - -webkit-overflow-scrolling: touch; - overflow-y: scroll; - } -} - -@media only screen and (min-width: 1024px) { - .navSearchWrapper { - padding-left: 10px; - position: relative; - right: auto; - top: auto; - @include desktop { - padding-left: 3px; - right: 10px; - margin-left: 0; - } - } - - .navSearchWrapper .algolia-autocomplete { - display: block; - } - - .tabletSearchWrapper { - right: 10px; - } -} - -@media only screen and (max-width: 735px) { - .reactNavSearchWrapper .aa-dropdown-menu { - min-width: 400px; - } -} -@media only screen and (max-width: 475px) { - .reactNavSearchWrapper .aa-dropdown-menu { - min-width: 300px; - } -} - -.search-border { - display: none; - flex-direction: row; - border: none; - background-color: transparent; - border-radius: 20px; - width: 100%; - float: right; - @include desktop { - display: flex; - } -} - -.mobile-search-border { - flex-direction: row; - border: none; - background-color: rgba(256, 256, 256, 0.1); - border-radius: 20px; - width: 100%; - float: right; - display: flex; - @include small-desktop { - border-radius: 25px; - } -} - -#close-search { - color: $orange; - padding-right: 10px; - font-size: .99em; - display: none; - cursor: pointer; -} - -.active-header { - margin-top: -1px; -} - -.active-search-icon { - background-image: url($baseurl + "/assets/images/search-icon-orange.svg") !important; - display: inline-block !important; -} - -.active-background { - background-color: $light_grey; - width: 50%; - padding: 4px; -} - -.homepage-header { - input#search-input { - background-image: url($baseurl + "/assets/images/search-icon-white.svg"); - color: $white; - } - input#search-input:focus, - input#search-input:active { - color: $white; - } - .active-background { - background-color:#88888833; - } - #close-search { - color: $white; - opacity: 0.5; - &:hover { - color: $orange; - } - } - #search-icon { - background-image: url(/assets/images/search-icon-white.svg); - &:hover { - background-color:#88888833; - } - } -} - -#search-icon { - background-image: url(/assets/images/search-icon.svg); - color: transparent; - width: 33px; - height: 33px; - background-size: 21px 21px; - background-repeat: no-repeat; - background-position: 6px 5px; - border-radius: 25px; - cursor: pointer; - &:hover { - background-color: $light_grey; - } -} - -#mobile-search-icon { - background-image: url(/assets/images/search-icon-white.svg); - width: 30px; - height: 38px; - background-size: 16px 28px; - background-repeat: no-repeat; - background-position: 0px 5px; - cursor: pointer; - border-top-right-radius: 20px; - border-bottom-right-radius: 20px; - @include small-desktop { - height: 50px; - width: 35px; - background-size: 20px 42px; - } -} - -.navSearchWrapper { - .algolia-autocomplete .ds-dropdown-menu { - min-width: 330px; - height: 500px; - overflow-y: scroll; - @include desktop { - height: auto; - min-width: 700px; - overflow-y: hidden; - } - @include small-desktop { - height: 700px; - overflow-y: scroll; - } - @media (min-width: 769px) and (max-width: 1024px) { - min-width: 950px; - } - } -} -/* End of Search */ diff --git a/_sass/similar-posts-module.scss b/_sass/similar-posts-module.scss deleted file mode 100644 index 71d804f22bee..000000000000 --- a/_sass/similar-posts-module.scss +++ /dev/null @@ -1,55 +0,0 @@ -.similar-posts-module { - background: $light_grey; - - p.blog-date { - font-size: rem(18px); - color: $very_dark_grey; - letter-spacing: 0; - line-height: rem(24px); - } - - h4 { - a { - font-family: FreightSans; - font-size: rem(24px); - color: $black; - letter-spacing: 0; - line-height: rem(32px); - font-weight: 400; - } - } - - .module-content { - .navbar-nav { - margin-top: rem(60px); - } - - .module-heading { - text-transform: uppercase; - color: $black; - font-size: rem(24px); - letter-spacing: rem(1.33px); - line-height: rem(32px); - font-weight: 400; - } - - .nav-item:last-of-type { - @include desktop { - position: absolute; - right: 0; - a { - margin-right: 0; - } - } - } - margin-bottom: rem(35px); - } - - .see-more-posts { - color: $black; - font-size: rem(18px); - letter-spacing: -0.25px; - line-height: rem(30px); - top: rem(2px); - } -} diff --git a/_sass/syntax-highlighting.scss b/_sass/syntax-highlighting.scss deleted file mode 100644 index 36b42a5bb8f6..000000000000 --- a/_sass/syntax-highlighting.scss +++ /dev/null @@ -1,211 +0,0 @@ -/*Github syntax highlighting theme via Rouge*/ - -.highlight table td { padding: 5px; } -.highlight table pre { margin: 0; } -.highlight .cm { - color: #999988; - font-style: italic; -} -.highlight .cp { - color: #999999; - font-weight: bold; -} -.highlight .c1 { - color: #999988; - font-style: italic; -} -.highlight .cs { - color: #999999; - font-weight: bold; - font-style: italic; -} -.highlight .c, .highlight .cd { - color: #8c8c8c; - font-style: italic; -} -.highlight .err { - color: #a61717; - background-color: #e3d2d2; -} -.highlight .gd { - color: #000000; - background-color: #ffdddd; -} -.highlight .ge { - color: #000000; - font-style: italic; -} -.highlight .gr { - color: #aa0000; -} -.highlight .gh { - color: #999999; -} -.highlight .gi { - color: #000000; - background-color: #ddffdd; -} -.highlight .go { - color: #888888; -} -.highlight .gp { - color: #555555; -} -.highlight .gs { - font-weight: bold; -} -.highlight .gu { - color: #aaaaaa; -} -.highlight .gt { - color: #aa0000; -} -.highlight .kc { - color: #000000; - font-weight: bold; -} -.highlight .kd { - color: #000000; - font-weight: bold; -} -.highlight .kn { - color: #000000; - font-weight: bold; -} -.highlight .kp { - color: #000000; - font-weight: bold; -} -.highlight .kr { - color: #000000; - font-weight: bold; -} -.highlight .kt { - color: #445588; - font-weight: bold; -} -.highlight .k, .highlight .kv { - color: #000000; - font-weight: bold; -} -.highlight .mf { - color: #009999; -} -.highlight .mh { - color: #009999; -} -.highlight .il { - color: #009999; -} -.highlight .mi { - color: #009999; -} -.highlight .mo { - color: #009999; -} -.highlight .m, .highlight .mb, .highlight .mx { - color: #009999; -} -.highlight .sb { - color: #d14; -} -.highlight .sc { - color: #d14; -} -.highlight .sd { - color: #d14; -} -.highlight .s2 { - color: #d14; -} -.highlight .se { - color: #d14; -} -.highlight .sh { - color: #d14; -} -.highlight .si { - color: #d14; -} -.highlight .sx { - color: #d14; -} -.highlight .sr { - color: #009926; -} -.highlight .s1 { - color: #d14; -} -.highlight .ss { - color: #990073; -} -.highlight .s { - color: #d14; -} -.highlight .na { - color: #008080; -} -.highlight .bp { - color: #999999; -} -.highlight .nb { - color: #0086B3; -} -.highlight .nc { - color: #445588; - font-weight: bold; -} -.highlight .no { - color: #008080; -} -.highlight .nd { - color: #3c5d5d; - font-weight: bold; -} -.highlight .ni { - color: #800080; -} -.highlight .ne { - color: #990000; - font-weight: bold; -} -.highlight .nf { - color: #990000; - font-weight: bold; -} -.highlight .nl { - color: #990000; - font-weight: bold; -} -.highlight .nn { - color: #555555; -} -.highlight .nt { - color: #000080; -} -.highlight .vc { - color: #008080; -} -.highlight .vg { - color: #008080; -} -.highlight .vi { - color: #008080; -} -.highlight .nv { - color: #008080; -} -.highlight .ow { - color: #000000; - font-weight: bold; -} -.highlight .o { - color: #000000; - font-weight: bold; -} -.highlight .w { - color: #bbbbbb; -} -.highlight { - background-color: #f8f8f8; -} diff --git a/_sass/videos.scss b/_sass/videos.scss deleted file mode 100644 index 9264c04a95d6..000000000000 --- a/_sass/videos.scss +++ /dev/null @@ -1,21 +0,0 @@ -.video-item { - margin-bottom: 5rem; - - a h5 { - color: $black; - margin-top: 1rem; - } - a:hover { - h5 { - color: $orange; - } - } - - .image-container { - overflow: hidden; - img { - margin: -10% 0; - width: 100%; - } - } -} \ No newline at end of file diff --git a/_style_guide/article.md b/_style_guide/article.md deleted file mode 100644 index 603e7814d5ba..000000000000 --- a/_style_guide/article.md +++ /dev/null @@ -1,122 +0,0 @@ ---- -layout: default -title: Base Style Guide ---- - -## Header 2 -This is body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea. - -### Header 3 - -This is body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea. - -#### Header 4 - -This is body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea. - -##### Header 5 - -This is body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea. - ---- - -This is more body copy with `code snippets`. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. [Here is an inline link](#). Ut enim ad minim veniam, quis nostrud `torch.*.FloatTensor` ullamco laboris nisi ut aliquip ex ea commodo consequat. - -_This is italicized body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat_ - -**This is bolded body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.** - ---- - -This is body copy before an unordered list. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea. - -- Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. -- Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. -- Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - -This is body copy after an unordered list. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea. - -1. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. -2. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. -3. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - -This is body copy after an ordered list. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea. - -
        -
        Definition list
        -
        Lorem ipsum dolor sit amet, consectetur adipiscing elit
        - -
        Definition list
        -
        Lorem ipsum dolor sit amet, consectetur adipiscing elit
        - -
        Definition list
        -
        Lorem ipsum dolor sit amet, consectetur adipiscing elit
        -
        - ---- - -![Here's an image](https://via.placeholder.com/1000x200/e44c2c/ffffff "Sample image") - ---- - -> "This is a blockquote. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat" - -```sh - brew install pytorch # Here is a small code block - brew install pytorch # Here is a small code block -``` - -```python -# Here is a large code block with syntax highlighting - -# !/usr/bin/python3 - -# Dictionaries map keys to values. - -fred = { 'mike': 456, 'bill': 399, 'sarah': 521 } - -# Subscripts. -try: - print(fred) - print(fred['bill']) - print(fred['nora']) - print("Won't see this!") -except KeyError as rest: - print("Lookup failed:", rest) -print() - -# Entries can be added, udated, or deleted. -fred['bill'] = 'Sopwith Camel' -fred['wilma'] = 2233 -del fred['mike'] -print(fred) -print() - -# Get all the keys. -print(fred.keys()) -for k in fred.keys(): - print(k, "=>", fred[k]) -print() - -# Test for presence of a key. -for t in [ 'zingo', 'sarah', 'bill', 'wilma' ]: - print(t,end=' ') - if t in fred: - print('=>', fred[t]) - else: - print('is not present.') -``` - -Here is a table: - -| Data | type torch.dtype | Tensor types | -|------|------------------|--------------| -| 32-bit floating point | `torch.float32` or `torch.float` | `torch.*.FloatTensor` -| 64-bit floating point | `torch.float64` or `torch.double` | `torch.*.DoubleTensor` -| 16-bit floating point | `torch.float16` or `torch.half` | `torch.*.HalfTensor` -| 8-bit integer (unsigned) | `torch.uint8` | `torch.*.ByteTensor` -| 8-bit integer (signed) | `torch.int8` | `torch.*.CharTensor` -| 16-bit integer (signed) | `torch.int16` or `torch.short` | `torch.*.ShortTensor` -| 32-bit integer (signed) | `torch.int32` or `torch.int` | `torch.*.IntTensor` -| 64-bit integer (signed) | `torch.int64` or `torch.long` | `torch.*.LongTensor` - diff --git a/_videos/pt20qa1.md b/_videos/pt20qa1.md deleted file mode 100644 index e6b641b68485..000000000000 --- a/_videos/pt20qa1.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Q&A Series: How and why you should contribute to tutorials and code to PyTorch' -youtube_id: v4nDZTK_eJg -date: Dec 16, 2022 ---- diff --git a/_videos/pt20qa10.md b/_videos/pt20qa10.md deleted file mode 100644 index a3722e3e89b5..000000000000 --- a/_videos/pt20qa10.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Q&A: Dynamic Shapes and Calculating Maximum Batch Size' -youtube_id: 4dX4kuVbl9U -date: Feb 8, 2023 ---- diff --git a/_videos/pt20qa11.md b/_videos/pt20qa11.md deleted file mode 100644 index c1ff75f708b9..000000000000 --- a/_videos/pt20qa11.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Q&A: TorchRL' -youtube_id: myEfUoYrbts -date: Feb 16, 2023 ---- diff --git a/_videos/pt20qa12.md b/_videos/pt20qa12.md deleted file mode 100644 index 06a8f75a88a0..000000000000 --- a/_videos/pt20qa12.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Q&A: TorchMultiModal' -youtube_id: L7W2-0pwsFI -date: Feb 24, 2023 ---- diff --git a/_videos/pt20qa2.md b/_videos/pt20qa2.md deleted file mode 100644 index d05f4e1d12df..000000000000 --- a/_videos/pt20qa2.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Live Q&A Series: PT2 Profiling and Debugging' -youtube_id: 1FSBurHpH_Q -date: Dec 16, 2022 ---- diff --git a/_videos/pt20qa3.md b/_videos/pt20qa3.md deleted file mode 100644 index 844d5bec7cef..000000000000 --- a/_videos/pt20qa3.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Live Q&A Series: A Deep Dive on TorchDynamo' -youtube_id: 5FNHwPIyHr8 -date: Dec 20, 2022 ---- diff --git a/_videos/pt20qa4.md b/_videos/pt20qa4.md deleted file mode 100644 index 1a8ae72d9c9b..000000000000 --- a/_videos/pt20qa4.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Live Q&A Series: PyTorch 2.0 Export' -youtube_id: U6J5hl6nXlU -date: Dec 22, 2022 ---- diff --git a/_videos/pt20qa5.md b/_videos/pt20qa5.md deleted file mode 100644 index 181fdb47a228..000000000000 --- a/_videos/pt20qa5.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Live Q&A Series: TorchRec and FSDP in Production' -youtube_id: NgW6gp69ssc -date: Dec 22, 2022 ---- diff --git a/_videos/pt20qa6.md b/_videos/pt20qa6.md deleted file mode 100644 index 38f64fdcc8d5..000000000000 --- a/_videos/pt20qa6.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Ask the Engineers Q&A Series: Deep Dive into TorchInductor and PT2 Backend Integration' -youtube_id: AaFc3C7CZAs -date: Jan 26, 2023 ---- diff --git a/_videos/pt20qa7.md b/_videos/pt20qa7.md deleted file mode 100644 index 11f736aca14d..000000000000 --- a/_videos/pt20qa7.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Ask the Engineers Q&A Series: PT2 and Distributed (DDP/FSDP)' -youtube_id: 6S4tH9qEswo -date: Jan 25, 2023 ---- diff --git a/_videos/pt20qa8.md b/_videos/pt20qa8.md deleted file mode 100644 index 4fc96052374c..000000000000 --- a/_videos/pt20qa8.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Q&A: Rethinking Data Loading with TorchData' -youtube_id: 65DvI3YrFW8 -date: Feb 2, 2023 ---- diff --git a/_videos/pt20qa9.md b/_videos/pt20qa9.md deleted file mode 100644 index 36125361fade..000000000000 --- a/_videos/pt20qa9.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.0 Q&A: Optimizing Transformers for Inference' -youtube_id: ZOWjOxC80qw -date: Feb 3, 2023 ---- diff --git a/_videos/ptconf1.md b/_videos/ptconf1.md deleted file mode 100644 index 9cceb6da68d7..000000000000 --- a/_videos/ptconf1.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: State of PyTorch - Alban Desmaison, Meta - Speakers: Alban Desmaison' -youtube_id: dR0lHxt3Tjo -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf11.md b/_videos/ptconf11.md deleted file mode 100644 index 9d859c6eb460..000000000000 --- a/_videos/ptconf11.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: Large-Scale Distributed Training with Dynamo and PyTorch/XLA SPMD - Yeounoh Chung & Jiewen Tan, Google' -youtube_id: tWH2MAHzVVc -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf12.md b/_videos/ptconf12.md deleted file mode 100644 index 16ccea6a767b..000000000000 --- a/_videos/ptconf12.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: PyTorch 2.0 on the ROCm Platform - Douglas Lehr, AMD' -youtube_id: lN-LrBqpeaA -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf13.md b/_videos/ptconf13.md deleted file mode 100644 index 27acd9aafd72..000000000000 --- a/_videos/ptconf13.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: Accelerated Inference in PyTorch 2.X with Torch-TensorRT - George Stefanakis & Dheeraj Peri, NVIDIA' -youtube_id: eGDMJ3MY4zk -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf15.md b/_videos/ptconf15.md deleted file mode 100644 index 544f10dfd178..000000000000 --- a/_videos/ptconf15.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: Streamlining Model Export with the New ONNX Exporter - Maanav Dalal & Aaron Bockover' -youtube_id: cDDWD8KhUbQ -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf16.md b/_videos/ptconf16.md deleted file mode 100644 index e49da6755f1c..000000000000 --- a/_videos/ptconf16.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: Efficient Inference at the Edge: Performance You Need at the Lowest Power You Deserve - Felix Baum, Qualcomm' -youtube_id: AEY64cbP4h8 -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf2.md b/_videos/ptconf2.md deleted file mode 100644 index b052e39235e1..000000000000 --- a/_videos/ptconf2.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: TorchFix - a Linter for PyTorch-Using Code with Autofix Support - Sergii Dymchenko' -youtube_id: qLU2JD_PtiY -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf3.md b/_videos/ptconf3.md deleted file mode 100644 index 7f95cb1c6310..000000000000 --- a/_videos/ptconf3.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: "What's New for PyTorch Developer Infrastructure - Eli Uriegas & Omkar Salpekar" -youtube_id: I95KmF6KSIA -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf4.md b/_videos/ptconf4.md deleted file mode 100644 index df46c48c697e..000000000000 --- a/_videos/ptconf4.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: Enhancements Made to MPS Backend in PyTorch for Applications Running on Mac Platforms - Kulin Seth, Apple' -youtube_id: Np8YEW011dg -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf5.md b/_videos/ptconf5.md deleted file mode 100644 index 64893ec93344..000000000000 --- a/_videos/ptconf5.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch Korea User Group: The Beginning, Present, and Future - Junghwan Park' -youtube_id: 80MGwzKQOc4 -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf6.md b/_videos/ptconf6.md deleted file mode 100644 index 7041e922780b..000000000000 --- a/_videos/ptconf6.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: Triton Compiler - Thomas Raoux, OpenAI' -youtube_id: AtbnRIzpwho -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf7.md b/_videos/ptconf7.md deleted file mode 100644 index 68931368a89e..000000000000 --- a/_videos/ptconf7.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: Harnessing NVIDIA Tensor Cores: An Exploration of CUTLASS & OpenAI Triton - Matthew Nicely US, NVIDIA' -youtube_id: yCyZEJrlrfY -date: Oct 25, 2023 ---- diff --git a/_videos/ptconf8.md b/_videos/ptconf8.md deleted file mode 100644 index 16ccea6a767b..000000000000 --- a/_videos/ptconf8.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Lightning Talk: PyTorch 2.0 on the ROCm Platform - Douglas Lehr, AMD' -youtube_id: lN-LrBqpeaA -date: Oct 25, 2023 ---- diff --git a/_videos/vid1.md b/_videos/vid1.md deleted file mode 100644 index d42b1576182d..000000000000 --- a/_videos/vid1.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.5 Live Q&A' -youtube_id: B3IgXpl4xt4 -date: Oct 21, 2024 ---- diff --git a/_videos/vid10.md b/_videos/vid10.md deleted file mode 100644 index faf1c637b5ae..000000000000 --- a/_videos/vid10.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Using PyTorch and DINOv2 for Multi-label Plant Species Classification' -youtube_id: rxVg3yrc51s -date: Mar 28, 2025 ---- diff --git a/_videos/vid11.md b/_videos/vid11.md deleted file mode 100644 index b7720dd02abb..000000000000 --- a/_videos/vid11.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch Expert Exchange – Multi-Modal Tabular Deep Learning with PyTorch Frame' -youtube_id: zPjLHf0X78w -date: Feb 20, 2025 ---- diff --git a/_videos/vid12.md b/_videos/vid12.md deleted file mode 100644 index f3ba5fc289fa..000000000000 --- a/_videos/vid12.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.6 Release Live Q&A' -youtube_id: 1OopuwTq6oE -date: Feb 8, 2025 ---- diff --git a/_videos/vid13.md b/_videos/vid13.md deleted file mode 100644 index 747642d8aea4..000000000000 --- a/_videos/vid13.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'How does batching work on modern GPUs?' -youtube_id: HTcnp9NEHGY -date: Nov 14, 2024 ---- diff --git a/_videos/vid2.md b/_videos/vid2.md deleted file mode 100644 index b7900abc0ec0..000000000000 --- a/_videos/vid2.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'DistServe: disaggregating prefill and decoding for goodput-optimized LLM inference' -youtube_id: Bh-jlh5vlF0 -date: Oct 16, 2024 ---- diff --git a/_videos/vid3.md b/_videos/vid3.md deleted file mode 100644 index 92bae2c1fa3e..000000000000 --- a/_videos/vid3.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Efficient Streaming Language Models with Attention Sinks' -youtube_id: RnM84Sv9WpA -date: Oct 11, 2024 ---- diff --git a/_videos/vid4.md b/_videos/vid4.md deleted file mode 100644 index c29ec8d1e005..000000000000 --- a/_videos/vid4.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch Expert Exchange: Adapting open source models with Open-Instruct and Tulu' -youtube_id: e1qUJFAo10s -date: Sep 11, 2024 ---- diff --git a/_videos/vid5.md b/_videos/vid5.md deleted file mode 100644 index 110c90047648..000000000000 --- a/_videos/vid5.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch Expert Exchange: Efficient Generative Models: From Sparse to Distributed Inference' -youtube_id: Eqg0VIiWrgM -date: Aug 30, 2024 ---- diff --git a/_videos/vid6.md b/_videos/vid6.md deleted file mode 100644 index e456938c3d9e..000000000000 --- a/_videos/vid6.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'torch.compile: The Missing Manual' -youtube_id: rew5CSUaIXg -date: Aug 13, 2024 ---- diff --git a/_videos/vid7.md b/_videos/vid7.md deleted file mode 100644 index 02bea05040a7..000000000000 --- a/_videos/vid7.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Using PyTorch for Monocular Depth Estimation Webinar' -youtube_id: xf2QgioY370 -date: Sep 27, 2024 ---- diff --git a/_videos/vid8.md b/_videos/vid8.md deleted file mode 100644 index 550290a68f3c..000000000000 --- a/_videos/vid8.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'Accelerating LLM family of models on Arm Neoverse based Graviton AWS processors with KleidiAI' -youtube_id: NeHIhQWewug -date: Aug 22, 2024 ---- diff --git a/_videos/vid9.md b/_videos/vid9.md deleted file mode 100644 index 769901483585..000000000000 --- a/_videos/vid9.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: 'PyTorch 2.4: Live Q&A' -youtube_id: ry_QgUIYX1E -date: Jul 25, 2024 ---- diff --git a/ai-powered-competitive-programming.html b/ai-powered-competitive-programming.html index 5a14b2a7b7a4..aae968437f6f 100644 --- a/ai-powered-competitive-programming.html +++ b/ai-powered-competitive-programming.html @@ -1,12 +1,310 @@ ---- -layout: default -title: "AI-Powered Competitive Programming: My HackerCup 2024 Experience" -body-class: announcement -background-class: announcement-background -permalink: /ai-powered-competitive-programming ---- - -
        + + + + + + + + + + + + + AI-Powered Competitive Programming: My HackerCup 2024 Experience | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Webinars

        @@ -38,4 +336,306 @@

        AI-Powered Competitive Programming: My HackerCup 2024 Experience

        -
        \ No newline at end of file + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/announcement.html b/announcement.html deleted file mode 100644 index 90eda81bc8d7..000000000000 --- a/announcement.html +++ /dev/null @@ -1,120 +0,0 @@ ---- -layout: default -title: PyTorch Foundation -body-class: announcement -background-class: announcement-background -permalink: /foundation ---- -{% assign cards = site.board_info %} - -
        -
        -
        -

        PyTorch
               Foundation

        -
        -
        -
        - -
        -
        -
        -
        -

        Accelerating Open Source AI

        -

        - Welcome to the PyTorch Foundation—a vibrant, community-driven hub for open source AI. Developers, researchers, and industry pioneers collaborate here to advance the PyTorch framework and strengthen the open source AI ecosystem. -
        -
        - From cutting-edge development to production-ready tools and libraries, the PyTorch Foundation thrives through transparent collaboration and collective innovation. As part of the Linux Foundation, we host global events, deliver specialized training, support research, and provide resources to accelerate your AI journey. - Whether you are contributing code, sharing your expertise, or deploying real-world AI solutions, the PyTorch Foundation actively empowers you to shape the future of accessible and impactful open source AI. -

        -
        -
        -
        -
        - -
        -
        -
        -
        -

        Our Guiding Principles

        - -

        Our mission is to drive the adoption of AI and deep learning by supporting an open, vendor-neutral ecosystem built around PyTorch. By making state-of-the-art tools and libraries accessible to everyone, we aim to democratize innovation in AI and ML. Learn more about the mission and values that guide us in our PyTorch Foundation Principles.

        -
        -
        -
        -
        - -
        -
        -
        -
        -

        PyTorch Members

        - - - - - - - - - -
        -
        -
        -
        - -
        -
        -
        -
        -

        Our Governance

        - -

        - The PyTorch Foundation’s Governing Board oversees the Foundation’s activities according to its Guiding Principles and the PyTorch Foundation Charter. -
        -
        - The PyTorch Foundation Code of Conduct details our commitment to fostering an inclusive, welcoming, and safe environment for everyone involved in the PyTorch Foundation community. -
        -
        - The technical governance structure for the PyTorch open source project is defined by the PyTorch maintainers and is available on our PyTorch Technical Governance page. -

        -
        -
        -
        -
        - - -
        -
        -
        -
        -

        How to Get Involved

        - -

        New to the PyTorch Foundation? Check out our guide to getting started with the PyTorch Foundation or join the PyTorch developer or user community to contribute, learn, and get your questions answered.

        -
        -
        -
        -
        - -
        -
        -
        -
        -

        Get in Touch

        -

        The success of PyTorch is only possible with the contributions and support of our developer community and member companies. If you would like to learn how you can collaborate with your peers in the PyTorch Foundation, and would like to have a conversation with a PyTorch Foundation representative, please fill out this form.

        - -

        Note: for all PyTorch technical questions please go to discuss.pytorch.org

        - - -
        -
        -
        -
        diff --git a/assets/css/style.css b/assets/css/style.css new file mode 100644 index 000000000000..1f9ba713ded3 --- /dev/null +++ b/assets/css/style.css @@ -0,0 +1 @@ +/*! normalize.css v4.1.1 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}body{margin:0}article,aside,details,figcaption,figure,footer,header,main,menu,nav,section{display:block}summary{display:list-item}audio,canvas,progress,video{display:inline-block}audio:not([controls]){display:none;height:0}progress{vertical-align:baseline}template,[hidden]{display:none !important}a{background-color:transparent}a:active,a:hover{outline-width:0}abbr[title]{border-bottom:none;text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted}b,strong{font-weight:inherit}b,strong{font-weight:bolder}dfn{font-style:italic}h1{font-size:2em;margin:0.67em 0}mark{background-color:#ff0;color:#000}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-0.25em}sup{top:-0.5em}img{border-style:none}svg:not(:root){overflow:hidden}code,kbd,pre,samp{font-family:monospace, monospace;font-size:1em}figure{margin:1em 40px}hr{box-sizing:content-box;height:0;overflow:visible}button,input,select,textarea{font:inherit;margin:0}optgroup{font-weight:bold}button,input{overflow:visible}button,select{text-transform:none}button,html [type="button"],[type="reset"],[type="submit"]{-webkit-appearance:button}button::-moz-focus-inner,[type="button"]::-moz-focus-inner,[type="reset"]::-moz-focus-inner,[type="submit"]::-moz-focus-inner{border-style:none;padding:0}button:-moz-focusring,[type="button"]:-moz-focusring,[type="reset"]:-moz-focusring,[type="submit"]:-moz-focusring{outline:1px dotted ButtonText}fieldset{border:1px solid #c0c0c0;margin:0 2px;padding:0.35em 0.625em 0.75em}legend{box-sizing:border-box;color:inherit;display:table;max-width:100%;padding:0;white-space:normal}textarea{overflow:auto}[type="checkbox"],[type="radio"]{box-sizing:border-box;padding:0}[type="number"]::-webkit-inner-spin-button,[type="number"]::-webkit-outer-spin-button{height:auto}[type="search"]{-webkit-appearance:textfield;outline-offset:-2px}[type="search"]::-webkit-search-cancel-button,[type="search"]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-input-placeholder{color:inherit;opacity:0.54}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}*{box-sizing:border-box}input,select,textarea,button{font-family:inherit;font-size:inherit;line-height:inherit}body{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";font-size:14px;line-height:1.5;color:#24292e;background-color:#fff}a{color:#0366d6;text-decoration:none}a:hover{text-decoration:underline}b,strong{font-weight:600}hr,.rule{height:0;margin:15px 0;overflow:hidden;background:transparent;border:0;border-bottom:1px solid #dfe2e5}hr::before,.rule::before{display:table;content:""}hr::after,.rule::after{display:table;clear:both;content:""}table{border-spacing:0;border-collapse:collapse}td,th{padding:0}button{cursor:pointer;border-radius:0}[hidden][hidden]{display:none !important}details summary{cursor:pointer}details:not([open])>*:not(summary){display:none !important}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:0}h1{font-size:32px;font-weight:600}h2{font-size:24px;font-weight:600}h3{font-size:20px;font-weight:600}h4{font-size:16px;font-weight:600}h5{font-size:14px;font-weight:600}h6{font-size:12px;font-weight:600}p{margin-top:0;margin-bottom:10px}small{font-size:90%}blockquote{margin:0}ul,ol{padding-left:0;margin-top:0;margin-bottom:0}ol ol,ul ol{list-style-type:lower-roman}ul ul ol,ul ol ol,ol ul ol,ol ol ol{list-style-type:lower-alpha}dd{margin-left:0}tt,code{font-family:"SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;font-size:12px}pre{margin-top:0;margin-bottom:0;font-family:"SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace;font-size:12px}.octicon{vertical-align:text-bottom}.anim-fade-in{-webkit-animation-name:fade-in;animation-name:fade-in;-webkit-animation-duration:1s;animation-duration:1s;-webkit-animation-timing-function:ease-in-out;animation-timing-function:ease-in-out}.anim-fade-in.fast{-webkit-animation-duration:300ms;animation-duration:300ms}@-webkit-keyframes fade-in{0%{opacity:0}100%{opacity:1}}@keyframes fade-in{0%{opacity:0}100%{opacity:1}}.anim-fade-out{-webkit-animation-name:fade-out;animation-name:fade-out;-webkit-animation-duration:1s;animation-duration:1s;-webkit-animation-timing-function:ease-out;animation-timing-function:ease-out}.anim-fade-out.fast{-webkit-animation-duration:0.3s;animation-duration:0.3s}@-webkit-keyframes fade-out{0%{opacity:1}100%{opacity:0}}@keyframes fade-out{0%{opacity:1}100%{opacity:0}}.anim-fade-up{opacity:0;-webkit-animation-name:fade-up;animation-name:fade-up;-webkit-animation-duration:0.3s;animation-duration:0.3s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-out;animation-timing-function:ease-out;-webkit-animation-delay:1s;animation-delay:1s}@-webkit-keyframes fade-up{0%{opacity:0.8;transform:translateY(100%)}100%{opacity:1;transform:translateY(0)}}@keyframes fade-up{0%{opacity:0.8;transform:translateY(100%)}100%{opacity:1;transform:translateY(0)}}.anim-fade-down{-webkit-animation-name:fade-down;animation-name:fade-down;-webkit-animation-duration:0.3s;animation-duration:0.3s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in;animation-timing-function:ease-in}@-webkit-keyframes fade-down{0%{opacity:1;transform:translateY(0)}100%{opacity:0.5;transform:translateY(100%)}}@keyframes fade-down{0%{opacity:1;transform:translateY(0)}100%{opacity:0.5;transform:translateY(100%)}}.anim-grow-x{width:0%;-webkit-animation-name:grow-x;animation-name:grow-x;-webkit-animation-duration:0.3s;animation-duration:0.3s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease;animation-timing-function:ease;-webkit-animation-delay:0.5s;animation-delay:0.5s}@-webkit-keyframes grow-x{to{width:100%}}@keyframes grow-x{to{width:100%}}.anim-shrink-x{-webkit-animation-name:shrink-x;animation-name:shrink-x;-webkit-animation-duration:0.3s;animation-duration:0.3s;-webkit-animation-fill-mode:forwards;animation-fill-mode:forwards;-webkit-animation-timing-function:ease-in-out;animation-timing-function:ease-in-out;-webkit-animation-delay:0.5s;animation-delay:0.5s}@-webkit-keyframes shrink-x{to{width:0%}}@keyframes shrink-x{to{width:0%}}.anim-scale-in{-webkit-animation-name:scale-in;animation-name:scale-in;-webkit-animation-duration:0.15s;animation-duration:0.15s;-webkit-animation-timing-function:cubic-bezier(0.2, 0, 0.13, 1.5);animation-timing-function:cubic-bezier(0.2, 0, 0.13, 1.5)}@-webkit-keyframes scale-in{0%{opacity:0;transform:scale(0.5)}100%{opacity:1;transform:scale(1)}}@keyframes scale-in{0%{opacity:0;transform:scale(0.5)}100%{opacity:1;transform:scale(1)}}.anim-pulse{-webkit-animation-name:pulse;animation-name:pulse;-webkit-animation-duration:2s;animation-duration:2s;-webkit-animation-timing-function:linear;animation-timing-function:linear;-webkit-animation-iteration-count:infinite;animation-iteration-count:infinite}@-webkit-keyframes pulse{0%{opacity:0.3}10%{opacity:1}100%{opacity:0.3}}@keyframes pulse{0%{opacity:0.3}10%{opacity:1}100%{opacity:0.3}}.anim-pulse-in{-webkit-animation-name:pulse-in;animation-name:pulse-in;-webkit-animation-duration:0.5s;animation-duration:0.5s}@-webkit-keyframes pulse-in{0%{transform:scale3d(1, 1, 1)}50%{transform:scale3d(1.1, 1.1, 1.1)}100%{transform:scale3d(1, 1, 1)}}@keyframes pulse-in{0%{transform:scale3d(1, 1, 1)}50%{transform:scale3d(1.1, 1.1, 1.1)}100%{transform:scale3d(1, 1, 1)}}.hover-grow{transition:transform 0.3s;-webkit-backface-visibility:hidden;backface-visibility:hidden}.hover-grow:hover{transform:scale(1.025)}.border{border:1px #e1e4e8 solid !important}.border-y{border-top:1px #e1e4e8 solid !important;border-bottom:1px #e1e4e8 solid !important}.border-0{border:0 !important}.border-dashed{border-style:dashed !important}.border-blue{border-color:#0366d6 !important}.border-blue-light{border-color:#c8e1ff !important}.border-green{border-color:#34d058 !important}.border-green-light{border-color:#a2cbac !important}.border-red{border-color:#d73a49 !important}.border-red-light{border-color:#cea0a5 !important}.border-purple{border-color:#6f42c1 !important}.border-yellow{border-color:#d9d0a5 !important}.border-gray-light{border-color:#eaecef !important}.border-gray-dark{border-color:#d1d5da !important}.border-black-fade{border-color:rgba(27,31,35,0.15) !important}.border-top{border-top:1px #e1e4e8 solid !important}.border-right{border-right:1px #e1e4e8 solid !important}.border-bottom{border-bottom:1px #e1e4e8 solid !important}.border-left{border-left:1px #e1e4e8 solid !important}.border-top-0{border-top:0 !important}.border-right-0{border-right:0 !important}.border-bottom-0{border-bottom:0 !important}.border-left-0{border-left:0 !important}.rounded-0{border-radius:0 !important}.rounded-1{border-radius:3px !important}.rounded-2{border-radius:6px !important}.rounded-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}@media (min-width: 544px){.border-sm-top{border-top:1px #e1e4e8 solid !important}.border-sm-right{border-right:1px #e1e4e8 solid !important}.border-sm-bottom{border-bottom:1px #e1e4e8 solid !important}.border-sm-left{border-left:1px #e1e4e8 solid !important}.border-sm-top-0{border-top:0 !important}.border-sm-right-0{border-right:0 !important}.border-sm-bottom-0{border-bottom:0 !important}.border-sm-left-0{border-left:0 !important}.rounded-sm-0{border-radius:0 !important}.rounded-sm-1{border-radius:3px !important}.rounded-sm-2{border-radius:6px !important}.rounded-sm-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-sm-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-sm-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-sm-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-sm-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-sm-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-sm-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-sm-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-sm-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-sm-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-sm-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-sm-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}}@media (min-width: 768px){.border-md-top{border-top:1px #e1e4e8 solid !important}.border-md-right{border-right:1px #e1e4e8 solid !important}.border-md-bottom{border-bottom:1px #e1e4e8 solid !important}.border-md-left{border-left:1px #e1e4e8 solid !important}.border-md-top-0{border-top:0 !important}.border-md-right-0{border-right:0 !important}.border-md-bottom-0{border-bottom:0 !important}.border-md-left-0{border-left:0 !important}.rounded-md-0{border-radius:0 !important}.rounded-md-1{border-radius:3px !important}.rounded-md-2{border-radius:6px !important}.rounded-md-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-md-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-md-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-md-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-md-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-md-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-md-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-md-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-md-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-md-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-md-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-md-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}}@media (min-width: 1012px){.border-lg-top{border-top:1px #e1e4e8 solid !important}.border-lg-right{border-right:1px #e1e4e8 solid !important}.border-lg-bottom{border-bottom:1px #e1e4e8 solid !important}.border-lg-left{border-left:1px #e1e4e8 solid !important}.border-lg-top-0{border-top:0 !important}.border-lg-right-0{border-right:0 !important}.border-lg-bottom-0{border-bottom:0 !important}.border-lg-left-0{border-left:0 !important}.rounded-lg-0{border-radius:0 !important}.rounded-lg-1{border-radius:3px !important}.rounded-lg-2{border-radius:6px !important}.rounded-lg-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-lg-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-lg-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-lg-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-lg-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-lg-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-lg-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-lg-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-lg-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-lg-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-lg-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-lg-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}}@media (min-width: 1280px){.border-xl-top{border-top:1px #e1e4e8 solid !important}.border-xl-right{border-right:1px #e1e4e8 solid !important}.border-xl-bottom{border-bottom:1px #e1e4e8 solid !important}.border-xl-left{border-left:1px #e1e4e8 solid !important}.border-xl-top-0{border-top:0 !important}.border-xl-right-0{border-right:0 !important}.border-xl-bottom-0{border-bottom:0 !important}.border-xl-left-0{border-left:0 !important}.rounded-xl-0{border-radius:0 !important}.rounded-xl-1{border-radius:3px !important}.rounded-xl-2{border-radius:6px !important}.rounded-xl-top-0{border-top-left-radius:0 !important;border-top-right-radius:0 !important}.rounded-xl-top-1{border-top-left-radius:3px !important;border-top-right-radius:3px !important}.rounded-xl-top-2{border-top-left-radius:6px !important;border-top-right-radius:6px !important}.rounded-xl-right-0{border-top-right-radius:0 !important;border-bottom-right-radius:0 !important}.rounded-xl-right-1{border-top-right-radius:3px !important;border-bottom-right-radius:3px !important}.rounded-xl-right-2{border-top-right-radius:6px !important;border-bottom-right-radius:6px !important}.rounded-xl-bottom-0{border-bottom-right-radius:0 !important;border-bottom-left-radius:0 !important}.rounded-xl-bottom-1{border-bottom-right-radius:3px !important;border-bottom-left-radius:3px !important}.rounded-xl-bottom-2{border-bottom-right-radius:6px !important;border-bottom-left-radius:6px !important}.rounded-xl-left-0{border-bottom-left-radius:0 !important;border-top-left-radius:0 !important}.rounded-xl-left-1{border-bottom-left-radius:3px !important;border-top-left-radius:3px !important}.rounded-xl-left-2{border-bottom-left-radius:6px !important;border-top-left-radius:6px !important}}.circle{border-radius:50% !important}.box-shadow{box-shadow:0 1px 1px rgba(27,31,35,0.1) !important}.box-shadow-medium{box-shadow:0 1px 5px rgba(27,31,35,0.15) !important}.box-shadow-large{box-shadow:0 1px 15px rgba(27,31,35,0.15) !important}.box-shadow-extra-large{box-shadow:0 10px 50px rgba(27,31,35,0.07) !important}.box-shadow-none{box-shadow:none !important}.bg-white{background-color:#fff !important}.bg-blue{background-color:#0366d6 !important}.bg-blue-light{background-color:#f1f8ff !important}.bg-gray-dark{background-color:#24292e !important}.bg-gray{background-color:#f6f8fa !important}.bg-gray-light{background-color:#fafbfc !important}.bg-green{background-color:#28a745 !important}.bg-green-light{background-color:#dcffe4 !important}.bg-red{background-color:#d73a49 !important}.bg-red-light{background-color:#ffdce0 !important}.bg-yellow{background-color:#ffd33d !important}.bg-yellow-light{background-color:#fff5b1 !important}.bg-purple{background-color:#6f42c1 !important}.bg-purple-light{background-color:#f5f0ff !important}.bg-shade-gradient{background-image:linear-gradient(180deg, rgba(27,31,35,0.065), rgba(27,31,35,0)) !important;background-repeat:no-repeat !important;background-size:100% 200px !important}.text-blue{color:#0366d6 !important}.text-red{color:#cb2431 !important}.text-gray-light{color:#6a737d !important}.text-gray{color:#586069 !important}.text-gray-dark{color:#24292e !important}.text-green{color:#28a745 !important}.text-orange{color:#a04100 !important}.text-orange-light{color:#e36209 !important}.text-purple{color:#6f42c1 !important}.text-white{color:#fff !important}.text-inherit{color:inherit !important}.text-pending{color:#b08800 !important}.bg-pending{color:#dbab09 !important}.link-gray{color:#586069 !important}.link-gray:hover{color:#0366d6 !important}.link-gray-dark{color:#24292e !important}.link-gray-dark:hover{color:#0366d6 !important}.link-hover-blue:hover{color:#0366d6 !important}.muted-link{color:#586069 !important}.muted-link:hover{color:#0366d6 !important;text-decoration:none}.details-overlay[open]>summary::before{position:fixed;top:0;right:0;bottom:0;left:0;z-index:80;display:block;cursor:default;content:" ";background:transparent}.details-overlay-dark[open]>summary::before{z-index:99;background:rgba(27,31,35,0.5)}.flex-row{flex-direction:row !important}.flex-row-reverse{flex-direction:row-reverse !important}.flex-column{flex-direction:column !important}.flex-wrap{flex-wrap:wrap !important}.flex-nowrap{flex-wrap:nowrap !important}.flex-justify-start{justify-content:flex-start !important}.flex-justify-end{justify-content:flex-end !important}.flex-justify-center{justify-content:center !important}.flex-justify-between{justify-content:space-between !important}.flex-justify-around{justify-content:space-around !important}.flex-items-start{align-items:flex-start !important}.flex-items-end{align-items:flex-end !important}.flex-items-center{align-items:center !important}.flex-items-baseline{align-items:baseline !important}.flex-items-stretch{align-items:stretch !important}.flex-content-start{align-content:flex-start !important}.flex-content-end{align-content:flex-end !important}.flex-content-center{align-content:center !important}.flex-content-between{align-content:space-between !important}.flex-content-around{align-content:space-around !important}.flex-content-stretch{align-content:stretch !important}.flex-auto{flex:1 1 auto !important}.flex-shrink-0{flex-shrink:0 !important}.flex-self-auto{align-self:auto !important}.flex-self-start{align-self:flex-start !important}.flex-self-end{align-self:flex-end !important}.flex-self-center{align-self:center !important}.flex-self-baseline{align-self:baseline !important}.flex-self-stretch{align-self:stretch !important}.flex-item-equal{flex-grow:1;flex-basis:0}@media (min-width: 544px){.flex-sm-row{flex-direction:row !important}.flex-sm-row-reverse{flex-direction:row-reverse !important}.flex-sm-column{flex-direction:column !important}.flex-sm-wrap{flex-wrap:wrap !important}.flex-sm-nowrap{flex-wrap:nowrap !important}.flex-sm-justify-start{justify-content:flex-start !important}.flex-sm-justify-end{justify-content:flex-end !important}.flex-sm-justify-center{justify-content:center !important}.flex-sm-justify-between{justify-content:space-between !important}.flex-sm-justify-around{justify-content:space-around !important}.flex-sm-items-start{align-items:flex-start !important}.flex-sm-items-end{align-items:flex-end !important}.flex-sm-items-center{align-items:center !important}.flex-sm-items-baseline{align-items:baseline !important}.flex-sm-items-stretch{align-items:stretch !important}.flex-sm-content-start{align-content:flex-start !important}.flex-sm-content-end{align-content:flex-end !important}.flex-sm-content-center{align-content:center !important}.flex-sm-content-between{align-content:space-between !important}.flex-sm-content-around{align-content:space-around !important}.flex-sm-content-stretch{align-content:stretch !important}.flex-sm-auto{flex:1 1 auto !important}.flex-sm-shrink-0{flex-shrink:0 !important}.flex-sm-self-auto{align-self:auto !important}.flex-sm-self-start{align-self:flex-start !important}.flex-sm-self-end{align-self:flex-end !important}.flex-sm-self-center{align-self:center !important}.flex-sm-self-baseline{align-self:baseline !important}.flex-sm-self-stretch{align-self:stretch !important}.flex-sm-item-equal{flex-grow:1;flex-basis:0}}@media (min-width: 768px){.flex-md-row{flex-direction:row !important}.flex-md-row-reverse{flex-direction:row-reverse !important}.flex-md-column{flex-direction:column !important}.flex-md-wrap{flex-wrap:wrap !important}.flex-md-nowrap{flex-wrap:nowrap !important}.flex-md-justify-start{justify-content:flex-start !important}.flex-md-justify-end{justify-content:flex-end !important}.flex-md-justify-center{justify-content:center !important}.flex-md-justify-between{justify-content:space-between !important}.flex-md-justify-around{justify-content:space-around !important}.flex-md-items-start{align-items:flex-start !important}.flex-md-items-end{align-items:flex-end !important}.flex-md-items-center{align-items:center !important}.flex-md-items-baseline{align-items:baseline !important}.flex-md-items-stretch{align-items:stretch !important}.flex-md-content-start{align-content:flex-start !important}.flex-md-content-end{align-content:flex-end !important}.flex-md-content-center{align-content:center !important}.flex-md-content-between{align-content:space-between !important}.flex-md-content-around{align-content:space-around !important}.flex-md-content-stretch{align-content:stretch !important}.flex-md-auto{flex:1 1 auto !important}.flex-md-shrink-0{flex-shrink:0 !important}.flex-md-self-auto{align-self:auto !important}.flex-md-self-start{align-self:flex-start !important}.flex-md-self-end{align-self:flex-end !important}.flex-md-self-center{align-self:center !important}.flex-md-self-baseline{align-self:baseline !important}.flex-md-self-stretch{align-self:stretch !important}.flex-md-item-equal{flex-grow:1;flex-basis:0}}@media (min-width: 1012px){.flex-lg-row{flex-direction:row !important}.flex-lg-row-reverse{flex-direction:row-reverse !important}.flex-lg-column{flex-direction:column !important}.flex-lg-wrap{flex-wrap:wrap !important}.flex-lg-nowrap{flex-wrap:nowrap !important}.flex-lg-justify-start{justify-content:flex-start !important}.flex-lg-justify-end{justify-content:flex-end !important}.flex-lg-justify-center{justify-content:center !important}.flex-lg-justify-between{justify-content:space-between !important}.flex-lg-justify-around{justify-content:space-around !important}.flex-lg-items-start{align-items:flex-start !important}.flex-lg-items-end{align-items:flex-end !important}.flex-lg-items-center{align-items:center !important}.flex-lg-items-baseline{align-items:baseline !important}.flex-lg-items-stretch{align-items:stretch !important}.flex-lg-content-start{align-content:flex-start !important}.flex-lg-content-end{align-content:flex-end !important}.flex-lg-content-center{align-content:center !important}.flex-lg-content-between{align-content:space-between !important}.flex-lg-content-around{align-content:space-around !important}.flex-lg-content-stretch{align-content:stretch !important}.flex-lg-auto{flex:1 1 auto !important}.flex-lg-shrink-0{flex-shrink:0 !important}.flex-lg-self-auto{align-self:auto !important}.flex-lg-self-start{align-self:flex-start !important}.flex-lg-self-end{align-self:flex-end !important}.flex-lg-self-center{align-self:center !important}.flex-lg-self-baseline{align-self:baseline !important}.flex-lg-self-stretch{align-self:stretch !important}.flex-lg-item-equal{flex-grow:1;flex-basis:0}}@media (min-width: 1280px){.flex-xl-row{flex-direction:row !important}.flex-xl-row-reverse{flex-direction:row-reverse !important}.flex-xl-column{flex-direction:column !important}.flex-xl-wrap{flex-wrap:wrap !important}.flex-xl-nowrap{flex-wrap:nowrap !important}.flex-xl-justify-start{justify-content:flex-start !important}.flex-xl-justify-end{justify-content:flex-end !important}.flex-xl-justify-center{justify-content:center !important}.flex-xl-justify-between{justify-content:space-between !important}.flex-xl-justify-around{justify-content:space-around !important}.flex-xl-items-start{align-items:flex-start !important}.flex-xl-items-end{align-items:flex-end !important}.flex-xl-items-center{align-items:center !important}.flex-xl-items-baseline{align-items:baseline !important}.flex-xl-items-stretch{align-items:stretch !important}.flex-xl-content-start{align-content:flex-start !important}.flex-xl-content-end{align-content:flex-end !important}.flex-xl-content-center{align-content:center !important}.flex-xl-content-between{align-content:space-between !important}.flex-xl-content-around{align-content:space-around !important}.flex-xl-content-stretch{align-content:stretch !important}.flex-xl-auto{flex:1 1 auto !important}.flex-xl-shrink-0{flex-shrink:0 !important}.flex-xl-self-auto{align-self:auto !important}.flex-xl-self-start{align-self:flex-start !important}.flex-xl-self-end{align-self:flex-end !important}.flex-xl-self-center{align-self:center !important}.flex-xl-self-baseline{align-self:baseline !important}.flex-xl-self-stretch{align-self:stretch !important}.flex-xl-item-equal{flex-grow:1;flex-basis:0}}.position-static{position:static !important}.position-relative{position:relative !important}.position-absolute{position:absolute !important}.position-fixed{position:fixed !important}.top-0{top:0 !important}.right-0{right:0 !important}.bottom-0{bottom:0 !important}.left-0{left:0 !important}.v-align-middle{vertical-align:middle !important}.v-align-top{vertical-align:top !important}.v-align-bottom{vertical-align:bottom !important}.v-align-text-top{vertical-align:text-top !important}.v-align-text-bottom{vertical-align:text-bottom !important}.v-align-baseline{vertical-align:baseline !important}.overflow-hidden{overflow:hidden !important}.overflow-scroll{overflow:scroll !important}.overflow-auto{overflow:auto !important}.clearfix::before{display:table;content:""}.clearfix::after{display:table;clear:both;content:""}.float-left{float:left !important}.float-right{float:right !important}.float-none{float:none !important}@media (min-width: 544px){.float-sm-left{float:left !important}.float-sm-right{float:right !important}.float-sm-none{float:none !important}}@media (min-width: 768px){.float-md-left{float:left !important}.float-md-right{float:right !important}.float-md-none{float:none !important}}@media (min-width: 1012px){.float-lg-left{float:left !important}.float-lg-right{float:right !important}.float-lg-none{float:none !important}}@media (min-width: 1280px){.float-xl-left{float:left !important}.float-xl-right{float:right !important}.float-xl-none{float:none !important}}.width-fit{max-width:100% !important}.width-full{width:100% !important}.height-fit{max-height:100% !important}.height-full{height:100% !important}.min-width-0{min-width:0 !important}.direction-rtl{direction:rtl !important}.direction-ltr{direction:ltr !important}@media (min-width: 544px){.direction-sm-rtl{direction:rtl !important}.direction-sm-ltr{direction:ltr !important}}@media (min-width: 768px){.direction-md-rtl{direction:rtl !important}.direction-md-ltr{direction:ltr !important}}@media (min-width: 1012px){.direction-lg-rtl{direction:rtl !important}.direction-lg-ltr{direction:ltr !important}}@media (min-width: 1280px){.direction-xl-rtl{direction:rtl !important}.direction-xl-ltr{direction:ltr !important}}.m-0{margin:0 !important}.mt-0{margin-top:0 !important}.mr-0{margin-right:0 !important}.mb-0{margin-bottom:0 !important}.ml-0{margin-left:0 !important}.mx-0{margin-right:0 !important;margin-left:0 !important}.my-0{margin-top:0 !important;margin-bottom:0 !important}.m-1{margin:4px !important}.mt-1{margin-top:4px !important}.mr-1{margin-right:4px !important}.mb-1{margin-bottom:4px !important}.ml-1{margin-left:4px !important}.mt-n1{margin-top:-4px !important}.mr-n1{margin-right:-4px !important}.mb-n1{margin-bottom:-4px !important}.ml-n1{margin-left:-4px !important}.mx-1{margin-right:4px !important;margin-left:4px !important}.my-1{margin-top:4px !important;margin-bottom:4px !important}.m-2{margin:8px !important}.mt-2{margin-top:8px !important}.mr-2{margin-right:8px !important}.mb-2{margin-bottom:8px !important}.ml-2{margin-left:8px !important}.mt-n2{margin-top:-8px !important}.mr-n2{margin-right:-8px !important}.mb-n2{margin-bottom:-8px !important}.ml-n2{margin-left:-8px !important}.mx-2{margin-right:8px !important;margin-left:8px !important}.my-2{margin-top:8px !important;margin-bottom:8px !important}.m-3{margin:16px !important}.mt-3{margin-top:16px !important}.mr-3{margin-right:16px !important}.mb-3{margin-bottom:16px !important}.ml-3{margin-left:16px !important}.mt-n3{margin-top:-16px !important}.mr-n3{margin-right:-16px !important}.mb-n3{margin-bottom:-16px !important}.ml-n3{margin-left:-16px !important}.mx-3{margin-right:16px !important;margin-left:16px !important}.my-3{margin-top:16px !important;margin-bottom:16px !important}.m-4{margin:24px !important}.mt-4{margin-top:24px !important}.mr-4{margin-right:24px !important}.mb-4{margin-bottom:24px !important}.ml-4{margin-left:24px !important}.mt-n4{margin-top:-24px !important}.mr-n4{margin-right:-24px !important}.mb-n4{margin-bottom:-24px !important}.ml-n4{margin-left:-24px !important}.mx-4{margin-right:24px !important;margin-left:24px !important}.my-4{margin-top:24px !important;margin-bottom:24px !important}.m-5{margin:32px !important}.mt-5{margin-top:32px !important}.mr-5{margin-right:32px !important}.mb-5{margin-bottom:32px !important}.ml-5{margin-left:32px !important}.mt-n5{margin-top:-32px !important}.mr-n5{margin-right:-32px !important}.mb-n5{margin-bottom:-32px !important}.ml-n5{margin-left:-32px !important}.mx-5{margin-right:32px !important;margin-left:32px !important}.my-5{margin-top:32px !important;margin-bottom:32px !important}.m-6{margin:40px !important}.mt-6{margin-top:40px !important}.mr-6{margin-right:40px !important}.mb-6{margin-bottom:40px !important}.ml-6{margin-left:40px !important}.mt-n6{margin-top:-40px !important}.mr-n6{margin-right:-40px !important}.mb-n6{margin-bottom:-40px !important}.ml-n6{margin-left:-40px !important}.mx-6{margin-right:40px !important;margin-left:40px !important}.my-6{margin-top:40px !important;margin-bottom:40px !important}.mx-auto{margin-right:auto !important;margin-left:auto !important}@media (min-width: 544px){.m-sm-0{margin:0 !important}.mt-sm-0{margin-top:0 !important}.mr-sm-0{margin-right:0 !important}.mb-sm-0{margin-bottom:0 !important}.ml-sm-0{margin-left:0 !important}.mx-sm-0{margin-right:0 !important;margin-left:0 !important}.my-sm-0{margin-top:0 !important;margin-bottom:0 !important}.m-sm-1{margin:4px !important}.mt-sm-1{margin-top:4px !important}.mr-sm-1{margin-right:4px !important}.mb-sm-1{margin-bottom:4px !important}.ml-sm-1{margin-left:4px !important}.mt-sm-n1{margin-top:-4px !important}.mr-sm-n1{margin-right:-4px !important}.mb-sm-n1{margin-bottom:-4px !important}.ml-sm-n1{margin-left:-4px !important}.mx-sm-1{margin-right:4px !important;margin-left:4px !important}.my-sm-1{margin-top:4px !important;margin-bottom:4px !important}.m-sm-2{margin:8px !important}.mt-sm-2{margin-top:8px !important}.mr-sm-2{margin-right:8px !important}.mb-sm-2{margin-bottom:8px !important}.ml-sm-2{margin-left:8px !important}.mt-sm-n2{margin-top:-8px !important}.mr-sm-n2{margin-right:-8px !important}.mb-sm-n2{margin-bottom:-8px !important}.ml-sm-n2{margin-left:-8px !important}.mx-sm-2{margin-right:8px !important;margin-left:8px !important}.my-sm-2{margin-top:8px !important;margin-bottom:8px !important}.m-sm-3{margin:16px !important}.mt-sm-3{margin-top:16px !important}.mr-sm-3{margin-right:16px !important}.mb-sm-3{margin-bottom:16px !important}.ml-sm-3{margin-left:16px !important}.mt-sm-n3{margin-top:-16px !important}.mr-sm-n3{margin-right:-16px !important}.mb-sm-n3{margin-bottom:-16px !important}.ml-sm-n3{margin-left:-16px !important}.mx-sm-3{margin-right:16px !important;margin-left:16px !important}.my-sm-3{margin-top:16px !important;margin-bottom:16px !important}.m-sm-4{margin:24px !important}.mt-sm-4{margin-top:24px !important}.mr-sm-4{margin-right:24px !important}.mb-sm-4{margin-bottom:24px !important}.ml-sm-4{margin-left:24px !important}.mt-sm-n4{margin-top:-24px !important}.mr-sm-n4{margin-right:-24px !important}.mb-sm-n4{margin-bottom:-24px !important}.ml-sm-n4{margin-left:-24px !important}.mx-sm-4{margin-right:24px !important;margin-left:24px !important}.my-sm-4{margin-top:24px !important;margin-bottom:24px !important}.m-sm-5{margin:32px !important}.mt-sm-5{margin-top:32px !important}.mr-sm-5{margin-right:32px !important}.mb-sm-5{margin-bottom:32px !important}.ml-sm-5{margin-left:32px !important}.mt-sm-n5{margin-top:-32px !important}.mr-sm-n5{margin-right:-32px !important}.mb-sm-n5{margin-bottom:-32px !important}.ml-sm-n5{margin-left:-32px !important}.mx-sm-5{margin-right:32px !important;margin-left:32px !important}.my-sm-5{margin-top:32px !important;margin-bottom:32px !important}.m-sm-6{margin:40px !important}.mt-sm-6{margin-top:40px !important}.mr-sm-6{margin-right:40px !important}.mb-sm-6{margin-bottom:40px !important}.ml-sm-6{margin-left:40px !important}.mt-sm-n6{margin-top:-40px !important}.mr-sm-n6{margin-right:-40px !important}.mb-sm-n6{margin-bottom:-40px !important}.ml-sm-n6{margin-left:-40px !important}.mx-sm-6{margin-right:40px !important;margin-left:40px !important}.my-sm-6{margin-top:40px !important;margin-bottom:40px !important}.mx-sm-auto{margin-right:auto !important;margin-left:auto !important}}@media (min-width: 768px){.m-md-0{margin:0 !important}.mt-md-0{margin-top:0 !important}.mr-md-0{margin-right:0 !important}.mb-md-0{margin-bottom:0 !important}.ml-md-0{margin-left:0 !important}.mx-md-0{margin-right:0 !important;margin-left:0 !important}.my-md-0{margin-top:0 !important;margin-bottom:0 !important}.m-md-1{margin:4px !important}.mt-md-1{margin-top:4px !important}.mr-md-1{margin-right:4px !important}.mb-md-1{margin-bottom:4px !important}.ml-md-1{margin-left:4px !important}.mt-md-n1{margin-top:-4px !important}.mr-md-n1{margin-right:-4px !important}.mb-md-n1{margin-bottom:-4px !important}.ml-md-n1{margin-left:-4px !important}.mx-md-1{margin-right:4px !important;margin-left:4px !important}.my-md-1{margin-top:4px !important;margin-bottom:4px !important}.m-md-2{margin:8px !important}.mt-md-2{margin-top:8px !important}.mr-md-2{margin-right:8px !important}.mb-md-2{margin-bottom:8px !important}.ml-md-2{margin-left:8px !important}.mt-md-n2{margin-top:-8px !important}.mr-md-n2{margin-right:-8px !important}.mb-md-n2{margin-bottom:-8px !important}.ml-md-n2{margin-left:-8px !important}.mx-md-2{margin-right:8px !important;margin-left:8px !important}.my-md-2{margin-top:8px !important;margin-bottom:8px !important}.m-md-3{margin:16px !important}.mt-md-3{margin-top:16px !important}.mr-md-3{margin-right:16px !important}.mb-md-3{margin-bottom:16px !important}.ml-md-3{margin-left:16px !important}.mt-md-n3{margin-top:-16px !important}.mr-md-n3{margin-right:-16px !important}.mb-md-n3{margin-bottom:-16px !important}.ml-md-n3{margin-left:-16px !important}.mx-md-3{margin-right:16px !important;margin-left:16px !important}.my-md-3{margin-top:16px !important;margin-bottom:16px !important}.m-md-4{margin:24px !important}.mt-md-4{margin-top:24px !important}.mr-md-4{margin-right:24px !important}.mb-md-4{margin-bottom:24px !important}.ml-md-4{margin-left:24px !important}.mt-md-n4{margin-top:-24px !important}.mr-md-n4{margin-right:-24px !important}.mb-md-n4{margin-bottom:-24px !important}.ml-md-n4{margin-left:-24px !important}.mx-md-4{margin-right:24px !important;margin-left:24px !important}.my-md-4{margin-top:24px !important;margin-bottom:24px !important}.m-md-5{margin:32px !important}.mt-md-5{margin-top:32px !important}.mr-md-5{margin-right:32px !important}.mb-md-5{margin-bottom:32px !important}.ml-md-5{margin-left:32px !important}.mt-md-n5{margin-top:-32px !important}.mr-md-n5{margin-right:-32px !important}.mb-md-n5{margin-bottom:-32px !important}.ml-md-n5{margin-left:-32px !important}.mx-md-5{margin-right:32px !important;margin-left:32px !important}.my-md-5{margin-top:32px !important;margin-bottom:32px !important}.m-md-6{margin:40px !important}.mt-md-6{margin-top:40px !important}.mr-md-6{margin-right:40px !important}.mb-md-6{margin-bottom:40px !important}.ml-md-6{margin-left:40px !important}.mt-md-n6{margin-top:-40px !important}.mr-md-n6{margin-right:-40px !important}.mb-md-n6{margin-bottom:-40px !important}.ml-md-n6{margin-left:-40px !important}.mx-md-6{margin-right:40px !important;margin-left:40px !important}.my-md-6{margin-top:40px !important;margin-bottom:40px !important}.mx-md-auto{margin-right:auto !important;margin-left:auto !important}}@media (min-width: 1012px){.m-lg-0{margin:0 !important}.mt-lg-0{margin-top:0 !important}.mr-lg-0{margin-right:0 !important}.mb-lg-0{margin-bottom:0 !important}.ml-lg-0{margin-left:0 !important}.mx-lg-0{margin-right:0 !important;margin-left:0 !important}.my-lg-0{margin-top:0 !important;margin-bottom:0 !important}.m-lg-1{margin:4px !important}.mt-lg-1{margin-top:4px !important}.mr-lg-1{margin-right:4px !important}.mb-lg-1{margin-bottom:4px !important}.ml-lg-1{margin-left:4px !important}.mt-lg-n1{margin-top:-4px !important}.mr-lg-n1{margin-right:-4px !important}.mb-lg-n1{margin-bottom:-4px !important}.ml-lg-n1{margin-left:-4px !important}.mx-lg-1{margin-right:4px !important;margin-left:4px !important}.my-lg-1{margin-top:4px !important;margin-bottom:4px !important}.m-lg-2{margin:8px !important}.mt-lg-2{margin-top:8px !important}.mr-lg-2{margin-right:8px !important}.mb-lg-2{margin-bottom:8px !important}.ml-lg-2{margin-left:8px !important}.mt-lg-n2{margin-top:-8px !important}.mr-lg-n2{margin-right:-8px !important}.mb-lg-n2{margin-bottom:-8px !important}.ml-lg-n2{margin-left:-8px !important}.mx-lg-2{margin-right:8px !important;margin-left:8px !important}.my-lg-2{margin-top:8px !important;margin-bottom:8px !important}.m-lg-3{margin:16px !important}.mt-lg-3{margin-top:16px !important}.mr-lg-3{margin-right:16px !important}.mb-lg-3{margin-bottom:16px !important}.ml-lg-3{margin-left:16px !important}.mt-lg-n3{margin-top:-16px !important}.mr-lg-n3{margin-right:-16px !important}.mb-lg-n3{margin-bottom:-16px !important}.ml-lg-n3{margin-left:-16px !important}.mx-lg-3{margin-right:16px !important;margin-left:16px !important}.my-lg-3{margin-top:16px !important;margin-bottom:16px !important}.m-lg-4{margin:24px !important}.mt-lg-4{margin-top:24px !important}.mr-lg-4{margin-right:24px !important}.mb-lg-4{margin-bottom:24px !important}.ml-lg-4{margin-left:24px !important}.mt-lg-n4{margin-top:-24px !important}.mr-lg-n4{margin-right:-24px !important}.mb-lg-n4{margin-bottom:-24px !important}.ml-lg-n4{margin-left:-24px !important}.mx-lg-4{margin-right:24px !important;margin-left:24px !important}.my-lg-4{margin-top:24px !important;margin-bottom:24px !important}.m-lg-5{margin:32px !important}.mt-lg-5{margin-top:32px !important}.mr-lg-5{margin-right:32px !important}.mb-lg-5{margin-bottom:32px !important}.ml-lg-5{margin-left:32px !important}.mt-lg-n5{margin-top:-32px !important}.mr-lg-n5{margin-right:-32px !important}.mb-lg-n5{margin-bottom:-32px !important}.ml-lg-n5{margin-left:-32px !important}.mx-lg-5{margin-right:32px !important;margin-left:32px !important}.my-lg-5{margin-top:32px !important;margin-bottom:32px !important}.m-lg-6{margin:40px !important}.mt-lg-6{margin-top:40px !important}.mr-lg-6{margin-right:40px !important}.mb-lg-6{margin-bottom:40px !important}.ml-lg-6{margin-left:40px !important}.mt-lg-n6{margin-top:-40px !important}.mr-lg-n6{margin-right:-40px !important}.mb-lg-n6{margin-bottom:-40px !important}.ml-lg-n6{margin-left:-40px !important}.mx-lg-6{margin-right:40px !important;margin-left:40px !important}.my-lg-6{margin-top:40px !important;margin-bottom:40px !important}.mx-lg-auto{margin-right:auto !important;margin-left:auto !important}}@media (min-width: 1280px){.m-xl-0{margin:0 !important}.mt-xl-0{margin-top:0 !important}.mr-xl-0{margin-right:0 !important}.mb-xl-0{margin-bottom:0 !important}.ml-xl-0{margin-left:0 !important}.mx-xl-0{margin-right:0 !important;margin-left:0 !important}.my-xl-0{margin-top:0 !important;margin-bottom:0 !important}.m-xl-1{margin:4px !important}.mt-xl-1{margin-top:4px !important}.mr-xl-1{margin-right:4px !important}.mb-xl-1{margin-bottom:4px !important}.ml-xl-1{margin-left:4px !important}.mt-xl-n1{margin-top:-4px !important}.mr-xl-n1{margin-right:-4px !important}.mb-xl-n1{margin-bottom:-4px !important}.ml-xl-n1{margin-left:-4px !important}.mx-xl-1{margin-right:4px !important;margin-left:4px !important}.my-xl-1{margin-top:4px !important;margin-bottom:4px !important}.m-xl-2{margin:8px !important}.mt-xl-2{margin-top:8px !important}.mr-xl-2{margin-right:8px !important}.mb-xl-2{margin-bottom:8px !important}.ml-xl-2{margin-left:8px !important}.mt-xl-n2{margin-top:-8px !important}.mr-xl-n2{margin-right:-8px !important}.mb-xl-n2{margin-bottom:-8px !important}.ml-xl-n2{margin-left:-8px !important}.mx-xl-2{margin-right:8px !important;margin-left:8px !important}.my-xl-2{margin-top:8px !important;margin-bottom:8px !important}.m-xl-3{margin:16px !important}.mt-xl-3{margin-top:16px !important}.mr-xl-3{margin-right:16px !important}.mb-xl-3{margin-bottom:16px !important}.ml-xl-3{margin-left:16px !important}.mt-xl-n3{margin-top:-16px !important}.mr-xl-n3{margin-right:-16px !important}.mb-xl-n3{margin-bottom:-16px !important}.ml-xl-n3{margin-left:-16px !important}.mx-xl-3{margin-right:16px !important;margin-left:16px !important}.my-xl-3{margin-top:16px !important;margin-bottom:16px !important}.m-xl-4{margin:24px !important}.mt-xl-4{margin-top:24px !important}.mr-xl-4{margin-right:24px !important}.mb-xl-4{margin-bottom:24px !important}.ml-xl-4{margin-left:24px !important}.mt-xl-n4{margin-top:-24px !important}.mr-xl-n4{margin-right:-24px !important}.mb-xl-n4{margin-bottom:-24px !important}.ml-xl-n4{margin-left:-24px !important}.mx-xl-4{margin-right:24px !important;margin-left:24px !important}.my-xl-4{margin-top:24px !important;margin-bottom:24px !important}.m-xl-5{margin:32px !important}.mt-xl-5{margin-top:32px !important}.mr-xl-5{margin-right:32px !important}.mb-xl-5{margin-bottom:32px !important}.ml-xl-5{margin-left:32px !important}.mt-xl-n5{margin-top:-32px !important}.mr-xl-n5{margin-right:-32px !important}.mb-xl-n5{margin-bottom:-32px !important}.ml-xl-n5{margin-left:-32px !important}.mx-xl-5{margin-right:32px !important;margin-left:32px !important}.my-xl-5{margin-top:32px !important;margin-bottom:32px !important}.m-xl-6{margin:40px !important}.mt-xl-6{margin-top:40px !important}.mr-xl-6{margin-right:40px !important}.mb-xl-6{margin-bottom:40px !important}.ml-xl-6{margin-left:40px !important}.mt-xl-n6{margin-top:-40px !important}.mr-xl-n6{margin-right:-40px !important}.mb-xl-n6{margin-bottom:-40px !important}.ml-xl-n6{margin-left:-40px !important}.mx-xl-6{margin-right:40px !important;margin-left:40px !important}.my-xl-6{margin-top:40px !important;margin-bottom:40px !important}.mx-xl-auto{margin-right:auto !important;margin-left:auto !important}}.p-0{padding:0 !important}.pt-0{padding-top:0 !important}.pr-0{padding-right:0 !important}.pb-0{padding-bottom:0 !important}.pl-0{padding-left:0 !important}.px-0{padding-right:0 !important;padding-left:0 !important}.py-0{padding-top:0 !important;padding-bottom:0 !important}.p-1{padding:4px !important}.pt-1{padding-top:4px !important}.pr-1{padding-right:4px !important}.pb-1{padding-bottom:4px !important}.pl-1{padding-left:4px !important}.px-1{padding-right:4px !important;padding-left:4px !important}.py-1{padding-top:4px !important;padding-bottom:4px !important}.p-2{padding:8px !important}.pt-2{padding-top:8px !important}.pr-2{padding-right:8px !important}.pb-2{padding-bottom:8px !important}.pl-2{padding-left:8px !important}.px-2{padding-right:8px !important;padding-left:8px !important}.py-2{padding-top:8px !important;padding-bottom:8px !important}.p-3{padding:16px !important}.pt-3{padding-top:16px !important}.pr-3{padding-right:16px !important}.pb-3{padding-bottom:16px !important}.pl-3{padding-left:16px !important}.px-3{padding-right:16px !important;padding-left:16px !important}.py-3{padding-top:16px !important;padding-bottom:16px !important}.p-4{padding:24px !important}.pt-4{padding-top:24px !important}.pr-4{padding-right:24px !important}.pb-4{padding-bottom:24px !important}.pl-4{padding-left:24px !important}.px-4{padding-right:24px !important;padding-left:24px !important}.py-4{padding-top:24px !important;padding-bottom:24px !important}.p-5{padding:32px !important}.pt-5{padding-top:32px !important}.pr-5{padding-right:32px !important}.pb-5{padding-bottom:32px !important}.pl-5{padding-left:32px !important}.px-5{padding-right:32px !important;padding-left:32px !important}.py-5{padding-top:32px !important;padding-bottom:32px !important}.p-6{padding:40px !important}.pt-6{padding-top:40px !important}.pr-6{padding-right:40px !important}.pb-6{padding-bottom:40px !important}.pl-6{padding-left:40px !important}.px-6{padding-right:40px !important;padding-left:40px !important}.py-6{padding-top:40px !important;padding-bottom:40px !important}@media (min-width: 544px){.p-sm-0{padding:0 !important}.pt-sm-0{padding-top:0 !important}.pr-sm-0{padding-right:0 !important}.pb-sm-0{padding-bottom:0 !important}.pl-sm-0{padding-left:0 !important}.px-sm-0{padding-right:0 !important;padding-left:0 !important}.py-sm-0{padding-top:0 !important;padding-bottom:0 !important}.p-sm-1{padding:4px !important}.pt-sm-1{padding-top:4px !important}.pr-sm-1{padding-right:4px !important}.pb-sm-1{padding-bottom:4px !important}.pl-sm-1{padding-left:4px !important}.px-sm-1{padding-right:4px !important;padding-left:4px !important}.py-sm-1{padding-top:4px !important;padding-bottom:4px !important}.p-sm-2{padding:8px !important}.pt-sm-2{padding-top:8px !important}.pr-sm-2{padding-right:8px !important}.pb-sm-2{padding-bottom:8px !important}.pl-sm-2{padding-left:8px !important}.px-sm-2{padding-right:8px !important;padding-left:8px !important}.py-sm-2{padding-top:8px !important;padding-bottom:8px !important}.p-sm-3{padding:16px !important}.pt-sm-3{padding-top:16px !important}.pr-sm-3{padding-right:16px !important}.pb-sm-3{padding-bottom:16px !important}.pl-sm-3{padding-left:16px !important}.px-sm-3{padding-right:16px !important;padding-left:16px !important}.py-sm-3{padding-top:16px !important;padding-bottom:16px !important}.p-sm-4{padding:24px !important}.pt-sm-4{padding-top:24px !important}.pr-sm-4{padding-right:24px !important}.pb-sm-4{padding-bottom:24px !important}.pl-sm-4{padding-left:24px !important}.px-sm-4{padding-right:24px !important;padding-left:24px !important}.py-sm-4{padding-top:24px !important;padding-bottom:24px !important}.p-sm-5{padding:32px !important}.pt-sm-5{padding-top:32px !important}.pr-sm-5{padding-right:32px !important}.pb-sm-5{padding-bottom:32px !important}.pl-sm-5{padding-left:32px !important}.px-sm-5{padding-right:32px !important;padding-left:32px !important}.py-sm-5{padding-top:32px !important;padding-bottom:32px !important}.p-sm-6{padding:40px !important}.pt-sm-6{padding-top:40px !important}.pr-sm-6{padding-right:40px !important}.pb-sm-6{padding-bottom:40px !important}.pl-sm-6{padding-left:40px !important}.px-sm-6{padding-right:40px !important;padding-left:40px !important}.py-sm-6{padding-top:40px !important;padding-bottom:40px !important}}@media (min-width: 768px){.p-md-0{padding:0 !important}.pt-md-0{padding-top:0 !important}.pr-md-0{padding-right:0 !important}.pb-md-0{padding-bottom:0 !important}.pl-md-0{padding-left:0 !important}.px-md-0{padding-right:0 !important;padding-left:0 !important}.py-md-0{padding-top:0 !important;padding-bottom:0 !important}.p-md-1{padding:4px !important}.pt-md-1{padding-top:4px !important}.pr-md-1{padding-right:4px !important}.pb-md-1{padding-bottom:4px !important}.pl-md-1{padding-left:4px !important}.px-md-1{padding-right:4px !important;padding-left:4px !important}.py-md-1{padding-top:4px !important;padding-bottom:4px !important}.p-md-2{padding:8px !important}.pt-md-2{padding-top:8px !important}.pr-md-2{padding-right:8px !important}.pb-md-2{padding-bottom:8px !important}.pl-md-2{padding-left:8px !important}.px-md-2{padding-right:8px !important;padding-left:8px !important}.py-md-2{padding-top:8px !important;padding-bottom:8px !important}.p-md-3{padding:16px !important}.pt-md-3{padding-top:16px !important}.pr-md-3{padding-right:16px !important}.pb-md-3{padding-bottom:16px !important}.pl-md-3{padding-left:16px !important}.px-md-3{padding-right:16px !important;padding-left:16px !important}.py-md-3{padding-top:16px !important;padding-bottom:16px !important}.p-md-4{padding:24px !important}.pt-md-4{padding-top:24px !important}.pr-md-4{padding-right:24px !important}.pb-md-4{padding-bottom:24px !important}.pl-md-4{padding-left:24px !important}.px-md-4{padding-right:24px !important;padding-left:24px !important}.py-md-4{padding-top:24px !important;padding-bottom:24px !important}.p-md-5{padding:32px !important}.pt-md-5{padding-top:32px !important}.pr-md-5{padding-right:32px !important}.pb-md-5{padding-bottom:32px !important}.pl-md-5{padding-left:32px !important}.px-md-5{padding-right:32px !important;padding-left:32px !important}.py-md-5{padding-top:32px !important;padding-bottom:32px !important}.p-md-6{padding:40px !important}.pt-md-6{padding-top:40px !important}.pr-md-6{padding-right:40px !important}.pb-md-6{padding-bottom:40px !important}.pl-md-6{padding-left:40px !important}.px-md-6{padding-right:40px !important;padding-left:40px !important}.py-md-6{padding-top:40px !important;padding-bottom:40px !important}}@media (min-width: 1012px){.p-lg-0{padding:0 !important}.pt-lg-0{padding-top:0 !important}.pr-lg-0{padding-right:0 !important}.pb-lg-0{padding-bottom:0 !important}.pl-lg-0{padding-left:0 !important}.px-lg-0{padding-right:0 !important;padding-left:0 !important}.py-lg-0{padding-top:0 !important;padding-bottom:0 !important}.p-lg-1{padding:4px !important}.pt-lg-1{padding-top:4px !important}.pr-lg-1{padding-right:4px !important}.pb-lg-1{padding-bottom:4px !important}.pl-lg-1{padding-left:4px !important}.px-lg-1{padding-right:4px !important;padding-left:4px !important}.py-lg-1{padding-top:4px !important;padding-bottom:4px !important}.p-lg-2{padding:8px !important}.pt-lg-2{padding-top:8px !important}.pr-lg-2{padding-right:8px !important}.pb-lg-2{padding-bottom:8px !important}.pl-lg-2{padding-left:8px !important}.px-lg-2{padding-right:8px !important;padding-left:8px !important}.py-lg-2{padding-top:8px !important;padding-bottom:8px !important}.p-lg-3{padding:16px !important}.pt-lg-3{padding-top:16px !important}.pr-lg-3{padding-right:16px !important}.pb-lg-3{padding-bottom:16px !important}.pl-lg-3{padding-left:16px !important}.px-lg-3{padding-right:16px !important;padding-left:16px !important}.py-lg-3{padding-top:16px !important;padding-bottom:16px !important}.p-lg-4{padding:24px !important}.pt-lg-4{padding-top:24px !important}.pr-lg-4{padding-right:24px !important}.pb-lg-4{padding-bottom:24px !important}.pl-lg-4{padding-left:24px !important}.px-lg-4{padding-right:24px !important;padding-left:24px !important}.py-lg-4{padding-top:24px !important;padding-bottom:24px !important}.p-lg-5{padding:32px !important}.pt-lg-5{padding-top:32px !important}.pr-lg-5{padding-right:32px !important}.pb-lg-5{padding-bottom:32px !important}.pl-lg-5{padding-left:32px !important}.px-lg-5{padding-right:32px !important;padding-left:32px !important}.py-lg-5{padding-top:32px !important;padding-bottom:32px !important}.p-lg-6{padding:40px !important}.pt-lg-6{padding-top:40px !important}.pr-lg-6{padding-right:40px !important}.pb-lg-6{padding-bottom:40px !important}.pl-lg-6{padding-left:40px !important}.px-lg-6{padding-right:40px !important;padding-left:40px !important}.py-lg-6{padding-top:40px !important;padding-bottom:40px !important}}@media (min-width: 1280px){.p-xl-0{padding:0 !important}.pt-xl-0{padding-top:0 !important}.pr-xl-0{padding-right:0 !important}.pb-xl-0{padding-bottom:0 !important}.pl-xl-0{padding-left:0 !important}.px-xl-0{padding-right:0 !important;padding-left:0 !important}.py-xl-0{padding-top:0 !important;padding-bottom:0 !important}.p-xl-1{padding:4px !important}.pt-xl-1{padding-top:4px !important}.pr-xl-1{padding-right:4px !important}.pb-xl-1{padding-bottom:4px !important}.pl-xl-1{padding-left:4px !important}.px-xl-1{padding-right:4px !important;padding-left:4px !important}.py-xl-1{padding-top:4px !important;padding-bottom:4px !important}.p-xl-2{padding:8px !important}.pt-xl-2{padding-top:8px !important}.pr-xl-2{padding-right:8px !important}.pb-xl-2{padding-bottom:8px !important}.pl-xl-2{padding-left:8px !important}.px-xl-2{padding-right:8px !important;padding-left:8px !important}.py-xl-2{padding-top:8px !important;padding-bottom:8px !important}.p-xl-3{padding:16px !important}.pt-xl-3{padding-top:16px !important}.pr-xl-3{padding-right:16px !important}.pb-xl-3{padding-bottom:16px !important}.pl-xl-3{padding-left:16px !important}.px-xl-3{padding-right:16px !important;padding-left:16px !important}.py-xl-3{padding-top:16px !important;padding-bottom:16px !important}.p-xl-4{padding:24px !important}.pt-xl-4{padding-top:24px !important}.pr-xl-4{padding-right:24px !important}.pb-xl-4{padding-bottom:24px !important}.pl-xl-4{padding-left:24px !important}.px-xl-4{padding-right:24px !important;padding-left:24px !important}.py-xl-4{padding-top:24px !important;padding-bottom:24px !important}.p-xl-5{padding:32px !important}.pt-xl-5{padding-top:32px !important}.pr-xl-5{padding-right:32px !important}.pb-xl-5{padding-bottom:32px !important}.pl-xl-5{padding-left:32px !important}.px-xl-5{padding-right:32px !important;padding-left:32px !important}.py-xl-5{padding-top:32px !important;padding-bottom:32px !important}.p-xl-6{padding:40px !important}.pt-xl-6{padding-top:40px !important}.pr-xl-6{padding-right:40px !important}.pb-xl-6{padding-bottom:40px !important}.pl-xl-6{padding-left:40px !important}.px-xl-6{padding-right:40px !important;padding-left:40px !important}.py-xl-6{padding-top:40px !important;padding-bottom:40px !important}}.p-responsive{padding-right:16px !important;padding-left:16px !important}@media (min-width: 544px){.p-responsive{padding-right:40px !important;padding-left:40px !important}}@media (min-width: 1012px){.p-responsive{padding-right:16px !important;padding-left:16px !important}}.h1{font-size:26px !important}@media (min-width: 768px){.h1{font-size:32px !important}}.h2{font-size:22px !important}@media (min-width: 768px){.h2{font-size:24px !important}}.h3{font-size:18px !important}@media (min-width: 768px){.h3{font-size:20px !important}}.h4{font-size:16px !important}.h5{font-size:14px !important}.h6{font-size:12px !important}.h1,.h2,.h3,.h4,.h5,.h6{font-weight:600 !important}.f1{font-size:26px !important}@media (min-width: 768px){.f1{font-size:32px !important}}.f2{font-size:22px !important}@media (min-width: 768px){.f2{font-size:24px !important}}.f3{font-size:18px !important}@media (min-width: 768px){.f3{font-size:20px !important}}.f4{font-size:16px !important}@media (min-width: 768px){.f4{font-size:16px !important}}.f5{font-size:14px !important}.f6{font-size:12px !important}.f00-light{font-size:40px !important;font-weight:300 !important}@media (min-width: 768px){.f00-light{font-size:48px !important}}.f0-light{font-size:32px !important;font-weight:300 !important}@media (min-width: 768px){.f0-light{font-size:40px !important}}.f1-light{font-size:26px !important;font-weight:300 !important}@media (min-width: 768px){.f1-light{font-size:32px !important}}.f2-light{font-size:22px !important;font-weight:300 !important}@media (min-width: 768px){.f2-light{font-size:24px !important}}.f3-light{font-size:18px !important;font-weight:300 !important}@media (min-width: 768px){.f3-light{font-size:20px !important}}.text-small{font-size:12px !important}.lead{margin-bottom:30px;font-size:20px;font-weight:300;color:#586069}.lh-condensed-ultra{line-height:1 !important}.lh-condensed{line-height:1.25 !important}.lh-default{line-height:1.5 !important}.lh-0{line-height:0 !important}.text-right{text-align:right !important}.text-left{text-align:left !important}.text-center{text-align:center !important}@media (min-width: 544px){.text-sm-right{text-align:right !important}.text-sm-left{text-align:left !important}.text-sm-center{text-align:center !important}}@media (min-width: 768px){.text-md-right{text-align:right !important}.text-md-left{text-align:left !important}.text-md-center{text-align:center !important}}@media (min-width: 1012px){.text-lg-right{text-align:right !important}.text-lg-left{text-align:left !important}.text-lg-center{text-align:center !important}}@media (min-width: 1280px){.text-xl-right{text-align:right !important}.text-xl-left{text-align:left !important}.text-xl-center{text-align:center !important}}.text-normal{font-weight:400 !important}.text-bold{font-weight:600 !important}.text-italic{font-style:italic !important}.text-uppercase{text-transform:uppercase !important}.text-underline{text-decoration:underline !important}.no-underline{text-decoration:none !important}.no-wrap{white-space:nowrap !important}.ws-normal{white-space:normal !important}.wb-break-all{word-break:break-all !important}.text-emphasized{font-weight:600;color:#24292e}.list-style-none{list-style:none !important}.text-shadow-dark{text-shadow:0 1px 1px rgba(27,31,35,0.25),0 1px 25px rgba(27,31,35,0.75)}.text-shadow-light{text-shadow:0 1px 0 rgba(255,255,255,0.5)}.text-mono{font-family:"SFMono-Regular",Consolas,"Liberation Mono",Menlo,Courier,monospace}.user-select-none{-webkit-user-select:none !important;-moz-user-select:none !important;-ms-user-select:none !important;user-select:none !important}.d-block{display:block !important}.d-flex{display:flex !important}.d-inline{display:inline !important}.d-inline-block{display:inline-block !important}.d-inline-flex{display:inline-flex !important}.d-none{display:none !important}.d-table{display:table !important}.d-table-cell{display:table-cell !important}@media (min-width: 544px){.d-sm-block{display:block !important}.d-sm-flex{display:flex !important}.d-sm-inline{display:inline !important}.d-sm-inline-block{display:inline-block !important}.d-sm-inline-flex{display:inline-flex !important}.d-sm-none{display:none !important}.d-sm-table{display:table !important}.d-sm-table-cell{display:table-cell !important}}@media (min-width: 768px){.d-md-block{display:block !important}.d-md-flex{display:flex !important}.d-md-inline{display:inline !important}.d-md-inline-block{display:inline-block !important}.d-md-inline-flex{display:inline-flex !important}.d-md-none{display:none !important}.d-md-table{display:table !important}.d-md-table-cell{display:table-cell !important}}@media (min-width: 1012px){.d-lg-block{display:block !important}.d-lg-flex{display:flex !important}.d-lg-inline{display:inline !important}.d-lg-inline-block{display:inline-block !important}.d-lg-inline-flex{display:inline-flex !important}.d-lg-none{display:none !important}.d-lg-table{display:table !important}.d-lg-table-cell{display:table-cell !important}}@media (min-width: 1280px){.d-xl-block{display:block !important}.d-xl-flex{display:flex !important}.d-xl-inline{display:inline !important}.d-xl-inline-block{display:inline-block !important}.d-xl-inline-flex{display:inline-flex !important}.d-xl-none{display:none !important}.d-xl-table{display:table !important}.d-xl-table-cell{display:table-cell !important}}.v-hidden{visibility:hidden !important}.v-visible{visibility:visible !important}@media (max-width: 544px){.hide-sm{display:none !important}}@media (min-width: 544px) and (max-width: 768px){.hide-md{display:none !important}}@media (min-width: 768px) and (max-width: 1012px){.hide-lg{display:none !important}}@media (min-width: 1012px){.hide-xl{display:none !important}}.table-fixed{table-layout:fixed !important}.sr-only{position:absolute;width:1px;height:1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);word-wrap:normal;border:0}.show-on-focus{position:absolute;width:1px;height:1px;margin:0;overflow:hidden;clip:rect(1px, 1px, 1px, 1px)}.show-on-focus:focus{z-index:20;width:auto;height:auto;clip:auto}.container{width:980px;margin-right:auto;margin-left:auto}.container::before{display:table;content:""}.container::after{display:table;clear:both;content:""}.container-md{max-width:768px;margin-right:auto;margin-left:auto}.container-lg{max-width:1012px;margin-right:auto;margin-left:auto}.container-xl{max-width:1280px;margin-right:auto;margin-left:auto}.columns{margin-right:-10px;margin-left:-10px}.columns::before{display:table;content:""}.columns::after{display:table;clear:both;content:""}.column{float:left;padding-right:10px;padding-left:10px}.one-third{width:33.333333%}.two-thirds{width:66.666667%}.one-fourth{width:25%}.one-half{width:50%}.three-fourths{width:75%}.one-fifth{width:20%}.four-fifths{width:80%}.centered{display:block;float:none;margin-right:auto;margin-left:auto}.col-1{width:8.3333333333%}.col-2{width:16.6666666667%}.col-3{width:25%}.col-4{width:33.3333333333%}.col-5{width:41.6666666667%}.col-6{width:50%}.col-7{width:58.3333333333%}.col-8{width:66.6666666667%}.col-9{width:75%}.col-10{width:83.3333333333%}.col-11{width:91.6666666667%}.col-12{width:100%}@media (min-width: 544px){.col-sm-1{width:8.3333333333%}.col-sm-2{width:16.6666666667%}.col-sm-3{width:25%}.col-sm-4{width:33.3333333333%}.col-sm-5{width:41.6666666667%}.col-sm-6{width:50%}.col-sm-7{width:58.3333333333%}.col-sm-8{width:66.6666666667%}.col-sm-9{width:75%}.col-sm-10{width:83.3333333333%}.col-sm-11{width:91.6666666667%}.col-sm-12{width:100%}}@media (min-width: 768px){.col-md-1{width:8.3333333333%}.col-md-2{width:16.6666666667%}.col-md-3{width:25%}.col-md-4{width:33.3333333333%}.col-md-5{width:41.6666666667%}.col-md-6{width:50%}.col-md-7{width:58.3333333333%}.col-md-8{width:66.6666666667%}.col-md-9{width:75%}.col-md-10{width:83.3333333333%}.col-md-11{width:91.6666666667%}.col-md-12{width:100%}}@media (min-width: 1012px){.col-lg-1{width:8.3333333333%}.col-lg-2{width:16.6666666667%}.col-lg-3{width:25%}.col-lg-4{width:33.3333333333%}.col-lg-5{width:41.6666666667%}.col-lg-6{width:50%}.col-lg-7{width:58.3333333333%}.col-lg-8{width:66.6666666667%}.col-lg-9{width:75%}.col-lg-10{width:83.3333333333%}.col-lg-11{width:91.6666666667%}.col-lg-12{width:100%}}@media (min-width: 1280px){.col-xl-1{width:8.3333333333%}.col-xl-2{width:16.6666666667%}.col-xl-3{width:25%}.col-xl-4{width:33.3333333333%}.col-xl-5{width:41.6666666667%}.col-xl-6{width:50%}.col-xl-7{width:58.3333333333%}.col-xl-8{width:66.6666666667%}.col-xl-9{width:75%}.col-xl-10{width:83.3333333333%}.col-xl-11{width:91.6666666667%}.col-xl-12{width:100%}}.gutter{margin-right:-16px;margin-left:-16px}.gutter>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-condensed{margin-right:-8px;margin-left:-8px}.gutter-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-spacious{margin-right:-24px;margin-left:-24px}.gutter-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}@media (min-width: 544px){.gutter-sm{margin-right:-16px;margin-left:-16px}.gutter-sm>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-sm-condensed{margin-right:-8px;margin-left:-8px}.gutter-sm-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-sm-spacious{margin-right:-24px;margin-left:-24px}.gutter-sm-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}}@media (min-width: 768px){.gutter-md{margin-right:-16px;margin-left:-16px}.gutter-md>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-md-condensed{margin-right:-8px;margin-left:-8px}.gutter-md-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-md-spacious{margin-right:-24px;margin-left:-24px}.gutter-md-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}}@media (min-width: 1012px){.gutter-lg{margin-right:-16px;margin-left:-16px}.gutter-lg>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-lg-condensed{margin-right:-8px;margin-left:-8px}.gutter-lg-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-lg-spacious{margin-right:-24px;margin-left:-24px}.gutter-lg-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}}@media (min-width: 1280px){.gutter-xl{margin-right:-16px;margin-left:-16px}.gutter-xl>[class*="col-"]{padding-right:16px !important;padding-left:16px !important}.gutter-xl-condensed{margin-right:-8px;margin-left:-8px}.gutter-xl-condensed>[class*="col-"]{padding-right:8px !important;padding-left:8px !important}.gutter-xl-spacious{margin-right:-24px;margin-left:-24px}.gutter-xl-spacious>[class*="col-"]{padding-right:24px !important;padding-left:24px !important}}.offset-1{margin-left:8.3333333333% !important}.offset-2{margin-left:16.6666666667% !important}.offset-3{margin-left:25% !important}.offset-4{margin-left:33.3333333333% !important}.offset-5{margin-left:41.6666666667% !important}.offset-6{margin-left:50% !important}.offset-7{margin-left:58.3333333333% !important}.offset-8{margin-left:66.6666666667% !important}.offset-9{margin-left:75% !important}.offset-10{margin-left:83.3333333333% !important}.offset-11{margin-left:91.6666666667% !important}@media (min-width: 544px){.offset-sm-1{margin-left:8.3333333333% !important}.offset-sm-2{margin-left:16.6666666667% !important}.offset-sm-3{margin-left:25% !important}.offset-sm-4{margin-left:33.3333333333% !important}.offset-sm-5{margin-left:41.6666666667% !important}.offset-sm-6{margin-left:50% !important}.offset-sm-7{margin-left:58.3333333333% !important}.offset-sm-8{margin-left:66.6666666667% !important}.offset-sm-9{margin-left:75% !important}.offset-sm-10{margin-left:83.3333333333% !important}.offset-sm-11{margin-left:91.6666666667% !important}}@media (min-width: 768px){.offset-md-1{margin-left:8.3333333333% !important}.offset-md-2{margin-left:16.6666666667% !important}.offset-md-3{margin-left:25% !important}.offset-md-4{margin-left:33.3333333333% !important}.offset-md-5{margin-left:41.6666666667% !important}.offset-md-6{margin-left:50% !important}.offset-md-7{margin-left:58.3333333333% !important}.offset-md-8{margin-left:66.6666666667% !important}.offset-md-9{margin-left:75% !important}.offset-md-10{margin-left:83.3333333333% !important}.offset-md-11{margin-left:91.6666666667% !important}}@media (min-width: 1012px){.offset-lg-1{margin-left:8.3333333333% !important}.offset-lg-2{margin-left:16.6666666667% !important}.offset-lg-3{margin-left:25% !important}.offset-lg-4{margin-left:33.3333333333% !important}.offset-lg-5{margin-left:41.6666666667% !important}.offset-lg-6{margin-left:50% !important}.offset-lg-7{margin-left:58.3333333333% !important}.offset-lg-8{margin-left:66.6666666667% !important}.offset-lg-9{margin-left:75% !important}.offset-lg-10{margin-left:83.3333333333% !important}.offset-lg-11{margin-left:91.6666666667% !important}}@media (min-width: 1280px){.offset-xl-1{margin-left:8.3333333333% !important}.offset-xl-2{margin-left:16.6666666667% !important}.offset-xl-3{margin-left:25% !important}.offset-xl-4{margin-left:33.3333333333% !important}.offset-xl-5{margin-left:41.6666666667% !important}.offset-xl-6{margin-left:50% !important}.offset-xl-7{margin-left:58.3333333333% !important}.offset-xl-8{margin-left:66.6666666667% !important}.offset-xl-9{margin-left:75% !important}.offset-xl-10{margin-left:83.3333333333% !important}.offset-xl-11{margin-left:91.6666666667% !important}}.markdown-body{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";font-size:16px;line-height:1.5;word-wrap:break-word}.markdown-body::before{display:table;content:""}.markdown-body::after{display:table;clear:both;content:""}.markdown-body>*:first-child{margin-top:0 !important}.markdown-body>*:last-child{margin-bottom:0 !important}.markdown-body a:not([href]){color:inherit;text-decoration:none}.markdown-body .absent{color:#cb2431}.markdown-body .anchor{float:left;padding-right:4px;margin-left:-20px;line-height:1}.markdown-body .anchor:focus{outline:none}.markdown-body p,.markdown-body blockquote,.markdown-body ul,.markdown-body ol,.markdown-body dl,.markdown-body table,.markdown-body pre{margin-top:0;margin-bottom:16px}.markdown-body hr{height:.25em;padding:0;margin:24px 0;background-color:#e1e4e8;border:0}.markdown-body blockquote{padding:0 1em;color:#6a737d;border-left:0.25em solid #dfe2e5}.markdown-body blockquote>:first-child{margin-top:0}.markdown-body blockquote>:last-child{margin-bottom:0}.markdown-body kbd{display:inline-block;padding:3px 5px;font-size:11px;line-height:10px;color:#444d56;vertical-align:middle;background-color:#fafbfc;border:solid 1px #c6cbd1;border-bottom-color:#959da5;border-radius:3px;box-shadow:inset 0 -1px 0 #959da5}.markdown-body h1,.markdown-body h2,.markdown-body h3,.markdown-body h4,.markdown-body h5,.markdown-body h6{margin-top:24px;margin-bottom:16px;font-weight:600;line-height:1.25}.markdown-body h1 .octicon-link,.markdown-body h2 .octicon-link,.markdown-body h3 .octicon-link,.markdown-body h4 .octicon-link,.markdown-body h5 .octicon-link,.markdown-body h6 .octicon-link{color:#1b1f23;vertical-align:middle;visibility:hidden}.markdown-body h1:hover .anchor,.markdown-body h2:hover .anchor,.markdown-body h3:hover .anchor,.markdown-body h4:hover .anchor,.markdown-body h5:hover .anchor,.markdown-body h6:hover .anchor{text-decoration:none}.markdown-body h1:hover .anchor .octicon-link,.markdown-body h2:hover .anchor .octicon-link,.markdown-body h3:hover .anchor .octicon-link,.markdown-body h4:hover .anchor .octicon-link,.markdown-body h5:hover .anchor .octicon-link,.markdown-body h6:hover .anchor .octicon-link{visibility:visible}.markdown-body h1 tt,.markdown-body h1 code,.markdown-body h2 tt,.markdown-body h2 code,.markdown-body h3 tt,.markdown-body h3 code,.markdown-body h4 tt,.markdown-body h4 code,.markdown-body h5 tt,.markdown-body h5 code,.markdown-body h6 tt,.markdown-body h6 code{font-size:inherit}.markdown-body h1{padding-bottom:0.3em;font-size:2em;border-bottom:1px solid #eaecef}.markdown-body h2{padding-bottom:0.3em;font-size:1.5em;border-bottom:1px solid #eaecef}.markdown-body h3{font-size:1.25em}.markdown-body h4{font-size:1em}.markdown-body h5{font-size:0.875em}.markdown-body h6{font-size:0.85em;color:#6a737d}.markdown-body ul,.markdown-body ol{padding-left:2em}.markdown-body ul.no-list,.markdown-body ol.no-list{padding:0;list-style-type:none}.markdown-body ul ul,.markdown-body ul ol,.markdown-body ol ol,.markdown-body ol ul{margin-top:0;margin-bottom:0}.markdown-body li{word-wrap:break-all}.markdown-body li>p{margin-top:16px}.markdown-body li+li{margin-top:.25em}.markdown-body dl{padding:0}.markdown-body dl dt{padding:0;margin-top:16px;font-size:1em;font-style:italic;font-weight:600}.markdown-body dl dd{padding:0 16px;margin-bottom:16px}.markdown-body table{display:block;width:100%;overflow:auto}.markdown-body table th{font-weight:600}.markdown-body table th,.markdown-body table td{padding:6px 13px;border:1px solid #dfe2e5}.markdown-body table tr{background-color:#fff;border-top:1px solid #c6cbd1}.markdown-body table tr:nth-child(2n){background-color:#f6f8fa}.markdown-body table img{background-color:transparent}.markdown-body img{max-width:100%;box-sizing:content-box;background-color:#fff}.markdown-body img[align=right]{padding-left:20px}.markdown-body img[align=left]{padding-right:20px}.markdown-body .emoji{max-width:none;vertical-align:text-top;background-color:transparent}.markdown-body span.frame{display:block;overflow:hidden}.markdown-body span.frame>span{display:block;float:left;width:auto;padding:7px;margin:13px 0 0;overflow:hidden;border:1px solid #dfe2e5}.markdown-body span.frame span img{display:block;float:left}.markdown-body span.frame span span{display:block;padding:5px 0 0;clear:both;color:#24292e}.markdown-body span.align-center{display:block;overflow:hidden;clear:both}.markdown-body span.align-center>span{display:block;margin:13px auto 0;overflow:hidden;text-align:center}.markdown-body span.align-center span img{margin:0 auto;text-align:center}.markdown-body span.align-right{display:block;overflow:hidden;clear:both}.markdown-body span.align-right>span{display:block;margin:13px 0 0;overflow:hidden;text-align:right}.markdown-body span.align-right span img{margin:0;text-align:right}.markdown-body span.float-left{display:block;float:left;margin-right:13px;overflow:hidden}.markdown-body span.float-left span{margin:13px 0 0}.markdown-body span.float-right{display:block;float:right;margin-left:13px;overflow:hidden}.markdown-body span.float-right>span{display:block;margin:13px auto 0;overflow:hidden;text-align:right}.markdown-body code,.markdown-body tt{padding:0.2em 0.4em;margin:0;font-size:85%;background-color:rgba(27,31,35,0.05);border-radius:3px}.markdown-body code br,.markdown-body tt br{display:none}.markdown-body del code{text-decoration:inherit}.markdown-body pre{word-wrap:normal}.markdown-body pre>code{padding:0;margin:0;font-size:100%;word-break:normal;white-space:pre;background:transparent;border:0}.markdown-body .highlight{margin-bottom:16px}.markdown-body .highlight pre{margin-bottom:0;word-break:normal}.markdown-body .highlight pre,.markdown-body pre{padding:16px;overflow:auto;font-size:85%;line-height:1.45;background-color:#f6f8fa;border-radius:3px}.markdown-body pre code,.markdown-body pre tt{display:inline;max-width:auto;padding:0;margin:0;overflow:visible;line-height:inherit;word-wrap:normal;background-color:transparent;border:0}.markdown-body .csv-data td,.markdown-body .csv-data th{padding:5px;overflow:hidden;font-size:12px;line-height:1;text-align:left;white-space:nowrap}.markdown-body .csv-data .blob-num{padding:10px 8px 9px;text-align:right;background:#fff;border:0}.markdown-body .csv-data tr{border-top:0}.markdown-body .csv-data th{font-weight:600;background:#f6f8fa;border-top:0}.highlight table td{padding:5px}.highlight table pre{margin:0}.highlight .cm{color:#999988;font-style:italic}.highlight .cp{color:#999999;font-weight:bold}.highlight .c1{color:#999988;font-style:italic}.highlight .cs{color:#999999;font-weight:bold;font-style:italic}.highlight .c,.highlight .cd{color:#999988;font-style:italic}.highlight .err{color:#a61717;background-color:#e3d2d2}.highlight .gd{color:#000000;background-color:#ffdddd}.highlight .ge{color:#000000;font-style:italic}.highlight .gr{color:#aa0000}.highlight .gh{color:#999999}.highlight .gi{color:#000000;background-color:#ddffdd}.highlight .go{color:#888888}.highlight .gp{color:#555555}.highlight .gs{font-weight:bold}.highlight .gu{color:#aaaaaa}.highlight .gt{color:#aa0000}.highlight .kc{color:#000000;font-weight:bold}.highlight .kd{color:#000000;font-weight:bold}.highlight .kn{color:#000000;font-weight:bold}.highlight .kp{color:#000000;font-weight:bold}.highlight .kr{color:#000000;font-weight:bold}.highlight .kt{color:#445588;font-weight:bold}.highlight .k,.highlight .kv{color:#000000;font-weight:bold}.highlight .mf{color:#009999}.highlight .mh{color:#009999}.highlight .il{color:#009999}.highlight .mi{color:#009999}.highlight .mo{color:#009999}.highlight .m,.highlight .mb,.highlight .mx{color:#009999}.highlight .sb{color:#d14}.highlight .sc{color:#d14}.highlight .sd{color:#d14}.highlight .s2{color:#d14}.highlight .se{color:#d14}.highlight .sh{color:#d14}.highlight .si{color:#d14}.highlight .sx{color:#d14}.highlight .sr{color:#009926}.highlight .s1{color:#d14}.highlight .ss{color:#990073}.highlight .s{color:#d14}.highlight .na{color:#008080}.highlight .bp{color:#999999}.highlight .nb{color:#0086B3}.highlight .nc{color:#445588;font-weight:bold}.highlight .no{color:#008080}.highlight .nd{color:#3c5d5d;font-weight:bold}.highlight .ni{color:#800080}.highlight .ne{color:#990000;font-weight:bold}.highlight .nf{color:#990000;font-weight:bold}.highlight .nl{color:#990000;font-weight:bold}.highlight .nn{color:#555555}.highlight .nt{color:#000080}.highlight .vc{color:#008080}.highlight .vg{color:#008080}.highlight .vi{color:#008080}.highlight .nv{color:#008080}.highlight .ow{color:#000000;font-weight:bold}.highlight .o{color:#000000;font-weight:bold}.highlight .w{color:#bbbbbb}.highlight{background-color:#f8f8f8} diff --git a/assets/hub/CODE_OF_CONDUCT.ipynb b/assets/hub/CODE_OF_CONDUCT.ipynb new file mode 100644 index 000000000000..363fcab7ed6e --- /dev/null +++ b/assets/hub/CODE_OF_CONDUCT.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/CONTRIBUTING.ipynb b/assets/hub/CONTRIBUTING.ipynb new file mode 100644 index 000000000000..363fcab7ed6e --- /dev/null +++ b/assets/hub/CONTRIBUTING.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/datvuthanh_hybridnets.ipynb b/assets/hub/datvuthanh_hybridnets.ipynb new file mode 100644 index 000000000000..8afb27b0135d --- /dev/null +++ b/assets/hub/datvuthanh_hybridnets.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bab3a91d", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# HybridNets\n", + "\n", + "*Author: Dat Vu Thanh*\n", + "\n", + "**HybridNets - End2End Perception Network**\n", + "\n", + "## Before You Start\n", + "\n", + "Start from a **Python>=3.7** environment with **PyTorch>=1.10** installed. To install PyTorch see [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/). To install HybridNets dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9be78eea", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install -qr https://raw.githubusercontent.com/datvuthanh/HybridNets/main/requirements.txt # install dependencies" + ] + }, + { + "cell_type": "markdown", + "id": "e218345e", + "metadata": {}, + "source": [ + "## Model Description\n", + " \n", + " \n", + "\n", + "HybridNets is an end2end perception network for multi-tasks. Our work focused on traffic object detection, drivable area segmentation and lane detection. HybridNets can run real-time on embedded systems, and obtains SOTA Object Detection, Lane Detection on BDD100K Dataset.\n", + "\n", + "### Results\n", + "\n", + "### Traffic Object Detection\n", + "\n", + "| Model | Recall (%) | mAP@0.5 (%) |\n", + "|:------------------:|:------------:|:---------------:|\n", + "| `MultiNet` | 81.3 | 60.2 |\n", + "| `DLT-Net` | 89.4 | 68.4 |\n", + "| `Faster R-CNN` | 77.2 | 55.6 |\n", + "| `YOLOv5s` | 86.8 | 77.2 |\n", + "| `YOLOP` | 89.2 | 76.5 |\n", + "| **`HybridNets`** | **92.8** | **77.3** |\n", + "\n", + "\n", + " \n", + "### Drivable Area Segmentation\n", + "\n", + "| Model | Drivable mIoU (%) |\n", + "|:----------------:|:-----------------:|\n", + "| `MultiNet` | 71.6 |\n", + "| `DLT-Net` | 71.3 |\n", + "| `PSPNet` | 89.6 |\n", + "| `YOLOP` | 91.5 |\n", + "| **`HybridNets`** | **90.5** |\n", + "\n", + "\n", + " \n", + "### Lane Line Detection\n", + "\n", + "| Model | Accuracy (%) | Lane Line IoU (%) |\n", + "|:----------------:|:------------:|:-----------------:|\n", + "| `Enet` | 34.12 | 14.64 |\n", + "| `SCNN` | 35.79 | 15.84 |\n", + "| `Enet-SAD` | 36.56 | 16.02 |\n", + "| `YOLOP` | 70.5 | 26.2 |\n", + "| **`HybridNets`** | **85.4** | **31.6** |\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "### Load From PyTorch Hub\n", + "\n", + "This example loads the pretrained **HybridNets** model and passes an image for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b94d3f8", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# load model\n", + "model = torch.hub.load('datvuthanh/hybridnets', 'hybridnets', pretrained=True)\n", + "\n", + "#inference\n", + "img = torch.randn(1,3,640,384)\n", + "features, regression, classification, anchors, segmentation = model(img)" + ] + }, + { + "cell_type": "markdown", + "id": "1fdb0bae", + "metadata": {}, + "source": [ + "### Citation\n", + "\n", + "If you find our [paper](https://arxiv.org/abs/2203.09035) and [code](https://github.com/datvuthanh/HybridNets) useful for your research, please consider giving a star and citation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76552e35", + "metadata": { + "attributes": { + "classes": [ + "BibTeX" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@misc{vu2022hybridnets,\n", + " title={HybridNets: End-to-End Perception Network}, \n", + " author={Dat Vu and Bao Ngo and Hung Phan},\n", + " year={2022},\n", + " eprint={2203.09035},\n", + " archivePrefix={arXiv},\n", + " primaryClass={cs.CV}\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_WSL-Images_resnext.ipynb b/assets/hub/facebookresearch_WSL-Images_resnext.ipynb index 223a36864282..874d8fe2b6c3 100644 --- a/assets/hub/facebookresearch_WSL-Images_resnext.ipynb +++ b/assets/hub/facebookresearch_WSL-Images_resnext.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "8ef1d490", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,6 +22,7 @@ { "cell_type": "code", "execution_count": null, + "id": "4cd8dad2", "metadata": {}, "outputs": [], "source": [ @@ -37,6 +39,7 @@ }, { "cell_type": "markdown", + "id": "5a74a046", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -50,12 +53,13 @@ { "cell_type": "code", "execution_count": null, + "id": "96400b56", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -63,6 +67,7 @@ { "cell_type": "code", "execution_count": null, + "id": "6a5a0f9c", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +91,7 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", "print(torch.nn.functional.softmax(output[0], dim=0))\n" @@ -94,6 +99,7 @@ }, { "cell_type": "markdown", + "id": "1ab881a6", "metadata": {}, "source": [ "### Model Description\n", @@ -118,5 +124,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/facebookresearch_pytorch-gan-zoo_dcgan.ipynb b/assets/hub/facebookresearch_pytorch-gan-zoo_dcgan.ipynb index b2502de94025..85ac76ff49f2 100644 --- a/assets/hub/facebookresearch_pytorch-gan-zoo_dcgan.ipynb +++ b/assets/hub/facebookresearch_pytorch-gan-zoo_dcgan.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "1b258978", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,6 +22,7 @@ { "cell_type": "code", "execution_count": null, + "id": "c32ce03d", "metadata": {}, "outputs": [], "source": [ @@ -32,6 +34,7 @@ }, { "cell_type": "markdown", + "id": "262b4ccb", "metadata": {}, "source": [ "The input to the model is a noise vector of shape `(N, 120)` where `N` is the number of images to be generated.\n", @@ -42,6 +45,7 @@ { "cell_type": "code", "execution_count": null, + "id": "c6c00225", "metadata": {}, "outputs": [], "source": [ @@ -59,6 +63,7 @@ }, { "cell_type": "markdown", + "id": "12cb57e2", "metadata": {}, "source": [ "You should see an image similar to the one on the left.\n", @@ -85,5 +90,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/facebookresearch_pytorch-gan-zoo_pgan.ipynb b/assets/hub/facebookresearch_pytorch-gan-zoo_pgan.ipynb index 46f1aa89ddc1..26c0c359e205 100644 --- a/assets/hub/facebookresearch_pytorch-gan-zoo_pgan.ipynb +++ b/assets/hub/facebookresearch_pytorch-gan-zoo_pgan.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "101b74b4", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -23,6 +24,7 @@ { "cell_type": "code", "execution_count": null, + "id": "e69f3757", "metadata": {}, "outputs": [], "source": [ @@ -42,6 +44,7 @@ }, { "cell_type": "markdown", + "id": "5d21bcb3", "metadata": {}, "source": [ "The input to the model is a noise vector of shape `(N, 512)` where `N` is the number of images to be generated.\n", @@ -52,6 +55,7 @@ { "cell_type": "code", "execution_count": null, + "id": "cd6247e2", "metadata": {}, "outputs": [], "source": [ @@ -70,6 +74,7 @@ }, { "cell_type": "markdown", + "id": "d38bb5f5", "metadata": {}, "source": [ "You should see an image similar to the one on the left.\n", @@ -88,11 +93,11 @@ "\n", "### References\n", "\n", - "- [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://arxiv.org/abs/1710.10196)" + "[1] Tero Karras et al, \"Progressive Growing of GANs for Improved Quality, Stability, and Variation\" https://arxiv.org/abs/1710.10196" ] } ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/facebookresearch_pytorchvideo_resnet.ipynb b/assets/hub/facebookresearch_pytorchvideo_resnet.ipynb new file mode 100644 index 000000000000..4b4641722270 --- /dev/null +++ b/assets/hub/facebookresearch_pytorchvideo_resnet.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e4f99e2c", + "metadata": {}, + "source": [ + "# 3D ResNet\n", + "\n", + "*Author: FAIR PyTorchVideo*\n", + "\n", + "**Resnet Style Video classification networks pretrained on the Kinetics 400 dataset**\n", + "\n", + "\n", + "### Example Usage\n", + "\n", + "#### Imports\n", + "\n", + "Load the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96affaa2", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# Choose the `slow_r50` model \n", + "model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1d0359d9", + "metadata": {}, + "source": [ + "Import remaining functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab84506f", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import urllib\n", + "from pytorchvideo.data.encoded_video import EncodedVideo\n", + "\n", + "from torchvision.transforms import Compose, Lambda\n", + "from torchvision.transforms._transforms_video import (\n", + " CenterCropVideo,\n", + " NormalizeVideo,\n", + ")\n", + "from pytorchvideo.transforms import (\n", + " ApplyTransformToKey,\n", + " ShortSideScale,\n", + " UniformTemporalSubsample\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "06976792", + "metadata": {}, + "source": [ + "#### Setup\n", + "\n", + "Set the model to eval mode and move to desired device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "680df0e7", + "metadata": { + "attributes": { + "classes": [ + "python " + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "# Set to GPU or CPU\n", + "device = \"cpu\"\n", + "model = model.eval()\n", + "model = model.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "68096afb", + "metadata": {}, + "source": [ + "Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c1eaa3c", + "metadata": {}, + "outputs": [], + "source": [ + "json_url = \"https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json\"\n", + "json_filename = \"kinetics_classnames.json\"\n", + "try: urllib.URLopener().retrieve(json_url, json_filename)\n", + "except: urllib.request.urlretrieve(json_url, json_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "134f9719", + "metadata": {}, + "outputs": [], + "source": [ + "with open(json_filename, \"r\") as f:\n", + " kinetics_classnames = json.load(f)\n", + "\n", + "# Create an id to label name mapping\n", + "kinetics_id_to_classname = {}\n", + "for k, v in kinetics_classnames.items():\n", + " kinetics_id_to_classname[v] = str(k).replace('\"', \"\")" + ] + }, + { + "cell_type": "markdown", + "id": "b53cb1e8", + "metadata": {}, + "source": [ + "#### Define input transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f317a15", + "metadata": {}, + "outputs": [], + "source": [ + "side_size = 256\n", + "mean = [0.45, 0.45, 0.45]\n", + "std = [0.225, 0.225, 0.225]\n", + "crop_size = 256\n", + "num_frames = 8\n", + "sampling_rate = 8\n", + "frames_per_second = 30\n", + "\n", + "# Note that this transform is specific to the slow_R50 model.\n", + "transform = ApplyTransformToKey(\n", + " key=\"video\",\n", + " transform=Compose(\n", + " [\n", + " UniformTemporalSubsample(num_frames),\n", + " Lambda(lambda x: x/255.0),\n", + " NormalizeVideo(mean, std),\n", + " ShortSideScale(\n", + " size=side_size\n", + " ),\n", + " CenterCropVideo(crop_size=(crop_size, crop_size))\n", + " ]\n", + " ),\n", + ")\n", + "\n", + "# The duration of the input clip is also specific to the model.\n", + "clip_duration = (num_frames * sampling_rate)/frames_per_second" + ] + }, + { + "cell_type": "markdown", + "id": "2126afcc", + "metadata": {}, + "source": [ + "#### Run Inference\n", + "\n", + "Download an example video." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d22db1ed", + "metadata": {}, + "outputs": [], + "source": [ + "url_link = \"https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4\"\n", + "video_path = 'archery.mp4'\n", + "try: urllib.URLopener().retrieve(url_link, video_path)\n", + "except: urllib.request.urlretrieve(url_link, video_path)" + ] + }, + { + "cell_type": "markdown", + "id": "a51f110a", + "metadata": {}, + "source": [ + "Load the video and transform it to the input format required by the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29a3ea72", + "metadata": {}, + "outputs": [], + "source": [ + "# Select the duration of the clip to load by specifying the start and end duration\n", + "# The start_sec should correspond to where the action occurs in the video\n", + "start_sec = 0\n", + "end_sec = start_sec + clip_duration\n", + "\n", + "# Initialize an EncodedVideo helper class and load the video\n", + "video = EncodedVideo.from_path(video_path)\n", + "\n", + "# Load the desired clip\n", + "video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)\n", + "\n", + "# Apply a transform to normalize the video input\n", + "video_data = transform(video_data)\n", + "\n", + "# Move the inputs to the desired device\n", + "inputs = video_data[\"video\"]\n", + "inputs = inputs.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "5d6b6e2c", + "metadata": {}, + "source": [ + "#### Get Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4190298", + "metadata": {}, + "outputs": [], + "source": [ + "# Pass the input clip through the model\n", + "preds = model(inputs[None, ...])\n", + "\n", + "# Get the predicted classes\n", + "post_act = torch.nn.Softmax(dim=1)\n", + "preds = post_act(preds)\n", + "pred_classes = preds.topk(k=5).indices[0]\n", + "\n", + "# Map the predicted classes to the label names\n", + "pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]\n", + "print(\"Top 5 predicted labels: %s\" % \", \".join(pred_class_names))" + ] + }, + { + "cell_type": "markdown", + "id": "15cd8cf7", + "metadata": {}, + "source": [ + "### Model Description\n", + "The model architecture is based on [1] with pretrained weights using the 8x8 setting\n", + "on the Kinetics dataset. \n", + "\n", + "| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |\n", + "| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |\n", + "| Slow | R50 | 8x8 | 74.58 | 91.63 | 54.52 | 32.45 |\n", + "\n", + "\n", + "### References\n", + "[1] Christoph Feichtenhofer et al, \"SlowFast Networks for Video Recognition\"\n", + "https://arxiv.org/pdf/1812.03982.pdf" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_pytorchvideo_slowfast.ipynb b/assets/hub/facebookresearch_pytorchvideo_slowfast.ipynb new file mode 100644 index 000000000000..a95866528fae --- /dev/null +++ b/assets/hub/facebookresearch_pytorchvideo_slowfast.ipynb @@ -0,0 +1,308 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "43b62276", + "metadata": {}, + "source": [ + "# SlowFast\n", + "\n", + "*Author: FAIR PyTorchVideo*\n", + "\n", + "**SlowFast networks pretrained on the Kinetics 400 dataset**\n", + "\n", + "\n", + "### Example Usage\n", + "\n", + "#### Imports\n", + "\n", + "Load the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cad7ce41", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# Choose the `slowfast_r50` model \n", + "model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0105e28f", + "metadata": {}, + "source": [ + "Import remaining functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21ec21fc", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "import json\n", + "import urllib\n", + "from torchvision.transforms import Compose, Lambda\n", + "from torchvision.transforms._transforms_video import (\n", + " CenterCropVideo,\n", + " NormalizeVideo,\n", + ")\n", + "from pytorchvideo.data.encoded_video import EncodedVideo\n", + "from pytorchvideo.transforms import (\n", + " ApplyTransformToKey,\n", + " ShortSideScale,\n", + " UniformTemporalSubsample,\n", + " UniformCropVideo\n", + ") " + ] + }, + { + "cell_type": "markdown", + "id": "88fe7c95", + "metadata": {}, + "source": [ + "#### Setup\n", + "\n", + "Set the model to eval mode and move to desired device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e439a0f", + "metadata": { + "attributes": { + "classes": [ + "python " + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "# Set to GPU or CPU\n", + "device = \"cpu\"\n", + "model = model.eval()\n", + "model = model.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "c9126ed1", + "metadata": {}, + "source": [ + "Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a9f96e8", + "metadata": {}, + "outputs": [], + "source": [ + "json_url = \"https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json\"\n", + "json_filename = \"kinetics_classnames.json\"\n", + "try: urllib.URLopener().retrieve(json_url, json_filename)\n", + "except: urllib.request.urlretrieve(json_url, json_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de3e0e3f", + "metadata": {}, + "outputs": [], + "source": [ + "with open(json_filename, \"r\") as f:\n", + " kinetics_classnames = json.load(f)\n", + "\n", + "# Create an id to label name mapping\n", + "kinetics_id_to_classname = {}\n", + "for k, v in kinetics_classnames.items():\n", + " kinetics_id_to_classname[v] = str(k).replace('\"', \"\")" + ] + }, + { + "cell_type": "markdown", + "id": "6866da20", + "metadata": {}, + "source": [ + "#### Define input transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8beb0a98", + "metadata": {}, + "outputs": [], + "source": [ + "side_size = 256\n", + "mean = [0.45, 0.45, 0.45]\n", + "std = [0.225, 0.225, 0.225]\n", + "crop_size = 256\n", + "num_frames = 32\n", + "sampling_rate = 2\n", + "frames_per_second = 30\n", + "slowfast_alpha = 4\n", + "num_clips = 10\n", + "num_crops = 3\n", + "\n", + "class PackPathway(torch.nn.Module):\n", + " \"\"\"\n", + " Transform for converting video frames as a list of tensors. \n", + " \"\"\"\n", + " def __init__(self):\n", + " super().__init__()\n", + " \n", + " def forward(self, frames: torch.Tensor):\n", + " fast_pathway = frames\n", + " # Perform temporal sampling from the fast pathway.\n", + " slow_pathway = torch.index_select(\n", + " frames,\n", + " 1,\n", + " torch.linspace(\n", + " 0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha\n", + " ).long(),\n", + " )\n", + " frame_list = [slow_pathway, fast_pathway]\n", + " return frame_list\n", + "\n", + "transform = ApplyTransformToKey(\n", + " key=\"video\",\n", + " transform=Compose(\n", + " [\n", + " UniformTemporalSubsample(num_frames),\n", + " Lambda(lambda x: x/255.0),\n", + " NormalizeVideo(mean, std),\n", + " ShortSideScale(\n", + " size=side_size\n", + " ),\n", + " CenterCropVideo(crop_size),\n", + " PackPathway()\n", + " ]\n", + " ),\n", + ")\n", + "\n", + "# The duration of the input clip is also specific to the model.\n", + "clip_duration = (num_frames * sampling_rate)/frames_per_second" + ] + }, + { + "cell_type": "markdown", + "id": "d7db0efb", + "metadata": {}, + "source": [ + "#### Run Inference\n", + "\n", + "Download an example video." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d215227", + "metadata": {}, + "outputs": [], + "source": [ + "url_link = \"https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4\"\n", + "video_path = 'archery.mp4'\n", + "try: urllib.URLopener().retrieve(url_link, video_path)\n", + "except: urllib.request.urlretrieve(url_link, video_path)" + ] + }, + { + "cell_type": "markdown", + "id": "fafecfaa", + "metadata": {}, + "source": [ + "Load the video and transform it to the input format required by the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2d91dfe", + "metadata": {}, + "outputs": [], + "source": [ + "# Select the duration of the clip to load by specifying the start and end duration\n", + "# The start_sec should correspond to where the action occurs in the video\n", + "start_sec = 0\n", + "end_sec = start_sec + clip_duration\n", + "\n", + "# Initialize an EncodedVideo helper class and load the video\n", + "video = EncodedVideo.from_path(video_path)\n", + "\n", + "# Load the desired clip\n", + "video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)\n", + "\n", + "# Apply a transform to normalize the video input\n", + "video_data = transform(video_data)\n", + "\n", + "# Move the inputs to the desired device\n", + "inputs = video_data[\"video\"]\n", + "inputs = [i.to(device)[None, ...] for i in inputs]" + ] + }, + { + "cell_type": "markdown", + "id": "f2387d0e", + "metadata": {}, + "source": [ + "#### Get Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55825ac2", + "metadata": {}, + "outputs": [], + "source": [ + "# Pass the input clip through the model\n", + "preds = model(inputs)\n", + "\n", + "# Get the predicted classes\n", + "post_act = torch.nn.Softmax(dim=1)\n", + "preds = post_act(preds)\n", + "pred_classes = preds.topk(k=5).indices[0]\n", + "\n", + "# Map the predicted classes to the label names\n", + "pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]\n", + "print(\"Top 5 predicted labels: %s\" % \", \".join(pred_class_names))" + ] + }, + { + "cell_type": "markdown", + "id": "5f95a42d", + "metadata": {}, + "source": [ + "### Model Description\n", + "SlowFast model architectures are based on [1] with pretrained weights using the 8x8 setting\n", + "on the Kinetics dataset. \n", + "\n", + "| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |\n", + "| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |\n", + "| SlowFast | R50 | 8x8 | 76.94 | 92.69 | 65.71 | 34.57 |\n", + "| SlowFast | R101 | 8x8 | 77.90 | 93.27 | 127.20 | 62.83 |\n", + "\n", + "\n", + "### References\n", + "[1] Christoph Feichtenhofer et al, \"SlowFast Networks for Video Recognition\"\n", + "https://arxiv.org/pdf/1812.03982.pdf" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_pytorchvideo_x3d.ipynb b/assets/hub/facebookresearch_pytorchvideo_x3d.ipynb new file mode 100644 index 000000000000..6f75fcbc1524 --- /dev/null +++ b/assets/hub/facebookresearch_pytorchvideo_x3d.ipynb @@ -0,0 +1,297 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "89d5af57", + "metadata": {}, + "source": [ + "# X3D\n", + "\n", + "*Author: FAIR PyTorchVideo*\n", + "\n", + "**X3D networks pretrained on the Kinetics 400 dataset**\n", + "\n", + "\n", + "### Example Usage\n", + "\n", + "#### Imports\n", + "\n", + "Load the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "daf69981", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# Choose the `x3d_s` model\n", + "model_name = 'x3d_s'\n", + "model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0f4f316f", + "metadata": {}, + "source": [ + "Import remaining functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42dbe99f", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import urllib\n", + "from pytorchvideo.data.encoded_video import EncodedVideo\n", + "\n", + "from torchvision.transforms import Compose, Lambda\n", + "from torchvision.transforms._transforms_video import (\n", + " CenterCropVideo,\n", + " NormalizeVideo,\n", + ")\n", + "from pytorchvideo.transforms import (\n", + " ApplyTransformToKey,\n", + " ShortSideScale,\n", + " UniformTemporalSubsample\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ab48f59a", + "metadata": {}, + "source": [ + "#### Setup\n", + "\n", + "Set the model to eval mode and move to desired device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18d25fa0", + "metadata": {}, + "outputs": [], + "source": [ + "# Set to GPU or CPU\n", + "device = \"cpu\"\n", + "model = model.eval()\n", + "model = model.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "bfed8b6b", + "metadata": {}, + "source": [ + "Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbf220ef", + "metadata": {}, + "outputs": [], + "source": [ + "json_url = \"https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json\"\n", + "json_filename = \"kinetics_classnames.json\"\n", + "try: urllib.URLopener().retrieve(json_url, json_filename)\n", + "except: urllib.request.urlretrieve(json_url, json_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ccab195", + "metadata": {}, + "outputs": [], + "source": [ + "with open(json_filename, \"r\") as f:\n", + " kinetics_classnames = json.load(f)\n", + "\n", + "# Create an id to label name mapping\n", + "kinetics_id_to_classname = {}\n", + "for k, v in kinetics_classnames.items():\n", + " kinetics_id_to_classname[v] = str(k).replace('\"', \"\")" + ] + }, + { + "cell_type": "markdown", + "id": "f2ffd57e", + "metadata": {}, + "source": [ + "#### Define input transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b387fdb", + "metadata": {}, + "outputs": [], + "source": [ + "mean = [0.45, 0.45, 0.45]\n", + "std = [0.225, 0.225, 0.225]\n", + "frames_per_second = 30\n", + "model_transform_params = {\n", + " \"x3d_xs\": {\n", + " \"side_size\": 182,\n", + " \"crop_size\": 182,\n", + " \"num_frames\": 4,\n", + " \"sampling_rate\": 12,\n", + " },\n", + " \"x3d_s\": {\n", + " \"side_size\": 182,\n", + " \"crop_size\": 182,\n", + " \"num_frames\": 13,\n", + " \"sampling_rate\": 6,\n", + " },\n", + " \"x3d_m\": {\n", + " \"side_size\": 256,\n", + " \"crop_size\": 256,\n", + " \"num_frames\": 16,\n", + " \"sampling_rate\": 5,\n", + " }\n", + "}\n", + "\n", + "# Get transform parameters based on model\n", + "transform_params = model_transform_params[model_name]\n", + "\n", + "# Note that this transform is specific to the slow_R50 model.\n", + "transform = ApplyTransformToKey(\n", + " key=\"video\",\n", + " transform=Compose(\n", + " [\n", + " UniformTemporalSubsample(transform_params[\"num_frames\"]),\n", + " Lambda(lambda x: x/255.0),\n", + " NormalizeVideo(mean, std),\n", + " ShortSideScale(size=transform_params[\"side_size\"]),\n", + " CenterCropVideo(\n", + " crop_size=(transform_params[\"crop_size\"], transform_params[\"crop_size\"])\n", + " )\n", + " ]\n", + " ),\n", + ")\n", + "\n", + "# The duration of the input clip is also specific to the model.\n", + "clip_duration = (transform_params[\"num_frames\"] * transform_params[\"sampling_rate\"])/frames_per_second" + ] + }, + { + "cell_type": "markdown", + "id": "a5de0111", + "metadata": {}, + "source": [ + "#### Run Inference\n", + "\n", + "Download an example video." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd125847", + "metadata": {}, + "outputs": [], + "source": [ + "url_link = \"https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4\"\n", + "video_path = 'archery.mp4'\n", + "try: urllib.URLopener().retrieve(url_link, video_path)\n", + "except: urllib.request.urlretrieve(url_link, video_path)" + ] + }, + { + "cell_type": "markdown", + "id": "ceb379eb", + "metadata": {}, + "source": [ + "Load the video and transform it to the input format required by the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5147a11a", + "metadata": {}, + "outputs": [], + "source": [ + "# Select the duration of the clip to load by specifying the start and end duration\n", + "# The start_sec should correspond to where the action occurs in the video\n", + "start_sec = 0\n", + "end_sec = start_sec + clip_duration\n", + "\n", + "# Initialize an EncodedVideo helper class and load the video\n", + "video = EncodedVideo.from_path(video_path)\n", + "\n", + "# Load the desired clip\n", + "video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)\n", + "\n", + "# Apply a transform to normalize the video input\n", + "video_data = transform(video_data)\n", + "\n", + "# Move the inputs to the desired device\n", + "inputs = video_data[\"video\"]\n", + "inputs = inputs.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "fb9be637", + "metadata": {}, + "source": [ + "#### Get Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6079fb75", + "metadata": {}, + "outputs": [], + "source": [ + "# Pass the input clip through the model\n", + "preds = model(inputs[None, ...])\n", + "\n", + "# Get the predicted classes\n", + "post_act = torch.nn.Softmax(dim=1)\n", + "preds = post_act(preds)\n", + "pred_classes = preds.topk(k=5).indices[0]\n", + "\n", + "# Map the predicted classes to the label names\n", + "pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]\n", + "print(\"Top 5 predicted labels: %s\" % \", \".join(pred_class_names))" + ] + }, + { + "cell_type": "markdown", + "id": "a6e53a9a", + "metadata": {}, + "source": [ + "### Model Description\n", + "X3D model architectures are based on [1] pretrained on the Kinetics dataset.\n", + "\n", + "| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |\n", + "| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |\n", + "| X3D | XS | 4x12 | 69.12 | 88.63 | 0.91 | 3.79 |\n", + "| X3D | S | 13x6 | 73.33 | 91.27 | 2.96 | 3.79 |\n", + "| X3D | M | 16x5 | 75.94 | 92.72 | 6.72 | 3.79 |\n", + "\n", + "\n", + "### References\n", + "[1] Christoph Feichtenhofer, \"X3D: Expanding Architectures for\n", + " Efficient Video Recognition.\" https://arxiv.org/abs/2004.04730" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext.ipynb b/assets/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext.ipynb index 53efcd736110..64a285b7b6ff 100644 --- a/assets/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext.ipynb +++ b/assets/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "6c28f06b", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,12 +22,13 @@ { "cell_type": "code", "execution_count": null, + "id": "73f3e3f0", "metadata": {}, "outputs": [], "source": [ "import torch\n", "\n", - "# === SEMI-WEAKLY SUPERVISED MODELSP RETRAINED WITH 940 HASHTAGGED PUBLIC CONTENT === \n", + "# === SEMI-WEAKLY SUPERVISED MODELS PRETRAINED WITH 940 HASHTAGGED PUBLIC CONTENT ===\n", "model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnet18_swsl')\n", "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnet50_swsl')\n", "# model = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models', 'resnext50_32x4d_swsl')\n", @@ -45,6 +47,7 @@ }, { "cell_type": "markdown", + "id": "a25ad51a", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -58,12 +61,13 @@ { "cell_type": "code", "execution_count": null, + "id": "3eec8b87", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -71,6 +75,7 @@ { "cell_type": "code", "execution_count": null, + "id": "08b15593", "metadata": {}, "outputs": [], "source": [ @@ -94,7 +99,7 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", "print(torch.nn.functional.softmax(output[0], dim=0))\n" @@ -102,28 +107,29 @@ }, { "cell_type": "markdown", + "id": "77e2a4e3", "metadata": {}, "source": [ "### Model Description\n", - "This project includes the semi-supervised and semi-weakly supervised ImageNet models introduced in \"Billion-scale Semi-Supervised Learning for Image Classification\" . \n", + "This project includes the semi-supervised and semi-weakly supervised ImageNet models introduced in \"Billion-scale Semi-Supervised Learning for Image Classification\" .\n", "\n", - "\"Semi-supervised\" (SSL) ImageNet models are pre-trained on a subset of unlabeled YFCC100M public image dataset and fine-tuned with the ImageNet1K training dataset, as described by the semi-supervised training framework in the paper mentioned above. In this case, the high capacity teacher model was trained only with labeled examples. \n", + "\"Semi-supervised\" (SSL) ImageNet models are pre-trained on a subset of unlabeled YFCC100M public image dataset and fine-tuned with the ImageNet1K training dataset, as described by the semi-supervised training framework in the paper mentioned above. In this case, the high capacity teacher model was trained only with labeled examples.\n", "\n", - "\"Semi-weakly\" supervised (SWSL) ImageNet models are pre-trained on **940 million** public images with 1.5K hashtags matching with 1000 ImageNet1K synsets, followed by fine-tuning on ImageNet1K dataset. In this case, the associated hashtags are only used for building a better teacher model. During training the student model, those hashtags are ingored and the student model is pretrained with a subset of 64M images selected by the teacher model from the same 940 million public image dataset. \n", + "\"Semi-weakly\" supervised (SWSL) ImageNet models are pre-trained on **940 million** public images with 1.5K hashtags matching with 1000 ImageNet1K synsets, followed by fine-tuning on ImageNet1K dataset. In this case, the associated hashtags are only used for building a better teacher model. During training the student model, those hashtags are ingored and the student model is pretrained with a subset of 64M images selected by the teacher model from the same 940 million public image dataset.\n", "\n", - "Semi-weakly supervised ResNet and ResNext models provided in the table below significantly improve the top-1 accuracy on the ImageNet validation set compared to training from scratch or other training mechanisms introduced in the literature as of September 2019. For example, **We achieve state-of-the-art accuracy of 81.2% on ImageNet for the widely used/adopted ResNet-50 model architecture**. \n", + "Semi-weakly supervised ResNet and ResNext models provided in the table below significantly improve the top-1 accuracy on the ImageNet validation set compared to training from scratch or other training mechanisms introduced in the literature as of September 2019. For example, **We achieve state-of-the-art accuracy of 81.2% on ImageNet for the widely used/adopted ResNet-50 model architecture**.\n", "\n", "\n", "| Architecture | Supervision | #Parameters | FLOPS | Top-1 Acc. | Top-5 Acc. |\n", "| ------------------ | :--------------:|:----------: | :---: | :--------: | :--------: |\n", "| ResNet-18 | semi-supervised |14M | 2B | 72.8 | 91.5 |\n", - "| ResNet-50 | semi-supervised |25M | 4B | 79.3 | 94.9 | \n", + "| ResNet-50 | semi-supervised |25M | 4B | 79.3 | 94.9 |\n", "| ResNeXt-50 32x4d | semi-supervised |25M | 4B | 80.3 | 95.4 |\n", "| ResNeXt-101 32x4d | semi-supervised |42M | 8B | 81.0 | 95.7 |\n", "| ResNeXt-101 32x8d | semi-supervised |88M | 16B | 81.7 | 96.1 |\n", "| ResNeXt-101 32x16d | semi-supervised |193M | 36B | 81.9 | 96.2 |\n", "| ResNet-18 | semi-weakly supervised |14M | 2B | **73.4** | 91.9 |\n", - "| ResNet-50 | semi-weakly supervised |25M | 4B | **81.2** | 96.0 | \n", + "| ResNet-50 | semi-weakly supervised |25M | 4B | **81.2** | 96.0 |\n", "| ResNeXt-50 32x4d | semi-weakly supervised |25M | 4B | **82.2** | 96.3 |\n", "| ResNeXt-101 32x4d | semi-weakly supervised |42M | 8B | **83.4** | 96.8 |\n", "| ResNeXt-101 32x8d | semi-weakly supervised |88M | 16B | **84.3** | 97.2 |\n", @@ -138,6 +144,7 @@ { "cell_type": "code", "execution_count": null, + "id": "20db95b4", "metadata": {}, "outputs": [], "source": [ @@ -154,5 +161,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/huggingface_pytorch-transformers.ipynb b/assets/hub/huggingface_pytorch-transformers.ipynb index c602f3239294..fc6856f7f5ba 100644 --- a/assets/hub/huggingface_pytorch-transformers.ipynb +++ b/assets/hub/huggingface_pytorch-transformers.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "aebf87f7", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -30,7 +31,7 @@ "5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.\n", "6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.\n", "7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.\n", - "8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b) by Victor Sanh, Lysandre Debut and Thomas Wolf.\n", + "8. **[DistilBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5) by Victor Sanh, Lysandre Debut and Thomas Wolf.\n", "\n", "The components available here are based on the `AutoModel` and `AutoTokenizer` classes of the `pytorch-transformers` library.\n", "\n", @@ -42,6 +43,7 @@ { "cell_type": "code", "execution_count": null, + "id": "569404ad", "metadata": {}, "outputs": [], "source": [ @@ -51,6 +53,7 @@ }, { "cell_type": "markdown", + "id": "dfccbc22", "metadata": {}, "source": [ "# Usage\n", @@ -59,7 +62,7 @@ "- `config`: returns a configuration item corresponding to the specified model or pth.\n", "- `tokenizer`: returns a tokenizer corresponding to the specified model or path\n", "- `model`: returns a model corresponding to the specified model or path\n", - "- `modelWithLMHead`: returns a model with a language modeling head corresponding to the specified model or path\n", + "- `modelForCausalLM`: returns a model with a language modeling head corresponding to the specified model or path\n", "- `modelForSequenceClassification`: returns a model with a sequence classifier corresponding to the specified model or path\n", "- `modelForQuestionAnswering`: returns a model with a question answering head corresponding to the specified model or path\n", "\n", @@ -68,7 +71,7 @@ "\n", "\n", "\n", - "The available models are listed on the [pytorch-transformers documentation, pre-trained models section](https://huggingface.co/pytorch-transformers/pretrained_models.html).\n", + "The available models are listed on the [transformers documentation, models page](https://huggingface.co/models).\n", "\n", "# Documentation\n", "\n", @@ -77,12 +80,13 @@ "\n", "## Tokenizer\n", "\n", - "The tokenizer object allows the conversion from character strings to tokens understood by the different models. Each model has its own tokenizer, and some tokenizing methods are different across tokenizers. The complete documentation can be found [here](https://huggingface.co/pytorch-transformers/main_classes/tokenizer.html)." + "The tokenizer object allows the conversion from character strings to tokens understood by the different models. Each model has its own tokenizer, and some tokenizing methods are different across tokenizers. The complete documentation can be found [here](https://huggingface.co/docs/transformers/main_classes/tokenizer)." ] }, { "cell_type": "code", "execution_count": null, + "id": "a52f187f", "metadata": { "attributes": { "classes": [ @@ -100,16 +104,18 @@ }, { "cell_type": "markdown", + "id": "2765418b", "metadata": {}, "source": [ "## Models\n", "\n", - "The model object is a model instance inheriting from a `nn.Module`. Each model is accompanied by their saving/loading methods, either from a local file or directory, or from a pre-trained configuration (see previously described `config`). Each model works differently, a complete overview of the different models can be found in the [documentation](https://huggingface.co/pytorch-transformers/pretrained_models.html)." + "The model object is a model instance inheriting from a `nn.Module`. Each model is accompanied by their saving/loading methods, either from a local file or directory, or from a pre-trained configuration (see previously described `config`). Each model works differently, a complete overview of the different models can be found in the [documentation](https://huggingface.co/docs/transformers/main_classes/model)." ] }, { "cell_type": "code", "execution_count": null, + "id": "b170367e", "metadata": { "attributes": { "classes": [ @@ -132,6 +138,7 @@ }, { "cell_type": "markdown", + "id": "4d9e3b45", "metadata": {}, "source": [ "## Models with a language modeling head\n", @@ -142,6 +149,7 @@ { "cell_type": "code", "execution_count": null, + "id": "d2e64b72", "metadata": { "attributes": { "classes": [ @@ -153,17 +161,18 @@ "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache.\n", - "model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`\n", - "model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attentions=True) # Update configuration during loading\n", + "model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'gpt2') # Download model and configuration from huggingface.co and cache.\n", + "model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', './test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`\n", + "model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'gpt2', output_attentions=True) # Update configuration during loading\n", "assert model.config.output_attentions == True\n", "# Loading from a TF checkpoint file instead of a PyTorch model (slower)\n", - "config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')\n", - "model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)" + "config = AutoConfig.from_pretrained('./tf_model/gpt_tf_model_config.json')\n", + "model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', './tf_model/gpt_tf_checkpoint.ckpt.index', from_tf=True, config=config)" ] }, { "cell_type": "markdown", + "id": "56838e82", "metadata": {}, "source": [ "## Models with a sequence classification head\n", @@ -174,6 +183,7 @@ { "cell_type": "code", "execution_count": null, + "id": "0fede52f", "metadata": { "attributes": { "classes": [ @@ -196,6 +206,7 @@ }, { "cell_type": "markdown", + "id": "a17e2167", "metadata": {}, "source": [ "## Models with a question answering head\n", @@ -206,6 +217,7 @@ { "cell_type": "code", "execution_count": null, + "id": "2a340191", "metadata": { "attributes": { "classes": [ @@ -228,16 +240,18 @@ }, { "cell_type": "markdown", + "id": "a347055f", "metadata": {}, "source": [ "## Configuration\n", "\n", - "The configuration is optional. The configuration object holds information concerning the model, such as the number of heads/layers, if the model should output attentions or hidden states, or if it should be adapted for TorchScript. Many parameters are available, some specific to each model. The complete documentation can be found [here](https://huggingface.co/pytorch-transformers/main_classes/configuration.html)." + "The configuration is optional. The configuration object holds information concerning the model, such as the number of heads/layers, if the model should output attentions or hidden states, or if it should be adapted for TorchScript. Many parameters are available, some specific to each model. The complete documentation can be found [here](https://huggingface.co/docs/transformers/main_classes/configuration)." ] }, { "cell_type": "code", "execution_count": null, + "id": "83bdbd7d", "metadata": { "attributes": { "classes": [ @@ -268,6 +282,7 @@ }, { "cell_type": "markdown", + "id": "4afcf83b", "metadata": {}, "source": [ "# Example Usage\n", @@ -280,6 +295,7 @@ { "cell_type": "code", "execution_count": null, + "id": "91ab7b53", "metadata": {}, "outputs": [], "source": [ @@ -295,6 +311,7 @@ }, { "cell_type": "markdown", + "id": "c057c229", "metadata": {}, "source": [ "## Using `BertModel` to encode the input sentence in a sequence of last layer hidden-states" @@ -303,6 +320,7 @@ { "cell_type": "code", "execution_count": null, + "id": "95ac4662", "metadata": {}, "outputs": [], "source": [ @@ -321,14 +339,16 @@ }, { "cell_type": "markdown", + "id": "70f4fefd", "metadata": {}, "source": [ - "## Using `modelWithLMHead` to predict a masked token with BERT" + "## Using `modelForMaskedLM` to predict a masked token with BERT" ] }, { "cell_type": "code", "execution_count": null, + "id": "48bde1ca", "metadata": {}, "outputs": [], "source": [ @@ -337,10 +357,10 @@ "indexed_tokens[masked_index] = tokenizer.mask_token_id\n", "tokens_tensor = torch.tensor([indexed_tokens])\n", "\n", - "masked_lm__model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-cased')\n", + "masked_lm_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForMaskedLM', 'bert-base-cased')\n", "\n", "with torch.no_grad():\n", - " predictions = masked_lm__model(tokens_tensor, token_type_ids=segments_tensors)\n", + " predictions = masked_lm_model(tokens_tensor, token_type_ids=segments_tensors)\n", "\n", "# Get the predicted token\n", "predicted_index = torch.argmax(predictions[0][0], dim=1)[masked_index].item()\n", @@ -350,6 +370,7 @@ }, { "cell_type": "markdown", + "id": "1b4a6bef", "metadata": {}, "source": [ "## Using `modelForQuestionAnswering` to do question answering with BERT" @@ -358,6 +379,7 @@ { "cell_type": "code", "execution_count": null, + "id": "d6f37585", "metadata": {}, "outputs": [], "source": [ @@ -374,10 +396,10 @@ "\n", "# Predict the start and end positions logits\n", "with torch.no_grad():\n", - " start_logits, end_logits = question_answering_model(tokens_tensor, token_type_ids=segments_tensors)\n", + " out = question_answering_model(tokens_tensor, token_type_ids=segments_tensors)\n", "\n", "# get the highest prediction\n", - "answer = question_answering_tokenizer.decode(indexed_tokens[torch.argmax(start_logits):torch.argmax(end_logits)+1])\n", + "answer = question_answering_tokenizer.decode(indexed_tokens[torch.argmax(out.start_logits):torch.argmax(out.end_logits)+1])\n", "assert answer == \"puppeteer\"\n", "\n", "# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions (set model to train mode before if used for training)\n", @@ -387,6 +409,7 @@ }, { "cell_type": "markdown", + "id": "6ee33213", "metadata": {}, "source": [ "## Using `modelForSequenceClassification` to do paraphrase classification with BERT" @@ -395,6 +418,7 @@ { "cell_type": "code", "execution_count": null, + "id": "9384a8b0", "metadata": {}, "outputs": [], "source": [ @@ -424,5 +448,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/hustvl_yolop.ipynb b/assets/hub/hustvl_yolop.ipynb new file mode 100644 index 000000000000..2c3496f534fc --- /dev/null +++ b/assets/hub/hustvl_yolop.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8ac5a855", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# YOLOP\n", + "\n", + "*Author: Hust Visual Learning Team*\n", + "\n", + "**YOLOP pretrained on the BDD100K dataset**\n", + "\n", + "## Before You Start\n", + "To install YOLOP dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16ed4d6d", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install -qr https://github.com/hustvl/YOLOP/blob/main/requirements.txt # install dependencies" + ] + }, + { + "cell_type": "markdown", + "id": "484a5e2b", + "metadata": {}, + "source": [ + "## YOLOP: You Only Look Once for Panoptic driving Perception\n", + "\n", + "### Model Description\n", + "\n", + "\"YOLOP\n", + " \n", + "\n", + "- YOLOP is an efficient multi-task network that can jointly handle three crucial tasks in autonomous driving: object detection, drivable area segmentation and lane detection. And it is also the first to reach real-time on embedded devices while maintaining state-of-the-art level performance on the **BDD100K** dataset.\n", + "\n", + "\n", + "### Results\n", + "\n", + "#### Traffic Object Detection Result\n", + "\n", + "| Model | Recall(%) | mAP50(%) | Speed(fps) |\n", + "| -------------- | --------- | -------- | ---------- |\n", + "| `Multinet` | 81.3 | 60.2 | 8.6 |\n", + "| `DLT-Net` | 89.4 | 68.4 | 9.3 |\n", + "| `Faster R-CNN` | 77.2 | 55.6 | 5.3 |\n", + "| `YOLOv5s` | 86.8 | 77.2 | 82 |\n", + "| `YOLOP(ours)` | 89.2 | 76.5 | 41 |\n", + "\n", + "#### Drivable Area Segmentation Result\n", + "\n", + "| Model | mIOU(%) | Speed(fps) |\n", + "| ------------- | ------- | ---------- |\n", + "| `Multinet` | 71.6 | 8.6 |\n", + "| `DLT-Net` | 71.3 | 9.3 |\n", + "| `PSPNet` | 89.6 | 11.1 |\n", + "| `YOLOP(ours)` | 91.5 | 41 |\n", + "\n", + "#### Lane Detection Result\n", + "\n", + "| Model | mIOU(%) | IOU(%) |\n", + "| ------------- | ------- | ------ |\n", + "| `ENet` | 34.12 | 14.64 |\n", + "| `SCNN` | 35.79 | 15.84 |\n", + "| `ENet-SAD` | 36.56 | 16.02 |\n", + "| `YOLOP(ours)` | 70.50 | 26.20 |\n", + "\n", + "#### Ablation Studies 1: End-to-end v.s. Step-by-step\n", + "\n", + "| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) |\n", + "| --------------- | --------- | ----- | ------- | ----------- | ------ |\n", + "| `ES-W` | 87.0 | 75.3 | 90.4 | 66.8 | 26.2 |\n", + "| `ED-W` | 87.3 | 76.0 | 91.6 | 71.2 | 26.1 |\n", + "| `ES-D-W` | 87.0 | 75.1 | 91.7 | 68.6 | 27.0 |\n", + "| `ED-S-W` | 87.5 | 76.1 | 91.6 | 68.0 | 26.8 |\n", + "| `End-to-end` | 89.2 | 76.5 | 91.5 | 70.5 | 26.2 |\n", + "\n", + "#### Ablation Studies 2: Multi-task v.s. Single task\n", + "\n", + "| Training_method | Recall(%) | AP(%) | mIoU(%) | Accuracy(%) | IoU(%) | Speed(ms/frame) |\n", + "| --------------- | --------- | ----- | ------- | ----------- | ------ | --------------- |\n", + "| `Det(only)` | 88.2 | 76.9 | - | - | - | 15.7 |\n", + "| `Da-Seg(only)` | - | - | 92.0 | - | - | 14.8 |\n", + "| `Ll-Seg(only)` | - | - | - | 79.6 | 27.9 | 14.8 |\n", + "| `Multitask` | 89.2 | 76.5 | 91.5 | 70.5 | 26.2 | 24.4 |\n", + "\n", + "**Notes**:\n", + "\n", + "- In table 4, E, D, S and W refer to Encoder, Detect head, two Segment heads and whole network. So the Algorithm (First, we only train Encoder and Detect head. Then we freeze the Encoder and Detect head as well as train two Segmentation heads. Finally, the entire network is trained jointly for all three tasks.) can be marked as ED-S-W, and the same for others.\n", + "\n", + "### Visualization\n", + "\n", + "#### Traffic Object Detection Result\n", + "\n", + "\"Traffic\n", + " \n", + "\n", + "#### Drivable Area Segmentation Result\n", + "\n", + "\"Drivable\n", + " \n", + "\n", + "#### Lane Detection Result\n", + "\n", + "\"Lane\n", + " \n", + "\n", + "**Notes**:\n", + "\n", + "- The visualization of lane detection result has been post processed by quadratic fitting.\n", + "\n", + "### Deployment\n", + "\n", + "Our model can reason in real-time on **Jetson Tx2**, with **Zed Camera** to capture image. We use **TensorRT** tool for speeding up. We provide code for deployment and reasoning of model in [github code](https://github.com/hustvl/YOLOP/tree/main/toolkits/deploy).\n", + "\n", + "\n", + "### Load From PyTorch Hub\n", + "This example loads the pretrained **YOLOP** model and passes an image for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a50d292a", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# load model\n", + "model = torch.hub.load('hustvl/yolop', 'yolop', pretrained=True)\n", + "\n", + "#inference\n", + "img = torch.randn(1,3,640,640)\n", + "det_out, da_seg_out,ll_seg_out = model(img)" + ] + }, + { + "cell_type": "markdown", + "id": "f07a9063", + "metadata": {}, + "source": [ + "### Citation\n", + "\n", + "See for more detail in [github code](https://github.com/hustvl/YOLOP) and [arxiv paper](https://arxiv.org/abs/2108.11250).\n", + "\n", + "If you find our paper and code useful for your research, please consider giving a star and citation:" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/intelisl_midas_v2.ipynb b/assets/hub/intelisl_midas_v2.ipynb new file mode 100644 index 000000000000..9c7a4480f581 --- /dev/null +++ b/assets/hub/intelisl_midas_v2.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0595d980", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# MiDaS\n", + "\n", + "*Author: Intel ISL*\n", + "\n", + "**MiDaS models for computing relative depth from a single image.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "[MiDaS](https://arxiv.org/abs/1907.01341) computes relative inverse depth from a single image. The repository provides multiple models that cover different use cases ranging from a small, high-speed model to a very large model that provide the highest accuracy. The models have been trained on 10 distinct datasets using\n", + "multi-objective optimization to ensure high quality on a wide range of inputs.\n", + "\n", + "### Dependencies\n", + "\n", + "MiDaS depends on [timm](https://github.com/rwightman/pytorch-image-models). Install with" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db3fd908", + "metadata": { + "attributes": { + "classes": [ + "shell" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "pip install timm" + ] + }, + { + "cell_type": "markdown", + "id": "8892d100", + "metadata": {}, + "source": [ + "### Example Usage\n", + "\n", + "Download an image from the PyTorch homepage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "758e089f", + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "import torch\n", + "import urllib.request\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "markdown", + "id": "3d5fb41f", + "metadata": {}, + "source": [ + "Load a model (see [https://github.com/intel-isl/MiDaS/#Accuracy](https://github.com/intel-isl/MiDaS/#Accuracy) for an overview)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49acb469", + "metadata": {}, + "outputs": [], + "source": [ + "model_type = \"DPT_Large\" # MiDaS v3 - Large (highest accuracy, slowest inference speed)\n", + "#model_type = \"DPT_Hybrid\" # MiDaS v3 - Hybrid (medium accuracy, medium inference speed)\n", + "#model_type = \"MiDaS_small\" # MiDaS v2.1 - Small (lowest accuracy, highest inference speed)\n", + "\n", + "midas = torch.hub.load(\"intel-isl/MiDaS\", model_type)" + ] + }, + { + "cell_type": "markdown", + "id": "d785d8c2", + "metadata": {}, + "source": [ + "Move model to GPU if available" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2aa0b2d", + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "midas.to(device)\n", + "midas.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "1d0a6f5b", + "metadata": {}, + "source": [ + "Load transforms to resize and normalize the image for large or small model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "763b447a", + "metadata": {}, + "outputs": [], + "source": [ + "midas_transforms = torch.hub.load(\"intel-isl/MiDaS\", \"transforms\")\n", + "\n", + "if model_type == \"DPT_Large\" or model_type == \"DPT_Hybrid\":\n", + " transform = midas_transforms.dpt_transform\n", + "else:\n", + " transform = midas_transforms.small_transform" + ] + }, + { + "cell_type": "markdown", + "id": "ce837f1c", + "metadata": {}, + "source": [ + "Load image and apply transforms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "412901d6", + "metadata": {}, + "outputs": [], + "source": [ + "img = cv2.imread(filename)\n", + "img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n", + "\n", + "input_batch = transform(img).to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "9d621088", + "metadata": {}, + "source": [ + "Predict and resize to original resolution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d0d2db5", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " prediction = midas(input_batch)\n", + "\n", + " prediction = torch.nn.functional.interpolate(\n", + " prediction.unsqueeze(1),\n", + " size=img.shape[:2],\n", + " mode=\"bicubic\",\n", + " align_corners=False,\n", + " ).squeeze()\n", + "\n", + "output = prediction.cpu().numpy()" + ] + }, + { + "cell_type": "markdown", + "id": "991ee991", + "metadata": {}, + "source": [ + "Show result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c630ed12", + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(output)\n", + "# plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f4c23d91", + "metadata": {}, + "source": [ + "### References\n", + "[Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer](https://arxiv.org/abs/1907.01341)\n", + "\n", + "[Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413)\n", + "\n", + "Please cite our papers if you use our models:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8248831c", + "metadata": { + "attributes": { + "classes": [ + "bibtex" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@article{Ranftl2020,\n", + "\tauthor = {Ren\\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun},\n", + "\ttitle = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer},\n", + "\tjournal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},\n", + "\tyear = {2020},\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2a1bd81", + "metadata": { + "attributes": { + "classes": [ + "bibtex" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@article{Ranftl2021,\n", + "\tauthor = {Ren\\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},\n", + "\ttitle = {Vision Transformers for Dense Prediction},\n", + "\tjournal = {ArXiv preprint},\n", + "\tyear = {2021},\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/mateuszbuda_brain-segmentation-pytorch_unet.ipynb b/assets/hub/mateuszbuda_brain-segmentation-pytorch_unet.ipynb index ff3094d2cfb7..f91614481594 100644 --- a/assets/hub/mateuszbuda_brain-segmentation-pytorch_unet.ipynb +++ b/assets/hub/mateuszbuda_brain-segmentation-pytorch_unet.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "e3af3710", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,6 +22,7 @@ { "cell_type": "code", "execution_count": null, + "id": "76536d46", "metadata": {}, "outputs": [], "source": [ @@ -31,6 +33,7 @@ }, { "cell_type": "markdown", + "id": "a28792eb", "metadata": {}, "source": [ "Loads a U-Net model pre-trained for abnormality segmentation on a dataset of brain MRI volumes [kaggle.com/mateuszbuda/lgg-mri-segmentation](https://www.kaggle.com/mateuszbuda/lgg-mri-segmentation)\n", @@ -40,7 +43,7 @@ "\n", "This U-Net model comprises four levels of blocks containing two convolutional layers with batch normalization and ReLU activation function, and one max pooling layer in the encoding part and up-convolutional layers instead in the decoding part.\n", "The number of convolutional filters in each block is 32, 64, 128, and 256.\n", - "The buttleneck layer has 512 convolutional filters.\n", + "The bottleneck layer has 512 convolutional filters.\n", "From the encoding layers, skip connections are used to the corresponding layers in the decoding part.\n", "Input image is a 3-channel brain MRI slice from pre-contrast, FLAIR, and post-contrast sequences, respectively.\n", "Output is a one-channel probability map of abnormality regions with the same size as the input image.\n", @@ -54,6 +57,7 @@ { "cell_type": "code", "execution_count": null, + "id": "edae5a92", "metadata": {}, "outputs": [], "source": [ @@ -67,6 +71,7 @@ { "cell_type": "code", "execution_count": null, + "id": "b2900236", "metadata": {}, "outputs": [], "source": [ @@ -95,6 +100,7 @@ }, { "cell_type": "markdown", + "id": "b5cdbd4e", "metadata": {}, "source": [ "### References\n", @@ -107,5 +113,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/nicolalandro_ntsnet-cub200_ntsnet.ipynb b/assets/hub/nicolalandro_ntsnet-cub200_ntsnet.ipynb new file mode 100644 index 000000000000..53fc1a9826b6 --- /dev/null +++ b/assets/hub/nicolalandro_ntsnet-cub200_ntsnet.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "20f62891", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ntsnet\n", + "\n", + "*Author: Moreno Caraffini and Nicola Landro*\n", + "\n", + "**classify birds using this fine-grained image classifier**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6d78a29", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('nicolalandro/ntsnet-cub200', 'ntsnet', pretrained=True,\n", + " **{'topN': 6, 'device':'cpu', 'num_classes': 200})" + ] + }, + { + "cell_type": "markdown", + "id": "cfa847dd", + "metadata": {}, + "source": [ + "### Example Usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdde7974", + "metadata": {}, + "outputs": [], + "source": [ + "from torchvision import transforms\n", + "import torch\n", + "import urllib\n", + "from PIL import Image\n", + "\n", + "transform_test = transforms.Compose([\n", + " transforms.Resize((600, 600), Image.BILINEAR),\n", + " transforms.CenterCrop((448, 448)),\n", + " # transforms.RandomHorizontalFlip(), # only if train\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),\n", + "])\n", + "\n", + "\n", + "model = torch.hub.load('nicolalandro/ntsnet-cub200', 'ntsnet', pretrained=True, **{'topN': 6, 'device':'cpu', 'num_classes': 200})\n", + "model.eval()\n", + "\n", + "url = 'https://raw.githubusercontent.com/nicolalandro/ntsnet-cub200/master/images/nts-net.png'\n", + "img = Image.open(urllib.request.urlopen(url))\n", + "scaled_img = transform_test(img)\n", + "torch_images = scaled_img.unsqueeze(0)\n", + "\n", + "with torch.no_grad():\n", + " top_n_coordinates, concat_out, raw_logits, concat_logits, part_logits, top_n_index, top_n_prob = model(torch_images)\n", + "\n", + " _, predict = torch.max(concat_logits, 1)\n", + " pred_id = predict.item()\n", + " print('bird class:', model.bird_classes[pred_id])" + ] + }, + { + "cell_type": "markdown", + "id": "20fe5d0c", + "metadata": {}, + "source": [ + "### Model Description\n", + "This is an nts-net pretrained with CUB200 2011 dataset, which is a fine grained dataset of birds species.\n", + "\n", + "### References\n", + "You can read the full paper at this [link](http://artelab.dista.uninsubria.it/res/research/papers/2019/2019-IVCNZ-Nawaz-Birds.pdf)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74ed0a07", + "metadata": { + "attributes": { + "classes": [ + "bibtex" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@INPROCEEDINGS{Gallo:2019:IVCNZ,\n", + " author={Nawaz, Shah and Calefati, Alessandro and Caraffini, Moreno and Landro, Nicola and Gallo, Ignazio},\n", + " booktitle={2019 International Conference on Image and Vision Computing New Zealand (IVCNZ 2019)},\n", + " title={Are These Birds Similar: Learning Branched Networks for Fine-grained Representations},\n", + " year={2019},\n", + " month={Dec},\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_efficientnet.ipynb b/assets/hub/nvidia_deeplearningexamples_efficientnet.ipynb new file mode 100644 index 000000000000..04ec17f4104f --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_efficientnet.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b913a656", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# EfficientNet\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**EfficientNets are a family of image classification models, which achieve state-of-the-art accuracy, being an order-of-magnitude smaller and faster. Trained with mixed precision using Tensor Cores.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "EfficientNet is an image classification model family. It was first described in [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946). This notebook allows you to load and test the EfficientNet-B0, EfficientNet-B4, EfficientNet-WideSE-B0 and, EfficientNet-WideSE-B4 models.\n", + "\n", + "EfficientNet-WideSE models use Squeeze-and-Excitation layers wider than original EfficientNet models, the width of SE module is proportional to the width of Depthwise Separable Convolutions instead of block width.\n", + "\n", + "WideSE models are slightly more accurate than original models.\n", + "\n", + "This model is trained with mixed precision using Tensor Cores on Volta and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results over 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.\n", + "\n", + "We use [NHWC data layout](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) when training using Mixed Precision.\n", + "\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained ***EfficientNet*** model to perform inference on image and present the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49342854", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17a365de", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "cc63c523", + "metadata": {}, + "source": [ + "Load the model pretrained on ImageNet dataset.\n", + "\n", + "You can choose among the following models:\n", + "\n", + "| TorchHub entrypoint | Description |\n", + "| :----- | :----- |\n", + "| `nvidia_efficientnet_b0` | baseline EfficientNet |\n", + "| `nvidia_efficientnet_b4` | scaled EfficientNet|\n", + "| `nvidia_efficientnet_widese_b0` | model with Squeeze-and-Excitation layers wider than baseline EfficientNet model |\n", + "| `nvidia_efficientnet_widese_b4` | model with Squeeze-and-Excitation layers wider than scaled EfficientNet model |\n", + "\n", + "There are also quantized version of the models, but they require nvidia container. See [quantized models](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet#quantization)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9434f5c7", + "metadata": {}, + "outputs": [], + "source": [ + "efficientnet = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_efficientnet_b0', pretrained=True)\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "efficientnet.eval().to(device)\n" + ] + }, + { + "cell_type": "markdown", + "id": "7303edb8", + "metadata": {}, + "source": [ + "Prepare sample input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "489f6768", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "21f12d5e", + "metadata": {}, + "source": [ + "Run inference. Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probable hypotheses according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d997b6f5", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(efficientnet(batch), dim=1)\n", + " \n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "8b7a0638", + "metadata": {}, + "source": [ + "Display the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "401003b6", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.ANTIALIAS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "e4780b64", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet)\n", + "and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:efficientnet_for_pytorch)\n", + "\n", + "### References\n", + "\n", + " - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)\n", + " - [model on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:efficientnet_for_pytorch)\n", + " - [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet)\n", + " - [pretrained model on NGC (efficientnet-b0)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_b0_pyt_amp)\n", + " - [pretrained model on NGC (efficientnet-b4)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_b4_pyt_amp)\n", + " - [pretrained model on NGC (efficientnet-widese-b0)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_widese_b0_pyt_amp)\n", + " - [pretrained model on NGC (efficientnet-widese-b4)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_widese_b4_pyt_amp)\n", + " - [pretrained, quantized model on NGC (efficientnet-widese-b0)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_widese_b0_pyt_amp)\n", + " - [pretrained, quantized model on NGC (efficientnet-widese-b4)](https://ngc.nvidia.com/catalog/models/nvidia:efficientnet_widese_b4_pyt_amp)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_fastpitch.ipynb b/assets/hub/nvidia_deeplearningexamples_fastpitch.ipynb new file mode 100644 index 000000000000..09ca3a568f57 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_fastpitch.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b844056c", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# FastPitch 2\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**The FastPitch model for generating mel spectrograms from text**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "This notebook demonstrates a PyTorch implementation of the FastPitch model described in the [FastPitch](https://arxiv.org/abs/2006.06873) paper.\n", + "The FastPitch model generates mel-spectrograms and predicts a pitch contour from raw input text. In version 1.1, it does not need any pre-trained aligning model to bootstrap from. To get the audio waveform we need a second model that will produce it from the generated mel-spectrogram. In this notebook we use HiFi-GAN model for that second step.\n", + "\n", + "The FastPitch model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. The main differences between FastPitch vs FastSpeech are as follows:\n", + "* no dependence on external aligner (Transformer TTS, Tacotron 2); in version 1.1, FastPitch aligns audio to transcriptions by itself as in [One TTS Alignment To Rule Them All](https://arxiv.org/abs/2108.10447),\n", + "* FastPitch explicitly learns to predict the pitch contour,\n", + "* pitch conditioning removes harsh sounding artifacts and provides faster convergence,\n", + "* no need for distilling mel-spectrograms with a teacher model,\n", + "* capabilities to train a multi-speaker model.\n", + "\n", + "\n", + "#### Model architecture\n", + "\n", + "![FastPitch Architecture](https://raw.githubusercontent.com/NVIDIA/DeepLearningExamples/master/PyTorch/SpeechSynthesis/FastPitch/img/fastpitch_model.png)\n", + "\n", + "### Example\n", + "In the example below:\n", + "\n", + "- pretrained FastPitch and HiFiGAN models are loaded from torch.hub\n", + "- given tensor representation of an input text (\"Say this smoothly to prove you are not a robot.\"), FastPitch generates mel spectrogram\n", + "- HiFiGAN generates sound given the mel spectrogram\n", + "- the output sound is saved in an 'audio.wav' file\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing of text and audio, as well as for display and input/output handling. Finally, for better performance of FastPitch model, we download the CMU pronounciation dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05ac615a", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "apt-get update\n", + "apt-get install -y libsndfile1 wget\n", + "pip install numpy scipy librosa unidecode inflect librosa matplotlib==3.6.3\n", + "wget https://raw.githubusercontent.com/NVIDIA/NeMo/263a30be71e859cee330e5925332009da3e5efbc/scripts/tts_dataset_files/heteronyms-052722 -qO heteronyms\n", + "wget https://raw.githubusercontent.com/NVIDIA/NeMo/263a30be71e859cee330e5925332009da3e5efbc/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 -qO cmudict-0.7b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "848828d1", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import Audio\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "a8a93fec", + "metadata": {}, + "source": [ + "Download and setup FastPitch generator model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0de224eb", + "metadata": {}, + "outputs": [], + "source": [ + "fastpitch, generator_train_setup = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_fastpitch')" + ] + }, + { + "cell_type": "markdown", + "id": "4f160e82", + "metadata": {}, + "source": [ + "Download and setup vocoder and denoiser models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0655df7", + "metadata": {}, + "outputs": [], + "source": [ + "hifigan, vocoder_train_setup, denoiser = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_hifigan')" + ] + }, + { + "cell_type": "markdown", + "id": "9c8575d3", + "metadata": {}, + "source": [ + "Verify that generator and vocoder models agree on input parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2140a54", + "metadata": {}, + "outputs": [], + "source": [ + "CHECKPOINT_SPECIFIC_ARGS = [\n", + " 'sampling_rate', 'hop_length', 'win_length', 'p_arpabet', 'text_cleaners',\n", + " 'symbol_set', 'max_wav_value', 'prepend_space_to_text',\n", + " 'append_space_to_text']\n", + "\n", + "for k in CHECKPOINT_SPECIFIC_ARGS:\n", + "\n", + " v1 = generator_train_setup.get(k, None)\n", + " v2 = vocoder_train_setup.get(k, None)\n", + "\n", + " assert v1 is None or v2 is None or v1 == v2, \\\n", + " f'{k} mismatch in spectrogram generator and vocoder'" + ] + }, + { + "cell_type": "markdown", + "id": "e24e3c5d", + "metadata": {}, + "source": [ + "Put all models on available device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7383ab5", + "metadata": {}, + "outputs": [], + "source": [ + "fastpitch.to(device)\n", + "hifigan.to(device)\n", + "denoiser.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "dd803d24", + "metadata": {}, + "source": [ + "Load text processor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f512618", + "metadata": {}, + "outputs": [], + "source": [ + "tp = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_textprocessing_utils', cmudict_path=\"cmudict-0.7b\", heteronyms_path=\"heteronyms\")" + ] + }, + { + "cell_type": "markdown", + "id": "c3ee8163", + "metadata": {}, + "source": [ + "Set the text to be synthetized, prepare input and set additional generation parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fad7df55", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"Say this smoothly, to prove you are not a robot.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bca9235", + "metadata": {}, + "outputs": [], + "source": [ + "batches = tp.prepare_input_sequence([text], batch_size=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3993c431", + "metadata": {}, + "outputs": [], + "source": [ + "gen_kw = {'pace': 1.0,\n", + " 'speaker': 0,\n", + " 'pitch_tgt': None,\n", + " 'pitch_transform': None}\n", + "denoising_strength = 0.005" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d4b3ecd", + "metadata": {}, + "outputs": [], + "source": [ + "for batch in batches:\n", + " with torch.no_grad():\n", + " mel, mel_lens, *_ = fastpitch(batch['text'].to(device), **gen_kw)\n", + " audios = hifigan(mel).float()\n", + " audios = denoiser(audios.squeeze(1), denoising_strength)\n", + " audios = audios.squeeze(1) * vocoder_train_setup['max_wav_value']\n" + ] + }, + { + "cell_type": "markdown", + "id": "c48a0f58", + "metadata": {}, + "source": [ + "Plot the intermediate spectorgram." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "006163af", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,12))\n", + "res_mel = mel[0].detach().cpu().numpy()\n", + "plt.imshow(res_mel, origin='lower')\n", + "plt.xlabel('time')\n", + "plt.ylabel('frequency')\n", + "_=plt.title('Spectrogram')" + ] + }, + { + "cell_type": "markdown", + "id": "6629975b", + "metadata": {}, + "source": [ + "Syntesize audio." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "251ea5b9", + "metadata": {}, + "outputs": [], + "source": [ + "audio_numpy = audios[0].cpu().numpy()\n", + "Audio(audio_numpy, rate=22050)" + ] + }, + { + "cell_type": "markdown", + "id": "98a6104e", + "metadata": {}, + "source": [ + "Write audio to wav file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c246eca4", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io.wavfile import write\n", + "write(\"audio.wav\", vocoder_train_setup['sampling_rate'], audio_numpy)" + ] + }, + { + "cell_type": "markdown", + "id": "6a978c6c", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/HiFiGAN) and/or [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/fastpitch_pyt)\n", + "\n", + "### References\n", + "\n", + " - [FastPitch paper](https://arxiv.org/abs/2006.06873)\n", + " - [FastPitch on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/fastpitch_pyt)\n", + " - [HiFi-GAN on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/hifigan_pyt)\n", + " - [FastPitch and HiFi-GAN on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/HiFiGAN)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_gpunet.ipynb b/assets/hub/nvidia_deeplearningexamples_gpunet.ipynb new file mode 100644 index 000000000000..248a21648804 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_gpunet.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f003828f", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# GPUNet\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**GPUNet is a new family of Convolutional Neural Networks designed to max out the performance of NVIDIA GPU and TensorRT.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "GPUNets are a new family of deployment and production ready Convolutional Neural Networks from NVIDIA auto-designed to max out the performance of NVIDIA GPU and TensorRT. \n", + "\n", + "Crafted by NVIDIA AI using novel Neural Architecture Search(NAS) methods, GPUNet demonstrates state-of-the-art inference performance up to 2x faster than EfficientNet-X and FBNet-V3. This notebook allows you to load and test all the the GPUNet model implementation listed in our [CVPR-2022 paper](https://arxiv.org/pdf/2205.00841.pdf). You can use this notebook to quickly load each one of listed models to perform inference runs.\n", + "\n", + "### Example\n", + "In the example below the pretrained ***GPUNet-0*** model is loaded by default to perform inference on image and present the result. You can switch the default pre-trained model loading from GPUNet-0 to one of these: GPUNet-1, GPUNet-2, GPUNet-P0, GPUNet-P1, GPUNet-D1 or GPUNet-D2.\n", + "### Install pre-requisites\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebb1a369", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib\n", + "!pip install timm==0.5.4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5747f0f9", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "\n", + "if torch.cuda.is_available():\n", + " device = torch.device(\"cuda\") \n", + " !nvidia-smi\n", + "else:\n", + " device = torch.device(\"cpu\")\n", + "\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "2f6e3438", + "metadata": {}, + "source": [ + "### Load Pretrained model\n", + "Loads NVIDIA GPUNet-0 model by default pre-trained on ImageNet dataset. You can switch the default pre-trained model loading from GPUNet-0 to one of the following models listed below. \n", + "\n", + "The model architecture is visible as output of the loaded model. For details architecture and latency info please refer to [architecture section](https://github.com/NVIDIA/DeepLearningExamples/tree/torchhub/PyTorch/Classification/GPUNet#model-architecture) in the original repo and Table#[3](https://arxiv.org/pdf/2205.00841.pdf) in the CVPR-2022 paper, respectively. \n", + "\n", + "Please pick and choose one of the following pre-trained models:\n", + "\n", + "| TorchHub model | Description |\n", + "| :----- | :----- |\n", + "| `GPUNet-0` | GPUNet-0 has the fastest measured latency on GV100 |\n", + "| `GPUNet-1` | GPUNet-1 has improved accuracy with one additional layer on GPUNet-0|\n", + "| `GPUNet-2` | GPUNet-2 has higher accuracy with two additional layers on GPUNet-0 |\n", + "| `GPUNet-P0` | GPUNet-P0 is the distilled model with higher accuracy than GPUNet-0 but similar latency|\n", + "| `GPUNet-P1` | GPUNet-P1 is distilled model with even higher accuracy than GPUNet-1 but similar latency |\n", + "| `GPUNet-D1` | GPUNet-D1 has the second highest accuracy amongst all GPUNets|\n", + "| `GPUNet-D2` | GPUNet-D2 has the highest accuracy amongst all GPUNets |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a59a679", + "metadata": {}, + "outputs": [], + "source": [ + "model_type = \"GPUNet-0\" # select one from above\n", + "precision = \"fp32\" # select either fp32 of fp16 (for better performance on GPU)\n", + "\n", + "gpunet = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_gpunet', pretrained=True, model_type=model_type, model_math=precision)\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "gpunet.to(device)\n", + "gpunet.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "be086399", + "metadata": {}, + "source": [ + "### Prepare inference data\n", + "Prepare sample input data for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34097ec0", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)\n", + "\n", + "if precision == \"fp16\":\n", + " batch = batch.half()\n", + " \n", + "print(\"Ready to run inference...\")" + ] + }, + { + "cell_type": "markdown", + "id": "2e1d3345", + "metadata": {}, + "source": [ + "### Run inference\n", + "Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probable hypotheses according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "763f63b6", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(gpunet(batch), dim=1)\n", + " \n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "af473b19", + "metadata": {}, + "source": [ + "### Display result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f9ecc93", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.ANTIALIAS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "bc98fef5", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/GPUNet)\n", + "\n", + "### References\n", + "\n", + " - [GPUNets: Searching Deployable Convolution Neural Networks for GPUs](https://arxiv.org/pdf/2205.00841.pdf)\n", + " - [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/GPUNet)\n", + " - [pretrained model on NGC (GPUNet-0)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_0_pyt_ckpt)\n", + " - [pretrained model on NGC (GPUNet-1)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_1_pyt_ckpt)\n", + " - [pretrained model on NGC (GPUNet-2)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_2_pyt_ckpt)\n", + " - [pretrained distilled model on NGC (GPUNet-P0)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_p0_pyt_ckpt)\n", + " - [pretrained, distilled model on NGC (GPUNet-P1)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_p1_pyt_ckpt)\n", + " - [pretrained, distilled model on NGC (GPUNet-D1)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_d1_pyt_ckpt)\n", + " - [pretrained, distilled model on NGC (GPUNet-D2)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/gpunet_d2_pyt_ckpt)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_hifigan.ipynb b/assets/hub/nvidia_deeplearningexamples_hifigan.ipynb new file mode 100644 index 000000000000..da04aad4410c --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_hifigan.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ad9f8ba2", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# HiFi GAN\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**The HiFi GAN model for generating waveforms from mel spectrograms**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "This notebook demonstrates a PyTorch implementation of the HiFi-GAN model described in the paper: [HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646).\n", + "The HiFi-GAN model implements a spectrogram inversion model that allows to synthesize speech waveforms from mel-spectrograms. It follows the generative adversarial network (GAN) paradigm, and is composed of a generator and a discriminator. After training, the generator is used for synthesis, and the discriminator is discarded.\n", + "\n", + "Our implementation is based on the one [published by the authors of the paper](https://github.com/jik876/hifi-gan). We modify the original hyperparameters and provide an alternative training recipe, which enables training on larger batches and faster convergence. HiFi-GAN is trained on a publicly available [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/). The samples demonstrate speech synthesized with our publicly available FastPitch and HiFi-GAN checkpoints.\n", + "\n", + "#### Model architecture\n", + "\n", + "![HiFiGAN Architecture](https://raw.githubusercontent.com/NVIDIA/DeepLearningExamples/master/PyTorch/SpeechSynthesis/HiFiGAN/img/hifigan_model.png)\n", + "\n", + "### Example\n", + "In the example below:\n", + "\n", + "- pretrained FastPitch and HiFiGAN models are loaded from torch.hub\n", + "- given tensor representation of an input text (\"Say this smoothly to prove you are not a robot.\"), FastPitch generates mel spectrogram \n", + "- HiFiGAN generates sound given the mel spectrogram\n", + "- the output sound is saved in an 'audio.wav' file\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing of text and audio, as well as for display and input/output handling. Finally, for better performance of FastPitch model, we download the CMU pronounciation dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2cf6412", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install numpy scipy librosa unidecode inflect librosa matplotlib==3.6.3\n", + "apt-get update\n", + "apt-get install -y libsndfile1 wget\n", + "wget https://raw.githubusercontent.com/NVIDIA/NeMo/263a30be71e859cee330e5925332009da3e5efbc/scripts/tts_dataset_files/heteronyms-052722 -qO heteronyms\n", + "wget https://raw.githubusercontent.com/NVIDIA/NeMo/263a30be71e859cee330e5925332009da3e5efbc/scripts/tts_dataset_files/cmudict-0.7b_nv22.08 -qO cmudict-0.7b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97c0c357", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import Audio\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "c6b05df7", + "metadata": {}, + "source": [ + "Download and setup FastPitch generator model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac394a05", + "metadata": {}, + "outputs": [], + "source": [ + "fastpitch, generator_train_setup = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_fastpitch')" + ] + }, + { + "cell_type": "markdown", + "id": "930dfcb6", + "metadata": {}, + "source": [ + "Download and setup vocoder and denoiser models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2157457", + "metadata": {}, + "outputs": [], + "source": [ + "hifigan, vocoder_train_setup, denoiser = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_hifigan')" + ] + }, + { + "cell_type": "markdown", + "id": "334e163f", + "metadata": {}, + "source": [ + "Verify that generator and vocoder models agree on input parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b07030e5", + "metadata": {}, + "outputs": [], + "source": [ + "CHECKPOINT_SPECIFIC_ARGS = [\n", + " 'sampling_rate', 'hop_length', 'win_length', 'p_arpabet', 'text_cleaners',\n", + " 'symbol_set', 'max_wav_value', 'prepend_space_to_text',\n", + " 'append_space_to_text']\n", + "\n", + "for k in CHECKPOINT_SPECIFIC_ARGS:\n", + "\n", + " v1 = generator_train_setup.get(k, None)\n", + " v2 = vocoder_train_setup.get(k, None)\n", + "\n", + " assert v1 is None or v2 is None or v1 == v2, \\\n", + " f'{k} mismatch in spectrogram generator and vocoder'" + ] + }, + { + "cell_type": "markdown", + "id": "37d00c33", + "metadata": {}, + "source": [ + "Put all models on available device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78dea725", + "metadata": {}, + "outputs": [], + "source": [ + "fastpitch.to(device)\n", + "hifigan.to(device)\n", + "denoiser.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "ca87ee4b", + "metadata": {}, + "source": [ + "Load text processor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75ccfe9f", + "metadata": {}, + "outputs": [], + "source": [ + "tp = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_textprocessing_utils', cmudict_path=\"cmudict-0.7b\", heteronyms_path=\"heteronyms\")" + ] + }, + { + "cell_type": "markdown", + "id": "711e02f7", + "metadata": {}, + "source": [ + "Set the text to be synthetized, prepare input and set additional generation parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d465b2b8", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"Say this smoothly, to prove you are not a robot.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe72e111", + "metadata": {}, + "outputs": [], + "source": [ + "batches = tp.prepare_input_sequence([text], batch_size=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97ab345f", + "metadata": {}, + "outputs": [], + "source": [ + "gen_kw = {'pace': 1.0,\n", + " 'speaker': 0,\n", + " 'pitch_tgt': None,\n", + " 'pitch_transform': None}\n", + "denoising_strength = 0.005" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad88a994", + "metadata": {}, + "outputs": [], + "source": [ + "for batch in batches:\n", + " with torch.no_grad():\n", + " mel, mel_lens, *_ = fastpitch(batch['text'].to(device), **gen_kw)\n", + " audios = hifigan(mel).float()\n", + " audios = denoiser(audios.squeeze(1), denoising_strength)\n", + " audios = audios.squeeze(1) * vocoder_train_setup['max_wav_value']\n" + ] + }, + { + "cell_type": "markdown", + "id": "215ac622", + "metadata": {}, + "source": [ + "Plot the intermediate spectorgram." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1391d11a", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,12))\n", + "res_mel = mel[0].detach().cpu().numpy()\n", + "plt.imshow(res_mel, origin='lower')\n", + "plt.xlabel('time')\n", + "plt.ylabel('frequency')\n", + "_=plt.title('Spectrogram')" + ] + }, + { + "cell_type": "markdown", + "id": "2bc202bd", + "metadata": {}, + "source": [ + "Syntesize audio." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dff55e0a", + "metadata": {}, + "outputs": [], + "source": [ + "audio_numpy = audios[0].cpu().numpy()\n", + "Audio(audio_numpy, rate=22050)" + ] + }, + { + "cell_type": "markdown", + "id": "911663e6", + "metadata": {}, + "source": [ + "Write audio to wav file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e6bb116", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io.wavfile import write\n", + "write(\"audio.wav\", vocoder_train_setup['sampling_rate'], audio_numpy)" + ] + }, + { + "cell_type": "markdown", + "id": "927c61db", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/HiFiGAN) and/or [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/hifigan_pyt)\n", + "\n", + "### References\n", + "\n", + " - [HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646)\n", + " - [Original implementation](https://github.com/jik876/hifi-gan)\n", + " - [FastPitch on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/fastpitch_pyt)\n", + " - [HiFi-GAN on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/resources/hifigan_pyt)\n", + " - [FastPitch and HiFi-GAN on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/HiFi-GAN)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_resnet50.ipynb b/assets/hub/nvidia_deeplearningexamples_resnet50.ipynb new file mode 100644 index 000000000000..c1bac33dbf51 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_resnet50.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66c2720d", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNet50\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**ResNet50 model trained with mixed precision using Tensor Cores.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The **_ResNet50 v1.5_** model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).\n", + "\n", + "The difference between v1 and v1.5 is that, in the bottleneck blocks which requires\n", + "downsampling, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution.\n", + "\n", + "This difference makes ResNet50 v1.5 slightly more accurate (\\~0.5% top1) than v1, but comes with a small performance drawback (\\~5% imgs/sec).\n", + "\n", + "The model is initialized as described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)\n", + "\n", + "This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results over 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.\n", + "\n", + "Note that the ResNet50 v1.5 model can be deployed for inference on the [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) using TorchScript, ONNX Runtime or TensorRT as an execution backend. For details check [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnet_for_triton_from_pytorch)\n", + "\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained **_ResNet50 v1.5_** model to perform inference on **_image_** and present the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78f246de", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "519f74f5", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "dee3c5cc", + "metadata": {}, + "source": [ + "Load the model pretrained on ImageNet dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6839294f", + "metadata": {}, + "outputs": [], + "source": [ + "resnet50 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "resnet50.eval().to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "71c9765d", + "metadata": {}, + "source": [ + "Prepare sample input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f3a02e3", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "c9e8c0b1", + "metadata": {}, + "source": [ + "Run inference. Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probably hypothesis according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f4c40b9", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(resnet50(batch), dim=1)\n", + "\n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "5c6115ba", + "metadata": {}, + "source": [ + "Display the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2d32382", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.LANCZOS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)\n" + ] + }, + { + "cell_type": "markdown", + "id": "20e348ab", + "metadata": {}, + "source": [ + "### Details\n", + "\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5)\n", + "and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnet_50_v1_5_for_pytorch)\n", + "\n", + "### References\n", + "\n", + "- [Original ResNet50 v1 paper](https://arxiv.org/abs/1512.03385)\n", + "- [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)\n", + "- [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5)\n", + "- [model on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnet_50_v1_5_for_pytorch)\n", + "- [pretrained model on NGC](https://ngc.nvidia.com/catalog/models/nvidia:resnet50_pyt_amp)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_resnext.ipynb b/assets/hub/nvidia_deeplearningexamples_resnext.ipynb new file mode 100644 index 000000000000..cc968b11d524 --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_resnext.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f6bde560", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNeXt101\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**ResNet with bottleneck 3x3 Convolutions substituted by 3x3 Grouped Convolutions, trained with mixed precision using Tensor Cores.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/ResNeXtArch.png) | ![alt](https://pytorch.org/assets/images/classification.jpg)\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The ***ResNeXt101-32x4d*** is a model introduced in the [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf) paper.\n", + "\n", + "It is based on regular ResNet model, substituting 3x3 convolutions inside the bottleneck block for 3x3 grouped convolutions.\n", + "\n", + "This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.\n", + "\n", + "We use [NHWC data layout](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) when training using Mixed Precision.\n", + "\n", + "Note that the ResNeXt101-32x4d model can be deployed for inference on the [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) using TorchScript, ONNX Runtime or TensorRT as an execution backend. For details check [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnext_for_triton_from_pytorch)\n", + "\n", + "#### Model architecture\n", + "\n", + "![ResNextArch](https://pytorch.org/assets/images/ResNeXtArch.png)\n", + "\n", + "_Image source: [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf)_\n", + "\n", + "Image shows difference between ResNet bottleneck block and ResNeXt bottleneck block.\n", + "\n", + "ResNeXt101-32x4d model's cardinality equals to 32 and bottleneck width equals to 4.\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained ***ResNeXt101-32x4d*** model to perform inference on images and present the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6284d8fa", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dcf5bde", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "d36102e3", + "metadata": {}, + "source": [ + "Load the model pretrained on ImageNet dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a2cad8f", + "metadata": {}, + "outputs": [], + "source": [ + "resneXt = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resneXt')\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "resneXt.eval().to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "46a9ccd2", + "metadata": {}, + "source": [ + "Prepare sample input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9a32ac0", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "5eeef251", + "metadata": {}, + "source": [ + "Run inference. Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probably hypothesis according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c2dacfe", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(resneXt(batch), dim=1)\n", + " \n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "60cb6124", + "metadata": {}, + "source": [ + "Display the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c10f528a", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.ANTIALIAS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)\n" + ] + }, + { + "cell_type": "markdown", + "id": "653c5f00", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnext101-32x4d)\n", + "and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnext_for_pytorch)\n", + "\n", + "\n", + "### References\n", + "\n", + " - [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf)\n", + " - [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnext101-32x4d)\n", + " - [model on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:resnext_for_pytorch)\n", + " - [pretrained model on NGC](https://ngc.nvidia.com/catalog/models/nvidia:resnext101_32x4d_pyt_amp)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_se-resnext.ipynb b/assets/hub/nvidia_deeplearningexamples_se-resnext.ipynb new file mode 100644 index 000000000000..10268dd88a4a --- /dev/null +++ b/assets/hub/nvidia_deeplearningexamples_se-resnext.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0d837faa", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# SE-ResNeXt101\n", + "\n", + "*Author: NVIDIA*\n", + "\n", + "**ResNeXt with Squeeze-and-Excitation module added, trained with mixed precision using Tensor Cores.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/SEArch.png) | ![alt](https://pytorch.org/assets/images/classification.jpg)\n", + "\n", + "\n", + "\n", + "### Model Description\n", + "\n", + "The ***SE-ResNeXt101-32x4d*** is a [ResNeXt101-32x4d](https://arxiv.org/pdf/1611.05431.pdf)\n", + "model with added Squeeze-and-Excitation module introduced\n", + "in the [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf) paper.\n", + "\n", + "This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.\n", + "\n", + "We use [NHWC data layout](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) when training using Mixed Precision.\n", + "\n", + "#### Model architecture\n", + "\n", + "![SEArch](https://pytorch.org/assets/images/SEArch.png)\n", + "\n", + "_Image source: [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf)_\n", + "\n", + "Image shows the architecture of SE block and where is it placed in ResNet bottleneck block.\n", + "\n", + "\n", + "Note that the SE-ResNeXt101-32x4d model can be deployed for inference on the [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) using TorchScript, ONNX Runtime or TensorRT as an execution backend. For details check [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/se_resnext_for_triton_from_pytorch).\n", + "\n", + "### Example\n", + "\n", + "In the example below we will use the pretrained ***SE-ResNeXt101-32x4d*** model to perform inference on images and present the result.\n", + "\n", + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8f575d1", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install validators matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1f397f1", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import torchvision.transforms as transforms\n", + "import numpy as np\n", + "import json\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "%matplotlib inline\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "print(f'Using {device} for inference')" + ] + }, + { + "cell_type": "markdown", + "id": "7501370a", + "metadata": {}, + "source": [ + "Load the model pretrained on ImageNet dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "642724b2", + "metadata": {}, + "outputs": [], + "source": [ + "resneXt = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_se_resnext101_32x4d')\n", + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + "\n", + "resneXt.eval().to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "d60343f8", + "metadata": {}, + "source": [ + "Prepare sample input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "302c281e", + "metadata": {}, + "outputs": [], + "source": [ + "uris = [\n", + " 'http://images.cocodataset.org/test-stuff2017/000000024309.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000028117.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000006149.jpg',\n", + " 'http://images.cocodataset.org/test-stuff2017/000000004954.jpg',\n", + "]\n", + "\n", + "\n", + "batch = torch.cat(\n", + " [utils.prepare_input_from_uri(uri) for uri in uris]\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "6dd2775b", + "metadata": {}, + "source": [ + "Run inference. Use `pick_n_best(predictions=output, n=topN)` helper function to pick N most probable hypotheses according to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc63ff1b", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output = torch.nn.functional.softmax(resneXt(batch), dim=1)\n", + " \n", + "results = utils.pick_n_best(predictions=output, n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "20296003", + "metadata": {}, + "source": [ + "Display the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a8ef184", + "metadata": {}, + "outputs": [], + "source": [ + "for uri, result in zip(uris, results):\n", + " img = Image.open(requests.get(uri, stream=True).raw)\n", + " img.thumbnail((256,256), Image.ANTIALIAS)\n", + " plt.imshow(img)\n", + " plt.show()\n", + " print(result)\n" + ] + }, + { + "cell_type": "markdown", + "id": "0e6c679f", + "metadata": {}, + "source": [ + "### Details\n", + "For detailed information on model input and output, training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/se-resnext101-32x4d)\n", + "and/or [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/se_resnext_for_pytorch).\n", + "\n", + "\n", + "### References\n", + "\n", + " - [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf)\n", + " - [model on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/se-resnext101-32x4d)\n", + " - [model on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/se_resnext_for_pytorch)\n", + " - [pretrained model on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/seresnext101_32x4d_pyt_amp)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/nvidia_deeplearningexamples_ssd.ipynb b/assets/hub/nvidia_deeplearningexamples_ssd.ipynb index e91c207b5f59..c2a8a01a4e6e 100644 --- a/assets/hub/nvidia_deeplearningexamples_ssd.ipynb +++ b/assets/hub/nvidia_deeplearningexamples_ssd.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "add05f5a", "metadata": {}, "source": [ "### This notebook requires a GPU runtime to run.\n", @@ -17,28 +18,7 @@ "\n", "_ | _\n", "- | -\n", - "![alt](https://pytorch.org/assets/images/ssd_diagram.png) | ![alt](https://pytorch.org/assets/images/ssd.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch \n", - "precision = 'fp32'\n", - "ssd_model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math=precision)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "will load an SSD model pretrained on COCO dataset from Torch Hub.\n", - "\n", - "Setting precision='fp16' will load a checkpoint trained with [mixed precision](https://arxiv.org/abs/1710.03740) into architecture enabling execution on [Tensor Cores](https://developer.nvidia.com/tensor-cores).\n", - "Handling mixed precision data requires [Apex](https://github.com/NVIDIA/apex) library.\n", + "![alt](https://pytorch.org/assets/images/ssd_diagram.png) | ![alt](https://pytorch.org/assets/images/ssd.png)\n", "\n", "\n", "\n", @@ -56,7 +36,7 @@ "[Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012)\n", "paper, the following enhancements were made to the backbone:\n", "* The conv5_x, avgpool, fc and softmax layers were removed from the original classification model.\n", - "* All strides in conv4_x are set to 1x1. \n", + "* All strides in conv4_x are set to 1x1.\n", "\n", "The backbone is followed by 5 additional convolutional layers.\n", "In addition to the convolutional layers, we attached 6 detection heads:\n", @@ -68,15 +48,15 @@ "\n", "### Example\n", "\n", - "In the example below we will use the pretrained SSD model loaded from Torch Hub to detect objects in sample images and visualize the result.\n", + "In the example below we will use the pretrained SSD model to detect objects in sample images and visualize the result.\n", "\n", - "To run the example you need some extra python packages installed.\n", - "These are needed for preprocessing images and visualization." + "To run the example you need some extra python packages installed. These are needed for preprocessing images and visualization." ] }, { "cell_type": "code", "execution_count": null, + "id": "a7799905", "metadata": {}, "outputs": [], "source": [ @@ -86,22 +66,27 @@ }, { "cell_type": "markdown", + "id": "ee048b09", "metadata": {}, "source": [ - "For convenient and comprehensive formatting of input and output of the model, load a set of utility methods." + "Load an SSD model pretrained on COCO dataset, as well as a set of utility methods for convenient and comprehensive formatting of input and output of the model." ] }, { "cell_type": "code", "execution_count": null, + "id": "57eadd0d", "metadata": {}, "outputs": [], "source": [ + "import torch\n", + "ssd_model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd')\n", "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd_processing_utils')" ] }, { "cell_type": "markdown", + "id": "f313bb4f", "metadata": {}, "source": [ "Now, prepare the loaded model for inference" @@ -110,6 +95,7 @@ { "cell_type": "code", "execution_count": null, + "id": "bc269b96", "metadata": {}, "outputs": [], "source": [ @@ -119,6 +105,7 @@ }, { "cell_type": "markdown", + "id": "f44b4baf", "metadata": {}, "source": [ "Prepare input images for object detection.\n", @@ -128,6 +115,7 @@ { "cell_type": "code", "execution_count": null, + "id": "97de9048", "metadata": {}, "outputs": [], "source": [ @@ -140,6 +128,7 @@ }, { "cell_type": "markdown", + "id": "5c7a8563", "metadata": {}, "source": [ "Format the images to comply with the network input and convert them to tensor." @@ -148,15 +137,17 @@ { "cell_type": "code", "execution_count": null, + "id": "0e6b7bce", "metadata": {}, "outputs": [], "source": [ "inputs = [utils.prepare_input(uri) for uri in uris]\n", - "tensor = utils.prepare_tensor(inputs, precision == 'fp16')" + "tensor = utils.prepare_tensor(inputs)" ] }, { "cell_type": "markdown", + "id": "ba5ef064", "metadata": {}, "source": [ "Run the SSD network to perform object detection." @@ -165,6 +156,7 @@ { "cell_type": "code", "execution_count": null, + "id": "7af0e311", "metadata": {}, "outputs": [], "source": [ @@ -174,6 +166,7 @@ }, { "cell_type": "markdown", + "id": "2f7b0ce7", "metadata": {}, "source": [ "By default, raw output from SSD network per input image contains\n", @@ -184,6 +177,7 @@ { "cell_type": "code", "execution_count": null, + "id": "d5e55a01", "metadata": {}, "outputs": [], "source": [ @@ -193,6 +187,7 @@ }, { "cell_type": "markdown", + "id": "21d6fcd1", "metadata": {}, "source": [ "The model was trained on COCO dataset, which we need to access in order to translate class IDs into object names.\n", @@ -202,6 +197,7 @@ { "cell_type": "code", "execution_count": null, + "id": "b0c5e835", "metadata": {}, "outputs": [], "source": [ @@ -210,6 +206,7 @@ }, { "cell_type": "markdown", + "id": "8953e6d2", "metadata": {}, "source": [ "Finally, let's visualize our detections" @@ -218,6 +215,7 @@ { "cell_type": "code", "execution_count": null, + "id": "cf2d9efe", "metadata": {}, "outputs": [], "source": [ @@ -242,24 +240,25 @@ }, { "cell_type": "markdown", + "id": "d50074e7", "metadata": {}, "source": [ "### Details\n", - "For detailed information on model input and output, \n", - "training recipies, inference and performance visit: \n", - "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD) \n", - "and/or [NGC](https://ngc.nvidia.com/catalog/model-scripts/nvidia:ssd_for_pytorch)\n", + "For detailed information on model input and output,\n", + "training recipies, inference and performance visit:\n", + "[github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD)\n", + "and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:ssd_for_pytorch)\n", "\n", "### References\n", "\n", " - [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325) paper\n", " - [Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012) paper\n", - " - [SSD on NGC](https://ngc.nvidia.com/catalog/model-scripts/nvidia:ssd_for_pytorch)\n", + " - [SSD on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:ssd_for_pytorch)\n", " - [SSD on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD)" ] } ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/nvidia_deeplearningexamples_tacotron2.ipynb b/assets/hub/nvidia_deeplearningexamples_tacotron2.ipynb index 8528a37e7cf7..ecd3b7c29c3e 100644 --- a/assets/hub/nvidia_deeplearningexamples_tacotron2.ipynb +++ b/assets/hub/nvidia_deeplearningexamples_tacotron2.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "d6a36e6b", "metadata": {}, "source": [ "### This notebook requires a GPU runtime to run.\n", @@ -15,24 +16,9 @@ "\n", "**The Tacotron 2 model for generating mel spectrograms from text**\n", "\n", - "\"alt\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "will load the Tacotron2 model pre-trained on [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)\n", + "\"alt\"\n", + "\n", + "\n", "\n", "### Model Description\n", "\n", @@ -44,7 +30,7 @@ "\n", "In the example below:\n", "- pretrained Tacotron2 and Waveglow models are loaded from torch.hub\n", - "- Tacotron2 generates mel spectrogram given tensor represantation of an input text (\"Hello world, I missed you\")\n", + "- Given a tensor representation of the input text (\"Hello world, I missed you so much\"), Tacotron2 generates a Mel spectrogram as shown on the illustration\n", "- Waveglow generates sound given the mel spectrogram\n", "- the output sound is saved in an 'audio.wav' file\n", "\n", @@ -55,95 +41,112 @@ { "cell_type": "code", "execution_count": null, + "id": "a384b737", "metadata": {}, "outputs": [], "source": [ "%%bash\n", - "pip install numpy scipy librosa unidecode inflect librosa" + "pip install numpy scipy librosa unidecode inflect librosa\n", + "apt-get update\n", + "apt-get install -y libsndfile1" + ] + }, + { + "cell_type": "markdown", + "id": "2578bea8", + "metadata": {}, + "source": [ + "Load the Tacotron2 model pre-trained on [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/) and prepare it for inference:" ] }, { "cell_type": "code", "execution_count": null, + "id": "6d735c9f", "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "from scipy.io.wavfile import write" + "import torch\n", + "tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')\n", + "tacotron2 = tacotron2.to('cuda')\n", + "tacotron2.eval()" ] }, { "cell_type": "markdown", + "id": "96353646", "metadata": {}, "source": [ - "Prepare tacotron2 for inference" + "Load pretrained WaveGlow model" ] }, { "cell_type": "code", "execution_count": null, + "id": "726773b0", "metadata": {}, "outputs": [], "source": [ - "tacotron2 = tacotron2.to('cuda')\n", - "tacotron2.eval()" + "waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')\n", + "waveglow = waveglow.remove_weightnorm(waveglow)\n", + "waveglow = waveglow.to('cuda')\n", + "waveglow.eval()" ] }, { "cell_type": "markdown", + "id": "0055a4a6", "metadata": {}, "source": [ - "Load waveglow from PyTorch Hub" + "Now, let's make the model say:" ] }, { "cell_type": "code", "execution_count": null, + "id": "c432a2a5", "metadata": {}, "outputs": [], "source": [ - "waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')\n", - "waveglow = waveglow.remove_weightnorm(waveglow)\n", - "waveglow = waveglow.to('cuda')\n", - "waveglow.eval()" + "text = \"Hello world, I missed you so much.\"" ] }, { "cell_type": "markdown", + "id": "09f3df08", "metadata": {}, "source": [ - "Now, let's make the model say *\"hello world, I missed you\"*" + "Format the input using utility methods" ] }, { "cell_type": "code", "execution_count": null, + "id": "1186aca4", "metadata": {}, "outputs": [], "source": [ - "text = \"hello world, I missed you\"" + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')\n", + "sequences, lengths = utils.prepare_input_sequence([text])" ] }, { "cell_type": "markdown", + "id": "52b62d50", "metadata": {}, "source": [ - "Now chain pre-processing -> tacotron2 -> waveglow" + "Run the chained models:" ] }, { "cell_type": "code", "execution_count": null, + "id": "fe9a3235", "metadata": {}, "outputs": [], "source": [ - "# preprocessing\n", - "sequence = np.array(tacotron2.text_to_sequence(text, ['english_cleaners']))[None, :]\n", - "sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64)\n", - "\n", - "# run the models\n", "with torch.no_grad():\n", - " _, mel, _, _ = tacotron2.infer(sequence)\n", + " mel, _, _ = tacotron2.infer(sequences, lengths)\n", " audio = waveglow.infer(mel)\n", "audio_numpy = audio[0].data.cpu().numpy()\n", "rate = 22050" @@ -151,6 +154,7 @@ }, { "cell_type": "markdown", + "id": "4f981ac4", "metadata": {}, "source": [ "You can write it to a file and listen to it" @@ -159,14 +163,17 @@ { "cell_type": "code", "execution_count": null, + "id": "4811ba40", "metadata": {}, "outputs": [], "source": [ + "from scipy.io.wavfile import write\n", "write(\"audio.wav\", rate, audio_numpy)" ] }, { "cell_type": "markdown", + "id": "a8484e97", "metadata": {}, "source": [ "Alternatively, play it right away in a notebook with IPython widgets" @@ -175,6 +182,7 @@ { "cell_type": "code", "execution_count": null, + "id": "4aea3333", "metadata": {}, "outputs": [], "source": [ @@ -184,21 +192,22 @@ }, { "cell_type": "markdown", + "id": "d8d9b03f", "metadata": {}, "source": [ "### Details\n", - "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2) and/or [NGC](https://ngc.nvidia.com/catalog/model-scripts/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", + "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2) and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", "\n", "### References\n", "\n", " - [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)\n", " - [WaveGlow: A Flow-based Generative Network for Speech Synthesis](https://arxiv.org/abs/1811.00002)\n", - " - [Tacotron2 and WaveGlow on NGC](https://ngc.nvidia.com/catalog/model-scripts/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", + " - [Tacotron2 and WaveGlow on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", " - [Tacotron2 and Waveglow on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2)" ] } ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/nvidia_deeplearningexamples_waveglow.ipynb b/assets/hub/nvidia_deeplearningexamples_waveglow.ipynb index db60fb8846b3..be4c1c4b8b72 100644 --- a/assets/hub/nvidia_deeplearningexamples_waveglow.ipynb +++ b/assets/hub/nvidia_deeplearningexamples_waveglow.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "1d26fab5", "metadata": {}, "source": [ "### This notebook requires a GPU runtime to run.\n", @@ -15,24 +16,9 @@ "\n", "**WaveGlow model for generating speech from mel spectrograms (generated by Tacotron2)**\n", "\n", - "\"alt\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "will load the WaveGlow model pre-trained on [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)\n", + "\"alt\"\n", + "\n", + "\n", "\n", "### Model Description\n", "\n", @@ -42,7 +28,7 @@ "\n", "In the example below:\n", "- pretrained Tacotron2 and Waveglow models are loaded from torch.hub\n", - "- Tacotron2 generates mel spectrogram given tensor represantation of an input text (\"Hello world, I missed you\")\n", + "- Given a tensor representation of the input text (\"Hello world, I missed you so much\"), Tacotron2 generates a Mel spectrogram as shown on the illustration\n", "- Waveglow generates sound given the mel spectrogram\n", "- the output sound is saved in an 'audio.wav' file\n", "\n", @@ -53,33 +39,47 @@ { "cell_type": "code", "execution_count": null, + "id": "406508db", "metadata": {}, "outputs": [], "source": [ "%%bash\n", - "pip install numpy scipy librosa unidecode inflect librosa" + "pip install numpy scipy librosa unidecode inflect librosa\n", + "apt-get update\n", + "apt-get install -y libsndfile1" + ] + }, + { + "cell_type": "markdown", + "id": "942e77d1", + "metadata": {}, + "source": [ + "Load the WaveGlow model pre-trained on [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)" ] }, { "cell_type": "code", "execution_count": null, + "id": "537f1a63", "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "from scipy.io.wavfile import write" + "import torch\n", + "waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32')" ] }, { "cell_type": "markdown", + "id": "7a47b767", "metadata": {}, "source": [ - "Prepare the waveglow model for inference" + "Prepare the WaveGlow model for inference" ] }, { "cell_type": "code", "execution_count": null, + "id": "1fbed70c", "metadata": {}, "outputs": [], "source": [ @@ -90,58 +90,78 @@ }, { "cell_type": "markdown", + "id": "8b0dcbce", "metadata": {}, "source": [ - "Load tacotron2 from PyTorch Hub" + "Load a pretrained Tacotron2 model" ] }, { "cell_type": "code", "execution_count": null, + "id": "1e1c62ea", "metadata": {}, "outputs": [], "source": [ - "tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')\n", + "tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp32')\n", "tacotron2 = tacotron2.to('cuda')\n", "tacotron2.eval()" ] }, { "cell_type": "markdown", + "id": "df4cc284", "metadata": {}, "source": [ - "Now, let's make the model say *\"hello world, I missed you\"*" + "Now, let's make the model say:" ] }, { "cell_type": "code", "execution_count": null, + "id": "aa1ca779", "metadata": {}, "outputs": [], "source": [ - "text = \"hello world, I missed you\"" + "text = \"hello world, I missed you so much\"" ] }, { "cell_type": "markdown", + "id": "4ad6ebad", "metadata": {}, "source": [ - "Now chain pre-processing -> tacotron2 -> waveglow" + "Format the input using utility methods" ] }, { "cell_type": "code", "execution_count": null, + "id": "1b6dc4d1", + "metadata": {}, + "outputs": [], + "source": [ + "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')\n", + "sequences, lengths = utils.prepare_input_sequence([text])" + ] + }, + { + "cell_type": "markdown", + "id": "2de62c22", + "metadata": {}, + "source": [ + "Run the chained models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "881b70b7", "metadata": {}, "outputs": [], "source": [ - "# preprocessing\n", - "sequence = np.array(tacotron2.text_to_sequence(text, ['english_cleaners']))[None, :]\n", - "sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64)\n", - "\n", - "# run the models\n", "with torch.no_grad():\n", - " _, mel, _, _ = tacotron2.infer(sequence)\n", + " mel, _, _ = tacotron2.infer(sequences, lengths)\n", " audio = waveglow.infer(mel)\n", "audio_numpy = audio[0].data.cpu().numpy()\n", "rate = 22050" @@ -149,6 +169,7 @@ }, { "cell_type": "markdown", + "id": "9471a982", "metadata": {}, "source": [ "You can write it to a file and listen to it" @@ -157,14 +178,17 @@ { "cell_type": "code", "execution_count": null, + "id": "87449085", "metadata": {}, "outputs": [], "source": [ + "from scipy.io.wavfile import write\n", "write(\"audio.wav\", rate, audio_numpy)" ] }, { "cell_type": "markdown", + "id": "b8555270", "metadata": {}, "source": [ "Alternatively, play it right away in a notebook with IPython widgets" @@ -173,6 +197,7 @@ { "cell_type": "code", "execution_count": null, + "id": "1a54e376", "metadata": {}, "outputs": [], "source": [ @@ -182,21 +207,22 @@ }, { "cell_type": "markdown", + "id": "461a1cf1", "metadata": {}, "source": [ "### Details\n", - "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2) and/or [NGC](https://ngc.nvidia.com/catalog/model-scripts/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", + "For detailed information on model input and output, training recipies, inference and performance visit: [github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2) and/or [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", "\n", "### References\n", "\n", " - [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)\n", " - [WaveGlow: A Flow-based Generative Network for Speech Synthesis](https://arxiv.org/abs/1811.00002)\n", - " - [Tacotron2 and WaveGlow on NGC](https://ngc.nvidia.com/catalog/model-scripts/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", + " - [Tacotron2 and WaveGlow on NGC](https://ngc.nvidia.com/catalog/resources/nvidia:tacotron_2_and_waveglow_for_pytorch)\n", " - [Tacotron2 and Waveglow on github](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2)" ] } ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_fairseq_roberta.ipynb b/assets/hub/pytorch_fairseq_roberta.ipynb index a2c972670663..a0b9e24a2743 100644 --- a/assets/hub/pytorch_fairseq_roberta.ipynb +++ b/assets/hub/pytorch_fairseq_roberta.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "a22ee80f", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -42,15 +43,17 @@ { "cell_type": "code", "execution_count": null, + "id": "31bf82e3", "metadata": {}, "outputs": [], "source": [ "%%bash\n", - "pip install regex requests" + "pip install regex requests hydra-core omegaconf" ] }, { "cell_type": "markdown", + "id": "c661359f", "metadata": {}, "source": [ "### Example\n", @@ -61,6 +64,7 @@ { "cell_type": "code", "execution_count": null, + "id": "ea6f6c39", "metadata": {}, "outputs": [], "source": [ @@ -71,6 +75,7 @@ }, { "cell_type": "markdown", + "id": "ec181a50", "metadata": {}, "source": [ "##### Apply Byte-Pair Encoding (BPE) to input text" @@ -79,6 +84,7 @@ { "cell_type": "code", "execution_count": null, + "id": "fb01609c", "metadata": {}, "outputs": [], "source": [ @@ -89,6 +95,7 @@ }, { "cell_type": "markdown", + "id": "6903db0b", "metadata": {}, "source": [ "##### Extract features from RoBERTa" @@ -97,6 +104,7 @@ { "cell_type": "code", "execution_count": null, + "id": "637c35e5", "metadata": {}, "outputs": [], "source": [ @@ -112,6 +120,7 @@ }, { "cell_type": "markdown", + "id": "db346d27", "metadata": {}, "source": [ "##### Use RoBERTa for sentence-pair classification tasks" @@ -120,6 +129,7 @@ { "cell_type": "code", "execution_count": null, + "id": "898b46e2", "metadata": {}, "outputs": [], "source": [ @@ -141,6 +151,7 @@ }, { "cell_type": "markdown", + "id": "6c234073", "metadata": {}, "source": [ "##### Register a new (randomly initialized) classification head" @@ -149,6 +160,7 @@ { "cell_type": "code", "execution_count": null, + "id": "1a89094b", "metadata": {}, "outputs": [], "source": [ @@ -158,6 +170,7 @@ }, { "cell_type": "markdown", + "id": "77b22901", "metadata": {}, "source": [ "### References\n", @@ -173,5 +186,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_fairseq_translation.ipynb b/assets/hub/pytorch_fairseq_translation.ipynb index da8317fab5ee..1c54148e4b77 100644 --- a/assets/hub/pytorch_fairseq_translation.ipynb +++ b/assets/hub/pytorch_fairseq_translation.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "00c45e90", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -36,15 +37,17 @@ { "cell_type": "code", "execution_count": null, + "id": "fc44211c", "metadata": {}, "outputs": [], "source": [ "%%bash\n", - "pip install fastBPE regex requests sacremoses subword_nmt" + "pip install bitarray fastBPE hydra-core omegaconf regex requests sacremoses subword_nmt" ] }, { "cell_type": "markdown", + "id": "688cbbe5", "metadata": {}, "source": [ "### English-to-French Translation\n", @@ -56,6 +59,7 @@ { "cell_type": "code", "execution_count": null, + "id": "36199fd8", "metadata": {}, "outputs": [], "source": [ @@ -97,6 +101,7 @@ }, { "cell_type": "markdown", + "id": "66b917b7", "metadata": {}, "source": [ "### English-to-German Translation\n", @@ -118,6 +123,7 @@ { "cell_type": "code", "execution_count": null, + "id": "b6ec05f7", "metadata": {}, "outputs": [], "source": [ @@ -136,6 +142,7 @@ }, { "cell_type": "markdown", + "id": "5633bdd6", "metadata": {}, "source": [ "We can also do a round-trip translation to create a paraphrase:" @@ -144,6 +151,7 @@ { "cell_type": "code", "execution_count": null, + "id": "3c9ced10", "metadata": {}, "outputs": [], "source": [ @@ -164,6 +172,7 @@ }, { "cell_type": "markdown", + "id": "5e28c30c", "metadata": {}, "source": [ "### References\n", @@ -187,5 +196,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_alexnet.ipynb b/assets/hub/pytorch_vision_alexnet.ipynb index c7e09f189c56..9d657a3d0f8b 100644 --- a/assets/hub/pytorch_vision_alexnet.ipynb +++ b/assets/hub/pytorch_vision_alexnet.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "a8c51646", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -23,16 +24,18 @@ { "cell_type": "code", "execution_count": null, + "id": "de851ed4", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'alexnet', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "c6c7ae8b", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -46,12 +49,13 @@ { "cell_type": "code", "execution_count": null, + "id": "3e4e8088", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -59,6 +63,7 @@ { "cell_type": "code", "execution_count": null, + "id": "f52cf73f", "metadata": {}, "outputs": [], "source": [ @@ -82,25 +87,54 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0d41084", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "813b3bab", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "2b0e3dac", "metadata": {}, "source": [ "### Model Description\n", "\n", "AlexNet competed in the ImageNet Large Scale Visual Recognition Challenge on September 30, 2012. The network achieved a top-5 error of 15.3%, more than 10.8 percentage points lower than that of the runner up. The original paper's primary result was that the depth of the model was essential for its high performance, which was computationally expensive, but made feasible due to the utilization of graphics processing units (GPUs) during training.\n", "\n", - "The 1-crop error rates on the imagenet dataset with the pretrained model are listed below.\n", + "The 1-crop error rates on the ImageNet dataset with the pretrained model are listed below.\n", "\n", "| Model structure | Top-1 error | Top-5 error |\n", "| --------------- | ----------- | ----------- |\n", - "| alexnet | 43.45 | 20.91 |\n", + "| AlexNet | 43.45 | 20.91 |\n", "\n", "### References\n", "\n", @@ -110,5 +144,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_deeplabv3_resnet101.ipynb b/assets/hub/pytorch_vision_deeplabv3_resnet101.ipynb index cc73b4ab4a19..9908963f7726 100644 --- a/assets/hub/pytorch_vision_deeplabv3_resnet101.ipynb +++ b/assets/hub/pytorch_vision_deeplabv3_resnet101.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "7e0c977e", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -9,11 +10,11 @@ "\n", "----------------------------------------------------------------------\n", "\n", - "# Deeplabv3-ResNet101\n", + "# Deeplabv3\n", "\n", "*Author: Pytorch Team*\n", "\n", - "**DeepLabV3 model with a ResNet-101 backbone**\n", + "**DeepLabV3 models with ResNet-50, ResNet-101 and MobileNet-V3 backbones**\n", "\n", "_ | _\n", "- | -\n", @@ -23,16 +24,21 @@ { "cell_type": "code", "execution_count": null, + "id": "aee2b394", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'deeplabv3_resnet101', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_resnet50', pretrained=True)\n", + "# or any of these variants\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_resnet101', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_mobilenet_v3_large', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "ffb66f42", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -41,19 +47,20 @@ "and `std = [0.229, 0.224, 0.225]`.\n", "\n", "The model returns an `OrderedDict` with two Tensors that are of the same height and width as the input Tensor, but with 21 classes.\n", - "`output['out']` contains the semantic masks, and `output['aux']` contains the auxillary loss values per-pixel. In inference mode, `output['aux']` is not useful.\n", - "So, `output['out']` is of shape `(N, 21, H, W)`. More documentation can be found [here](https://pytorch.org/docs/stable/torchvision/models.html#object-detection-instance-segmentation-and-person-keypoint-detection)." + "`output['out']` contains the semantic masks, and `output['aux']` contains the auxiliary loss values per-pixel. In inference mode, `output['aux']` is not useful.\n", + "So, `output['out']` is of shape `(N, 21, H, W)`. More documentation can be found [here](https://pytorch.org/vision/stable/models.html#semantic-segmentation)." ] }, { "cell_type": "code", "execution_count": null, + "id": "6eb1b292", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/deeplab1.png\", \"deeplab1.png\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -61,6 +68,7 @@ { "cell_type": "code", "execution_count": null, + "id": "88780d40", "metadata": {}, "outputs": [], "source": [ @@ -68,6 +76,7 @@ "from PIL import Image\n", "from torchvision import transforms\n", "input_image = Image.open(filename)\n", + "input_image = input_image.convert(\"RGB\")\n", "preprocess = transforms.Compose([\n", " transforms.ToTensor(),\n", " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", @@ -88,9 +97,10 @@ }, { "cell_type": "markdown", + "id": "162a301a", "metadata": {}, "source": [ - "The output here is of shape `(21, H, W)`, and at each location, there are unnormalized proababilities corresponding to the prediction of each class.\n", + "The output here is of shape `(21, H, W)`, and at each location, there are unnormalized probabilities corresponding to the prediction of each class.\n", "To get the maximum prediction of each class, and then use it for a downstream task, you can do `output_predictions = output.argmax(0)`.\n", "\n", "Here's a small snippet that plots the predictions, with each color being assigned to each class (see the visualized image on the left)." @@ -99,6 +109,7 @@ { "cell_type": "code", "execution_count": null, + "id": "967e4c05", "metadata": {}, "outputs": [], "source": [ @@ -118,18 +129,22 @@ }, { "cell_type": "markdown", + "id": "a766996f", "metadata": {}, "source": [ "### Model Description\n", "\n", - "Deeplabv3-ResNet101 is contructed by a Deeplabv3 model with a ResNet-101 backbone.\n", + "Deeplabv3-ResNet is constructed by a Deeplabv3 model using a ResNet-50 or ResNet-101 backbone.\n", + "Deeplabv3-MobileNetV3-Large is constructed by a Deeplabv3 model using the MobileNetV3 large backbone.\n", "The pre-trained model has been trained on a subset of COCO train2017, on the 20 categories that are present in the Pascal VOC dataset.\n", "\n", "Their accuracies of the pre-trained models evaluated on COCO val2017 dataset are listed below.\n", "\n", - "| Model structure | Mean IOU | Global Pixelwise Accuracy |\n", - "| ------------------- | ----------- | --------------------------|\n", - "| deeplabv3_resnet101 | 67.4 | 92.4 |\n", + "| Model structure | Mean IOU | Global Pixelwise Accuracy |\n", + "| ---------------------------- | ----------- | --------------------------|\n", + "| deeplabv3_resnet50 | 66.4 | 92.4 |\n", + "| deeplabv3_resnet101 | 67.4 | 92.4 |\n", + "| deeplabv3_mobilenet_v3_large | 60.3 | 91.2 |\n", "\n", "### Resources\n", "\n", @@ -139,5 +154,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_densenet.ipynb b/assets/hub/pytorch_vision_densenet.ipynb index 60e947b2ef22..fc61d267a09d 100644 --- a/assets/hub/pytorch_vision_densenet.ipynb +++ b/assets/hub/pytorch_vision_densenet.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "7dac9025", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -23,20 +24,22 @@ { "cell_type": "code", "execution_count": null, + "id": "a9367c2e", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'densenet121', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet121', pretrained=True)\n", "# or any of these variants\n", - "# model = torch.hub.load('pytorch/vision', 'densenet169', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'densenet201', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'densenet161', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet169', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet201', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet161', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "16747e9d", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -50,12 +53,13 @@ { "cell_type": "code", "execution_count": null, + "id": "578d3a1e", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -63,6 +67,7 @@ { "cell_type": "code", "execution_count": null, + "id": "9e53747c", "metadata": {}, "outputs": [], "source": [ @@ -86,21 +91,50 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a77ca7e1", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c37376b4", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "5cdbff63", "metadata": {}, "source": [ "### Model Description\n", "\n", "Dense Convolutional Network (DenseNet), connects each layer to every other layer in a feed-forward fashion. Whereas traditional convolutional networks with L layers have L connections - one between each layer and its subsequent layer - our network has L(L+1)/2 direct connections. For each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation, encourage feature reuse, and substantially reduce the number of parameters.\n", "\n", - "The 1-crop error rates on the imagenet dataset with the pretrained model are listed below.\n", + "The 1-crop error rates on the ImageNet dataset with the pretrained model are listed below.\n", "\n", "| Model structure | Top-1 error | Top-5 error |\n", "| --------------- | ----------- | ----------- |\n", @@ -117,5 +151,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_fcn_resnet101.ipynb b/assets/hub/pytorch_vision_fcn_resnet101.ipynb index c29506225a9b..880c4ad7b2d6 100644 --- a/assets/hub/pytorch_vision_fcn_resnet101.ipynb +++ b/assets/hub/pytorch_vision_fcn_resnet101.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "ad4eaa3f", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -9,11 +10,11 @@ "\n", "----------------------------------------------------------------------\n", "\n", - "# FCN-ResNet101\n", + "# FCN\n", "\n", "*Author: Pytorch Team*\n", "\n", - "**Fully-Convolutional Network model with a ResNet-101 backbone**\n", + "**Fully-Convolutional Network model with ResNet-50 and ResNet-101 backbones**\n", "\n", "_ | _\n", "- | -\n", @@ -23,16 +24,20 @@ { "cell_type": "code", "execution_count": null, + "id": "596ae4bd", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'fcn_resnet101', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'fcn_resnet50', pretrained=True)\n", + "# or\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'fcn_resnet101', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "2e46d2dc", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -48,12 +53,13 @@ { "cell_type": "code", "execution_count": null, + "id": "5b259707", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/deeplab1.png\", \"deeplab1.png\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -61,6 +67,7 @@ { "cell_type": "code", "execution_count": null, + "id": "b1f6fa05", "metadata": {}, "outputs": [], "source": [ @@ -68,6 +75,7 @@ "from PIL import Image\n", "from torchvision import transforms\n", "input_image = Image.open(filename)\n", + "input_image = input_image.convert(\"RGB\")\n", "preprocess = transforms.Compose([\n", " transforms.ToTensor(),\n", " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", @@ -88,9 +96,10 @@ }, { "cell_type": "markdown", + "id": "4542090e", "metadata": {}, "source": [ - "The output here is of shape `(21, H, W)`, and at each location, there are unnormalized proababilities corresponding to the prediction of each class.\n", + "The output here is of shape `(21, H, W)`, and at each location, there are unnormalized probabilities corresponding to the prediction of each class.\n", "To get the maximum prediction of each class, and then use it for a downstream task, you can do `output_predictions = output.argmax(0)`.\n", "\n", "Here's a small snippet that plots the predictions, with each color being assigned to each class (see the visualized image on the left)." @@ -99,6 +108,7 @@ { "cell_type": "code", "execution_count": null, + "id": "f7277631", "metadata": {}, "outputs": [], "source": [ @@ -118,17 +128,19 @@ }, { "cell_type": "markdown", + "id": "3a5a585d", "metadata": {}, "source": [ "### Model Description\n", "\n", - "FCN-ResNet101 is contructed by a Fully-Covolutional Network model with a ResNet-101 backbone.\n", + "FCN-ResNet is constructed by a Fully-Convolutional Network model, using a ResNet-50 or a ResNet-101 backbone.\n", "The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are present in the Pascal VOC dataset.\n", "\n", "Their accuracies of the pre-trained models evaluated on COCO val2017 dataset are listed below.\n", "\n", "| Model structure | Mean IOU | Global Pixelwise Accuracy |\n", "| --------------- | ----------- | --------------------------|\n", + "| fcn_resnet50 | 60.5 | 91.4 |\n", "| fcn_resnet101 | 63.7 | 91.9 |\n", "\n", "### Resources\n", @@ -139,5 +151,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_ghostnet.ipynb b/assets/hub/pytorch_vision_ghostnet.ipynb new file mode 100644 index 000000000000..9625aa6efbe1 --- /dev/null +++ b/assets/hub/pytorch_vision_ghostnet.ipynb @@ -0,0 +1,154 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "28f1b67a", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# GhostNet\n", + "\n", + "*Author: Huawei Noah's Ark Lab*\n", + "\n", + "**Efficient networks by generating more features from cheap operations**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d3c51de", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('huawei-noah/ghostnet', 'ghostnet_1x', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "f950f2af", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58fbf55e", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab2d59bf", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2979ac25", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b59152d", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "1d889cf0", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "The GhostNet architecture is based on an Ghost module structure which generate more features from cheap operations. Based on a set of intrinsic feature maps, a series of cheap operations are applied to generate many ghost feature maps that could fully reveal information underlying intrinsic features. Experiments conducted on benchmarks demonstrate that the superiority of GhostNet in terms of speed and accuracy tradeoff.\n", + "\n", + "The corresponding accuracy on ImageNet dataset with pretrained model is listed below.\n", + "\n", + "| Model structure | FLOPs | Top-1 acc | Top-5 acc |\n", + "| --------------- | ----------- | ----------- | ----------- |\n", + "| GhostNet 1.0x | 142M | 73.98 | 91.46 |\n", + "\n", + "\n", + "### References\n", + "\n", + "You can read the full paper at this [link](https://arxiv.org/abs/1911.11907).\n", + "\n", + ">@inproceedings{han2019ghostnet,\n", + "> title={GhostNet: More Features from Cheap Operations},\n", + "> author={Kai Han and Yunhe Wang and Qi Tian and Jianyuan Guo and Chunjing Xu and Chang Xu},\n", + "> booktitle={CVPR},\n", + "> year={2020},\n", + ">}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_googlenet.ipynb b/assets/hub/pytorch_vision_googlenet.ipynb index 0bc6484dec47..a17e3e2097ae 100644 --- a/assets/hub/pytorch_vision_googlenet.ipynb +++ b/assets/hub/pytorch_vision_googlenet.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "3de6cad3", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -23,16 +24,18 @@ { "cell_type": "code", "execution_count": null, + "id": "9f47a584", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'googlenet', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "535bcc73", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -46,12 +49,13 @@ { "cell_type": "code", "execution_count": null, + "id": "fb150def", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -59,6 +63,7 @@ { "cell_type": "code", "execution_count": null, + "id": "9b6cbf35", "metadata": {}, "outputs": [], "source": [ @@ -82,14 +87,43 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59b9161f", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19a93651", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "84c98908", "metadata": {}, "source": [ "### Model Description\n", @@ -110,5 +144,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_hardnet.ipynb b/assets/hub/pytorch_vision_hardnet.ipynb index 7c83db46f2ed..f362326c17e2 100644 --- a/assets/hub/pytorch_vision_hardnet.ipynb +++ b/assets/hub/pytorch_vision_hardnet.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "7b71e157", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -23,6 +24,7 @@ { "cell_type": "code", "execution_count": null, + "id": "c3be2b63", "metadata": {}, "outputs": [], "source": [ @@ -37,6 +39,7 @@ }, { "cell_type": "markdown", + "id": "781d2cd7", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -50,12 +53,13 @@ { "cell_type": "code", "execution_count": null, + "id": "7e95526f", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -63,6 +67,7 @@ { "cell_type": "code", "execution_count": null, + "id": "1e7c2c3e", "metadata": {}, "outputs": [], "source": [ @@ -86,14 +91,43 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a40b533b", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfeff952", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "b85e734e", "metadata": {}, "source": [ "### Model Description\n", @@ -106,7 +140,7 @@ "\n", "Here we have the 4 versions of hardnet models, which contains 39, 68, 85 layers\n", "w/ or w/o Depthwise Separable Conv respectively.\n", - "Their 1-crop error rates on imagenet dataset with pretrained models are listed below.\n", + "Their 1-crop error rates on ImageNet dataset with pretrained models are listed below.\n", "\n", "| Model structure | Top-1 error | Top-5 error |\n", "| --------------- | ----------- | ----------- |\n", @@ -123,5 +157,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_ibnnet.ipynb b/assets/hub/pytorch_vision_ibnnet.ipynb new file mode 100644 index 000000000000..d5e9bbcc434e --- /dev/null +++ b/assets/hub/pytorch_vision_ibnnet.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "981d849a", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# IBN-Net\n", + "\n", + "*Author: Xingang Pan*\n", + "\n", + "**Networks with domain/appearance invariance**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "638d2324", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('XingangPan/IBN-Net', 'resnet50_ibn_a', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "dd36e6ab", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50499219", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81296a41", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8f1a0cf", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "805ecb76", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "85dcd9b3", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "IBN-Net is a CNN model with domain/appearance invariance.\n", + "Motivated by style transfer works, IBN-Net carefully unifies instance normalization and batch normalization in a single deep network.\n", + "It provides a simple way to increase both modeling and generalization capacities without adding model complexity.\n", + "IBN-Net is especially suitable for cross domain or person/vehicle re-identification tasks.\n", + "\n", + "The corresponding accuracies on ImageNet dataset with pretrained models are listed below.\n", + "\n", + "| Model name | Top-1 acc | Top-5 acc |\n", + "| --------------- | ----------- | ----------- |\n", + "| resnet50_ibn_a | 77.46 | 93.68 |\n", + "| resnet101_ibn_a | 78.61 | 94.41 |\n", + "| resnext101_ibn_a | 79.12 | 94.58 |\n", + "| se_resnet101_ibn_a | 78.75 | 94.49 |\n", + "\n", + "The rank1/mAP on two Re-ID benchmarks Market1501 and DukeMTMC-reID are listed below (from [michuanhaohao/reid-strong-baseline](https://github.com/michuanhaohao/reid-strong-baseline)).\n", + "\n", + "| Backbone | Market1501 | DukeMTMC-reID |\n", + "| --- | -- | -- |\n", + "| ResNet50 | 94.5 (85.9) | 86.4 (76.4) |\n", + "| ResNet101 | 94.5 (87.1) | 87.6 (77.6) |\n", + "| SeResNet50 | 94.4 (86.3) | 86.4 (76.5) |\n", + "| SeResNet101 | 94.6 (87.3) | 87.5 (78.0) |\n", + "| SeResNeXt50 | 94.9 (87.6) | 88.0 (78.3) |\n", + "| SeResNeXt101 | 95.0 (88.0) | 88.4 (79.0) |\n", + "| ResNet50-IBN-a | 95.0 (88.2) | 90.1 (79.1) |\n", + "\n", + "### References\n", + "\n", + " - [Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net](https://arxiv.org/abs/1807.09441)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_inception_v3.ipynb b/assets/hub/pytorch_vision_inception_v3.ipynb index a90c6c0785ce..087a6201fe8f 100644 --- a/assets/hub/pytorch_vision_inception_v3.ipynb +++ b/assets/hub/pytorch_vision_inception_v3.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "6a0633dd", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -13,7 +14,7 @@ "\n", "*Author: Pytorch Team*\n", "\n", - "**Also called GoogleNetv3, a famous ConvNet trained on Imagenet from 2015**\n", + "**Also called GoogleNetv3, a famous ConvNet trained on ImageNet from 2015**\n", "\n", "\"alt\"" ] @@ -21,16 +22,18 @@ { "cell_type": "code", "execution_count": null, + "id": "b593a71f", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'inception_v3', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'inception_v3', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "24d6e73e", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -44,12 +47,13 @@ { "cell_type": "code", "execution_count": null, + "id": "f1228762", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -57,6 +61,7 @@ { "cell_type": "code", "execution_count": null, + "id": "4c187630", "metadata": {}, "outputs": [], "source": [ @@ -80,21 +85,50 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4d1c366", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc090b7", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "e0199fa7", "metadata": {}, "source": [ "### Model Description\n", "\n", "Inception v3: Based on the exploration of ways to scale up networks in ways that aim at utilizing the added computation as efficiently as possible by suitably factorized convolutions and aggressive regularization. We benchmark our methods on the ILSVRC 2012 classification challenge validation set demonstrate substantial gains over the state of the art: 21.2% top-1 and 5.6% top-5 error for single frame evaluation using a network with a computational cost of 5 billion multiply-adds per inference and with using less than 25 million parameters. With an ensemble of 4 models and multi-crop evaluation, we report 3.5% top-5 error on the validation set (3.6% error on the test set) and 17.3% top-1 error on the validation set.\n", "\n", - "The 1-crop error rates on the imagenet dataset with the pretrained model are listed below.\n", + "The 1-crop error rates on the ImageNet dataset with the pretrained model are listed below.\n", "\n", "| Model structure | Top-1 error | Top-5 error |\n", "| --------------- | ----------- | ----------- |\n", @@ -108,5 +142,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_meal_v2.ipynb b/assets/hub/pytorch_vision_meal_v2.ipynb new file mode 100644 index 000000000000..bea099f0d0cb --- /dev/null +++ b/assets/hub/pytorch_vision_meal_v2.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "87b6d7ba", + "metadata": {}, + "source": [ + "### This notebook requires a GPU runtime to run.\n", + "### Please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# MEAL_V2\n", + "\n", + "*Author: Carnegie Mellon University*\n", + "\n", + "**Boosting Tiny and Efficient Models using Knowledge Distillation.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/MEALV2_method.png) | ![alt](https://pytorch.org/assets/images/MEALV2_results.png)\n", + "\n", + "\n", + "We require one additional Python dependency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "041ba368", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "!pip install timm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d29f16dc", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# list of models: 'mealv1_resnest50', 'mealv2_resnest50', 'mealv2_resnest50_cutmix', 'mealv2_resnest50_380x380', 'mealv2_mobilenetv3_small_075', 'mealv2_mobilenetv3_small_100', 'mealv2_mobilenet_v3_large_100', 'mealv2_efficientnet_b0'\n", + "# load pretrained models, using \"mealv2_resnest50_cutmix\" as an example\n", + "model = torch.hub.load('szq0214/MEAL-V2','meal_v2', 'mealv2_resnest50_cutmix', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "99c27a3e", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa4fa53", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f18274", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ce4b3fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c392ed05", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "2de17ed5", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "MEAL V2 models are from the [MEAL V2: Boosting Vanilla ResNet-50 to 80%+ Top-1 Accuracy on ImageNet without Tricks](https://arxiv.org/pdf/2009.08453.pdf) paper.\n", + "\n", + "In this paper, we introduce a simple yet effective approach that can boost the vanilla ResNet-50 to 80%+ Top-1 accuracy on ImageNet without any tricks. Generally, our method is based on the recently proposed [MEAL](https://arxiv.org/abs/1812.02425), i.e., ensemble knowledge distillation via discriminators. We further simplify it through 1) adopting the similarity loss and discriminator only on the final outputs and 2) using the average of softmax probabilities from all teacher ensembles as the stronger supervision for distillation. One crucial perspective of our method is that the one-hot/hard label should not be used in the distillation process. We show that such a simple framework can achieve state-of-the-art results without involving any commonly-used tricks, such as 1) architecture modification; 2) outside training data beyond ImageNet; 3) autoaug/randaug; 4) cosine learning rate; 5) mixup/cutmix training; 6) label smoothing; etc.\n", + "\n", + "| Models | Resolution| #Parameters | Top-1/Top-5 |\n", + "| :---: | :-: | :-: | :------:| :------: | \n", + "| [MEAL-V1 w/ ResNet50](https://arxiv.org/abs/1812.02425) | 224 | 25.6M |**78.21/94.01** | [GitHub](https://github.com/AaronHeee/MEAL#imagenet-model) |\n", + "| MEAL-V2 w/ ResNet50 | 224 | 25.6M | **80.67/95.09** | \n", + "| MEAL-V2 w/ ResNet50| 380 | 25.6M | **81.72/95.81** | \n", + "| MEAL-V2 + CutMix w/ ResNet50| 224 | 25.6M | **80.98/95.35** | \n", + "| MEAL-V2 w/ MobileNet V3-Small 0.75| 224 | 2.04M | **67.60/87.23** | \n", + "| MEAL-V2 w/ MobileNet V3-Small 1.0| 224 | 2.54M | **69.65/88.71** | \n", + "| MEAL-V2 w/ MobileNet V3-Large 1.0 | 224 | 5.48M | **76.92/93.32** | \n", + "| MEAL-V2 w/ EfficientNet-B0| 224 | 5.29M | **78.29/93.95** | \n", + "\n", + "### References\n", + "\n", + "Please refer to our papers [MEAL V2](https://arxiv.org/pdf/2009.08453.pdf), [MEAL](https://arxiv.org/pdf/1812.02425.pdf) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1966f9f3", + "metadata": {}, + "outputs": [], + "source": [ + "@article{shen2020mealv2,\n", + " title={MEAL V2: Boosting Vanilla ResNet-50 to 80%+ Top-1 Accuracy on ImageNet without Tricks},\n", + " author={Shen, Zhiqiang and Savvides, Marios},\n", + " journal={arXiv preprint arXiv:2009.08453},\n", + " year={2020}\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "4fed91a2", + "metadata": {}, + "source": [ + "@inproceedings{shen2019MEAL,\n", + "\t\ttitle = {MEAL: Multi-Model Ensemble via Adversarial Learning},\n", + "\t\tauthor = {Shen, Zhiqiang and He, Zhankui and Xue, Xiangyang},\n", + "\t\tbooktitle = {AAAI},\n", + "\t\tyear = {2019}\n", + "\t}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_mobilenet_v2.ipynb b/assets/hub/pytorch_vision_mobilenet_v2.ipynb index 977b2835883a..b33561619ff5 100644 --- a/assets/hub/pytorch_vision_mobilenet_v2.ipynb +++ b/assets/hub/pytorch_vision_mobilenet_v2.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "eb5333f8", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -23,16 +24,18 @@ { "cell_type": "code", "execution_count": null, + "id": "a9ec286f", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'mobilenet_v2', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "efb840d6", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -46,12 +49,13 @@ { "cell_type": "code", "execution_count": null, + "id": "82571048", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -59,6 +63,7 @@ { "cell_type": "code", "execution_count": null, + "id": "28f02763", "metadata": {}, "outputs": [], "source": [ @@ -82,14 +87,43 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6380a714", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92b2e982", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "805611f2", "metadata": {}, "source": [ "### Model Description\n", @@ -109,5 +143,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_once_for_all.ipynb b/assets/hub/pytorch_vision_once_for_all.ipynb new file mode 100644 index 000000000000..0183543a65a3 --- /dev/null +++ b/assets/hub/pytorch_vision_once_for_all.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1a4b7f5", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Once-for-All\n", + "\n", + "*Author: MIT Han Lab*\n", + "\n", + "**Once-for-all (OFA) decouples training and search, and achieves efficient inference across various edge devices and resource constraints.**\n", + "\n", + "\"alt\"\n", + "\n", + "\n", + "\n", + "\n", + "### Get supernet\n", + "\n", + "You can quickly load a supernet as following" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53f8de30", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "super_net_name = \"ofa_supernet_mbv3_w10\" \n", + "# other options: \n", + "# ofa_supernet_resnet50 / \n", + "# ofa_supernet_mbv3_w12 / \n", + "# ofa_supernet_proxyless\n", + "\n", + "super_net = torch.hub.load('mit-han-lab/once-for-all', super_net_name, pretrained=True).eval()" + ] + }, + { + "cell_type": "markdown", + "id": "1fd4088d", + "metadata": {}, + "source": [ + "| OFA Network | Design Space | Resolution | Width Multiplier | Depth | Expand Ratio | kernel Size | \n", + "|----------------------|----------|----------|---------|------------|---------|------------|\n", + "| ofa_resnet50 | ResNet50D | 128 - 224 | 0.65, 0.8, 1.0 | 0, 1, 2 | 0.2, 0.25, 0.35 | 3 |\n", + "| ofa_mbv3_d234_e346_k357_w1.0 | MobileNetV3 | 128 - 224 | 1.0 | 2, 3, 4 | 3, 4, 6 | 3, 5, 7 |\n", + "| ofa_mbv3_d234_e346_k357_w1.2 | MobileNetV3 | 160 - 224 | 1.2 | 2, 3, 4 | 3, 4, 6 | 3, 5, 7 |\n", + "| ofa_proxyless_d234_e346_k357_w1.3 | ProxylessNAS | 128 - 224 | 1.3 | 2, 3, 4 | 3, 4, 6 | 3, 5, 7 |\n", + "\n", + "\n", + "Below are the usage of sampling / selecting a subnet from the supernet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b33c44d", + "metadata": {}, + "outputs": [], + "source": [ + "# Randomly sample sub-networks from OFA network\n", + "super_net.sample_active_subnet()\n", + "random_subnet = super_net.get_active_subnet(preserve_weight=True)\n", + " \n", + "# Manually set the sub-network\n", + "super_net.set_active_subnet(ks=7, e=6, d=4)\n", + "manual_subnet = super_net.get_active_subnet(preserve_weight=True)" + ] + }, + { + "cell_type": "markdown", + "id": "dd512c03", + "metadata": {}, + "source": [ + "### Get Specialized Architecture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1d56c24", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# or load a architecture specialized for certain platform\n", + "net_config = \"resnet50D_MAC_4_1B\"\n", + "\n", + "specialized_net, image_size = torch.hub.load('mit-han-lab/once-for-all', net_config, pretrained=True)\n", + "specialized_net.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "157a77cd", + "metadata": {}, + "source": [ + "More models and configurations can be found in [once-for-all/model-zoo](https://github.com/mit-han-lab/once-for-all#evaluate-1)\n", + "and obtained through the following scripts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9985bdd4", + "metadata": {}, + "outputs": [], + "source": [ + "ofa_specialized_get = torch.hub.load('mit-han-lab/once-for-all', \"ofa_specialized_get\")\n", + "model, image_size = ofa_specialized_get(\"flops@595M_top1@80.0_finetune@75\", pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "b98bdea4", + "metadata": {}, + "source": [ + "The model's prediction can be evalutaed by" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d86ac1d", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: \n", + " urllib.URLopener().retrieve(url, filename)\n", + "except: \n", + " urllib.request.urlretrieve(url, filename)\n", + "\n", + "\n", + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)\n" + ] + }, + { + "cell_type": "markdown", + "id": "db6dd8fb", + "metadata": {}, + "source": [ + "### Model Description\n", + "Once-for-all models are from [Once for All: Train One Network and Specialize it for Efficient Deployment](https://arxiv.org/abs/1908.09791). Conventional approaches either manually design or use neural architecture search (NAS) to find a specialized neural network and train it from scratch for each case, which is computationally prohibitive (causing CO2 emission as much as 5 cars' lifetime) thus unscalable. In this work, we propose to train a once-for-all (OFA) network that supports diverse architectural settings by decoupling training and search. Across diverse edge devices, OFA consistently outperforms state-of-the-art (SOTA) NAS methods (up to 4.0% ImageNet top1 accuracy improvement over MobileNetV3, or same accuracy but 1.5x faster than MobileNetV3, 2.6x faster than EfficientNet w.r.t measured latency) while reducing many orders of magnitude GPU hours and CO2 emission. In particular, OFA achieves a new SOTA 80.0% ImageNet top-1 accuracy under the mobile setting (<600M MACs).\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "### References" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fa6a8b2", + "metadata": {}, + "outputs": [], + "source": [ + "@inproceedings{\n", + " cai2020once,\n", + " title={Once for All: Train One Network and Specialize it for Efficient Deployment},\n", + " author={Han Cai and Chuang Gan and Tianzhe Wang and Zhekai Zhang and Song Han},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2020},\n", + " url={https://arxiv.org/pdf/1908.09791.pdf}\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_proxylessnas.ipynb b/assets/hub/pytorch_vision_proxylessnas.ipynb index d768796b6f4a..0bf04652b2a2 100644 --- a/assets/hub/pytorch_vision_proxylessnas.ipynb +++ b/assets/hub/pytorch_vision_proxylessnas.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "68fb6d6f", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,6 +22,7 @@ { "cell_type": "code", "execution_count": null, + "id": "c2515655", "metadata": {}, "outputs": [], "source": [ @@ -33,6 +35,7 @@ }, { "cell_type": "markdown", + "id": "57e3d1a0", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -46,12 +49,13 @@ { "cell_type": "code", "execution_count": null, + "id": "1366edb4", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -59,6 +63,7 @@ { "cell_type": "code", "execution_count": null, + "id": "55b37d3a", "metadata": {}, "outputs": [], "source": [ @@ -82,14 +87,43 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8503aa7", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec59ca2", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "be80f865", "metadata": {}, "source": [ "### Model Description\n", @@ -99,16 +133,16 @@ "Conventionally, people tend to design *one efficient model* for *all hardware platforms*. But different hardware has different properties, for example, CPU has higher frequency and GPU is better at parallization. Therefore, instead of generalizing, we need to **specialize** CNN architectures for different hardware platforms. As shown in below, with similar accuracy, specialization offers free yet significant performance boost on all three platforms.\n", "\n", "| Model structure | GPU Latency | CPU Latency | Mobile Latency\n", - "| --------------- | ----------- | ----------- | ----------- | \n", + "| --------------- | ----------- | ----------- | ----------- |\n", "| proxylessnas_gpu | **5.1ms** | 204.9ms | 124ms |\n", - "| proxylessnas_cpu | 7.4ms | **138.7ms** | 116ms | \n", + "| proxylessnas_cpu | 7.4ms | **138.7ms** | 116ms |\n", "| proxylessnas_mobile | 7.2ms | 164.1ms | **78ms** |\n", "\n", "The corresponding top-1 accuracy with pretrained models are listed below.\n", "\n", "| Model structure | Top-1 error |\n", - "| --------------- | ----------- | \n", - "| proxylessnas_cpu | 24.7 | \n", + "| --------------- | ----------- |\n", + "| proxylessnas_cpu | 24.7 |\n", "| proxylessnas_gpu | 24.9 |\n", "| proxylessnas_mobile | 25.4 |\n", "| proxylessnas_mobile_14 | 23.3 |\n", @@ -121,5 +155,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_resnest.ipynb b/assets/hub/pytorch_vision_resnest.ipynb new file mode 100644 index 000000000000..d7641840d1f9 --- /dev/null +++ b/assets/hub/pytorch_vision_resnest.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8521b666", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# ResNeSt\n", + "\n", + "*Author: Hang Zhang*\n", + "\n", + "**A new ResNet variant.**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68bf59b7", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "# get list of models\n", + "torch.hub.list('zhanghang1989/ResNeSt', force_reload=True)\n", + "# load pretrained models, using ResNeSt-50 as an example\n", + "model = torch.hub.load('zhanghang1989/ResNeSt', 'resnest50', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "d8d356c0", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "196e3c9a", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96819a60", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57aa5766", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3189a592", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "cbaa3b10", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "ResNeSt models are from the [ResNeSt: Split-Attention Networks](https://arxiv.org/pdf/2004.08955.pdf) paper.\n", + "\n", + "While image classification models have recently continued to advance, most downstream applications such as object detection and semantic segmentation still employ ResNet variants as the backbone network due to their simple and modular structure. We present a simple and modular Split-Attention block that enables attention across feature-map groups. By stacking these Split-Attention blocks ResNet-style, we obtain a new ResNet variant which we call ResNeSt. Our network preserves the overall ResNet structure to be used in downstream tasks straightforwardly without introducing additional computational costs. ResNeSt models outperform other networks with similar model complexities, and also help downstream tasks including object detection, instance segmentation and semantic segmentation.\n", + "\n", + "| | crop size | PyTorch |\n", + "|-------------|-----------|---------|\n", + "| ResNeSt-50 | 224 | 81.03 |\n", + "| ResNeSt-101 | 256 | 82.83 |\n", + "| ResNeSt-200 | 320 | 83.84 |\n", + "| ResNeSt-269 | 416 | 84.54 |\n", + "\n", + "### References\n", + "\n", + " - [ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955)." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_resnet.ipynb b/assets/hub/pytorch_vision_resnet.ipynb index 3c7693bee364..2fda8ca3e66e 100644 --- a/assets/hub/pytorch_vision_resnet.ipynb +++ b/assets/hub/pytorch_vision_resnet.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "b74d19a0", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,21 +22,23 @@ { "cell_type": "code", "execution_count": null, + "id": "7de1aa61", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'resnet18', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)\n", "# or any of these variants\n", - "# model = torch.hub.load('pytorch/vision', 'resnet34', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'resnet50', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'resnet101', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'resnet152', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet34', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet101', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet152', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "3ddf328e", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -49,12 +52,13 @@ { "cell_type": "code", "execution_count": null, + "id": "94991788", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -62,6 +66,7 @@ { "cell_type": "code", "execution_count": null, + "id": "86dcba1c", "metadata": {}, "outputs": [], "source": [ @@ -85,14 +90,43 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ace2a087", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8448d407", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "70b4e1b9", "metadata": {}, "source": [ "### Model Description\n", @@ -100,7 +134,7 @@ "Resnet models were proposed in \"Deep Residual Learning for Image Recognition\".\n", "Here we have the 5 versions of resnet models, which contains 18, 34, 50, 101, 152 layers respectively.\n", "Detailed model architectures can be found in Table 1.\n", - "Their 1-crop error rates on imagenet dataset with pretrained models are listed below.\n", + "Their 1-crop error rates on ImageNet dataset with pretrained models are listed below.\n", "\n", "| Model structure | Top-1 error | Top-5 error |\n", "| --------------- | ----------- | ----------- |\n", @@ -118,5 +152,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_resnext.ipynb b/assets/hub/pytorch_vision_resnext.ipynb index b3c5f07761a5..eb0f20bea282 100644 --- a/assets/hub/pytorch_vision_resnext.ipynb +++ b/assets/hub/pytorch_vision_resnext.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "f2256586", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,18 +22,20 @@ { "cell_type": "code", "execution_count": null, + "id": "6ee13ed9", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'resnext50_32x4d', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'resnext50_32x4d', pretrained=True)\n", "# or\n", - "# model = torch.hub.load('pytorch/vision', 'resnext101_32x8d', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnext101_32x8d', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "3d73fa36", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -46,12 +49,13 @@ { "cell_type": "code", "execution_count": null, + "id": "fd200719", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -59,6 +63,7 @@ { "cell_type": "code", "execution_count": null, + "id": "7753e3c1", "metadata": {}, "outputs": [], "source": [ @@ -82,14 +87,45 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d81ccdf", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a71ac5d9", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "f7ddaedf", "metadata": {}, "source": [ "### Model Description\n", @@ -97,7 +133,7 @@ "Resnext models were proposed in [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431).\n", "Here we have the 2 versions of resnet models, which contains 50, 101 layers repspectively.\n", "A comparison in model archetechure between resnet50 and resnext50 can be found in Table 1.\n", - "Their 1-crop error rates on imagenet dataset with pretrained models are listed below.\n", + "Their 1-crop error rates on ImageNet dataset with pretrained models are listed below.\n", "\n", "| Model structure | Top-1 error | Top-5 error |\n", "| ----------------- | ----------- | ----------- |\n", @@ -112,5 +148,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_shufflenet_v2.ipynb b/assets/hub/pytorch_vision_shufflenet_v2.ipynb index 0bbdedf93b55..6af2a18507be 100644 --- a/assets/hub/pytorch_vision_shufflenet_v2.ipynb +++ b/assets/hub/pytorch_vision_shufflenet_v2.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "ceddae15", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -13,7 +14,7 @@ "\n", "*Author: Pytorch Team*\n", "\n", - "**An efficient ConvNet optimized for speed and memory, pre-trained on Imagenet**\n", + "**An efficient ConvNet optimized for speed and memory, pre-trained on ImageNet**\n", "\n", "_ | _\n", "- | -\n", @@ -23,16 +24,18 @@ { "cell_type": "code", "execution_count": null, + "id": "f5e75733", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'shufflenet_v2_x1_0', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'shufflenet_v2_x1_0', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "68f3912c", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -46,12 +49,13 @@ { "cell_type": "code", "execution_count": null, + "id": "9b128da6", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -59,6 +63,7 @@ { "cell_type": "code", "execution_count": null, + "id": "70d5a956", "metadata": {}, "outputs": [], "source": [ @@ -82,14 +87,43 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "416164dc", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a9adec0", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "d969a239", "metadata": {}, "source": [ "### Model Description\n", @@ -109,5 +143,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_snnmlp.ipynb b/assets/hub/pytorch_vision_snnmlp.ipynb new file mode 100644 index 000000000000..0b2d3ec2d3bf --- /dev/null +++ b/assets/hub/pytorch_vision_snnmlp.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "679d03cf", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# SNNMLP\n", + "\n", + "*Author: Huawei Noah's Ark Lab*\n", + "\n", + "**Brain-inspired Multilayer Perceptron with Spiking Neurons**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4eb3d83", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load('huawei-noah/Efficient-AI-Backbones', 'snnmlp_t', pretrained=True)\n", + "# or\n", + "# model = torch.hub.load('huawei-noah/Efficient-AI-Backbones', 'snnmlp_s', pretrained=True)\n", + "# or\n", + "# model = torch.hub.load('huawei-noah/Efficient-AI-Backbones', 'snnmlp_b', pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "d2ac61fc", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32db137b", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15f2f96a", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "print(torch.nn.functional.softmax(output[0], dim=0))\n" + ] + }, + { + "cell_type": "markdown", + "id": "391bc7b8", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "SNNMLP incorporates the mechanism of LIF neurons into the MLP models, to achieve better accuracy without extra FLOPs. We propose a full-precision LIF operation to communicate between patches, including horizontal LIF and vertical LIF in different directions. We also propose to use group LIF to extract better local features. With LIF modules, our SNNMLP model achieves 81.9%, 83.3% and 83.6% top-1 accuracy on ImageNet dataset with only 4.4G, 8.5G and 15.2G FLOPs, respectively.\n", + "\n", + "The corresponding accuracy on ImageNet dataset with pretrained model is listed below.\n", + "\n", + "| Model structure | #Parameters | FLOPs | Top-1 acc |\n", + "| --------------- | ----------- | ----------- | ----------- |\n", + "| SNNMLP Tiny | 28M | 4.4G | 81.88 |\n", + "| SNNMLP Small | 50M | 8.5G | 83.30 |\n", + "| SNNMLP Base | 88M | 15.2G | 85.59 |\n", + "\n", + "\n", + "### References\n", + "\n", + "You can read the full paper [here](https://arxiv.org/abs/2203.14679)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3bee9bc", + "metadata": {}, + "outputs": [], + "source": [ + "@inproceedings{li2022brain,\n", + " title={Brain-inspired multilayer perceptron with spiking neurons},\n", + " author={Li, Wenshuo and Chen, Hanting and Guo, Jianyuan and Zhang, Ziyang and Wang, Yunhe},\n", + " booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\n", + " pages={783--793},\n", + " year={2022}\n", + "}" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/pytorch_vision_squeezenet.ipynb b/assets/hub/pytorch_vision_squeezenet.ipynb index ef1215d57d7b..71af8b401bf9 100644 --- a/assets/hub/pytorch_vision_squeezenet.ipynb +++ b/assets/hub/pytorch_vision_squeezenet.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "cd4df47c", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,18 +22,20 @@ { "cell_type": "code", "execution_count": null, + "id": "8b58effa", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'squeezenet1_0', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_0', pretrained=True)\n", "# or\n", - "# model = torch.hub.load('pytorch/vision', 'squeezenet1_1', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_1', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "fc0fbc27", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -46,12 +49,13 @@ { "cell_type": "code", "execution_count": null, + "id": "b8740dd7", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -59,6 +63,7 @@ { "cell_type": "code", "execution_count": null, + "id": "978191be", "metadata": {}, "outputs": [], "source": [ @@ -82,14 +87,43 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e475d7e", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f18701", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "066555b8", "metadata": {}, "source": [ "### Model Description\n", @@ -99,7 +133,7 @@ "Model `squeezenet1_1` is from the [official squeezenet repo](https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1).\n", "It has 2.4x less computation and slightly fewer parameters than `squeezenet1_0`, without sacrificing accuracy.\n", "\n", - "Their 1-crop error rates on imagenet dataset with pretrained models are listed below.\n", + "Their 1-crop error rates on ImageNet dataset with pretrained models are listed below.\n", "\n", "| Model structure | Top-1 error | Top-5 error |\n", "| --------------- | ----------- | ----------- |\n", @@ -114,5 +148,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_vgg.ipynb b/assets/hub/pytorch_vision_vgg.ipynb index 658af86910b9..689966eb07c5 100644 --- a/assets/hub/pytorch_vision_vgg.ipynb +++ b/assets/hub/pytorch_vision_vgg.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "48c981c1", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -13,7 +14,7 @@ "\n", "*Author: Pytorch Team*\n", "\n", - "**Award winning ConvNets from 2014 Imagenet ILSVRC challenge**\n", + "**Award winning ConvNets from 2014 ImageNet ILSVRC challenge**\n", "\n", "\"alt\"" ] @@ -21,24 +22,26 @@ { "cell_type": "code", "execution_count": null, + "id": "353975ab", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "model = torch.hub.load('pytorch/vision', 'vgg11', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg11', pretrained=True)\n", "# or any of these variants\n", - "# model = torch.hub.load('pytorch/vision', 'vgg11_bn', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'vgg13', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'vgg13_bn', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'vgg16', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'vgg16_bn', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'vgg19', pretrained=True)\n", - "# model = torch.hub.load('pytorch/vision', 'vgg19_bn', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg11_bn', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg13', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg13_bn', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16_bn', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg19', pretrained=True)\n", + "# model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg19_bn', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "49b59512", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -52,12 +55,13 @@ { "cell_type": "code", "execution_count": null, + "id": "30b08430", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -65,6 +69,7 @@ { "cell_type": "code", "execution_count": null, + "id": "f8e70afe", "metadata": {}, "outputs": [], "source": [ @@ -88,25 +93,54 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c29c5f9e", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0a2573", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "f02f5387", "metadata": {}, "source": [ "### Model Description\n", "\n", "Here we have implementations for the models proposed in [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556),\n", - "for each configurations and their with bachnorm version.\n", + "for each configurations and their with batchnorm version.\n", "\n", "For example, configuration `A` presented in the paper is `vgg11`, configuration `B` is `vgg13`, configuration `D` is `vgg16`\n", "and configuration `E` is `vgg19`. Their batchnorm version are suffixed with `_bn`.\n", "\n", - "Their 1-crop error rates on imagenet dataset with pretrained models are listed below.\n", + "Their Top-1 error rates on ImageNet dataset with pretrained models are listed below.\n", "\n", "| Model structure | Top-1 error | Top-5 error |\n", "| --------------- | ----------- | ----------- |\n", @@ -127,5 +161,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/pytorch_vision_wide_resnet.ipynb b/assets/hub/pytorch_vision_wide_resnet.ipynb index 4b70eb9fc158..4d81cad7d879 100644 --- a/assets/hub/pytorch_vision_wide_resnet.ipynb +++ b/assets/hub/pytorch_vision_wide_resnet.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "a42a2c48", "metadata": {}, "source": [ "### This notebook is optionally accelerated with a GPU runtime.\n", @@ -21,19 +22,21 @@ { "cell_type": "code", "execution_count": null, + "id": "b6367742", "metadata": {}, "outputs": [], "source": [ "import torch\n", "# load WRN-50-2:\n", - "model = torch.hub.load('pytorch/vision', 'wide_resnet50_2', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'wide_resnet50_2', pretrained=True)\n", "# or WRN-101-2\n", - "model = torch.hub.load('pytorch/vision', 'wide_resnet101_2', pretrained=True)\n", + "model = torch.hub.load('pytorch/vision:v0.10.0', 'wide_resnet101_2', pretrained=True)\n", "model.eval()" ] }, { "cell_type": "markdown", + "id": "758f9e23", "metadata": {}, "source": [ "All pre-trained models expect input images normalized in the same way,\n", @@ -47,12 +50,13 @@ { "cell_type": "code", "execution_count": null, + "id": "faf3a0f5", "metadata": {}, "outputs": [], "source": [ "# Download an example image from the pytorch website\n", "import urllib\n", - "url, filename = (\"https://github.com/pytorch/hub/raw/master/dog.jpg\", \"dog.jpg\")\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", "try: urllib.URLopener().retrieve(url, filename)\n", "except: urllib.request.urlretrieve(url, filename)" ] @@ -60,6 +64,7 @@ { "cell_type": "code", "execution_count": null, + "id": "dc6a9980", "metadata": {}, "outputs": [], "source": [ @@ -83,14 +88,43 @@ "\n", "with torch.no_grad():\n", " output = model(input_batch)\n", - "# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", "print(output[0])\n", "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", - "print(torch.nn.functional.softmax(output[0], dim=0))\n" + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28122f5d", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d2f6fcb", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" ] }, { "cell_type": "markdown", + "id": "a9b740da", "metadata": {}, "source": [ "### Model Description\n", @@ -119,5 +153,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 5 } diff --git a/assets/hub/sigsep_open-unmix-pytorch_umx.ipynb b/assets/hub/sigsep_open-unmix-pytorch_umx.ipynb new file mode 100644 index 000000000000..de8bc7d3b942 --- /dev/null +++ b/assets/hub/sigsep_open-unmix-pytorch_umx.ipynb @@ -0,0 +1,120 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "99fde666", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Open-Unmix\n", + "\n", + "*Author: Inria*\n", + "\n", + "**Reference implementation for music source separation**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1b6c35b", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# assuming you have a PyTorch >=1.6.0 installed\n", + "pip install -q torchaudio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "869d0784", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# loading umxhq four target separator\n", + "separator = torch.hub.load('sigsep/open-unmix-pytorch', 'umxhq')\n", + "\n", + "# generate random audio\n", + "# ... with shape (nb_samples, nb_channels, nb_timesteps)\n", + "# ... and with the same sample rate as that of the separator\n", + "audio = torch.rand((1, 2, 100000))\n", + "original_sample_rate = separator.sample_rate\n", + "\n", + "# make sure to resample the audio to models' sample rate, separator.sample_rate, if the two are different\n", + "# resampler = torchaudio.transforms.Resample(original_sample_rate, separator.sample_rate)\n", + "# audio = resampler(audio)\n", + "\n", + "estimates = separator(audio)\n", + "# estimates.shape = (1, 4, 2, 100000)" + ] + }, + { + "cell_type": "markdown", + "id": "7e8fcf3c", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "__Open-Unmix__ provides ready-to-use models that allow users to separate pop music into four stems: __vocals__, __drums__, __bass__ and the remaining __other__ instruments. The models were pre-trained on the freely available [MUSDB18](https://sigsep.github.io/datasets/musdb.html) dataset.\n", + "\n", + "Each target model is based on a three-layer bidirectional deep LSTM. The model learns to predict the magnitude spectrogram of a target source, like vocals, from the magnitude spectrogram of a mixture input. Internally, the prediction is obtained by applying a mask on the input. The model is optimized in the magnitude domain using mean squared error.\n", + "\n", + "A `Separator` meta-model (as shown in the code example above) puts together multiple _Open-unmix_ spectrogram models for each desired target, and combines their output through a multichannel generalized Wiener filter, before application of inverse STFTs using `torchaudio`.\n", + "The filtering is differentiable (but parameter-free) version of [norbert](https://github.com/sigsep/norbert).\n", + "\n", + "### Pre-trained `Separator` models\n", + "\n", + "* __`umxhq` (default)__ trained on [MUSDB18-HQ](https://sigsep.github.io/datasets/musdb.html#uncompressed-wav) which comprises the same tracks as in MUSDB18 but un-compressed which yield in a full bandwidth of 22050 Hz.\n", + "\n", + "* __`umx`__ is trained on the regular [MUSDB18](https://sigsep.github.io/datasets/musdb.html#compressed-stems) which is bandwidth limited to 16 kHz due to AAC compression. This model should be used for comparison with other (older) methods for evaluation in [SiSEC18](sisec18.unmix.app).\n", + "\n", + "Furthermore, we provide a model for speech enhancement trained by [Sony Corporation](link)\n", + "\n", + "* __`umxse`__ speech enhancement model is trained on the 28-speaker version of the [Voicebank+DEMAND corpus](https://datashare.is.ed.ac.uk/handle/10283/1942?show=full).\n", + "\n", + "All three models are also available as spectrogram (core) models, which take magnitude spectrogram inputs and ouput separated spectrograms.\n", + "These models can be loaded using `umxhq_spec`, `umx_spec` and `umxse_spec`.\n", + "\n", + "### Details\n", + "\n", + "For additional examples, documentation and usage examples, please visit this [the github repo](https://github.com/sigsep/open-unmix-pytorch).\n", + "\n", + "Furthermore, the models and all utility function to preprocess, read and save audio stems, are available in a python package that can be installed via" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ad88076", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install openunmix" + ] + }, + { + "cell_type": "markdown", + "id": "2f026e5d", + "metadata": {}, + "source": [ + "### References\n", + "\n", + "- [Open-Unmix - A Reference Implementation for Music Source Separation](https://doi.org/10.21105/joss.01667)\n", + "- [SigSep - Open Ressources for Music Separation](https://sigsep.github.io/)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/simplenet.ipynb b/assets/hub/simplenet.ipynb new file mode 100644 index 000000000000..b9e57af0ee25 --- /dev/null +++ b/assets/hub/simplenet.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "72d50304", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# SimpleNet\n", + "\n", + "*Author: Seyyed Hossein Hasanpour*\n", + "\n", + "**Lets Keep it simple, Using simple architectures to outperform deeper and more complex architectures**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d02ef84", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_5m_m1\", pretrained=True)\n", + "# or any of these variants\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_5m_m2\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_9m_m1\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_9m_m2\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_small_m1_05\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_small_m2_05\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_small_m1_075\", pretrained=True)\n", + "# model = torch.hub.load(\"coderx7/simplenet_pytorch:v1.0.0\", \"simplenetv1_small_m2_075\", pretrained=True)\n", + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "id": "ec4ddcb6", + "metadata": {}, + "source": [ + "All pre-trained models expect input images normalized in the same way,\n", + "i.e. mini-batches of 3-channel RGB images of shape `(3 x H x W)`, where `H` and `W` are expected to be at least `224`.\n", + "The images have to be loaded in to a range of `[0, 1]` and then normalized using `mean = [0.485, 0.456, 0.406]`\n", + "and `std = [0.229, 0.224, 0.225]`.\n", + "\n", + "Here's a sample execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c655a2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Download an example image from the pytorch website\n", + "import urllib\n", + "url, filename = (\"https://github.com/pytorch/hub/raw/master/images/dog.jpg\", \"dog.jpg\")\n", + "try: urllib.URLopener().retrieve(url, filename)\n", + "except: urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43bb8ba8", + "metadata": {}, + "outputs": [], + "source": [ + "# sample execution (requires torchvision)\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "input_image = Image.open(filename)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(input_image)\n", + "input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", + "\n", + "# move the input and model to GPU for speed if available\n", + "if torch.cuda.is_available():\n", + " input_batch = input_batch.to('cuda')\n", + " model.to('cuda')\n", + "\n", + "with torch.no_grad():\n", + " output = model(input_batch)\n", + "# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes\n", + "print(output[0])\n", + "# The output has unnormalized scores. To get probabilities, you can run a softmax on it.\n", + "probabilities = torch.nn.functional.softmax(output[0], dim=0)\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d89b13ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Download ImageNet labels\n", + "!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff946e58", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the categories\n", + "with open(\"imagenet_classes.txt\", \"r\") as f:\n", + " categories = [s.strip() for s in f.readlines()]\n", + "# Show top categories per image\n", + "top5_prob, top5_catid = torch.topk(probabilities, 5)\n", + "for i in range(top5_prob.size(0)):\n", + " print(categories[top5_catid[i]], top5_prob[i].item())" + ] + }, + { + "cell_type": "markdown", + "id": "dbd43f60", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "SimpleNet models were proposed in \"Lets Keep it simple, Using simple architectures to outperform deeper and more complex architectures\". \n", + "Here we have the 8 versions of simplenet models, which contains 1.5m, 3.2m, 5.7m and 9.5m parameters respectively. \n", + "Detailed model architectures can be found in Table 1 and Table 2. \n", + "Their 1-crop errors on ImageNet dataset with pretrained models are listed below. \n", + "\n", + "The m2 variants \n", + "\n", + "| Model structure | Top-1 errors | Top-5 errors |\n", + "| :------------------------- | :-----------: | :-----------:|\n", + "| simplenetv1_small_m2_05 | 38.33 | 16.512 |\n", + "| simplenetv1_small_m2_075 | 31.494 | 11.85 |\n", + "| simplenetv1_5m_m2 | 27.97 | 9.676 |\n", + "| simplenetv1_9m_m2 | 25.77 | 8.252 |\n", + "\n", + "The m1 variants \n", + "\n", + "| Model structure | Top-1 errors | Top-5 errors |\n", + "| :------------------------- | :-----------: | :-----------:|\n", + "| simplenetv1_small_m1_05 | 38.878 | 17.012 |\n", + "| simplenetv1_small_m1_075 | 32.216 | 12.282 |\n", + "| simplenetv1_5m_m1 | 28.452 | 10.06 |\n", + "| simplenetv1_9m_m1 | 26.208 | 8.514 |\n", + "\n", + "### References\n", + "\n", + " - [Lets Keep it simple, Using simple architectures to outperform deeper and more complex architectures](https://arxiv.org/abs/1608.06037)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/snakers4_silero-models_stt.ipynb b/assets/hub/snakers4_silero-models_stt.ipynb new file mode 100644 index 000000000000..df2cd221f606 --- /dev/null +++ b/assets/hub/snakers4_silero-models_stt.ipynb @@ -0,0 +1,106 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cadeb1df", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Silero Speech-To-Text Models\n", + "\n", + "*Author: Silero AI Team*\n", + "\n", + "**A set of compact enterprise-grade pre-trained STT Models for multiple languages.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/silero_stt_model.jpg) | ![alt](https://pytorch.org/assets/images/silero_imagenet_moment.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df36f984", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# this assumes that you have a proper version of PyTorch already installed\n", + "pip install -q torchaudio omegaconf soundfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ab515b9", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import zipfile\n", + "import torchaudio\n", + "from glob import glob\n", + "\n", + "device = torch.device('cpu') # gpu also works, but our models are fast enough for CPU\n", + "\n", + "model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',\n", + " model='silero_stt',\n", + " language='en', # also available 'de', 'es'\n", + " device=device)\n", + "(read_batch, split_into_batches,\n", + " read_audio, prepare_model_input) = utils # see function signature for details\n", + "\n", + "# download a single file, any format compatible with TorchAudio (soundfile backend)\n", + "torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav',\n", + " dst ='speech_orig.wav', progress=True)\n", + "test_files = glob('speech_orig.wav')\n", + "batches = split_into_batches(test_files, batch_size=10)\n", + "input = prepare_model_input(read_batch(batches[0]),\n", + " device=device)\n", + "\n", + "output = model(input)\n", + "for example in output:\n", + " print(decoder(example.cpu()))" + ] + }, + { + "cell_type": "markdown", + "id": "84bebade", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Silero Speech-To-Text models provide enterprise grade STT in a compact form-factor for several commonly spoken languages. Unlike conventional ASR models our models are robust to a variety of dialects, codecs, domains, noises, lower sampling rates (for simplicity audio should be resampled to 16 kHz). The models consume a normalized audio in the form of samples (i.e. without any pre-processing except for normalization to -1 ... 1) and output frames with token probabilities. We provide a decoder utility for simplicity (we could include it into our model itself, but scripted modules had problems with storing model artifacts i.e. labels during certain export scenarios).\n", + "\n", + "We hope that our efforts with Open-STT and Silero Models will bring the ImageNet moment in speech closer.\n", + "\n", + "### Supported Languages and Formats\n", + "\n", + "As of this page update, the following languages are supported:\n", + "\n", + "- English\n", + "- German\n", + "- Spanish\n", + "\n", + "To see the always up-to-date language list, please visit our [repo](https://github.com/snakers4/silero-models) and see the `yml` [file](https://github.com/snakers4/silero-models/blob/master/models.yml) for all available checkpoints.\n", + "\n", + "### Additional Examples and Benchmarks\n", + "\n", + "For additional examples and other model formats please visit this [link](https://github.com/snakers4/silero-models). For quality and performance benchmarks please see the [wiki](https://github.com/snakers4/silero-models/wiki). These resources will be updated from time to time.\n", + "\n", + "### References\n", + "\n", + "- [Silero Models](https://github.com/snakers4/silero-models)\n", + "- [Alexander Veysov, \"Toward's an ImageNet Moment for Speech-to-Text\", The Gradient, 2020](https://thegradient.pub/towards-an-imagenet-moment-for-speech-to-text/)\n", + "- [Alexander Veysov, \"A Speech-To-Text Practitioner’s Criticisms of Industry and Academia\", The Gradient, 2020](https://thegradient.pub/a-speech-to-text-practitioners-criticisms-of-industry-and-academia/)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/snakers4_silero-models_tts.ipynb b/assets/hub/snakers4_silero-models_tts.ipynb new file mode 100644 index 000000000000..5a674397cd29 --- /dev/null +++ b/assets/hub/snakers4_silero-models_tts.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ff883b45", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Silero Text-To-Speech Models\n", + "\n", + "*Author: Silero AI Team*\n", + "\n", + "**A set of compact enterprise-grade pre-trained TTS Models for multiple languages**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ce245de", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# this assumes that you have a proper version of PyTorch already installed\n", + "pip install -q torchaudio omegaconf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39b1ae7f", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "language = 'en'\n", + "speaker = 'lj_16khz'\n", + "device = torch.device('cpu')\n", + "model, symbols, sample_rate, example_text, apply_tts = torch.hub.load(repo_or_dir='snakers4/silero-models',\n", + " model='silero_tts',\n", + " language=language,\n", + " speaker=speaker)\n", + "model = model.to(device) # gpu or cpu\n", + "audio = apply_tts(texts=[example_text],\n", + " model=model,\n", + " sample_rate=sample_rate,\n", + " symbols=symbols,\n", + " device=device)" + ] + }, + { + "cell_type": "markdown", + "id": "352c834f", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Silero Text-To-Speech models provide enterprise grade TTS in a compact form-factor for several commonly spoken languages:\n", + "\n", + "- One-line usage\n", + "- Naturally sounding speech\n", + "- No GPU or training required\n", + "- Minimalism and lack of dependencies\n", + "- A library of voices in many languages\n", + "- Support for `16kHz` and `8kHz` out of the box\n", + "- High throughput on slow hardware. Decent performance on one CPU thread\n", + "\n", + "### Supported Languages and Formats\n", + "\n", + "As of this page update, the speakers of the following languages are supported both in 8 kHz and 16 kHz:\n", + "\n", + "- Russian (6 speakers)\n", + "- English (1 speaker)\n", + "- German (1 speaker)\n", + "- Spanish (1 speaker)\n", + "- French (1 speaker)\n", + "\n", + "To see the always up-to-date language list, please visit our [repo](https://github.com/snakers4/silero-models) and see the `yml` [file](https://github.com/snakers4/silero-models/blob/master/models.yml) for all available checkpoints.\n", + "\n", + "### Additional Examples and Benchmarks\n", + "\n", + "For additional examples and other model formats please visit this [link](https://github.com/snakers4/silero-models). For quality and performance benchmarks please see the [wiki](https://github.com/snakers4/silero-models/wiki). These resources will be updated from time to time.\n", + "\n", + "### References\n", + "\n", + "- [Silero Models](https://github.com/snakers4/silero-models)\n", + "- [High-Quality Speech-to-Text Made Accessible, Simple and Fast](https://habr.com/ru/post/549482/)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/snakers4_silero-vad_vad.ipynb b/assets/hub/snakers4_silero-vad_vad.ipynb new file mode 100644 index 000000000000..9cdf0c02f217 --- /dev/null +++ b/assets/hub/snakers4_silero-vad_vad.ipynb @@ -0,0 +1,95 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4e119581", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# Silero Voice Activity Detector\n", + "\n", + "*Author: Silero AI Team*\n", + "\n", + "**Pre-trained Voice Activity Detector**\n", + "\n", + "\"alt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a9a1b01", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# this assumes that you have a proper version of PyTorch already installed\n", + "pip install -q torchaudio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a638e514", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "torch.set_num_threads(1)\n", + "\n", + "from IPython.display import Audio\n", + "from pprint import pprint\n", + "# download example\n", + "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')\n", + "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_vad',\n", + " force_reload=True)\n", + "\n", + "(get_speech_timestamps,\n", + " _, read_audio,\n", + " *_) = utils\n", + "\n", + "sampling_rate = 16000 # also accepts 8000\n", + "wav = read_audio('en_example.wav', sampling_rate=sampling_rate)\n", + "# get speech timestamps from full audio file\n", + "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)\n", + "pprint(speech_timestamps)" + ] + }, + { + "cell_type": "markdown", + "id": "9c5dc9e9", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD). Enterprise-grade Speech Products made refreshingly simple (see our STT models). **Each model is published separately**.\n", + "\n", + "Currently, there are hardly any high quality / modern / free / public voice activity detectors except for WebRTC Voice Activity Detector (link). WebRTC though starts to show its age and it suffers from many false positives.\n", + "\n", + "**(!!!) Important Notice (!!!)** - the models are intended to run on CPU only and were optimized for performance on 1 CPU thread. Note that the model is quantized.\n", + "\n", + "\n", + "### Additional Examples and Benchmarks\n", + "\n", + "For additional examples and other model formats please visit this [link](https://github.com/snakers4/silero-vad) and please refer to the extensive examples in the Colab format (including the streaming examples).\n", + "\n", + "### References\n", + "\n", + "VAD model architectures are based on similar STT architectures.\n", + "\n", + "- [Silero VAD](https://github.com/snakers4/silero-vad)\n", + "- [Alexander Veysov, \"Toward's an ImageNet Moment for Speech-to-Text\", The Gradient, 2020](https://thegradient.pub/towards-an-imagenet-moment-for-speech-to-text/)\n", + "- [Alexander Veysov, \"A Speech-To-Text Practitioner’s Criticisms of Industry and Academia\", The Gradient, 2020](https://thegradient.pub/a-speech-to-text-practitioners-criticisms-of-industry-and-academia/)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/hub/ultralytics_yolov5.ipynb b/assets/hub/ultralytics_yolov5.ipynb new file mode 100644 index 000000000000..1dacc0f1dc8d --- /dev/null +++ b/assets/hub/ultralytics_yolov5.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5c265ba5", + "metadata": {}, + "source": [ + "### This notebook is optionally accelerated with a GPU runtime.\n", + "### If you would like to use this acceleration, please select the menu option \"Runtime\" -> \"Change runtime type\", select \"Hardware Accelerator\" -> \"GPU\" and click \"SAVE\"\n", + "\n", + "----------------------------------------------------------------------\n", + "\n", + "# YOLOv5\n", + "\n", + "*Author: Ultralytics*\n", + "\n", + "**Ultralytics YOLOv5 🚀 for object detection, instance segmentation and image classification.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/ultralytics_yolov5_img1.png) | ![alt](https://pytorch.org/assets/images/ultralytics_yolov5_img2.png)\n", + "\n", + "\n", + "## Before You Start\n", + "\n", + "Start from a **Python>=3.8** environment with **PyTorch>=1.7** installed. To install PyTorch see [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/). To install YOLOv5 dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f3bfa8", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "pip install -U ultralytics" + ] + }, + { + "cell_type": "markdown", + "id": "6f248585", + "metadata": {}, + "source": [ + "## Model Description\n", + "\n", + "\"YOLO\n", + "\n", + "Ultralytics YOLOv5 🚀 is a cutting-edge, state-of-the-art (SOTA) model that builds upon the success of previous YOLO versions and introduces new features and improvements to further boost performance and flexibility. YOLOv5 is designed to be fast, accurate, and easy to use, making it an excellent choice for a wide range of object detection, instance segmentation and image classification tasks.\n", + "\n", + "We hope that the resources here will help you get the most out of YOLOv5. Please browse the YOLOv5 [Docs](https://docs.ultralytics.com/yolov5) for details, raise an issue on [GitHub](https://github.com/ultralytics/yolov5/issues/new/choose) for support, and join our [Discord](https://discord.gg/n6cFeSPZdD) community for questions and discussions!\n", + "\n", + "| Model | size
        (pixels) | mAPval
        50-95 | mAPval
        50 | Speed
        CPU b1
        (ms) | Speed
        V100 b1
        (ms) | Speed
        V100 b32
        (ms) | params
        (M) | FLOPs
        @640 (B) |\n", + "|-------------------------------------------------------------------------------------------------|-----------------------|----------------------|-------------------|------------------------------|-------------------------------|--------------------------------|--------------------|------------------------|\n", + "| [YOLOv5n](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n.pt) | 640 | 28.0 | 45.7 | **45** | **6.3** | **0.6** | **1.9** | **4.5** |\n", + "| [YOLOv5s](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt) | 640 | 37.4 | 56.8 | 98 | 6.4 | 0.9 | 7.2 | 16.5 |\n", + "| [YOLOv5m](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5m.pt) | 640 | 45.4 | 64.1 | 224 | 8.2 | 1.7 | 21.2 | 49.0 |\n", + "| [YOLOv5l](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5l.pt) | 640 | 49.0 | 67.3 | 430 | 10.1 | 2.7 | 46.5 | 109.1 |\n", + "| [YOLOv5x](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5x.pt) | 640 | 50.7 | 68.9 | 766 | 12.1 | 4.8 | 86.7 | 205.7 |\n", + "| | | | | | | | | |\n", + "| [YOLOv5n6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n6.pt) | 1280 | 36.0 | 54.4 | 153 | 8.1 | 2.1 | 3.2 | 4.6 |\n", + "| [YOLOv5s6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s6.pt) | 1280 | 44.8 | 63.7 | 385 | 8.2 | 3.6 | 12.6 | 16.8 |\n", + "| [YOLOv5m6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5m6.pt) | 1280 | 51.3 | 69.3 | 887 | 11.1 | 6.8 | 35.7 | 50.0 |\n", + "| [YOLOv5l6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5l6.pt) | 1280 | 53.7 | 71.3 | 1784 | 15.8 | 10.5 | 76.8 | 111.4 |\n", + "| [YOLOv5x6](https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5x6.pt)
        + [TTA] | 1280
        1536 | 55.0
        **55.8** | 72.7
        **72.7** | 3136
        - | 26.2
        - | 19.4
        - | 140.7
        - | 209.8
        - |\n", + "\n", + "
        \n", + " Table Notes\n", + "\n", + "- All checkpoints are trained to 300 epochs with default settings. Nano and Small models use [hyp.scratch-low.yaml](https://github.com/ultralytics/yolov5/blob/master/data/hyps/hyp.scratch-low.yaml) hyps, all others use [hyp.scratch-high.yaml](https://github.com/ultralytics/yolov5/blob/master/data/hyps/hyp.scratch-high.yaml).\n", + "- **mAPval** values are for single-model single-scale on [COCO val2017](http://cocodataset.org) dataset.
        Reproduce by `python val.py --data coco.yaml --img 640 --conf 0.001 --iou 0.65`\n", + "- **Speed** averaged over COCO val images using a [AWS p3.2xlarge](https://aws.amazon.com/ec2/instance-types/p3/) instance. NMS times (~1 ms/img) not included.
        Reproduce by `python val.py --data coco.yaml --img 640 --task speed --batch 1`\n", + "- **TTA** [Test Time Augmentation](https://docs.ultralytics.com/yolov5/tutorials/test_time_augmentation) includes reflection and scale augmentations.
        Reproduce by `python val.py --data coco.yaml --img 1536 --iou 0.7 --augment`\n", + "\n", + "
        \n", + "\n", + "## Load From PyTorch Hub\n", + "\n", + "This example loads a pretrained **YOLOv5s** model and passes an image for inference. YOLOv5 accepts **URL**, **Filename**, **PIL**, **OpenCV**, **Numpy** and **PyTorch** inputs, and returns detections in **torch**, **pandas**, and **JSON** output formats. See the [YOLOv5 PyTorch Hub Tutorial](https://docs.ultralytics.com/yolov5/tutorials/pytorch_hub_model_loading/) for details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a729163", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# Model\n", + "model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)\n", + "\n", + "# Images\n", + "imgs = ['https://ultralytics.com/images/zidane.jpg'] # batch of images\n", + "\n", + "# Inference\n", + "results = model(imgs)\n", + "\n", + "# Results\n", + "results.print()\n", + "results.save() # or .show()\n", + "\n", + "results.xyxy[0] # img1 predictions (tensor)\n", + "results.pandas().xyxy[0] # img1 predictions (pandas)\n", + "# xmin ymin xmax ymax confidence class name\n", + "# 0 749.50 43.50 1148.0 704.5 0.874023 0 person\n", + "# 1 433.50 433.50 517.5 714.5 0.687988 27 tie\n", + "# 2 114.75 195.75 1095.0 708.0 0.624512 0 person\n", + "# 3 986.00 304.00 1028.0 420.0 0.286865 27 tie" + ] + }, + { + "cell_type": "markdown", + "id": "6d4d3437", + "metadata": {}, + "source": [ + "## Citation\n", + "\n", + "If you use YOLOv5 or YOLOv5u in your research, please cite the Ultralytics YOLOv5 repository as follows:\n", + "\n", + "[![DOI](https://zenodo.org/badge/264818686.svg)](https://zenodo.org/badge/latestdoi/264818686)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8e175a9", + "metadata": { + "attributes": { + "classes": [ + "bibtex" + ], + "id": "" + } + }, + "outputs": [], + "source": [ + "@software{yolov5,\n", + " title = {YOLOv5 by Ultralytics},\n", + " author = {Glenn Jocher},\n", + " year = {2020},\n", + " version = {7.0},\n", + " license = {AGPL-3.0},\n", + " url = {https://github.com/ultralytics/yolov5},\n", + " doi = {10.5281/zenodo.3908559},\n", + " orcid = {0000-0001-5950-6979}\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5aca7c41", + "metadata": {}, + "source": [ + "## Contact\n", + "\n", + "For YOLOv5 bug reports and feature requests please visit [GitHub Issues](https://github.com/ultralytics/yolov5/issues), and join our [Discord](https://discord.gg/n6cFeSPZdD) community for questions and discussions!\n", + "\n", + " " + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/images/Cub200Dataset.png b/assets/images/Cub200Dataset.png new file mode 100644 index 000000000000..ead780b0d8ac Binary files /dev/null and b/assets/images/Cub200Dataset.png differ diff --git a/assets/images/MEALV2.png b/assets/images/MEALV2.png new file mode 100644 index 000000000000..b4e8b2088599 Binary files /dev/null and b/assets/images/MEALV2.png differ diff --git a/assets/images/MEALV2_method.png b/assets/images/MEALV2_method.png new file mode 100644 index 000000000000..02f7668d4a8c Binary files /dev/null and b/assets/images/MEALV2_method.png differ diff --git a/assets/images/MEALV2_results.png b/assets/images/MEALV2_results.png new file mode 100644 index 000000000000..947734e7044c Binary files /dev/null and b/assets/images/MEALV2_results.png differ diff --git a/assets/images/ResNeXtArch.png b/assets/images/ResNeXtArch.png new file mode 100644 index 000000000000..b75d41b64af5 Binary files /dev/null and b/assets/images/ResNeXtArch.png differ diff --git a/assets/images/SEArch.png b/assets/images/SEArch.png new file mode 100755 index 000000000000..a7fb8d047226 Binary files /dev/null and b/assets/images/SEArch.png differ diff --git a/assets/images/classification.jpg b/assets/images/classification.jpg new file mode 100644 index 000000000000..eb1e20641c3c Binary files /dev/null and b/assets/images/classification.jpg differ diff --git a/assets/images/dog.jpg b/assets/images/dog.jpg new file mode 100644 index 000000000000..12f0e0dd1162 Binary files /dev/null and b/assets/images/dog.jpg differ diff --git a/assets/images/fastpitch_model.png b/assets/images/fastpitch_model.png new file mode 100644 index 000000000000..f828877edfdd Binary files /dev/null and b/assets/images/fastpitch_model.png differ diff --git a/assets/images/ghostnet.png b/assets/images/ghostnet.png new file mode 100644 index 000000000000..b91337e2aea3 Binary files /dev/null and b/assets/images/ghostnet.png differ diff --git a/assets/images/hifigan_model.png b/assets/images/hifigan_model.png new file mode 100644 index 000000000000..9ba92bb6a5e8 Binary files /dev/null and b/assets/images/hifigan_model.png differ diff --git a/assets/images/hybridnets.jpg b/assets/images/hybridnets.jpg new file mode 100644 index 000000000000..ee053ce4f549 Binary files /dev/null and b/assets/images/hybridnets.jpg differ diff --git a/assets/images/ibnnet.png b/assets/images/ibnnet.png new file mode 100644 index 000000000000..d6c0ce6006da Binary files /dev/null and b/assets/images/ibnnet.png differ diff --git a/assets/images/intel-logo.png b/assets/images/intel-logo.png new file mode 100644 index 000000000000..2d022a97c15a Binary files /dev/null and b/assets/images/intel-logo.png differ diff --git a/assets/images/midas_samples.png b/assets/images/midas_samples.png new file mode 100644 index 000000000000..921e290edbae Binary files /dev/null and b/assets/images/midas_samples.png differ diff --git a/assets/images/nts-net.png b/assets/images/nts-net.png new file mode 100644 index 000000000000..b7bd97b1ec70 Binary files /dev/null and b/assets/images/nts-net.png differ diff --git a/assets/images/ofa_imagenet_results.png b/assets/images/ofa_imagenet_results.png new file mode 100644 index 000000000000..46ceae12c0c5 Binary files /dev/null and b/assets/images/ofa_imagenet_results.png differ diff --git a/assets/images/once_for_all_overview.png b/assets/images/once_for_all_overview.png new file mode 100644 index 000000000000..555bf30cc5e1 Binary files /dev/null and b/assets/images/once_for_all_overview.png differ diff --git a/assets/images/resnest.jpg b/assets/images/resnest.jpg new file mode 100644 index 000000000000..994dc6ff00ee Binary files /dev/null and b/assets/images/resnest.jpg differ diff --git a/assets/images/sigsep_logo_inria.png b/assets/images/sigsep_logo_inria.png new file mode 100644 index 000000000000..066ea8861253 Binary files /dev/null and b/assets/images/sigsep_logo_inria.png differ diff --git a/assets/images/sigsep_umx-diagram.png b/assets/images/sigsep_umx-diagram.png new file mode 100644 index 000000000000..9cb5c4a3591d Binary files /dev/null and b/assets/images/sigsep_umx-diagram.png differ diff --git a/assets/images/silero_imagenet_moment.png b/assets/images/silero_imagenet_moment.png new file mode 100644 index 000000000000..faa16dc5ce49 Binary files /dev/null and b/assets/images/silero_imagenet_moment.png differ diff --git a/assets/images/silero_logo.jpg b/assets/images/silero_logo.jpg new file mode 100644 index 000000000000..0ced1942afa6 Binary files /dev/null and b/assets/images/silero_logo.jpg differ diff --git a/assets/images/silero_stt_model.jpg b/assets/images/silero_stt_model.jpg new file mode 100644 index 000000000000..2e67c11c2d31 Binary files /dev/null and b/assets/images/silero_stt_model.jpg differ diff --git a/assets/images/silero_vad_performance.png b/assets/images/silero_vad_performance.png new file mode 100644 index 000000000000..9d1d9f4f1479 Binary files /dev/null and b/assets/images/silero_vad_performance.png differ diff --git a/assets/images/simplenet.jpg b/assets/images/simplenet.jpg new file mode 100644 index 000000000000..e3bc71437dc9 Binary files /dev/null and b/assets/images/simplenet.jpg differ diff --git a/assets/images/slowfast.png b/assets/images/slowfast.png new file mode 100644 index 000000000000..c5f542a1f81e Binary files /dev/null and b/assets/images/slowfast.png differ diff --git a/assets/images/snnmlp.png b/assets/images/snnmlp.png new file mode 100644 index 000000000000..f08f8ea86f6d Binary files /dev/null and b/assets/images/snnmlp.png differ diff --git a/assets/images/ultralytics_yolov5_img0.jpg b/assets/images/ultralytics_yolov5_img0.jpg new file mode 100644 index 000000000000..b4147e36764a Binary files /dev/null and b/assets/images/ultralytics_yolov5_img0.jpg differ diff --git a/assets/images/ultralytics_yolov5_img1.png b/assets/images/ultralytics_yolov5_img1.png new file mode 100644 index 000000000000..73b996b237df Binary files /dev/null and b/assets/images/ultralytics_yolov5_img1.png differ diff --git a/assets/images/ultralytics_yolov5_img2.png b/assets/images/ultralytics_yolov5_img2.png new file mode 100644 index 000000000000..4e648fba938d Binary files /dev/null and b/assets/images/ultralytics_yolov5_img2.png differ diff --git a/assets/images/x3d.png b/assets/images/x3d.png new file mode 100644 index 000000000000..7f86e44b724f Binary files /dev/null and b/assets/images/x3d.png differ diff --git a/assets/images/yolop.png b/assets/images/yolop.png new file mode 100644 index 000000000000..1a6088452dc7 Binary files /dev/null and b/assets/images/yolop.png differ diff --git a/assets/main.css b/assets/main.css new file mode 100644 index 000000000000..aa7eb1db6cdd --- /dev/null +++ b/assets/main.css @@ -0,0 +1,6 @@ +/*! + * Bootstrap v4.3.1 (https://getbootstrap.com/) + * Copyright 2011-2019 The Bootstrap Authors + * Copyright 2011-2019 Twitter, Inc. + * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) + */:root{--blue: #007bff;--indigo: #6610f2;--purple: #6f42c1;--pink: #e83e8c;--red: #dc3545;--orange: #fd7e14;--yellow: #ffc107;--green: #28a745;--teal: #20c997;--cyan: #17a2b8;--white: #fff;--gray: #6c757d;--gray-dark: #343a40;--primary: #007bff;--secondary: #6c757d;--success: #28a745;--info: #17a2b8;--warning: #ffc107;--danger: #dc3545;--light: #f8f9fa;--dark: #343a40;--breakpoint-xs: 0;--breakpoint-sm: 576px;--breakpoint-md: 768px;--breakpoint-lg: 992px;--breakpoint-xl: 1200px;--font-family-sans-serif: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";--font-family-monospace: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace}*,*::before,*::after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}article,aside,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;text-align:left;background-color:#fff}[tabindex="-1"]:focus{outline:0 !important}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[title],abbr[data-original-title]{text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;border-bottom:0;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}ol,ul,dl{margin-top:0;margin-bottom:1rem}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#007bff;text-decoration:none;background-color:transparent}a:hover{color:#0056b3;text-decoration:underline}a:not([href]):not([tabindex]){color:inherit;text-decoration:none}a:not([href]):not([tabindex]):hover,a:not([href]):not([tabindex]):focus{color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus{outline:0}pre,code,kbd,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle;border-style:none}svg{overflow:hidden;vertical-align:middle}table{border-collapse:collapse}caption{padding-top:.75rem;padding-bottom:.75rem;color:#6c757d;text-align:left;caption-side:bottom}th{text-align:inherit}label{display:inline-block;margin-bottom:.5rem}button{border-radius:0}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}input,button,select,optgroup,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}select{word-wrap:normal}button,[type="button"],[type="reset"],[type="submit"]{-webkit-appearance:button}button:not(:disabled),[type="button"]:not(:disabled),[type="reset"]:not(:disabled),[type="submit"]:not(:disabled){cursor:pointer}button::-moz-focus-inner,[type="button"]::-moz-focus-inner,[type="reset"]::-moz-focus-inner,[type="submit"]::-moz-focus-inner{padding:0;border-style:none}input[type="radio"],input[type="checkbox"]{box-sizing:border-box;padding:0}input[type="date"],input[type="time"],input[type="datetime-local"],input[type="month"]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;max-width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit;color:inherit;white-space:normal}progress{vertical-align:baseline}[type="number"]::-webkit-inner-spin-button,[type="number"]::-webkit-outer-spin-button{height:auto}[type="search"]{outline-offset:-2px;-webkit-appearance:none}[type="search"]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item;cursor:pointer}template{display:none}[hidden]{display:none !important}h1,h2,h3,h4,h5,h6,.h1,.h2,.h3,.h4,.h5,.h6{margin-bottom:.5rem;font-weight:500;line-height:1.2}h1,.h1{font-size:2.5rem}h2,.h2{font-size:2rem}h3,.h3{font-size:1.75rem}h4,.h4{font-size:1.5rem}h5,.h5{font-size:1.25rem}h6,.h6{font-size:1rem}.lead{font-size:1.25rem;font-weight:300}.display-1{font-size:6rem;font-weight:300;line-height:1.2}.display-2{font-size:5.5rem;font-weight:300;line-height:1.2}.display-3{font-size:4.5rem;font-weight:300;line-height:1.2}.display-4{font-size:3.5rem;font-weight:300;line-height:1.2}hr{margin-top:1rem;margin-bottom:1rem;border:0;border-top:1px solid rgba(0,0,0,0.1)}small,.small{font-size:80%;font-weight:400}mark,.mark{padding:.2em;background-color:#fcf8e3}.list-unstyled{padding-left:0;list-style:none}.list-inline{padding-left:0;list-style:none}.list-inline-item{display:inline-block}.list-inline-item:not(:last-child){margin-right:.5rem}.initialism{font-size:90%;text-transform:uppercase}.blockquote{margin-bottom:1rem;font-size:1.25rem}.blockquote-footer{display:block;font-size:80%;color:#6c757d}.blockquote-footer::before{content:"\2014\00A0"}.img-fluid{max-width:100%;height:auto}.img-thumbnail{padding:.25rem;background-color:#fff;border:1px solid #dee2e6;border-radius:.25rem;max-width:100%;height:auto}.figure{display:inline-block}.figure-img{margin-bottom:.5rem;line-height:1}.figure-caption{font-size:90%;color:#6c757d}code{font-size:87.5%;color:#e83e8c;word-break:break-word}a>code{color:inherit}kbd{padding:.2rem .4rem;font-size:87.5%;color:#fff;background-color:#212529;border-radius:.2rem}kbd kbd{padding:0;font-size:100%;font-weight:700}pre{display:block;font-size:87.5%;color:#212529}pre code{font-size:inherit;color:inherit;word-break:normal}.pre-scrollable{max-height:340px;overflow-y:scroll}.container{width:100%;padding-right:15px;padding-left:15px;margin-right:auto;margin-left:auto}@media (min-width: 576px){.container{max-width:540px}}@media (min-width: 768px){.container{max-width:720px}}@media (min-width: 992px){.container{max-width:960px}}@media (min-width: 1200px){.container{max-width:1140px}}.container-fluid{width:100%;padding-right:15px;padding-left:15px;margin-right:auto;margin-left:auto}.row{display:flex;flex-wrap:wrap;margin-right:-15px;margin-left:-15px}.no-gutters{margin-right:0;margin-left:0}.no-gutters>.col,.no-gutters>[class*="col-"]{padding-right:0;padding-left:0}.col-1,.col-2,.col-3,.col-4,.col-5,.col-6,.col-7,.col-8,.col-9,.col-10,.col-11,.col-12,.col,.col-auto,.col-sm-1,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm,.col-sm-auto,.col-md-1,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-md-10,.col-md-11,.col-md-12,.col-md,.col-md-auto,.col-lg-1,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg,.col-lg-auto,.col-xl-1,.col-xl-2,.col-xl-3,.col-xl-4,.col-xl-5,.col-xl-6,.col-xl-7,.col-xl-8,.col-xl-9,.col-xl-10,.col-xl-11,.col-xl-12,.col-xl,.col-xl-auto{position:relative;width:100%;padding-right:15px;padding-left:15px}.col{flex-basis:0;flex-grow:1;max-width:100%}.col-auto{flex:0 0 auto;width:auto;max-width:100%}.col-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-3{flex:0 0 25%;max-width:25%}.col-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-6{flex:0 0 50%;max-width:50%}.col-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-9{flex:0 0 75%;max-width:75%}.col-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-12{flex:0 0 100%;max-width:100%}.order-first{order:-1}.order-last{order:13}.order-0{order:0}.order-1{order:1}.order-2{order:2}.order-3{order:3}.order-4{order:4}.order-5{order:5}.order-6{order:6}.order-7{order:7}.order-8{order:8}.order-9{order:9}.order-10{order:10}.order-11{order:11}.order-12{order:12}.offset-1{margin-left:8.3333333333%}.offset-2{margin-left:16.6666666667%}.offset-3{margin-left:25%}.offset-4{margin-left:33.3333333333%}.offset-5{margin-left:41.6666666667%}.offset-6{margin-left:50%}.offset-7{margin-left:58.3333333333%}.offset-8{margin-left:66.6666666667%}.offset-9{margin-left:75%}.offset-10{margin-left:83.3333333333%}.offset-11{margin-left:91.6666666667%}@media (min-width: 576px){.col-sm{flex-basis:0;flex-grow:1;max-width:100%}.col-sm-auto{flex:0 0 auto;width:auto;max-width:100%}.col-sm-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-sm-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-sm-3{flex:0 0 25%;max-width:25%}.col-sm-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-sm-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-sm-6{flex:0 0 50%;max-width:50%}.col-sm-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-sm-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-sm-9{flex:0 0 75%;max-width:75%}.col-sm-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-sm-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-sm-12{flex:0 0 100%;max-width:100%}.order-sm-first{order:-1}.order-sm-last{order:13}.order-sm-0{order:0}.order-sm-1{order:1}.order-sm-2{order:2}.order-sm-3{order:3}.order-sm-4{order:4}.order-sm-5{order:5}.order-sm-6{order:6}.order-sm-7{order:7}.order-sm-8{order:8}.order-sm-9{order:9}.order-sm-10{order:10}.order-sm-11{order:11}.order-sm-12{order:12}.offset-sm-0{margin-left:0}.offset-sm-1{margin-left:8.3333333333%}.offset-sm-2{margin-left:16.6666666667%}.offset-sm-3{margin-left:25%}.offset-sm-4{margin-left:33.3333333333%}.offset-sm-5{margin-left:41.6666666667%}.offset-sm-6{margin-left:50%}.offset-sm-7{margin-left:58.3333333333%}.offset-sm-8{margin-left:66.6666666667%}.offset-sm-9{margin-left:75%}.offset-sm-10{margin-left:83.3333333333%}.offset-sm-11{margin-left:91.6666666667%}}@media (min-width: 768px){.col-md{flex-basis:0;flex-grow:1;max-width:100%}.col-md-auto{flex:0 0 auto;width:auto;max-width:100%}.col-md-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-md-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-md-3{flex:0 0 25%;max-width:25%}.col-md-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-md-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-md-6{flex:0 0 50%;max-width:50%}.col-md-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-md-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-md-9{flex:0 0 75%;max-width:75%}.col-md-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-md-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-md-12{flex:0 0 100%;max-width:100%}.order-md-first{order:-1}.order-md-last{order:13}.order-md-0{order:0}.order-md-1{order:1}.order-md-2{order:2}.order-md-3{order:3}.order-md-4{order:4}.order-md-5{order:5}.order-md-6{order:6}.order-md-7{order:7}.order-md-8{order:8}.order-md-9{order:9}.order-md-10{order:10}.order-md-11{order:11}.order-md-12{order:12}.offset-md-0{margin-left:0}.offset-md-1{margin-left:8.3333333333%}.offset-md-2{margin-left:16.6666666667%}.offset-md-3{margin-left:25%}.offset-md-4{margin-left:33.3333333333%}.offset-md-5{margin-left:41.6666666667%}.offset-md-6{margin-left:50%}.offset-md-7{margin-left:58.3333333333%}.offset-md-8{margin-left:66.6666666667%}.offset-md-9{margin-left:75%}.offset-md-10{margin-left:83.3333333333%}.offset-md-11{margin-left:91.6666666667%}}@media (min-width: 992px){.col-lg{flex-basis:0;flex-grow:1;max-width:100%}.col-lg-auto{flex:0 0 auto;width:auto;max-width:100%}.col-lg-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-lg-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-lg-3{flex:0 0 25%;max-width:25%}.col-lg-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-lg-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-lg-6{flex:0 0 50%;max-width:50%}.col-lg-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-lg-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-lg-9{flex:0 0 75%;max-width:75%}.col-lg-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-lg-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-lg-12{flex:0 0 100%;max-width:100%}.order-lg-first{order:-1}.order-lg-last{order:13}.order-lg-0{order:0}.order-lg-1{order:1}.order-lg-2{order:2}.order-lg-3{order:3}.order-lg-4{order:4}.order-lg-5{order:5}.order-lg-6{order:6}.order-lg-7{order:7}.order-lg-8{order:8}.order-lg-9{order:9}.order-lg-10{order:10}.order-lg-11{order:11}.order-lg-12{order:12}.offset-lg-0{margin-left:0}.offset-lg-1{margin-left:8.3333333333%}.offset-lg-2{margin-left:16.6666666667%}.offset-lg-3{margin-left:25%}.offset-lg-4{margin-left:33.3333333333%}.offset-lg-5{margin-left:41.6666666667%}.offset-lg-6{margin-left:50%}.offset-lg-7{margin-left:58.3333333333%}.offset-lg-8{margin-left:66.6666666667%}.offset-lg-9{margin-left:75%}.offset-lg-10{margin-left:83.3333333333%}.offset-lg-11{margin-left:91.6666666667%}}@media (min-width: 1200px){.col-xl{flex-basis:0;flex-grow:1;max-width:100%}.col-xl-auto{flex:0 0 auto;width:auto;max-width:100%}.col-xl-1{flex:0 0 8.3333333333%;max-width:8.3333333333%}.col-xl-2{flex:0 0 16.6666666667%;max-width:16.6666666667%}.col-xl-3{flex:0 0 25%;max-width:25%}.col-xl-4{flex:0 0 33.3333333333%;max-width:33.3333333333%}.col-xl-5{flex:0 0 41.6666666667%;max-width:41.6666666667%}.col-xl-6{flex:0 0 50%;max-width:50%}.col-xl-7{flex:0 0 58.3333333333%;max-width:58.3333333333%}.col-xl-8{flex:0 0 66.6666666667%;max-width:66.6666666667%}.col-xl-9{flex:0 0 75%;max-width:75%}.col-xl-10{flex:0 0 83.3333333333%;max-width:83.3333333333%}.col-xl-11{flex:0 0 91.6666666667%;max-width:91.6666666667%}.col-xl-12{flex:0 0 100%;max-width:100%}.order-xl-first{order:-1}.order-xl-last{order:13}.order-xl-0{order:0}.order-xl-1{order:1}.order-xl-2{order:2}.order-xl-3{order:3}.order-xl-4{order:4}.order-xl-5{order:5}.order-xl-6{order:6}.order-xl-7{order:7}.order-xl-8{order:8}.order-xl-9{order:9}.order-xl-10{order:10}.order-xl-11{order:11}.order-xl-12{order:12}.offset-xl-0{margin-left:0}.offset-xl-1{margin-left:8.3333333333%}.offset-xl-2{margin-left:16.6666666667%}.offset-xl-3{margin-left:25%}.offset-xl-4{margin-left:33.3333333333%}.offset-xl-5{margin-left:41.6666666667%}.offset-xl-6{margin-left:50%}.offset-xl-7{margin-left:58.3333333333%}.offset-xl-8{margin-left:66.6666666667%}.offset-xl-9{margin-left:75%}.offset-xl-10{margin-left:83.3333333333%}.offset-xl-11{margin-left:91.6666666667%}}.table{width:100%;margin-bottom:1rem;color:#212529}.table th,.table td{padding:.75rem;vertical-align:top;border-top:1px solid #dee2e6}.table thead th{vertical-align:bottom;border-bottom:2px solid #dee2e6}.table tbody+tbody{border-top:2px solid #dee2e6}.table-sm th,.table-sm td{padding:.3rem}.table-bordered{border:1px solid #dee2e6}.table-bordered th,.table-bordered td{border:1px solid #dee2e6}.table-bordered thead th,.table-bordered thead td{border-bottom-width:2px}.table-borderless th,.table-borderless td,.table-borderless thead th,.table-borderless tbody+tbody{border:0}.table-striped tbody tr:nth-of-type(odd){background-color:rgba(0,0,0,0.05)}.table-hover tbody tr:hover{color:#212529;background-color:rgba(0,0,0,0.075)}.table-primary,.table-primary>th,.table-primary>td{background-color:#b8daff}.table-primary th,.table-primary td,.table-primary thead th,.table-primary tbody+tbody{border-color:#7abaff}.table-hover .table-primary:hover{background-color:#9fcdff}.table-hover .table-primary:hover>td,.table-hover .table-primary:hover>th{background-color:#9fcdff}.table-secondary,.table-secondary>th,.table-secondary>td{background-color:#d6d8db}.table-secondary th,.table-secondary td,.table-secondary thead th,.table-secondary tbody+tbody{border-color:#b3b7bb}.table-hover .table-secondary:hover{background-color:#c8cbcf}.table-hover .table-secondary:hover>td,.table-hover .table-secondary:hover>th{background-color:#c8cbcf}.table-success,.table-success>th,.table-success>td{background-color:#c3e6cb}.table-success th,.table-success td,.table-success thead th,.table-success tbody+tbody{border-color:#8fd19e}.table-hover .table-success:hover{background-color:#b1dfbb}.table-hover .table-success:hover>td,.table-hover .table-success:hover>th{background-color:#b1dfbb}.table-info,.table-info>th,.table-info>td{background-color:#bee5eb}.table-info th,.table-info td,.table-info thead th,.table-info tbody+tbody{border-color:#86cfda}.table-hover .table-info:hover{background-color:#abdde5}.table-hover .table-info:hover>td,.table-hover .table-info:hover>th{background-color:#abdde5}.table-warning,.table-warning>th,.table-warning>td{background-color:#ffeeba}.table-warning th,.table-warning td,.table-warning thead th,.table-warning tbody+tbody{border-color:#ffdf7e}.table-hover .table-warning:hover{background-color:#ffe8a1}.table-hover .table-warning:hover>td,.table-hover .table-warning:hover>th{background-color:#ffe8a1}.table-danger,.table-danger>th,.table-danger>td{background-color:#f5c6cb}.table-danger th,.table-danger td,.table-danger thead th,.table-danger tbody+tbody{border-color:#ed969e}.table-hover .table-danger:hover{background-color:#f1b0b7}.table-hover .table-danger:hover>td,.table-hover .table-danger:hover>th{background-color:#f1b0b7}.table-light,.table-light>th,.table-light>td{background-color:#fdfdfe}.table-light th,.table-light td,.table-light thead th,.table-light tbody+tbody{border-color:#fbfcfc}.table-hover .table-light:hover{background-color:#ececf6}.table-hover .table-light:hover>td,.table-hover .table-light:hover>th{background-color:#ececf6}.table-dark,.table-dark>th,.table-dark>td{background-color:#c6c8ca}.table-dark th,.table-dark td,.table-dark thead th,.table-dark tbody+tbody{border-color:#95999c}.table-hover .table-dark:hover{background-color:#b9bbbe}.table-hover .table-dark:hover>td,.table-hover .table-dark:hover>th{background-color:#b9bbbe}.table-active,.table-active>th,.table-active>td{background-color:rgba(0,0,0,0.075)}.table-hover .table-active:hover{background-color:rgba(0,0,0,0.075)}.table-hover .table-active:hover>td,.table-hover .table-active:hover>th{background-color:rgba(0,0,0,0.075)}.table .thead-dark th{color:#fff;background-color:#343a40;border-color:#454d55}.table .thead-light th{color:#495057;background-color:#e9ecef;border-color:#dee2e6}.table-dark{color:#fff;background-color:#343a40}.table-dark th,.table-dark td,.table-dark thead th{border-color:#454d55}.table-dark.table-bordered{border:0}.table-dark.table-striped tbody tr:nth-of-type(odd){background-color:rgba(255,255,255,0.05)}.table-dark.table-hover tbody tr:hover{color:#fff;background-color:rgba(255,255,255,0.075)}@media (max-width: 575.98px){.table-responsive-sm{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-sm>.table-bordered{border:0}}@media (max-width: 767.98px){.table-responsive-md{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-md>.table-bordered{border:0}}@media (max-width: 991.98px){.table-responsive-lg{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-lg>.table-bordered{border:0}}@media (max-width: 1199.98px){.table-responsive-xl{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-xl>.table-bordered{border:0}}.table-responsive{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive>.table-bordered{border:0}.form-control{display:block;width:100%;height:calc(1.5em + .75rem + 2px);padding:.375rem .75rem;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;background-color:#fff;background-clip:padding-box;border:1px solid #ced4da;border-radius:.25rem;transition:border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.form-control{transition:none}}.form-control::-ms-expand{background-color:transparent;border:0}.form-control:focus{color:#495057;background-color:#fff;border-color:#80bdff;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.form-control::-moz-placeholder{color:#6c757d;opacity:1}.form-control:-ms-input-placeholder{color:#6c757d;opacity:1}.form-control::-ms-input-placeholder{color:#6c757d;opacity:1}.form-control::placeholder{color:#6c757d;opacity:1}.form-control:disabled,.form-control[readonly]{background-color:#e9ecef;opacity:1}select.form-control:focus::-ms-value{color:#495057;background-color:#fff}.form-control-file,.form-control-range{display:block;width:100%}.col-form-label{padding-top:calc(.375rem + 1px);padding-bottom:calc(.375rem + 1px);margin-bottom:0;font-size:inherit;line-height:1.5}.col-form-label-lg{padding-top:calc(.5rem + 1px);padding-bottom:calc(.5rem + 1px);font-size:1.25rem;line-height:1.5}.col-form-label-sm{padding-top:calc(.25rem + 1px);padding-bottom:calc(.25rem + 1px);font-size:.875rem;line-height:1.5}.form-control-plaintext{display:block;width:100%;padding-top:.375rem;padding-bottom:.375rem;margin-bottom:0;line-height:1.5;color:#212529;background-color:transparent;border:solid transparent;border-width:1px 0}.form-control-plaintext.form-control-sm,.form-control-plaintext.form-control-lg{padding-right:0;padding-left:0}.form-control-sm{height:calc(1.5em + .5rem + 2px);padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.form-control-lg{height:calc(1.5em + 1rem + 2px);padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}select.form-control[size],select.form-control[multiple]{height:auto}textarea.form-control{height:auto}.form-group{margin-bottom:1rem}.form-text{display:block;margin-top:.25rem}.form-row{display:flex;flex-wrap:wrap;margin-right:-5px;margin-left:-5px}.form-row>.col,.form-row>[class*="col-"]{padding-right:5px;padding-left:5px}.form-check{position:relative;display:block;padding-left:1.25rem}.form-check-input{position:absolute;margin-top:.3rem;margin-left:-1.25rem}.form-check-input:disabled ~ .form-check-label{color:#6c757d}.form-check-label{margin-bottom:0}.form-check-inline{display:inline-flex;align-items:center;padding-left:0;margin-right:.75rem}.form-check-inline .form-check-input{position:static;margin-top:0;margin-right:.3125rem;margin-left:0}.valid-feedback{display:none;width:100%;margin-top:.25rem;font-size:80%;color:#28a745}.valid-tooltip{position:absolute;top:100%;z-index:5;display:none;max-width:100%;padding:.25rem .5rem;margin-top:.1rem;font-size:.875rem;line-height:1.5;color:#fff;background-color:rgba(40,167,69,0.9);border-radius:.25rem}.was-validated .form-control:valid,.form-control.is-valid{border-color:#28a745;padding-right:calc(1.5em + .75rem);background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%2328a745' d='M2.3 6.73L.6 4.53c-.4-1.04.46-1.4 1.1-.8l1.1 1.4 3.4-3.8c.6-.63 1.6-.27 1.2.7l-4 4.6c-.43.5-.8.4-1.1.1z'/%3e%3c/svg%3e");background-repeat:no-repeat;background-position:center right calc(.375em + .1875rem);background-size:calc(.75em + .375rem) calc(.75em + .375rem)}.was-validated .form-control:valid:focus,.form-control.is-valid:focus{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,0.25)}.was-validated .form-control:valid ~ .valid-feedback,.was-validated .form-control:valid ~ .valid-tooltip,.form-control.is-valid ~ .valid-feedback,.form-control.is-valid ~ .valid-tooltip{display:block}.was-validated textarea.form-control:valid,textarea.form-control.is-valid{padding-right:calc(1.5em + .75rem);background-position:top calc(.375em + .1875rem) right calc(.375em + .1875rem)}.was-validated .custom-select:valid,.custom-select.is-valid{border-color:#28a745;padding-right:calc((1em + .75rem) * 3 / 4 + 1.75rem);background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px,url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%2328a745' d='M2.3 6.73L.6 4.53c-.4-1.04.46-1.4 1.1-.8l1.1 1.4 3.4-3.8c.6-.63 1.6-.27 1.2.7l-4 4.6c-.43.5-.8.4-1.1.1z'/%3e%3c/svg%3e") #fff no-repeat center right 1.75rem/calc(.75em + .375rem) calc(.75em + .375rem)}.was-validated .custom-select:valid:focus,.custom-select.is-valid:focus{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,0.25)}.was-validated .custom-select:valid ~ .valid-feedback,.was-validated .custom-select:valid ~ .valid-tooltip,.custom-select.is-valid ~ .valid-feedback,.custom-select.is-valid ~ .valid-tooltip{display:block}.was-validated .form-control-file:valid ~ .valid-feedback,.was-validated .form-control-file:valid ~ .valid-tooltip,.form-control-file.is-valid ~ .valid-feedback,.form-control-file.is-valid ~ .valid-tooltip{display:block}.was-validated .form-check-input:valid ~ .form-check-label,.form-check-input.is-valid ~ .form-check-label{color:#28a745}.was-validated .form-check-input:valid ~ .valid-feedback,.was-validated .form-check-input:valid ~ .valid-tooltip,.form-check-input.is-valid ~ .valid-feedback,.form-check-input.is-valid ~ .valid-tooltip{display:block}.was-validated .custom-control-input:valid ~ .custom-control-label,.custom-control-input.is-valid ~ .custom-control-label{color:#28a745}.was-validated .custom-control-input:valid ~ .custom-control-label::before,.custom-control-input.is-valid ~ .custom-control-label::before{border-color:#28a745}.was-validated .custom-control-input:valid ~ .valid-feedback,.was-validated .custom-control-input:valid ~ .valid-tooltip,.custom-control-input.is-valid ~ .valid-feedback,.custom-control-input.is-valid ~ .valid-tooltip{display:block}.was-validated .custom-control-input:valid:checked ~ .custom-control-label::before,.custom-control-input.is-valid:checked ~ .custom-control-label::before{border-color:#34ce57;background-color:#34ce57}.was-validated .custom-control-input:valid:focus ~ .custom-control-label::before,.custom-control-input.is-valid:focus ~ .custom-control-label::before{box-shadow:0 0 0 .2rem rgba(40,167,69,0.25)}.was-validated .custom-control-input:valid:focus:not(:checked) ~ .custom-control-label::before,.custom-control-input.is-valid:focus:not(:checked) ~ .custom-control-label::before{border-color:#28a745}.was-validated .custom-file-input:valid ~ .custom-file-label,.custom-file-input.is-valid ~ .custom-file-label{border-color:#28a745}.was-validated .custom-file-input:valid ~ .valid-feedback,.was-validated .custom-file-input:valid ~ .valid-tooltip,.custom-file-input.is-valid ~ .valid-feedback,.custom-file-input.is-valid ~ .valid-tooltip{display:block}.was-validated .custom-file-input:valid:focus ~ .custom-file-label,.custom-file-input.is-valid:focus ~ .custom-file-label{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,0.25)}.invalid-feedback{display:none;width:100%;margin-top:.25rem;font-size:80%;color:#dc3545}.invalid-tooltip{position:absolute;top:100%;z-index:5;display:none;max-width:100%;padding:.25rem .5rem;margin-top:.1rem;font-size:.875rem;line-height:1.5;color:#fff;background-color:rgba(220,53,69,0.9);border-radius:.25rem}.was-validated .form-control:invalid,.form-control.is-invalid{border-color:#dc3545;padding-right:calc(1.5em + .75rem);background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23dc3545' viewBox='-2 -2 7 7'%3e%3cpath stroke='%23dc3545' d='M0 0l3 3m0-3L0 3'/%3e%3ccircle r='.5'/%3e%3ccircle cx='3' r='.5'/%3e%3ccircle cy='3' r='.5'/%3e%3ccircle cx='3' cy='3' r='.5'/%3e%3c/svg%3E");background-repeat:no-repeat;background-position:center right calc(.375em + .1875rem);background-size:calc(.75em + .375rem) calc(.75em + .375rem)}.was-validated .form-control:invalid:focus,.form-control.is-invalid:focus{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,0.25)}.was-validated .form-control:invalid ~ .invalid-feedback,.was-validated .form-control:invalid ~ .invalid-tooltip,.form-control.is-invalid ~ .invalid-feedback,.form-control.is-invalid ~ .invalid-tooltip{display:block}.was-validated textarea.form-control:invalid,textarea.form-control.is-invalid{padding-right:calc(1.5em + .75rem);background-position:top calc(.375em + .1875rem) right calc(.375em + .1875rem)}.was-validated .custom-select:invalid,.custom-select.is-invalid{border-color:#dc3545;padding-right:calc((1em + .75rem) * 3 / 4 + 1.75rem);background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px,url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23dc3545' viewBox='-2 -2 7 7'%3e%3cpath stroke='%23dc3545' d='M0 0l3 3m0-3L0 3'/%3e%3ccircle r='.5'/%3e%3ccircle cx='3' r='.5'/%3e%3ccircle cy='3' r='.5'/%3e%3ccircle cx='3' cy='3' r='.5'/%3e%3c/svg%3E") #fff no-repeat center right 1.75rem/calc(.75em + .375rem) calc(.75em + .375rem)}.was-validated .custom-select:invalid:focus,.custom-select.is-invalid:focus{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,0.25)}.was-validated .custom-select:invalid ~ .invalid-feedback,.was-validated .custom-select:invalid ~ .invalid-tooltip,.custom-select.is-invalid ~ .invalid-feedback,.custom-select.is-invalid ~ .invalid-tooltip{display:block}.was-validated .form-control-file:invalid ~ .invalid-feedback,.was-validated .form-control-file:invalid ~ .invalid-tooltip,.form-control-file.is-invalid ~ .invalid-feedback,.form-control-file.is-invalid ~ .invalid-tooltip{display:block}.was-validated .form-check-input:invalid ~ .form-check-label,.form-check-input.is-invalid ~ .form-check-label{color:#dc3545}.was-validated .form-check-input:invalid ~ .invalid-feedback,.was-validated .form-check-input:invalid ~ .invalid-tooltip,.form-check-input.is-invalid ~ .invalid-feedback,.form-check-input.is-invalid ~ .invalid-tooltip{display:block}.was-validated .custom-control-input:invalid ~ .custom-control-label,.custom-control-input.is-invalid ~ .custom-control-label{color:#dc3545}.was-validated .custom-control-input:invalid ~ .custom-control-label::before,.custom-control-input.is-invalid ~ .custom-control-label::before{border-color:#dc3545}.was-validated .custom-control-input:invalid ~ .invalid-feedback,.was-validated .custom-control-input:invalid ~ .invalid-tooltip,.custom-control-input.is-invalid ~ .invalid-feedback,.custom-control-input.is-invalid ~ .invalid-tooltip{display:block}.was-validated .custom-control-input:invalid:checked ~ .custom-control-label::before,.custom-control-input.is-invalid:checked ~ .custom-control-label::before{border-color:#e4606d;background-color:#e4606d}.was-validated .custom-control-input:invalid:focus ~ .custom-control-label::before,.custom-control-input.is-invalid:focus ~ .custom-control-label::before{box-shadow:0 0 0 .2rem rgba(220,53,69,0.25)}.was-validated .custom-control-input:invalid:focus:not(:checked) ~ .custom-control-label::before,.custom-control-input.is-invalid:focus:not(:checked) ~ .custom-control-label::before{border-color:#dc3545}.was-validated .custom-file-input:invalid ~ .custom-file-label,.custom-file-input.is-invalid ~ .custom-file-label{border-color:#dc3545}.was-validated .custom-file-input:invalid ~ .invalid-feedback,.was-validated .custom-file-input:invalid ~ .invalid-tooltip,.custom-file-input.is-invalid ~ .invalid-feedback,.custom-file-input.is-invalid ~ .invalid-tooltip{display:block}.was-validated .custom-file-input:invalid:focus ~ .custom-file-label,.custom-file-input.is-invalid:focus ~ .custom-file-label{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,0.25)}.form-inline{display:flex;flex-flow:row wrap;align-items:center}.form-inline .form-check{width:100%}@media (min-width: 576px){.form-inline label{display:flex;align-items:center;justify-content:center;margin-bottom:0}.form-inline .form-group{display:flex;flex:0 0 auto;flex-flow:row wrap;align-items:center;margin-bottom:0}.form-inline .form-control{display:inline-block;width:auto;vertical-align:middle}.form-inline .form-control-plaintext{display:inline-block}.form-inline .input-group,.form-inline .custom-select{width:auto}.form-inline .form-check{display:flex;align-items:center;justify-content:center;width:auto;padding-left:0}.form-inline .form-check-input{position:relative;flex-shrink:0;margin-top:0;margin-right:.25rem;margin-left:0}.form-inline .custom-control{align-items:center;justify-content:center}.form-inline .custom-control-label{margin-bottom:0}}.btn{display:inline-block;font-weight:400;color:#212529;text-align:center;vertical-align:middle;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;background-color:transparent;border:1px solid transparent;padding:.375rem .75rem;font-size:1rem;line-height:1.5;border-radius:.25rem;transition:color 0.15s ease-in-out,background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.btn{transition:none}}.btn:hover{color:#212529;text-decoration:none}.btn:focus,.btn.focus{outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.btn.disabled,.btn:disabled{opacity:.65}a.btn.disabled,fieldset:disabled a.btn{pointer-events:none}.btn-primary{color:#fff;background-color:#007bff;border-color:#007bff}.btn-primary:hover{color:#fff;background-color:#0069d9;border-color:#0062cc}.btn-primary:focus,.btn-primary.focus{box-shadow:0 0 0 .2rem rgba(38,143,255,0.5)}.btn-primary.disabled,.btn-primary:disabled{color:#fff;background-color:#007bff;border-color:#007bff}.btn-primary:not(:disabled):not(.disabled):active,.btn-primary:not(:disabled):not(.disabled).active,.show>.btn-primary.dropdown-toggle{color:#fff;background-color:#0062cc;border-color:#005cbf}.btn-primary:not(:disabled):not(.disabled):active:focus,.btn-primary:not(:disabled):not(.disabled).active:focus,.show>.btn-primary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(38,143,255,0.5)}.btn-secondary{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-secondary:hover{color:#fff;background-color:#5a6268;border-color:#545b62}.btn-secondary:focus,.btn-secondary.focus{box-shadow:0 0 0 .2rem rgba(130,138,145,0.5)}.btn-secondary.disabled,.btn-secondary:disabled{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-secondary:not(:disabled):not(.disabled):active,.btn-secondary:not(:disabled):not(.disabled).active,.show>.btn-secondary.dropdown-toggle{color:#fff;background-color:#545b62;border-color:#4e555b}.btn-secondary:not(:disabled):not(.disabled):active:focus,.btn-secondary:not(:disabled):not(.disabled).active:focus,.show>.btn-secondary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(130,138,145,0.5)}.btn-success{color:#fff;background-color:#28a745;border-color:#28a745}.btn-success:hover{color:#fff;background-color:#218838;border-color:#1e7e34}.btn-success:focus,.btn-success.focus{box-shadow:0 0 0 .2rem rgba(72,180,97,0.5)}.btn-success.disabled,.btn-success:disabled{color:#fff;background-color:#28a745;border-color:#28a745}.btn-success:not(:disabled):not(.disabled):active,.btn-success:not(:disabled):not(.disabled).active,.show>.btn-success.dropdown-toggle{color:#fff;background-color:#1e7e34;border-color:#1c7430}.btn-success:not(:disabled):not(.disabled):active:focus,.btn-success:not(:disabled):not(.disabled).active:focus,.show>.btn-success.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(72,180,97,0.5)}.btn-info{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-info:hover{color:#fff;background-color:#138496;border-color:#117a8b}.btn-info:focus,.btn-info.focus{box-shadow:0 0 0 .2rem rgba(58,176,195,0.5)}.btn-info.disabled,.btn-info:disabled{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-info:not(:disabled):not(.disabled):active,.btn-info:not(:disabled):not(.disabled).active,.show>.btn-info.dropdown-toggle{color:#fff;background-color:#117a8b;border-color:#10707f}.btn-info:not(:disabled):not(.disabled):active:focus,.btn-info:not(:disabled):not(.disabled).active:focus,.show>.btn-info.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(58,176,195,0.5)}.btn-warning{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-warning:hover{color:#212529;background-color:#e0a800;border-color:#d39e00}.btn-warning:focus,.btn-warning.focus{box-shadow:0 0 0 .2rem rgba(222,170,12,0.5)}.btn-warning.disabled,.btn-warning:disabled{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-warning:not(:disabled):not(.disabled):active,.btn-warning:not(:disabled):not(.disabled).active,.show>.btn-warning.dropdown-toggle{color:#212529;background-color:#d39e00;border-color:#c69500}.btn-warning:not(:disabled):not(.disabled):active:focus,.btn-warning:not(:disabled):not(.disabled).active:focus,.show>.btn-warning.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(222,170,12,0.5)}.btn-danger{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-danger:hover{color:#fff;background-color:#c82333;border-color:#bd2130}.btn-danger:focus,.btn-danger.focus{box-shadow:0 0 0 .2rem rgba(225,83,97,0.5)}.btn-danger.disabled,.btn-danger:disabled{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-danger:not(:disabled):not(.disabled):active,.btn-danger:not(:disabled):not(.disabled).active,.show>.btn-danger.dropdown-toggle{color:#fff;background-color:#bd2130;border-color:#b21f2d}.btn-danger:not(:disabled):not(.disabled):active:focus,.btn-danger:not(:disabled):not(.disabled).active:focus,.show>.btn-danger.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(225,83,97,0.5)}.btn-light{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-light:hover{color:#212529;background-color:#e2e6ea;border-color:#dae0e5}.btn-light:focus,.btn-light.focus{box-shadow:0 0 0 .2rem rgba(216,217,219,0.5)}.btn-light.disabled,.btn-light:disabled{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-light:not(:disabled):not(.disabled):active,.btn-light:not(:disabled):not(.disabled).active,.show>.btn-light.dropdown-toggle{color:#212529;background-color:#dae0e5;border-color:#d3d9df}.btn-light:not(:disabled):not(.disabled):active:focus,.btn-light:not(:disabled):not(.disabled).active:focus,.show>.btn-light.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(216,217,219,0.5)}.btn-dark{color:#fff;background-color:#343a40;border-color:#343a40}.btn-dark:hover{color:#fff;background-color:#23272b;border-color:#1d2124}.btn-dark:focus,.btn-dark.focus{box-shadow:0 0 0 .2rem rgba(82,88,93,0.5)}.btn-dark.disabled,.btn-dark:disabled{color:#fff;background-color:#343a40;border-color:#343a40}.btn-dark:not(:disabled):not(.disabled):active,.btn-dark:not(:disabled):not(.disabled).active,.show>.btn-dark.dropdown-toggle{color:#fff;background-color:#1d2124;border-color:#171a1d}.btn-dark:not(:disabled):not(.disabled):active:focus,.btn-dark:not(:disabled):not(.disabled).active:focus,.show>.btn-dark.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(82,88,93,0.5)}.btn-outline-primary{color:#007bff;border-color:#007bff}.btn-outline-primary:hover{color:#fff;background-color:#007bff;border-color:#007bff}.btn-outline-primary:focus,.btn-outline-primary.focus{box-shadow:0 0 0 .2rem rgba(0,123,255,0.5)}.btn-outline-primary.disabled,.btn-outline-primary:disabled{color:#007bff;background-color:transparent}.btn-outline-primary:not(:disabled):not(.disabled):active,.btn-outline-primary:not(:disabled):not(.disabled).active,.show>.btn-outline-primary.dropdown-toggle{color:#fff;background-color:#007bff;border-color:#007bff}.btn-outline-primary:not(:disabled):not(.disabled):active:focus,.btn-outline-primary:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-primary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(0,123,255,0.5)}.btn-outline-secondary{color:#6c757d;border-color:#6c757d}.btn-outline-secondary:hover{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-outline-secondary:focus,.btn-outline-secondary.focus{box-shadow:0 0 0 .2rem rgba(108,117,125,0.5)}.btn-outline-secondary.disabled,.btn-outline-secondary:disabled{color:#6c757d;background-color:transparent}.btn-outline-secondary:not(:disabled):not(.disabled):active,.btn-outline-secondary:not(:disabled):not(.disabled).active,.show>.btn-outline-secondary.dropdown-toggle{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-outline-secondary:not(:disabled):not(.disabled):active:focus,.btn-outline-secondary:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-secondary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(108,117,125,0.5)}.btn-outline-success{color:#28a745;border-color:#28a745}.btn-outline-success:hover{color:#fff;background-color:#28a745;border-color:#28a745}.btn-outline-success:focus,.btn-outline-success.focus{box-shadow:0 0 0 .2rem rgba(40,167,69,0.5)}.btn-outline-success.disabled,.btn-outline-success:disabled{color:#28a745;background-color:transparent}.btn-outline-success:not(:disabled):not(.disabled):active,.btn-outline-success:not(:disabled):not(.disabled).active,.show>.btn-outline-success.dropdown-toggle{color:#fff;background-color:#28a745;border-color:#28a745}.btn-outline-success:not(:disabled):not(.disabled):active:focus,.btn-outline-success:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-success.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(40,167,69,0.5)}.btn-outline-info{color:#17a2b8;border-color:#17a2b8}.btn-outline-info:hover{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-outline-info:focus,.btn-outline-info.focus{box-shadow:0 0 0 .2rem rgba(23,162,184,0.5)}.btn-outline-info.disabled,.btn-outline-info:disabled{color:#17a2b8;background-color:transparent}.btn-outline-info:not(:disabled):not(.disabled):active,.btn-outline-info:not(:disabled):not(.disabled).active,.show>.btn-outline-info.dropdown-toggle{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-outline-info:not(:disabled):not(.disabled):active:focus,.btn-outline-info:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-info.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(23,162,184,0.5)}.btn-outline-warning{color:#ffc107;border-color:#ffc107}.btn-outline-warning:hover{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-outline-warning:focus,.btn-outline-warning.focus{box-shadow:0 0 0 .2rem rgba(255,193,7,0.5)}.btn-outline-warning.disabled,.btn-outline-warning:disabled{color:#ffc107;background-color:transparent}.btn-outline-warning:not(:disabled):not(.disabled):active,.btn-outline-warning:not(:disabled):not(.disabled).active,.show>.btn-outline-warning.dropdown-toggle{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-outline-warning:not(:disabled):not(.disabled):active:focus,.btn-outline-warning:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-warning.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(255,193,7,0.5)}.btn-outline-danger{color:#dc3545;border-color:#dc3545}.btn-outline-danger:hover{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-outline-danger:focus,.btn-outline-danger.focus{box-shadow:0 0 0 .2rem rgba(220,53,69,0.5)}.btn-outline-danger.disabled,.btn-outline-danger:disabled{color:#dc3545;background-color:transparent}.btn-outline-danger:not(:disabled):not(.disabled):active,.btn-outline-danger:not(:disabled):not(.disabled).active,.show>.btn-outline-danger.dropdown-toggle{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-outline-danger:not(:disabled):not(.disabled):active:focus,.btn-outline-danger:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-danger.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(220,53,69,0.5)}.btn-outline-light{color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light:hover{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light:focus,.btn-outline-light.focus{box-shadow:0 0 0 .2rem rgba(248,249,250,0.5)}.btn-outline-light.disabled,.btn-outline-light:disabled{color:#f8f9fa;background-color:transparent}.btn-outline-light:not(:disabled):not(.disabled):active,.btn-outline-light:not(:disabled):not(.disabled).active,.show>.btn-outline-light.dropdown-toggle{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light:not(:disabled):not(.disabled):active:focus,.btn-outline-light:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-light.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(248,249,250,0.5)}.btn-outline-dark{color:#343a40;border-color:#343a40}.btn-outline-dark:hover{color:#fff;background-color:#343a40;border-color:#343a40}.btn-outline-dark:focus,.btn-outline-dark.focus{box-shadow:0 0 0 .2rem rgba(52,58,64,0.5)}.btn-outline-dark.disabled,.btn-outline-dark:disabled{color:#343a40;background-color:transparent}.btn-outline-dark:not(:disabled):not(.disabled):active,.btn-outline-dark:not(:disabled):not(.disabled).active,.show>.btn-outline-dark.dropdown-toggle{color:#fff;background-color:#343a40;border-color:#343a40}.btn-outline-dark:not(:disabled):not(.disabled):active:focus,.btn-outline-dark:not(:disabled):not(.disabled).active:focus,.show>.btn-outline-dark.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(52,58,64,0.5)}.btn-link{font-weight:400;color:#007bff;text-decoration:none}.btn-link:hover{color:#0056b3;text-decoration:underline}.btn-link:focus,.btn-link.focus{text-decoration:underline;box-shadow:none}.btn-link:disabled,.btn-link.disabled{color:#6c757d;pointer-events:none}.btn-lg,.btn-group-lg>.btn{padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}.btn-sm,.btn-group-sm>.btn{padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.btn-block{display:block;width:100%}.btn-block+.btn-block{margin-top:.5rem}input[type="submit"].btn-block,input[type="reset"].btn-block,input[type="button"].btn-block{width:100%}.fade{transition:opacity 0.15s linear}@media (prefers-reduced-motion: reduce){.fade{transition:none}}.fade:not(.show){opacity:0}.collapse:not(.show){display:none}.collapsing{position:relative;height:0;overflow:hidden;transition:height 0.35s ease}@media (prefers-reduced-motion: reduce){.collapsing{transition:none}}.dropup,.dropright,.dropdown,.dropleft{position:relative}.dropdown-toggle{white-space:nowrap}.dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:.3em solid;border-right:.3em solid transparent;border-bottom:0;border-left:.3em solid transparent}.dropdown-toggle:empty::after{margin-left:0}.dropdown-menu{position:absolute;top:100%;left:0;z-index:1000;display:none;float:left;min-width:10rem;padding:.5rem 0;margin:.125rem 0 0;font-size:1rem;color:#212529;text-align:left;list-style:none;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,0.15);border-radius:.25rem}.dropdown-menu-left{right:auto;left:0}.dropdown-menu-right{right:0;left:auto}@media (min-width: 576px){.dropdown-menu-sm-left{right:auto;left:0}.dropdown-menu-sm-right{right:0;left:auto}}@media (min-width: 768px){.dropdown-menu-md-left{right:auto;left:0}.dropdown-menu-md-right{right:0;left:auto}}@media (min-width: 992px){.dropdown-menu-lg-left{right:auto;left:0}.dropdown-menu-lg-right{right:0;left:auto}}@media (min-width: 1200px){.dropdown-menu-xl-left{right:auto;left:0}.dropdown-menu-xl-right{right:0;left:auto}}.dropup .dropdown-menu{top:auto;bottom:100%;margin-top:0;margin-bottom:.125rem}.dropup .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:0;border-right:.3em solid transparent;border-bottom:.3em solid;border-left:.3em solid transparent}.dropup .dropdown-toggle:empty::after{margin-left:0}.dropright .dropdown-menu{top:0;right:auto;left:100%;margin-top:0;margin-left:.125rem}.dropright .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:.3em solid transparent;border-right:0;border-bottom:.3em solid transparent;border-left:.3em solid}.dropright .dropdown-toggle:empty::after{margin-left:0}.dropright .dropdown-toggle::after{vertical-align:0}.dropleft .dropdown-menu{top:0;right:100%;left:auto;margin-top:0;margin-right:.125rem}.dropleft .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:""}.dropleft .dropdown-toggle::after{display:none}.dropleft .dropdown-toggle::before{display:inline-block;margin-right:.255em;vertical-align:.255em;content:"";border-top:.3em solid transparent;border-right:.3em solid;border-bottom:.3em solid transparent}.dropleft .dropdown-toggle:empty::after{margin-left:0}.dropleft .dropdown-toggle::before{vertical-align:0}.dropdown-menu[x-placement^="top"],.dropdown-menu[x-placement^="right"],.dropdown-menu[x-placement^="bottom"],.dropdown-menu[x-placement^="left"]{right:auto;bottom:auto}.dropdown-divider{height:0;margin:.5rem 0;overflow:hidden;border-top:1px solid #e9ecef}.dropdown-item{display:block;width:100%;padding:.25rem 1.5rem;clear:both;font-weight:400;color:#212529;text-align:inherit;white-space:nowrap;background-color:transparent;border:0}.dropdown-item:hover,.dropdown-item:focus{color:#16181b;text-decoration:none;background-color:#f8f9fa}.dropdown-item.active,.dropdown-item:active{color:#fff;text-decoration:none;background-color:#007bff}.dropdown-item.disabled,.dropdown-item:disabled{color:#6c757d;pointer-events:none;background-color:transparent}.dropdown-menu.show{display:block}.dropdown-header{display:block;padding:.5rem 1.5rem;margin-bottom:0;font-size:.875rem;color:#6c757d;white-space:nowrap}.dropdown-item-text{display:block;padding:.25rem 1.5rem;color:#212529}.btn-group,.btn-group-vertical{position:relative;display:inline-flex;vertical-align:middle}.btn-group>.btn,.btn-group-vertical>.btn{position:relative;flex:1 1 auto}.btn-group>.btn:hover,.btn-group-vertical>.btn:hover{z-index:1}.btn-group>.btn:focus,.btn-group>.btn:active,.btn-group>.btn.active,.btn-group-vertical>.btn:focus,.btn-group-vertical>.btn:active,.btn-group-vertical>.btn.active{z-index:1}.btn-toolbar{display:flex;flex-wrap:wrap;justify-content:flex-start}.btn-toolbar .input-group{width:auto}.btn-group>.btn:not(:first-child),.btn-group>.btn-group:not(:first-child){margin-left:-1px}.btn-group>.btn:not(:last-child):not(.dropdown-toggle),.btn-group>.btn-group:not(:last-child)>.btn{border-top-right-radius:0;border-bottom-right-radius:0}.btn-group>.btn:not(:first-child),.btn-group>.btn-group:not(:first-child)>.btn{border-top-left-radius:0;border-bottom-left-radius:0}.dropdown-toggle-split{padding-right:.5625rem;padding-left:.5625rem}.dropdown-toggle-split::after,.dropup .dropdown-toggle-split::after,.dropright .dropdown-toggle-split::after{margin-left:0}.dropleft .dropdown-toggle-split::before{margin-right:0}.btn-sm+.dropdown-toggle-split,.btn-group-sm>.btn+.dropdown-toggle-split{padding-right:.375rem;padding-left:.375rem}.btn-lg+.dropdown-toggle-split,.btn-group-lg>.btn+.dropdown-toggle-split{padding-right:.75rem;padding-left:.75rem}.btn-group-vertical{flex-direction:column;align-items:flex-start;justify-content:center}.btn-group-vertical>.btn,.btn-group-vertical>.btn-group{width:100%}.btn-group-vertical>.btn:not(:first-child),.btn-group-vertical>.btn-group:not(:first-child){margin-top:-1px}.btn-group-vertical>.btn:not(:last-child):not(.dropdown-toggle),.btn-group-vertical>.btn-group:not(:last-child)>.btn{border-bottom-right-radius:0;border-bottom-left-radius:0}.btn-group-vertical>.btn:not(:first-child),.btn-group-vertical>.btn-group:not(:first-child)>.btn{border-top-left-radius:0;border-top-right-radius:0}.btn-group-toggle>.btn,.btn-group-toggle>.btn-group>.btn{margin-bottom:0}.btn-group-toggle>.btn input[type="radio"],.btn-group-toggle>.btn input[type="checkbox"],.btn-group-toggle>.btn-group>.btn input[type="radio"],.btn-group-toggle>.btn-group>.btn input[type="checkbox"]{position:absolute;clip:rect(0, 0, 0, 0);pointer-events:none}.input-group{position:relative;display:flex;flex-wrap:wrap;align-items:stretch;width:100%}.input-group>.form-control,.input-group>.form-control-plaintext,.input-group>.custom-select,.input-group>.custom-file{position:relative;flex:1 1 auto;width:1%;margin-bottom:0}.input-group>.form-control+.form-control,.input-group>.form-control+.custom-select,.input-group>.form-control+.custom-file,.input-group>.form-control-plaintext+.form-control,.input-group>.form-control-plaintext+.custom-select,.input-group>.form-control-plaintext+.custom-file,.input-group>.custom-select+.form-control,.input-group>.custom-select+.custom-select,.input-group>.custom-select+.custom-file,.input-group>.custom-file+.form-control,.input-group>.custom-file+.custom-select,.input-group>.custom-file+.custom-file{margin-left:-1px}.input-group>.form-control:focus,.input-group>.custom-select:focus,.input-group>.custom-file .custom-file-input:focus ~ .custom-file-label{z-index:3}.input-group>.custom-file .custom-file-input:focus{z-index:4}.input-group>.form-control:not(:last-child),.input-group>.custom-select:not(:last-child){border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.form-control:not(:first-child),.input-group>.custom-select:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.input-group>.custom-file{display:flex;align-items:center}.input-group>.custom-file:not(:last-child) .custom-file-label,.input-group>.custom-file:not(:last-child) .custom-file-label::after{border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.custom-file:not(:first-child) .custom-file-label{border-top-left-radius:0;border-bottom-left-radius:0}.input-group-prepend,.input-group-append{display:flex}.input-group-prepend .btn,.input-group-append .btn{position:relative;z-index:2}.input-group-prepend .btn:focus,.input-group-append .btn:focus{z-index:3}.input-group-prepend .btn+.btn,.input-group-prepend .btn+.input-group-text,.input-group-prepend .input-group-text+.input-group-text,.input-group-prepend .input-group-text+.btn,.input-group-append .btn+.btn,.input-group-append .btn+.input-group-text,.input-group-append .input-group-text+.input-group-text,.input-group-append .input-group-text+.btn{margin-left:-1px}.input-group-prepend{margin-right:-1px}.input-group-append{margin-left:-1px}.input-group-text{display:flex;align-items:center;padding:.375rem .75rem;margin-bottom:0;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;text-align:center;white-space:nowrap;background-color:#e9ecef;border:1px solid #ced4da;border-radius:.25rem}.input-group-text input[type="radio"],.input-group-text input[type="checkbox"]{margin-top:0}.input-group-lg>.form-control:not(textarea),.input-group-lg>.custom-select{height:calc(1.5em + 1rem + 2px)}.input-group-lg>.form-control,.input-group-lg>.custom-select,.input-group-lg>.input-group-prepend>.input-group-text,.input-group-lg>.input-group-append>.input-group-text,.input-group-lg>.input-group-prepend>.btn,.input-group-lg>.input-group-append>.btn{padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}.input-group-sm>.form-control:not(textarea),.input-group-sm>.custom-select{height:calc(1.5em + .5rem + 2px)}.input-group-sm>.form-control,.input-group-sm>.custom-select,.input-group-sm>.input-group-prepend>.input-group-text,.input-group-sm>.input-group-append>.input-group-text,.input-group-sm>.input-group-prepend>.btn,.input-group-sm>.input-group-append>.btn{padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.input-group-lg>.custom-select,.input-group-sm>.custom-select{padding-right:1.75rem}.input-group>.input-group-prepend>.btn,.input-group>.input-group-prepend>.input-group-text,.input-group>.input-group-append:not(:last-child)>.btn,.input-group>.input-group-append:not(:last-child)>.input-group-text,.input-group>.input-group-append:last-child>.btn:not(:last-child):not(.dropdown-toggle),.input-group>.input-group-append:last-child>.input-group-text:not(:last-child){border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.input-group-append>.btn,.input-group>.input-group-append>.input-group-text,.input-group>.input-group-prepend:not(:first-child)>.btn,.input-group>.input-group-prepend:not(:first-child)>.input-group-text,.input-group>.input-group-prepend:first-child>.btn:not(:first-child),.input-group>.input-group-prepend:first-child>.input-group-text:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.custom-control{position:relative;display:block;min-height:1.5rem;padding-left:1.5rem}.custom-control-inline{display:inline-flex;margin-right:1rem}.custom-control-input{position:absolute;z-index:-1;opacity:0}.custom-control-input:checked ~ .custom-control-label::before{color:#fff;border-color:#007bff;background-color:#007bff}.custom-control-input:focus ~ .custom-control-label::before{box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.custom-control-input:focus:not(:checked) ~ .custom-control-label::before{border-color:#80bdff}.custom-control-input:not(:disabled):active ~ .custom-control-label::before{color:#fff;background-color:#b3d7ff;border-color:#b3d7ff}.custom-control-input:disabled ~ .custom-control-label{color:#6c757d}.custom-control-input:disabled ~ .custom-control-label::before{background-color:#e9ecef}.custom-control-label{position:relative;margin-bottom:0;vertical-align:top}.custom-control-label::before{position:absolute;top:.25rem;left:-1.5rem;display:block;width:1rem;height:1rem;pointer-events:none;content:"";background-color:#fff;border:#adb5bd solid 1px}.custom-control-label::after{position:absolute;top:.25rem;left:-1.5rem;display:block;width:1rem;height:1rem;content:"";background:no-repeat 50% / 50% 50%}.custom-checkbox .custom-control-label::before{border-radius:.25rem}.custom-checkbox .custom-control-input:checked ~ .custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%23fff' d='M6.564.75l-3.59 3.612-1.538-1.55L0 4.26 2.974 7.25 8 2.193z'/%3e%3c/svg%3e")}.custom-checkbox .custom-control-input:indeterminate ~ .custom-control-label::before{border-color:#007bff;background-color:#007bff}.custom-checkbox .custom-control-input:indeterminate ~ .custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 4'%3e%3cpath stroke='%23fff' d='M0 2h4'/%3e%3c/svg%3e")}.custom-checkbox .custom-control-input:disabled:checked ~ .custom-control-label::before{background-color:rgba(0,123,255,0.5)}.custom-checkbox .custom-control-input:disabled:indeterminate ~ .custom-control-label::before{background-color:rgba(0,123,255,0.5)}.custom-radio .custom-control-label::before{border-radius:50%}.custom-radio .custom-control-input:checked ~ .custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='-4 -4 8 8'%3e%3ccircle r='3' fill='%23fff'/%3e%3c/svg%3e")}.custom-radio .custom-control-input:disabled:checked ~ .custom-control-label::before{background-color:rgba(0,123,255,0.5)}.custom-switch{padding-left:2.25rem}.custom-switch .custom-control-label::before{left:-2.25rem;width:1.75rem;pointer-events:all;border-radius:.5rem}.custom-switch .custom-control-label::after{top:calc(.25rem + 2px);left:calc(-2.25rem + 2px);width:calc(1rem - 4px);height:calc(1rem - 4px);background-color:#adb5bd;border-radius:.5rem;transition:transform 0.15s ease-in-out,background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.custom-switch .custom-control-label::after{transition:none}}.custom-switch .custom-control-input:checked ~ .custom-control-label::after{background-color:#fff;transform:translateX(.75rem)}.custom-switch .custom-control-input:disabled:checked ~ .custom-control-label::before{background-color:rgba(0,123,255,0.5)}.custom-select{display:inline-block;width:100%;height:calc(1.5em + .75rem + 2px);padding:.375rem 1.75rem .375rem .75rem;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;vertical-align:middle;background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px;background-color:#fff;border:1px solid #ced4da;border-radius:.25rem;-webkit-appearance:none;-moz-appearance:none;appearance:none}.custom-select:focus{border-color:#80bdff;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.custom-select:focus::-ms-value{color:#495057;background-color:#fff}.custom-select[multiple],.custom-select[size]:not([size="1"]){height:auto;padding-right:.75rem;background-image:none}.custom-select:disabled{color:#6c757d;background-color:#e9ecef}.custom-select::-ms-expand{display:none}.custom-select-sm{height:calc(1.5em + .5rem + 2px);padding-top:.25rem;padding-bottom:.25rem;padding-left:.5rem;font-size:.875rem}.custom-select-lg{height:calc(1.5em + 1rem + 2px);padding-top:.5rem;padding-bottom:.5rem;padding-left:1rem;font-size:1.25rem}.custom-file{position:relative;display:inline-block;width:100%;height:calc(1.5em + .75rem + 2px);margin-bottom:0}.custom-file-input{position:relative;z-index:2;width:100%;height:calc(1.5em + .75rem + 2px);margin:0;opacity:0}.custom-file-input:focus ~ .custom-file-label{border-color:#80bdff;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.custom-file-input:disabled ~ .custom-file-label{background-color:#e9ecef}.custom-file-input:lang(en) ~ .custom-file-label::after{content:"Browse"}.custom-file-input ~ .custom-file-label[data-browse]::after{content:attr(data-browse)}.custom-file-label{position:absolute;top:0;right:0;left:0;z-index:1;height:calc(1.5em + .75rem + 2px);padding:.375rem .75rem;font-weight:400;line-height:1.5;color:#495057;background-color:#fff;border:1px solid #ced4da;border-radius:.25rem}.custom-file-label::after{position:absolute;top:0;right:0;bottom:0;z-index:3;display:block;height:calc(1.5em + .75rem);padding:.375rem .75rem;line-height:1.5;color:#495057;content:"Browse";background-color:#e9ecef;border-left:inherit;border-radius:0 .25rem .25rem 0}.custom-range{width:100%;height:calc(1rem + .4rem);padding:0;background-color:transparent;-webkit-appearance:none;-moz-appearance:none;appearance:none}.custom-range:focus{outline:none}.custom-range:focus::-webkit-slider-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,0.25)}.custom-range:focus::-moz-range-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,0.25)}.custom-range:focus::-ms-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,0.25)}.custom-range::-moz-focus-outer{border:0}.custom-range::-webkit-slider-thumb{width:1rem;height:1rem;margin-top:-.25rem;background-color:#007bff;border:0;border-radius:1rem;-webkit-transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;-webkit-appearance:none;appearance:none}@media (prefers-reduced-motion: reduce){.custom-range::-webkit-slider-thumb{-webkit-transition:none;transition:none}}.custom-range::-webkit-slider-thumb:active{background-color:#b3d7ff}.custom-range::-webkit-slider-runnable-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:#dee2e6;border-color:transparent;border-radius:1rem}.custom-range::-moz-range-thumb{width:1rem;height:1rem;background-color:#007bff;border:0;border-radius:1rem;-moz-transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;-moz-appearance:none;appearance:none}@media (prefers-reduced-motion: reduce){.custom-range::-moz-range-thumb{-moz-transition:none;transition:none}}.custom-range::-moz-range-thumb:active{background-color:#b3d7ff}.custom-range::-moz-range-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:#dee2e6;border-color:transparent;border-radius:1rem}.custom-range::-ms-thumb{width:1rem;height:1rem;margin-top:0;margin-right:.2rem;margin-left:.2rem;background-color:#007bff;border:0;border-radius:1rem;-ms-transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out;appearance:none}@media (prefers-reduced-motion: reduce){.custom-range::-ms-thumb{-ms-transition:none;transition:none}}.custom-range::-ms-thumb:active{background-color:#b3d7ff}.custom-range::-ms-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:transparent;border-color:transparent;border-width:.5rem}.custom-range::-ms-fill-lower{background-color:#dee2e6;border-radius:1rem}.custom-range::-ms-fill-upper{margin-right:15px;background-color:#dee2e6;border-radius:1rem}.custom-range:disabled::-webkit-slider-thumb{background-color:#adb5bd}.custom-range:disabled::-webkit-slider-runnable-track{cursor:default}.custom-range:disabled::-moz-range-thumb{background-color:#adb5bd}.custom-range:disabled::-moz-range-track{cursor:default}.custom-range:disabled::-ms-thumb{background-color:#adb5bd}.custom-control-label::before,.custom-file-label,.custom-select{transition:background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.custom-control-label::before,.custom-file-label,.custom-select{transition:none}}.nav{display:flex;flex-wrap:wrap;padding-left:0;margin-bottom:0;list-style:none}.nav-link{display:block;padding:.5rem 1rem}.nav-link:hover,.nav-link:focus{text-decoration:none}.nav-link.disabled{color:#6c757d;pointer-events:none;cursor:default}.nav-tabs{border-bottom:1px solid #dee2e6}.nav-tabs .nav-item{margin-bottom:-1px}.nav-tabs .nav-link{border:1px solid transparent;border-top-left-radius:.25rem;border-top-right-radius:.25rem}.nav-tabs .nav-link:hover,.nav-tabs .nav-link:focus{border-color:#e9ecef #e9ecef #dee2e6}.nav-tabs .nav-link.disabled{color:#6c757d;background-color:transparent;border-color:transparent}.nav-tabs .nav-link.active,.nav-tabs .nav-item.show .nav-link{color:#495057;background-color:#fff;border-color:#dee2e6 #dee2e6 #fff}.nav-tabs .dropdown-menu{margin-top:-1px;border-top-left-radius:0;border-top-right-radius:0}.nav-pills .nav-link{border-radius:.25rem}.nav-pills .nav-link.active,.nav-pills .show>.nav-link{color:#fff;background-color:#007bff}.nav-fill .nav-item{flex:1 1 auto;text-align:center}.nav-justified .nav-item{flex-basis:0;flex-grow:1;text-align:center}.tab-content>.tab-pane{display:none}.tab-content>.active{display:block}.navbar{position:relative;display:flex;flex-wrap:wrap;align-items:center;justify-content:space-between;padding:.5rem 1rem}.navbar>.container,.navbar>.container-fluid{display:flex;flex-wrap:wrap;align-items:center;justify-content:space-between}.navbar-brand{display:inline-block;padding-top:.3125rem;padding-bottom:.3125rem;margin-right:1rem;font-size:1.25rem;line-height:inherit;white-space:nowrap}.navbar-brand:hover,.navbar-brand:focus{text-decoration:none}.navbar-nav{display:flex;flex-direction:column;padding-left:0;margin-bottom:0;list-style:none}.navbar-nav .nav-link{padding-right:0;padding-left:0}.navbar-nav .dropdown-menu{position:static;float:none}.navbar-text{display:inline-block;padding-top:.5rem;padding-bottom:.5rem}.navbar-collapse{flex-basis:100%;flex-grow:1;align-items:center}.navbar-toggler{padding:.25rem .75rem;font-size:1.25rem;line-height:1;background-color:transparent;border:1px solid transparent;border-radius:.25rem}.navbar-toggler:hover,.navbar-toggler:focus{text-decoration:none}.navbar-toggler-icon{display:inline-block;width:1.5em;height:1.5em;vertical-align:middle;content:"";background:no-repeat center center;background-size:100% 100%}@media (max-width: 575.98px){.navbar-expand-sm>.container,.navbar-expand-sm>.container-fluid{padding-right:0;padding-left:0}}@media (min-width: 576px){.navbar-expand-sm{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand-sm .navbar-nav{flex-direction:row}.navbar-expand-sm .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-sm .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-sm>.container,.navbar-expand-sm>.container-fluid{flex-wrap:nowrap}.navbar-expand-sm .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand-sm .navbar-toggler{display:none}}@media (max-width: 767.98px){.navbar-expand-md>.container,.navbar-expand-md>.container-fluid{padding-right:0;padding-left:0}}@media (min-width: 768px){.navbar-expand-md{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand-md .navbar-nav{flex-direction:row}.navbar-expand-md .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-md .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-md>.container,.navbar-expand-md>.container-fluid{flex-wrap:nowrap}.navbar-expand-md .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand-md .navbar-toggler{display:none}}@media (max-width: 991.98px){.navbar-expand-lg>.container,.navbar-expand-lg>.container-fluid{padding-right:0;padding-left:0}}@media (min-width: 992px){.navbar-expand-lg{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand-lg .navbar-nav{flex-direction:row}.navbar-expand-lg .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-lg .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-lg>.container,.navbar-expand-lg>.container-fluid{flex-wrap:nowrap}.navbar-expand-lg .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand-lg .navbar-toggler{display:none}}@media (max-width: 1199.98px){.navbar-expand-xl>.container,.navbar-expand-xl>.container-fluid{padding-right:0;padding-left:0}}@media (min-width: 1200px){.navbar-expand-xl{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand-xl .navbar-nav{flex-direction:row}.navbar-expand-xl .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-xl .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-xl>.container,.navbar-expand-xl>.container-fluid{flex-wrap:nowrap}.navbar-expand-xl .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand-xl .navbar-toggler{display:none}}.navbar-expand{flex-flow:row nowrap;justify-content:flex-start}.navbar-expand>.container,.navbar-expand>.container-fluid{padding-right:0;padding-left:0}.navbar-expand .navbar-nav{flex-direction:row}.navbar-expand .navbar-nav .dropdown-menu{position:absolute}.navbar-expand .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand>.container,.navbar-expand>.container-fluid{flex-wrap:nowrap}.navbar-expand .navbar-collapse{display:flex !important;flex-basis:auto}.navbar-expand .navbar-toggler{display:none}.navbar-light .navbar-brand{color:rgba(0,0,0,0.9)}.navbar-light .navbar-brand:hover,.navbar-light .navbar-brand:focus{color:rgba(0,0,0,0.9)}.navbar-light .navbar-nav .nav-link{color:rgba(0,0,0,0.5)}.navbar-light .navbar-nav .nav-link:hover,.navbar-light .navbar-nav .nav-link:focus{color:rgba(0,0,0,0.7)}.navbar-light .navbar-nav .nav-link.disabled{color:rgba(0,0,0,0.3)}.navbar-light .navbar-nav .show>.nav-link,.navbar-light .navbar-nav .active>.nav-link,.navbar-light .navbar-nav .nav-link.show,.navbar-light .navbar-nav .nav-link.active{color:rgba(0,0,0,0.9)}.navbar-light .navbar-toggler{color:rgba(0,0,0,0.5);border-color:rgba(0,0,0,0.1)}.navbar-light .navbar-toggler-icon{background-image:url("data:image/svg+xml,%3csvg viewBox='0 0 30 30' xmlns='http://www.w3.org/2000/svg'%3e%3cpath stroke='rgba(0,0,0,0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 7h22M4 15h22M4 23h22'/%3e%3c/svg%3e")}.navbar-light .navbar-text{color:rgba(0,0,0,0.5)}.navbar-light .navbar-text a{color:rgba(0,0,0,0.9)}.navbar-light .navbar-text a:hover,.navbar-light .navbar-text a:focus{color:rgba(0,0,0,0.9)}.navbar-dark .navbar-brand{color:#fff}.navbar-dark .navbar-brand:hover,.navbar-dark .navbar-brand:focus{color:#fff}.navbar-dark .navbar-nav .nav-link{color:rgba(255,255,255,0.5)}.navbar-dark .navbar-nav .nav-link:hover,.navbar-dark .navbar-nav .nav-link:focus{color:rgba(255,255,255,0.75)}.navbar-dark .navbar-nav .nav-link.disabled{color:rgba(255,255,255,0.25)}.navbar-dark .navbar-nav .show>.nav-link,.navbar-dark .navbar-nav .active>.nav-link,.navbar-dark .navbar-nav .nav-link.show,.navbar-dark .navbar-nav .nav-link.active{color:#fff}.navbar-dark .navbar-toggler{color:rgba(255,255,255,0.5);border-color:rgba(255,255,255,0.1)}.navbar-dark .navbar-toggler-icon{background-image:url("data:image/svg+xml,%3csvg viewBox='0 0 30 30' xmlns='http://www.w3.org/2000/svg'%3e%3cpath stroke='rgba(255,255,255,0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 7h22M4 15h22M4 23h22'/%3e%3c/svg%3e")}.navbar-dark .navbar-text{color:rgba(255,255,255,0.5)}.navbar-dark .navbar-text a{color:#fff}.navbar-dark .navbar-text a:hover,.navbar-dark .navbar-text a:focus{color:#fff}.card{position:relative;display:flex;flex-direction:column;min-width:0;word-wrap:break-word;background-color:#fff;background-clip:border-box;border:1px solid rgba(0,0,0,0.125);border-radius:.25rem}.card>hr{margin-right:0;margin-left:0}.card>.list-group:first-child .list-group-item:first-child{border-top-left-radius:.25rem;border-top-right-radius:.25rem}.card>.list-group:last-child .list-group-item:last-child{border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.card-body{flex:1 1 auto;padding:1.25rem}.card-title{margin-bottom:.75rem}.card-subtitle{margin-top:-.375rem;margin-bottom:0}.card-text:last-child{margin-bottom:0}.card-link:hover{text-decoration:none}.card-link+.card-link{margin-left:1.25rem}.card-header{padding:.75rem 1.25rem;margin-bottom:0;background-color:rgba(0,0,0,0.03);border-bottom:1px solid rgba(0,0,0,0.125)}.card-header:first-child{border-radius:calc(.25rem - 1px) calc(.25rem - 1px) 0 0}.card-header+.list-group .list-group-item:first-child{border-top:0}.card-footer{padding:.75rem 1.25rem;background-color:rgba(0,0,0,0.03);border-top:1px solid rgba(0,0,0,0.125)}.card-footer:last-child{border-radius:0 0 calc(.25rem - 1px) calc(.25rem - 1px)}.card-header-tabs{margin-right:-.625rem;margin-bottom:-0.75rem;margin-left:-.625rem;border-bottom:0}.card-header-pills{margin-right:-.625rem;margin-left:-.625rem}.card-img-overlay{position:absolute;top:0;right:0;bottom:0;left:0;padding:1.25rem}.card-img{width:100%;border-radius:calc(.25rem - 1px)}.card-img-top{width:100%;border-top-left-radius:calc(.25rem - 1px);border-top-right-radius:calc(.25rem - 1px)}.card-img-bottom{width:100%;border-bottom-right-radius:calc(.25rem - 1px);border-bottom-left-radius:calc(.25rem - 1px)}.card-deck{display:flex;flex-direction:column}.card-deck .card{margin-bottom:15px}@media (min-width: 576px){.card-deck{flex-flow:row wrap;margin-right:-15px;margin-left:-15px}.card-deck .card{display:flex;flex:1 0 0%;flex-direction:column;margin-right:15px;margin-bottom:0;margin-left:15px}}.card-group{display:flex;flex-direction:column}.card-group>.card{margin-bottom:15px}@media (min-width: 576px){.card-group{flex-flow:row wrap}.card-group>.card{flex:1 0 0%;margin-bottom:0}.card-group>.card+.card{margin-left:0;border-left:0}.card-group>.card:not(:last-child){border-top-right-radius:0;border-bottom-right-radius:0}.card-group>.card:not(:last-child) .card-img-top,.card-group>.card:not(:last-child) .card-header{border-top-right-radius:0}.card-group>.card:not(:last-child) .card-img-bottom,.card-group>.card:not(:last-child) .card-footer{border-bottom-right-radius:0}.card-group>.card:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.card-group>.card:not(:first-child) .card-img-top,.card-group>.card:not(:first-child) .card-header{border-top-left-radius:0}.card-group>.card:not(:first-child) .card-img-bottom,.card-group>.card:not(:first-child) .card-footer{border-bottom-left-radius:0}}.card-columns .card{margin-bottom:.75rem}@media (min-width: 576px){.card-columns{-moz-column-count:3;column-count:3;-moz-column-gap:1.25rem;column-gap:1.25rem;orphans:1;widows:1}.card-columns .card{display:inline-block;width:100%}}.accordion>.card{overflow:hidden}.accordion>.card:not(:first-of-type) .card-header:first-child{border-radius:0}.accordion>.card:not(:first-of-type):not(:last-of-type){border-bottom:0;border-radius:0}.accordion>.card:first-of-type{border-bottom:0;border-bottom-right-radius:0;border-bottom-left-radius:0}.accordion>.card:last-of-type{border-top-left-radius:0;border-top-right-radius:0}.accordion>.card .card-header{margin-bottom:-1px}.breadcrumb{display:flex;flex-wrap:wrap;padding:.75rem 1rem;margin-bottom:1rem;list-style:none;background-color:#e9ecef;border-radius:.25rem}.breadcrumb-item+.breadcrumb-item{padding-left:.5rem}.breadcrumb-item+.breadcrumb-item::before{display:inline-block;padding-right:.5rem;color:#6c757d;content:"/"}.breadcrumb-item+.breadcrumb-item:hover::before{text-decoration:underline}.breadcrumb-item+.breadcrumb-item:hover::before{text-decoration:none}.breadcrumb-item.active{color:#6c757d}.pagination{display:flex;padding-left:0;list-style:none;border-radius:.25rem}.page-link{position:relative;display:block;padding:.5rem .75rem;margin-left:-1px;line-height:1.25;color:#007bff;background-color:#fff;border:1px solid #dee2e6}.page-link:hover{z-index:2;color:#0056b3;text-decoration:none;background-color:#e9ecef;border-color:#dee2e6}.page-link:focus{z-index:2;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.25)}.page-item:first-child .page-link{margin-left:0;border-top-left-radius:.25rem;border-bottom-left-radius:.25rem}.page-item:last-child .page-link{border-top-right-radius:.25rem;border-bottom-right-radius:.25rem}.page-item.active .page-link{z-index:1;color:#fff;background-color:#007bff;border-color:#007bff}.page-item.disabled .page-link{color:#6c757d;pointer-events:none;cursor:auto;background-color:#fff;border-color:#dee2e6}.pagination-lg .page-link{padding:.75rem 1.5rem;font-size:1.25rem;line-height:1.5}.pagination-lg .page-item:first-child .page-link{border-top-left-radius:.3rem;border-bottom-left-radius:.3rem}.pagination-lg .page-item:last-child .page-link{border-top-right-radius:.3rem;border-bottom-right-radius:.3rem}.pagination-sm .page-link{padding:.25rem .5rem;font-size:.875rem;line-height:1.5}.pagination-sm .page-item:first-child .page-link{border-top-left-radius:.2rem;border-bottom-left-radius:.2rem}.pagination-sm .page-item:last-child .page-link{border-top-right-radius:.2rem;border-bottom-right-radius:.2rem}.badge{display:inline-block;padding:.25em .4em;font-size:75%;font-weight:700;line-height:1;text-align:center;white-space:nowrap;vertical-align:baseline;border-radius:.25rem;transition:color 0.15s ease-in-out,background-color 0.15s ease-in-out,border-color 0.15s ease-in-out,box-shadow 0.15s ease-in-out}@media (prefers-reduced-motion: reduce){.badge{transition:none}}a.badge:hover,a.badge:focus{text-decoration:none}.badge:empty{display:none}.btn .badge{position:relative;top:-1px}.badge-pill{padding-right:.6em;padding-left:.6em;border-radius:10rem}.badge-primary{color:#fff;background-color:#007bff}a.badge-primary:hover,a.badge-primary:focus{color:#fff;background-color:#0062cc}a.badge-primary:focus,a.badge-primary.focus{outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,0.5)}.badge-secondary{color:#fff;background-color:#6c757d}a.badge-secondary:hover,a.badge-secondary:focus{color:#fff;background-color:#545b62}a.badge-secondary:focus,a.badge-secondary.focus{outline:0;box-shadow:0 0 0 .2rem rgba(108,117,125,0.5)}.badge-success{color:#fff;background-color:#28a745}a.badge-success:hover,a.badge-success:focus{color:#fff;background-color:#1e7e34}a.badge-success:focus,a.badge-success.focus{outline:0;box-shadow:0 0 0 .2rem rgba(40,167,69,0.5)}.badge-info{color:#fff;background-color:#17a2b8}a.badge-info:hover,a.badge-info:focus{color:#fff;background-color:#117a8b}a.badge-info:focus,a.badge-info.focus{outline:0;box-shadow:0 0 0 .2rem rgba(23,162,184,0.5)}.badge-warning{color:#212529;background-color:#ffc107}a.badge-warning:hover,a.badge-warning:focus{color:#212529;background-color:#d39e00}a.badge-warning:focus,a.badge-warning.focus{outline:0;box-shadow:0 0 0 .2rem rgba(255,193,7,0.5)}.badge-danger{color:#fff;background-color:#dc3545}a.badge-danger:hover,a.badge-danger:focus{color:#fff;background-color:#bd2130}a.badge-danger:focus,a.badge-danger.focus{outline:0;box-shadow:0 0 0 .2rem rgba(220,53,69,0.5)}.badge-light{color:#212529;background-color:#f8f9fa}a.badge-light:hover,a.badge-light:focus{color:#212529;background-color:#dae0e5}a.badge-light:focus,a.badge-light.focus{outline:0;box-shadow:0 0 0 .2rem rgba(248,249,250,0.5)}.badge-dark{color:#fff;background-color:#343a40}a.badge-dark:hover,a.badge-dark:focus{color:#fff;background-color:#1d2124}a.badge-dark:focus,a.badge-dark.focus{outline:0;box-shadow:0 0 0 .2rem rgba(52,58,64,0.5)}.jumbotron{padding:2rem 1rem;margin-bottom:2rem;background-color:#e9ecef;border-radius:.3rem}@media (min-width: 576px){.jumbotron{padding:4rem 2rem}}.jumbotron-fluid{padding-right:0;padding-left:0;border-radius:0}.alert{position:relative;padding:.75rem 1.25rem;margin-bottom:1rem;border:1px solid transparent;border-radius:.25rem}.alert-heading{color:inherit}.alert-link{font-weight:700}.alert-dismissible{padding-right:4rem}.alert-dismissible .close{position:absolute;top:0;right:0;padding:.75rem 1.25rem;color:inherit}.alert-primary{color:#004085;background-color:#cce5ff;border-color:#b8daff}.alert-primary hr{border-top-color:#9fcdff}.alert-primary .alert-link{color:#002752}.alert-secondary{color:#383d41;background-color:#e2e3e5;border-color:#d6d8db}.alert-secondary hr{border-top-color:#c8cbcf}.alert-secondary .alert-link{color:#202326}.alert-success{color:#155724;background-color:#d4edda;border-color:#c3e6cb}.alert-success hr{border-top-color:#b1dfbb}.alert-success .alert-link{color:#0b2e13}.alert-info{color:#0c5460;background-color:#d1ecf1;border-color:#bee5eb}.alert-info hr{border-top-color:#abdde5}.alert-info .alert-link{color:#062c33}.alert-warning{color:#856404;background-color:#fff3cd;border-color:#ffeeba}.alert-warning hr{border-top-color:#ffe8a1}.alert-warning .alert-link{color:#533f03}.alert-danger{color:#721c24;background-color:#f8d7da;border-color:#f5c6cb}.alert-danger hr{border-top-color:#f1b0b7}.alert-danger .alert-link{color:#491217}.alert-light{color:#818182;background-color:#fefefe;border-color:#fdfdfe}.alert-light hr{border-top-color:#ececf6}.alert-light .alert-link{color:#686868}.alert-dark{color:#1b1e21;background-color:#d6d8d9;border-color:#c6c8ca}.alert-dark hr{border-top-color:#b9bbbe}.alert-dark .alert-link{color:#040505}@-webkit-keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}@keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}.progress{display:flex;height:1rem;overflow:hidden;font-size:.75rem;background-color:#e9ecef;border-radius:.25rem}.progress-bar{display:flex;flex-direction:column;justify-content:center;color:#fff;text-align:center;white-space:nowrap;background-color:#007bff;transition:width 0.6s ease}@media (prefers-reduced-motion: reduce){.progress-bar{transition:none}}.progress-bar-striped{background-image:linear-gradient(45deg, rgba(255,255,255,0.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.15) 50%, rgba(255,255,255,0.15) 75%, transparent 75%, transparent);background-size:1rem 1rem}.progress-bar-animated{-webkit-animation:progress-bar-stripes 1s linear infinite;animation:progress-bar-stripes 1s linear infinite}@media (prefers-reduced-motion: reduce){.progress-bar-animated{-webkit-animation:none;animation:none}}.media{display:flex;align-items:flex-start}.media-body{flex:1}.list-group{display:flex;flex-direction:column;padding-left:0;margin-bottom:0}.list-group-item-action{width:100%;color:#495057;text-align:inherit}.list-group-item-action:hover,.list-group-item-action:focus{z-index:1;color:#495057;text-decoration:none;background-color:#f8f9fa}.list-group-item-action:active{color:#212529;background-color:#e9ecef}.list-group-item{position:relative;display:block;padding:.75rem 1.25rem;margin-bottom:-1px;background-color:#fff;border:1px solid rgba(0,0,0,0.125)}.list-group-item:first-child{border-top-left-radius:.25rem;border-top-right-radius:.25rem}.list-group-item:last-child{margin-bottom:0;border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.list-group-item.disabled,.list-group-item:disabled{color:#6c757d;pointer-events:none;background-color:#fff}.list-group-item.active{z-index:2;color:#fff;background-color:#007bff;border-color:#007bff}.list-group-horizontal{flex-direction:row}.list-group-horizontal .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}@media (min-width: 576px){.list-group-horizontal-sm{flex-direction:row}.list-group-horizontal-sm .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-sm .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-sm .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width: 768px){.list-group-horizontal-md{flex-direction:row}.list-group-horizontal-md .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-md .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-md .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width: 992px){.list-group-horizontal-lg{flex-direction:row}.list-group-horizontal-lg .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-lg .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-lg .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width: 1200px){.list-group-horizontal-xl{flex-direction:row}.list-group-horizontal-xl .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-xl .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-xl .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}.list-group-flush .list-group-item{border-right:0;border-left:0;border-radius:0}.list-group-flush .list-group-item:last-child{margin-bottom:-1px}.list-group-flush:first-child .list-group-item:first-child{border-top:0}.list-group-flush:last-child .list-group-item:last-child{margin-bottom:0;border-bottom:0}.list-group-item-primary{color:#004085;background-color:#b8daff}.list-group-item-primary.list-group-item-action:hover,.list-group-item-primary.list-group-item-action:focus{color:#004085;background-color:#9fcdff}.list-group-item-primary.list-group-item-action.active{color:#fff;background-color:#004085;border-color:#004085}.list-group-item-secondary{color:#383d41;background-color:#d6d8db}.list-group-item-secondary.list-group-item-action:hover,.list-group-item-secondary.list-group-item-action:focus{color:#383d41;background-color:#c8cbcf}.list-group-item-secondary.list-group-item-action.active{color:#fff;background-color:#383d41;border-color:#383d41}.list-group-item-success{color:#155724;background-color:#c3e6cb}.list-group-item-success.list-group-item-action:hover,.list-group-item-success.list-group-item-action:focus{color:#155724;background-color:#b1dfbb}.list-group-item-success.list-group-item-action.active{color:#fff;background-color:#155724;border-color:#155724}.list-group-item-info{color:#0c5460;background-color:#bee5eb}.list-group-item-info.list-group-item-action:hover,.list-group-item-info.list-group-item-action:focus{color:#0c5460;background-color:#abdde5}.list-group-item-info.list-group-item-action.active{color:#fff;background-color:#0c5460;border-color:#0c5460}.list-group-item-warning{color:#856404;background-color:#ffeeba}.list-group-item-warning.list-group-item-action:hover,.list-group-item-warning.list-group-item-action:focus{color:#856404;background-color:#ffe8a1}.list-group-item-warning.list-group-item-action.active{color:#fff;background-color:#856404;border-color:#856404}.list-group-item-danger{color:#721c24;background-color:#f5c6cb}.list-group-item-danger.list-group-item-action:hover,.list-group-item-danger.list-group-item-action:focus{color:#721c24;background-color:#f1b0b7}.list-group-item-danger.list-group-item-action.active{color:#fff;background-color:#721c24;border-color:#721c24}.list-group-item-light{color:#818182;background-color:#fdfdfe}.list-group-item-light.list-group-item-action:hover,.list-group-item-light.list-group-item-action:focus{color:#818182;background-color:#ececf6}.list-group-item-light.list-group-item-action.active{color:#fff;background-color:#818182;border-color:#818182}.list-group-item-dark{color:#1b1e21;background-color:#c6c8ca}.list-group-item-dark.list-group-item-action:hover,.list-group-item-dark.list-group-item-action:focus{color:#1b1e21;background-color:#b9bbbe}.list-group-item-dark.list-group-item-action.active{color:#fff;background-color:#1b1e21;border-color:#1b1e21}.close{float:right;font-size:1.5rem;font-weight:700;line-height:1;color:#000;text-shadow:0 1px 0 #fff;opacity:.5}.close:hover{color:#000;text-decoration:none}.close:not(:disabled):not(.disabled):hover,.close:not(:disabled):not(.disabled):focus{opacity:.75}button.close{padding:0;background-color:transparent;border:0;-webkit-appearance:none;-moz-appearance:none;appearance:none}a.close.disabled{pointer-events:none}.toast{max-width:350px;overflow:hidden;font-size:.875rem;background-color:rgba(255,255,255,0.85);background-clip:padding-box;border:1px solid rgba(0,0,0,0.1);box-shadow:0 0.25rem 0.75rem rgba(0,0,0,0.1);-webkit-backdrop-filter:blur(10px);backdrop-filter:blur(10px);opacity:0;border-radius:.25rem}.toast:not(:last-child){margin-bottom:.75rem}.toast.showing{opacity:1}.toast.show{display:block;opacity:1}.toast.hide{display:none}.toast-header{display:flex;align-items:center;padding:.25rem .75rem;color:#6c757d;background-color:rgba(255,255,255,0.85);background-clip:padding-box;border-bottom:1px solid rgba(0,0,0,0.05)}.toast-body{padding:.75rem}.modal-open{overflow:hidden}.modal-open .modal{overflow-x:hidden;overflow-y:auto}.modal{position:fixed;top:0;left:0;z-index:1050;display:none;width:100%;height:100%;overflow:hidden;outline:0}.modal-dialog{position:relative;width:auto;margin:.5rem;pointer-events:none}.modal.fade .modal-dialog{transition:transform 0.3s ease-out;transform:translate(0, -50px)}@media (prefers-reduced-motion: reduce){.modal.fade .modal-dialog{transition:none}}.modal.show .modal-dialog{transform:none}.modal-dialog-scrollable{display:flex;max-height:calc(100% - 1rem)}.modal-dialog-scrollable .modal-content{max-height:calc(100vh - 1rem);overflow:hidden}.modal-dialog-scrollable .modal-header,.modal-dialog-scrollable .modal-footer{flex-shrink:0}.modal-dialog-scrollable .modal-body{overflow-y:auto}.modal-dialog-centered{display:flex;align-items:center;min-height:calc(100% - 1rem)}.modal-dialog-centered::before{display:block;height:calc(100vh - 1rem);content:""}.modal-dialog-centered.modal-dialog-scrollable{flex-direction:column;justify-content:center;height:100%}.modal-dialog-centered.modal-dialog-scrollable .modal-content{max-height:none}.modal-dialog-centered.modal-dialog-scrollable::before{content:none}.modal-content{position:relative;display:flex;flex-direction:column;width:100%;pointer-events:auto;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,0.2);border-radius:.3rem;outline:0}.modal-backdrop{position:fixed;top:0;left:0;z-index:1040;width:100vw;height:100vh;background-color:#000}.modal-backdrop.fade{opacity:0}.modal-backdrop.show{opacity:.5}.modal-header{display:flex;align-items:flex-start;justify-content:space-between;padding:1rem 1rem;border-bottom:1px solid #dee2e6;border-top-left-radius:.3rem;border-top-right-radius:.3rem}.modal-header .close{padding:1rem 1rem;margin:-1rem -1rem -1rem auto}.modal-title{margin-bottom:0;line-height:1.5}.modal-body{position:relative;flex:1 1 auto;padding:1rem}.modal-footer{display:flex;align-items:center;justify-content:flex-end;padding:1rem;border-top:1px solid #dee2e6;border-bottom-right-radius:.3rem;border-bottom-left-radius:.3rem}.modal-footer>:not(:first-child){margin-left:.25rem}.modal-footer>:not(:last-child){margin-right:.25rem}.modal-scrollbar-measure{position:absolute;top:-9999px;width:50px;height:50px;overflow:scroll}@media (min-width: 576px){.modal-dialog{max-width:500px;margin:1.75rem auto}.modal-dialog-scrollable{max-height:calc(100% - 3.5rem)}.modal-dialog-scrollable .modal-content{max-height:calc(100vh - 3.5rem)}.modal-dialog-centered{min-height:calc(100% - 3.5rem)}.modal-dialog-centered::before{height:calc(100vh - 3.5rem)}.modal-sm{max-width:300px}}@media (min-width: 992px){.modal-lg,.modal-xl{max-width:800px}}@media (min-width: 1200px){.modal-xl{max-width:1140px}}.tooltip{position:absolute;z-index:1070;display:block;margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-style:normal;font-weight:400;line-height:1.5;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;letter-spacing:normal;word-break:normal;word-spacing:normal;white-space:normal;line-break:auto;font-size:.875rem;word-wrap:break-word;opacity:0}.tooltip.show{opacity:.9}.tooltip .arrow{position:absolute;display:block;width:.8rem;height:.4rem}.tooltip .arrow::before{position:absolute;content:"";border-color:transparent;border-style:solid}.bs-tooltip-top,.bs-tooltip-auto[x-placement^="top"]{padding:.4rem 0}.bs-tooltip-top .arrow,.bs-tooltip-auto[x-placement^="top"] .arrow{bottom:0}.bs-tooltip-top .arrow::before,.bs-tooltip-auto[x-placement^="top"] .arrow::before{top:0;border-width:.4rem .4rem 0;border-top-color:#000}.bs-tooltip-right,.bs-tooltip-auto[x-placement^="right"]{padding:0 .4rem}.bs-tooltip-right .arrow,.bs-tooltip-auto[x-placement^="right"] .arrow{left:0;width:.4rem;height:.8rem}.bs-tooltip-right .arrow::before,.bs-tooltip-auto[x-placement^="right"] .arrow::before{right:0;border-width:.4rem .4rem .4rem 0;border-right-color:#000}.bs-tooltip-bottom,.bs-tooltip-auto[x-placement^="bottom"]{padding:.4rem 0}.bs-tooltip-bottom .arrow,.bs-tooltip-auto[x-placement^="bottom"] .arrow{top:0}.bs-tooltip-bottom .arrow::before,.bs-tooltip-auto[x-placement^="bottom"] .arrow::before{bottom:0;border-width:0 .4rem .4rem;border-bottom-color:#000}.bs-tooltip-left,.bs-tooltip-auto[x-placement^="left"]{padding:0 .4rem}.bs-tooltip-left .arrow,.bs-tooltip-auto[x-placement^="left"] .arrow{right:0;width:.4rem;height:.8rem}.bs-tooltip-left .arrow::before,.bs-tooltip-auto[x-placement^="left"] .arrow::before{left:0;border-width:.4rem 0 .4rem .4rem;border-left-color:#000}.tooltip-inner{max-width:200px;padding:.25rem .5rem;color:#fff;text-align:center;background-color:#000;border-radius:.25rem}.popover{position:absolute;top:0;left:0;z-index:1060;display:block;max-width:276px;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-style:normal;font-weight:400;line-height:1.5;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;letter-spacing:normal;word-break:normal;word-spacing:normal;white-space:normal;line-break:auto;font-size:.875rem;word-wrap:break-word;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,0.2);border-radius:.3rem}.popover .arrow{position:absolute;display:block;width:1rem;height:.5rem;margin:0 .3rem}.popover .arrow::before,.popover .arrow::after{position:absolute;display:block;content:"";border-color:transparent;border-style:solid}.bs-popover-top,.bs-popover-auto[x-placement^="top"]{margin-bottom:.5rem}.bs-popover-top>.arrow,.bs-popover-auto[x-placement^="top"]>.arrow{bottom:calc((.5rem + 1px) * -1)}.bs-popover-top>.arrow::before,.bs-popover-auto[x-placement^="top"]>.arrow::before{bottom:0;border-width:.5rem .5rem 0;border-top-color:rgba(0,0,0,0.25)}.bs-popover-top>.arrow::after,.bs-popover-auto[x-placement^="top"]>.arrow::after{bottom:1px;border-width:.5rem .5rem 0;border-top-color:#fff}.bs-popover-right,.bs-popover-auto[x-placement^="right"]{margin-left:.5rem}.bs-popover-right>.arrow,.bs-popover-auto[x-placement^="right"]>.arrow{left:calc((.5rem + 1px) * -1);width:.5rem;height:1rem;margin:.3rem 0}.bs-popover-right>.arrow::before,.bs-popover-auto[x-placement^="right"]>.arrow::before{left:0;border-width:.5rem .5rem .5rem 0;border-right-color:rgba(0,0,0,0.25)}.bs-popover-right>.arrow::after,.bs-popover-auto[x-placement^="right"]>.arrow::after{left:1px;border-width:.5rem .5rem .5rem 0;border-right-color:#fff}.bs-popover-bottom,.bs-popover-auto[x-placement^="bottom"]{margin-top:.5rem}.bs-popover-bottom>.arrow,.bs-popover-auto[x-placement^="bottom"]>.arrow{top:calc((.5rem + 1px) * -1)}.bs-popover-bottom>.arrow::before,.bs-popover-auto[x-placement^="bottom"]>.arrow::before{top:0;border-width:0 .5rem .5rem .5rem;border-bottom-color:rgba(0,0,0,0.25)}.bs-popover-bottom>.arrow::after,.bs-popover-auto[x-placement^="bottom"]>.arrow::after{top:1px;border-width:0 .5rem .5rem .5rem;border-bottom-color:#fff}.bs-popover-bottom .popover-header::before,.bs-popover-auto[x-placement^="bottom"] .popover-header::before{position:absolute;top:0;left:50%;display:block;width:1rem;margin-left:-.5rem;content:"";border-bottom:1px solid #f7f7f7}.bs-popover-left,.bs-popover-auto[x-placement^="left"]{margin-right:.5rem}.bs-popover-left>.arrow,.bs-popover-auto[x-placement^="left"]>.arrow{right:calc((.5rem + 1px) * -1);width:.5rem;height:1rem;margin:.3rem 0}.bs-popover-left>.arrow::before,.bs-popover-auto[x-placement^="left"]>.arrow::before{right:0;border-width:.5rem 0 .5rem .5rem;border-left-color:rgba(0,0,0,0.25)}.bs-popover-left>.arrow::after,.bs-popover-auto[x-placement^="left"]>.arrow::after{right:1px;border-width:.5rem 0 .5rem .5rem;border-left-color:#fff}.popover-header{padding:.5rem .75rem;margin-bottom:0;font-size:1rem;background-color:#f7f7f7;border-bottom:1px solid #ebebeb;border-top-left-radius:calc(.3rem - 1px);border-top-right-radius:calc(.3rem - 1px)}.popover-header:empty{display:none}.popover-body{padding:.5rem .75rem;color:#212529}.carousel{position:relative}.carousel.pointer-event{touch-action:pan-y}.carousel-inner{position:relative;width:100%;overflow:hidden}.carousel-inner::after{display:block;clear:both;content:""}.carousel-item{position:relative;display:none;float:left;width:100%;margin-right:-100%;-webkit-backface-visibility:hidden;backface-visibility:hidden;transition:transform .6s ease-in-out}@media (prefers-reduced-motion: reduce){.carousel-item{transition:none}}.carousel-item.active,.carousel-item-next,.carousel-item-prev{display:block}.carousel-item-next:not(.carousel-item-left),.active.carousel-item-right{transform:translateX(100%)}.carousel-item-prev:not(.carousel-item-right),.active.carousel-item-left{transform:translateX(-100%)}.carousel-fade .carousel-item{opacity:0;transition-property:opacity;transform:none}.carousel-fade .carousel-item.active,.carousel-fade .carousel-item-next.carousel-item-left,.carousel-fade .carousel-item-prev.carousel-item-right{z-index:1;opacity:1}.carousel-fade .active.carousel-item-left,.carousel-fade .active.carousel-item-right{z-index:0;opacity:0;transition:0s .6s opacity}@media (prefers-reduced-motion: reduce){.carousel-fade .active.carousel-item-left,.carousel-fade .active.carousel-item-right{transition:none}}.carousel-control-prev,.carousel-control-next{position:absolute;top:0;bottom:0;z-index:1;display:flex;align-items:center;justify-content:center;width:15%;color:#fff;text-align:center;opacity:.5;transition:opacity 0.15s ease}@media (prefers-reduced-motion: reduce){.carousel-control-prev,.carousel-control-next{transition:none}}.carousel-control-prev:hover,.carousel-control-prev:focus,.carousel-control-next:hover,.carousel-control-next:focus{color:#fff;text-decoration:none;outline:0;opacity:.9}.carousel-control-prev{left:0}.carousel-control-next{right:0}.carousel-control-prev-icon,.carousel-control-next-icon{display:inline-block;width:20px;height:20px;background:no-repeat 50% / 100% 100%}.carousel-control-prev-icon{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3e%3cpath d='M5.25 0l-4 4 4 4 1.5-1.5-2.5-2.5 2.5-2.5-1.5-1.5z'/%3e%3c/svg%3e")}.carousel-control-next-icon{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3e%3cpath d='M2.75 0l-1.5 1.5 2.5 2.5-2.5 2.5 1.5 1.5 4-4-4-4z'/%3e%3c/svg%3e")}.carousel-indicators{position:absolute;right:0;bottom:0;left:0;z-index:15;display:flex;justify-content:center;padding-left:0;margin-right:15%;margin-left:15%;list-style:none}.carousel-indicators li{box-sizing:content-box;flex:0 1 auto;width:30px;height:3px;margin-right:3px;margin-left:3px;text-indent:-999px;cursor:pointer;background-color:#fff;background-clip:padding-box;border-top:10px solid transparent;border-bottom:10px solid transparent;opacity:.5;transition:opacity 0.6s ease}@media (prefers-reduced-motion: reduce){.carousel-indicators li{transition:none}}.carousel-indicators .active{opacity:1}.carousel-caption{position:absolute;right:15%;bottom:20px;left:15%;z-index:10;padding-top:20px;padding-bottom:20px;color:#fff;text-align:center}@-webkit-keyframes spinner-border{to{transform:rotate(360deg)}}@keyframes spinner-border{to{transform:rotate(360deg)}}.spinner-border{display:inline-block;width:2rem;height:2rem;vertical-align:text-bottom;border:.25em solid currentColor;border-right-color:transparent;border-radius:50%;-webkit-animation:spinner-border .75s linear infinite;animation:spinner-border .75s linear infinite}.spinner-border-sm{width:1rem;height:1rem;border-width:.2em}@-webkit-keyframes spinner-grow{0%{transform:scale(0)}50%{opacity:1}}@keyframes spinner-grow{0%{transform:scale(0)}50%{opacity:1}}.spinner-grow{display:inline-block;width:2rem;height:2rem;vertical-align:text-bottom;background-color:currentColor;border-radius:50%;opacity:0;-webkit-animation:spinner-grow .75s linear infinite;animation:spinner-grow .75s linear infinite}.spinner-grow-sm{width:1rem;height:1rem}.align-baseline{vertical-align:baseline !important}.align-top{vertical-align:top !important}.align-middle{vertical-align:middle !important}.align-bottom{vertical-align:bottom !important}.align-text-bottom{vertical-align:text-bottom !important}.align-text-top{vertical-align:text-top !important}.bg-primary{background-color:#007bff !important}a.bg-primary:hover,a.bg-primary:focus,button.bg-primary:hover,button.bg-primary:focus{background-color:#0062cc !important}.bg-secondary{background-color:#6c757d !important}a.bg-secondary:hover,a.bg-secondary:focus,button.bg-secondary:hover,button.bg-secondary:focus{background-color:#545b62 !important}.bg-success{background-color:#28a745 !important}a.bg-success:hover,a.bg-success:focus,button.bg-success:hover,button.bg-success:focus{background-color:#1e7e34 !important}.bg-info{background-color:#17a2b8 !important}a.bg-info:hover,a.bg-info:focus,button.bg-info:hover,button.bg-info:focus{background-color:#117a8b !important}.bg-warning{background-color:#ffc107 !important}a.bg-warning:hover,a.bg-warning:focus,button.bg-warning:hover,button.bg-warning:focus{background-color:#d39e00 !important}.bg-danger{background-color:#dc3545 !important}a.bg-danger:hover,a.bg-danger:focus,button.bg-danger:hover,button.bg-danger:focus{background-color:#bd2130 !important}.bg-light{background-color:#f8f9fa !important}a.bg-light:hover,a.bg-light:focus,button.bg-light:hover,button.bg-light:focus{background-color:#dae0e5 !important}.bg-dark{background-color:#343a40 !important}a.bg-dark:hover,a.bg-dark:focus,button.bg-dark:hover,button.bg-dark:focus{background-color:#1d2124 !important}.bg-white{background-color:#fff !important}.bg-transparent{background-color:transparent !important}.border{border:1px solid #dee2e6 !important}.border-top{border-top:1px solid #dee2e6 !important}.border-right{border-right:1px solid #dee2e6 !important}.border-bottom{border-bottom:1px solid #dee2e6 !important}.border-left{border-left:1px solid #dee2e6 !important}.border-0{border:0 !important}.border-top-0{border-top:0 !important}.border-right-0{border-right:0 !important}.border-bottom-0{border-bottom:0 !important}.border-left-0{border-left:0 !important}.border-primary{border-color:#007bff !important}.border-secondary{border-color:#6c757d !important}.border-success{border-color:#28a745 !important}.border-info{border-color:#17a2b8 !important}.border-warning{border-color:#ffc107 !important}.border-danger{border-color:#dc3545 !important}.border-light{border-color:#f8f9fa !important}.border-dark{border-color:#343a40 !important}.border-white{border-color:#fff !important}.rounded-sm{border-radius:.2rem !important}.rounded{border-radius:.25rem !important}.rounded-top{border-top-left-radius:.25rem !important;border-top-right-radius:.25rem !important}.rounded-right{border-top-right-radius:.25rem !important;border-bottom-right-radius:.25rem !important}.rounded-bottom{border-bottom-right-radius:.25rem !important;border-bottom-left-radius:.25rem !important}.rounded-left{border-top-left-radius:.25rem !important;border-bottom-left-radius:.25rem !important}.rounded-lg{border-radius:.3rem !important}.rounded-circle{border-radius:50% !important}.rounded-pill{border-radius:50rem !important}.rounded-0{border-radius:0 !important}.clearfix::after{display:block;clear:both;content:""}.d-none{display:none !important}.d-inline{display:inline !important}.d-inline-block{display:inline-block !important}.d-block{display:block !important}.d-table{display:table !important}.d-table-row{display:table-row !important}.d-table-cell{display:table-cell !important}.d-flex{display:flex !important}.d-inline-flex{display:inline-flex !important}@media (min-width: 576px){.d-sm-none{display:none !important}.d-sm-inline{display:inline !important}.d-sm-inline-block{display:inline-block !important}.d-sm-block{display:block !important}.d-sm-table{display:table !important}.d-sm-table-row{display:table-row !important}.d-sm-table-cell{display:table-cell !important}.d-sm-flex{display:flex !important}.d-sm-inline-flex{display:inline-flex !important}}@media (min-width: 768px){.d-md-none{display:none !important}.d-md-inline{display:inline !important}.d-md-inline-block{display:inline-block !important}.d-md-block{display:block !important}.d-md-table{display:table !important}.d-md-table-row{display:table-row !important}.d-md-table-cell{display:table-cell !important}.d-md-flex{display:flex !important}.d-md-inline-flex{display:inline-flex !important}}@media (min-width: 992px){.d-lg-none{display:none !important}.d-lg-inline{display:inline !important}.d-lg-inline-block{display:inline-block !important}.d-lg-block{display:block !important}.d-lg-table{display:table !important}.d-lg-table-row{display:table-row !important}.d-lg-table-cell{display:table-cell !important}.d-lg-flex{display:flex !important}.d-lg-inline-flex{display:inline-flex !important}}@media (min-width: 1200px){.d-xl-none{display:none !important}.d-xl-inline{display:inline !important}.d-xl-inline-block{display:inline-block !important}.d-xl-block{display:block !important}.d-xl-table{display:table !important}.d-xl-table-row{display:table-row !important}.d-xl-table-cell{display:table-cell !important}.d-xl-flex{display:flex !important}.d-xl-inline-flex{display:inline-flex !important}}@media print{.d-print-none{display:none !important}.d-print-inline{display:inline !important}.d-print-inline-block{display:inline-block !important}.d-print-block{display:block !important}.d-print-table{display:table !important}.d-print-table-row{display:table-row !important}.d-print-table-cell{display:table-cell !important}.d-print-flex{display:flex !important}.d-print-inline-flex{display:inline-flex !important}}.embed-responsive{position:relative;display:block;width:100%;padding:0;overflow:hidden}.embed-responsive::before{display:block;content:""}.embed-responsive .embed-responsive-item,.embed-responsive iframe,.embed-responsive embed,.embed-responsive object,.embed-responsive video{position:absolute;top:0;bottom:0;left:0;width:100%;height:100%;border:0}.embed-responsive-21by9::before{padding-top:42.8571428571%}.embed-responsive-16by9::before{padding-top:56.25%}.embed-responsive-4by3::before{padding-top:75%}.embed-responsive-1by1::before{padding-top:100%}.flex-row{flex-direction:row !important}.flex-column{flex-direction:column !important}.flex-row-reverse{flex-direction:row-reverse !important}.flex-column-reverse{flex-direction:column-reverse !important}.flex-wrap{flex-wrap:wrap !important}.flex-nowrap{flex-wrap:nowrap !important}.flex-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-fill{flex:1 1 auto !important}.flex-grow-0{flex-grow:0 !important}.flex-grow-1{flex-grow:1 !important}.flex-shrink-0{flex-shrink:0 !important}.flex-shrink-1{flex-shrink:1 !important}.justify-content-start{justify-content:flex-start !important}.justify-content-end{justify-content:flex-end !important}.justify-content-center{justify-content:center !important}.justify-content-between{justify-content:space-between !important}.justify-content-around{justify-content:space-around !important}.align-items-start{align-items:flex-start !important}.align-items-end{align-items:flex-end !important}.align-items-center{align-items:center !important}.align-items-baseline{align-items:baseline !important}.align-items-stretch{align-items:stretch !important}.align-content-start{align-content:flex-start !important}.align-content-end{align-content:flex-end !important}.align-content-center{align-content:center !important}.align-content-between{align-content:space-between !important}.align-content-around{align-content:space-around !important}.align-content-stretch{align-content:stretch !important}.align-self-auto{align-self:auto !important}.align-self-start{align-self:flex-start !important}.align-self-end{align-self:flex-end !important}.align-self-center{align-self:center !important}.align-self-baseline{align-self:baseline !important}.align-self-stretch{align-self:stretch !important}@media (min-width: 576px){.flex-sm-row{flex-direction:row !important}.flex-sm-column{flex-direction:column !important}.flex-sm-row-reverse{flex-direction:row-reverse !important}.flex-sm-column-reverse{flex-direction:column-reverse !important}.flex-sm-wrap{flex-wrap:wrap !important}.flex-sm-nowrap{flex-wrap:nowrap !important}.flex-sm-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-sm-fill{flex:1 1 auto !important}.flex-sm-grow-0{flex-grow:0 !important}.flex-sm-grow-1{flex-grow:1 !important}.flex-sm-shrink-0{flex-shrink:0 !important}.flex-sm-shrink-1{flex-shrink:1 !important}.justify-content-sm-start{justify-content:flex-start !important}.justify-content-sm-end{justify-content:flex-end !important}.justify-content-sm-center{justify-content:center !important}.justify-content-sm-between{justify-content:space-between !important}.justify-content-sm-around{justify-content:space-around !important}.align-items-sm-start{align-items:flex-start !important}.align-items-sm-end{align-items:flex-end !important}.align-items-sm-center{align-items:center !important}.align-items-sm-baseline{align-items:baseline !important}.align-items-sm-stretch{align-items:stretch !important}.align-content-sm-start{align-content:flex-start !important}.align-content-sm-end{align-content:flex-end !important}.align-content-sm-center{align-content:center !important}.align-content-sm-between{align-content:space-between !important}.align-content-sm-around{align-content:space-around !important}.align-content-sm-stretch{align-content:stretch !important}.align-self-sm-auto{align-self:auto !important}.align-self-sm-start{align-self:flex-start !important}.align-self-sm-end{align-self:flex-end !important}.align-self-sm-center{align-self:center !important}.align-self-sm-baseline{align-self:baseline !important}.align-self-sm-stretch{align-self:stretch !important}}@media (min-width: 768px){.flex-md-row{flex-direction:row !important}.flex-md-column{flex-direction:column !important}.flex-md-row-reverse{flex-direction:row-reverse !important}.flex-md-column-reverse{flex-direction:column-reverse !important}.flex-md-wrap{flex-wrap:wrap !important}.flex-md-nowrap{flex-wrap:nowrap !important}.flex-md-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-md-fill{flex:1 1 auto !important}.flex-md-grow-0{flex-grow:0 !important}.flex-md-grow-1{flex-grow:1 !important}.flex-md-shrink-0{flex-shrink:0 !important}.flex-md-shrink-1{flex-shrink:1 !important}.justify-content-md-start{justify-content:flex-start !important}.justify-content-md-end{justify-content:flex-end !important}.justify-content-md-center{justify-content:center !important}.justify-content-md-between{justify-content:space-between !important}.justify-content-md-around{justify-content:space-around !important}.align-items-md-start{align-items:flex-start !important}.align-items-md-end{align-items:flex-end !important}.align-items-md-center{align-items:center !important}.align-items-md-baseline{align-items:baseline !important}.align-items-md-stretch{align-items:stretch !important}.align-content-md-start{align-content:flex-start !important}.align-content-md-end{align-content:flex-end !important}.align-content-md-center{align-content:center !important}.align-content-md-between{align-content:space-between !important}.align-content-md-around{align-content:space-around !important}.align-content-md-stretch{align-content:stretch !important}.align-self-md-auto{align-self:auto !important}.align-self-md-start{align-self:flex-start !important}.align-self-md-end{align-self:flex-end !important}.align-self-md-center{align-self:center !important}.align-self-md-baseline{align-self:baseline !important}.align-self-md-stretch{align-self:stretch !important}}@media (min-width: 992px){.flex-lg-row{flex-direction:row !important}.flex-lg-column{flex-direction:column !important}.flex-lg-row-reverse{flex-direction:row-reverse !important}.flex-lg-column-reverse{flex-direction:column-reverse !important}.flex-lg-wrap{flex-wrap:wrap !important}.flex-lg-nowrap{flex-wrap:nowrap !important}.flex-lg-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-lg-fill{flex:1 1 auto !important}.flex-lg-grow-0{flex-grow:0 !important}.flex-lg-grow-1{flex-grow:1 !important}.flex-lg-shrink-0{flex-shrink:0 !important}.flex-lg-shrink-1{flex-shrink:1 !important}.justify-content-lg-start{justify-content:flex-start !important}.justify-content-lg-end{justify-content:flex-end !important}.justify-content-lg-center{justify-content:center !important}.justify-content-lg-between{justify-content:space-between !important}.justify-content-lg-around{justify-content:space-around !important}.align-items-lg-start{align-items:flex-start !important}.align-items-lg-end{align-items:flex-end !important}.align-items-lg-center{align-items:center !important}.align-items-lg-baseline{align-items:baseline !important}.align-items-lg-stretch{align-items:stretch !important}.align-content-lg-start{align-content:flex-start !important}.align-content-lg-end{align-content:flex-end !important}.align-content-lg-center{align-content:center !important}.align-content-lg-between{align-content:space-between !important}.align-content-lg-around{align-content:space-around !important}.align-content-lg-stretch{align-content:stretch !important}.align-self-lg-auto{align-self:auto !important}.align-self-lg-start{align-self:flex-start !important}.align-self-lg-end{align-self:flex-end !important}.align-self-lg-center{align-self:center !important}.align-self-lg-baseline{align-self:baseline !important}.align-self-lg-stretch{align-self:stretch !important}}@media (min-width: 1200px){.flex-xl-row{flex-direction:row !important}.flex-xl-column{flex-direction:column !important}.flex-xl-row-reverse{flex-direction:row-reverse !important}.flex-xl-column-reverse{flex-direction:column-reverse !important}.flex-xl-wrap{flex-wrap:wrap !important}.flex-xl-nowrap{flex-wrap:nowrap !important}.flex-xl-wrap-reverse{flex-wrap:wrap-reverse !important}.flex-xl-fill{flex:1 1 auto !important}.flex-xl-grow-0{flex-grow:0 !important}.flex-xl-grow-1{flex-grow:1 !important}.flex-xl-shrink-0{flex-shrink:0 !important}.flex-xl-shrink-1{flex-shrink:1 !important}.justify-content-xl-start{justify-content:flex-start !important}.justify-content-xl-end{justify-content:flex-end !important}.justify-content-xl-center{justify-content:center !important}.justify-content-xl-between{justify-content:space-between !important}.justify-content-xl-around{justify-content:space-around !important}.align-items-xl-start{align-items:flex-start !important}.align-items-xl-end{align-items:flex-end !important}.align-items-xl-center{align-items:center !important}.align-items-xl-baseline{align-items:baseline !important}.align-items-xl-stretch{align-items:stretch !important}.align-content-xl-start{align-content:flex-start !important}.align-content-xl-end{align-content:flex-end !important}.align-content-xl-center{align-content:center !important}.align-content-xl-between{align-content:space-between !important}.align-content-xl-around{align-content:space-around !important}.align-content-xl-stretch{align-content:stretch !important}.align-self-xl-auto{align-self:auto !important}.align-self-xl-start{align-self:flex-start !important}.align-self-xl-end{align-self:flex-end !important}.align-self-xl-center{align-self:center !important}.align-self-xl-baseline{align-self:baseline !important}.align-self-xl-stretch{align-self:stretch !important}}.float-left{float:left !important}.float-right{float:right !important}.float-none{float:none !important}@media (min-width: 576px){.float-sm-left{float:left !important}.float-sm-right{float:right !important}.float-sm-none{float:none !important}}@media (min-width: 768px){.float-md-left{float:left !important}.float-md-right{float:right !important}.float-md-none{float:none !important}}@media (min-width: 992px){.float-lg-left{float:left !important}.float-lg-right{float:right !important}.float-lg-none{float:none !important}}@media (min-width: 1200px){.float-xl-left{float:left !important}.float-xl-right{float:right !important}.float-xl-none{float:none !important}}.overflow-auto{overflow:auto !important}.overflow-hidden{overflow:hidden !important}.position-static{position:static !important}.position-relative{position:relative !important}.position-absolute{position:absolute !important}.position-fixed{position:fixed !important}.position-sticky{position:-webkit-sticky !important;position:sticky !important}.fixed-top{position:fixed;top:0;right:0;left:0;z-index:1030}.fixed-bottom{position:fixed;right:0;bottom:0;left:0;z-index:1030}@supports ((position: -webkit-sticky) or (position: sticky)){.sticky-top{position:-webkit-sticky;position:sticky;top:0;z-index:1020}}.sr-only{position:absolute;width:1px;height:1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);white-space:nowrap;border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;overflow:visible;clip:auto;white-space:normal}.shadow-sm{box-shadow:0 0.125rem 0.25rem rgba(0,0,0,0.075) !important}.shadow{box-shadow:0 0.5rem 1rem rgba(0,0,0,0.15) !important}.shadow-lg{box-shadow:0 1rem 3rem rgba(0,0,0,0.175) !important}.shadow-none{box-shadow:none !important}.w-25{width:25% !important}.w-50{width:50% !important}.w-75{width:75% !important}.w-100{width:100% !important}.w-auto{width:auto !important}.h-25{height:25% !important}.h-50{height:50% !important}.h-75{height:75% !important}.h-100{height:100% !important}.h-auto{height:auto !important}.mw-100{max-width:100% !important}.mh-100{max-height:100% !important}.min-vw-100{min-width:100vw !important}.min-vh-100{min-height:100vh !important}.vw-100{width:100vw !important}.vh-100{height:100vh !important}.stretched-link::after{position:absolute;top:0;right:0;bottom:0;left:0;z-index:1;pointer-events:auto;content:"";background-color:transparent}.m-0{margin:0 !important}.mt-0,.my-0{margin-top:0 !important}.mr-0,.mx-0{margin-right:0 !important}.mb-0,.my-0{margin-bottom:0 !important}.ml-0,.mx-0{margin-left:0 !important}.m-1{margin:.25rem !important}.mt-1,.my-1{margin-top:.25rem !important}.mr-1,.mx-1{margin-right:.25rem !important}.mb-1,.my-1{margin-bottom:.25rem !important}.ml-1,.mx-1{margin-left:.25rem !important}.m-2{margin:.5rem !important}.mt-2,.my-2{margin-top:.5rem !important}.mr-2,.mx-2{margin-right:.5rem !important}.mb-2,.my-2{margin-bottom:.5rem !important}.ml-2,.mx-2{margin-left:.5rem !important}.m-3{margin:1rem !important}.mt-3,.my-3{margin-top:1rem !important}.mr-3,.mx-3{margin-right:1rem !important}.mb-3,.my-3{margin-bottom:1rem !important}.ml-3,.mx-3{margin-left:1rem !important}.m-4{margin:1.5rem !important}.mt-4,.my-4{margin-top:1.5rem !important}.mr-4,.mx-4{margin-right:1.5rem !important}.mb-4,.my-4{margin-bottom:1.5rem !important}.ml-4,.mx-4{margin-left:1.5rem !important}.m-5{margin:3rem !important}.mt-5,.my-5{margin-top:3rem !important}.mr-5,.mx-5{margin-right:3rem !important}.mb-5,.my-5{margin-bottom:3rem !important}.ml-5,.mx-5{margin-left:3rem !important}.p-0{padding:0 !important}.pt-0,.py-0{padding-top:0 !important}.pr-0,.px-0{padding-right:0 !important}.pb-0,.py-0{padding-bottom:0 !important}.pl-0,.px-0{padding-left:0 !important}.p-1{padding:.25rem !important}.pt-1,.py-1{padding-top:.25rem !important}.pr-1,.px-1{padding-right:.25rem !important}.pb-1,.py-1{padding-bottom:.25rem !important}.pl-1,.px-1{padding-left:.25rem !important}.p-2{padding:.5rem !important}.pt-2,.py-2{padding-top:.5rem !important}.pr-2,.px-2{padding-right:.5rem !important}.pb-2,.py-2{padding-bottom:.5rem !important}.pl-2,.px-2{padding-left:.5rem !important}.p-3{padding:1rem !important}.pt-3,.py-3{padding-top:1rem !important}.pr-3,.px-3{padding-right:1rem !important}.pb-3,.py-3{padding-bottom:1rem !important}.pl-3,.px-3{padding-left:1rem !important}.p-4{padding:1.5rem !important}.pt-4,.py-4{padding-top:1.5rem !important}.pr-4,.px-4{padding-right:1.5rem !important}.pb-4,.py-4{padding-bottom:1.5rem !important}.pl-4,.px-4{padding-left:1.5rem !important}.p-5{padding:3rem !important}.pt-5,.py-5{padding-top:3rem !important}.pr-5,.px-5{padding-right:3rem !important}.pb-5,.py-5{padding-bottom:3rem !important}.pl-5,.px-5{padding-left:3rem !important}.m-n1{margin:-.25rem !important}.mt-n1,.my-n1{margin-top:-.25rem !important}.mr-n1,.mx-n1{margin-right:-.25rem !important}.mb-n1,.my-n1{margin-bottom:-.25rem !important}.ml-n1,.mx-n1{margin-left:-.25rem !important}.m-n2{margin:-.5rem !important}.mt-n2,.my-n2{margin-top:-.5rem !important}.mr-n2,.mx-n2{margin-right:-.5rem !important}.mb-n2,.my-n2{margin-bottom:-.5rem !important}.ml-n2,.mx-n2{margin-left:-.5rem !important}.m-n3{margin:-1rem !important}.mt-n3,.my-n3{margin-top:-1rem !important}.mr-n3,.mx-n3{margin-right:-1rem !important}.mb-n3,.my-n3{margin-bottom:-1rem !important}.ml-n3,.mx-n3{margin-left:-1rem !important}.m-n4{margin:-1.5rem !important}.mt-n4,.my-n4{margin-top:-1.5rem !important}.mr-n4,.mx-n4{margin-right:-1.5rem !important}.mb-n4,.my-n4{margin-bottom:-1.5rem !important}.ml-n4,.mx-n4{margin-left:-1.5rem !important}.m-n5{margin:-3rem !important}.mt-n5,.my-n5{margin-top:-3rem !important}.mr-n5,.mx-n5{margin-right:-3rem !important}.mb-n5,.my-n5{margin-bottom:-3rem !important}.ml-n5,.mx-n5{margin-left:-3rem !important}.m-auto{margin:auto !important}.mt-auto,.my-auto{margin-top:auto !important}.mr-auto,.mx-auto{margin-right:auto !important}.mb-auto,.my-auto{margin-bottom:auto !important}.ml-auto,.mx-auto{margin-left:auto !important}@media (min-width: 576px){.m-sm-0{margin:0 !important}.mt-sm-0,.my-sm-0{margin-top:0 !important}.mr-sm-0,.mx-sm-0{margin-right:0 !important}.mb-sm-0,.my-sm-0{margin-bottom:0 !important}.ml-sm-0,.mx-sm-0{margin-left:0 !important}.m-sm-1{margin:.25rem !important}.mt-sm-1,.my-sm-1{margin-top:.25rem !important}.mr-sm-1,.mx-sm-1{margin-right:.25rem !important}.mb-sm-1,.my-sm-1{margin-bottom:.25rem !important}.ml-sm-1,.mx-sm-1{margin-left:.25rem !important}.m-sm-2{margin:.5rem !important}.mt-sm-2,.my-sm-2{margin-top:.5rem !important}.mr-sm-2,.mx-sm-2{margin-right:.5rem !important}.mb-sm-2,.my-sm-2{margin-bottom:.5rem !important}.ml-sm-2,.mx-sm-2{margin-left:.5rem !important}.m-sm-3{margin:1rem !important}.mt-sm-3,.my-sm-3{margin-top:1rem !important}.mr-sm-3,.mx-sm-3{margin-right:1rem !important}.mb-sm-3,.my-sm-3{margin-bottom:1rem !important}.ml-sm-3,.mx-sm-3{margin-left:1rem !important}.m-sm-4{margin:1.5rem !important}.mt-sm-4,.my-sm-4{margin-top:1.5rem !important}.mr-sm-4,.mx-sm-4{margin-right:1.5rem !important}.mb-sm-4,.my-sm-4{margin-bottom:1.5rem !important}.ml-sm-4,.mx-sm-4{margin-left:1.5rem !important}.m-sm-5{margin:3rem !important}.mt-sm-5,.my-sm-5{margin-top:3rem !important}.mr-sm-5,.mx-sm-5{margin-right:3rem !important}.mb-sm-5,.my-sm-5{margin-bottom:3rem !important}.ml-sm-5,.mx-sm-5{margin-left:3rem !important}.p-sm-0{padding:0 !important}.pt-sm-0,.py-sm-0{padding-top:0 !important}.pr-sm-0,.px-sm-0{padding-right:0 !important}.pb-sm-0,.py-sm-0{padding-bottom:0 !important}.pl-sm-0,.px-sm-0{padding-left:0 !important}.p-sm-1{padding:.25rem !important}.pt-sm-1,.py-sm-1{padding-top:.25rem !important}.pr-sm-1,.px-sm-1{padding-right:.25rem !important}.pb-sm-1,.py-sm-1{padding-bottom:.25rem !important}.pl-sm-1,.px-sm-1{padding-left:.25rem !important}.p-sm-2{padding:.5rem !important}.pt-sm-2,.py-sm-2{padding-top:.5rem !important}.pr-sm-2,.px-sm-2{padding-right:.5rem !important}.pb-sm-2,.py-sm-2{padding-bottom:.5rem !important}.pl-sm-2,.px-sm-2{padding-left:.5rem !important}.p-sm-3{padding:1rem !important}.pt-sm-3,.py-sm-3{padding-top:1rem !important}.pr-sm-3,.px-sm-3{padding-right:1rem !important}.pb-sm-3,.py-sm-3{padding-bottom:1rem !important}.pl-sm-3,.px-sm-3{padding-left:1rem !important}.p-sm-4{padding:1.5rem !important}.pt-sm-4,.py-sm-4{padding-top:1.5rem !important}.pr-sm-4,.px-sm-4{padding-right:1.5rem !important}.pb-sm-4,.py-sm-4{padding-bottom:1.5rem !important}.pl-sm-4,.px-sm-4{padding-left:1.5rem !important}.p-sm-5{padding:3rem !important}.pt-sm-5,.py-sm-5{padding-top:3rem !important}.pr-sm-5,.px-sm-5{padding-right:3rem !important}.pb-sm-5,.py-sm-5{padding-bottom:3rem !important}.pl-sm-5,.px-sm-5{padding-left:3rem !important}.m-sm-n1{margin:-.25rem !important}.mt-sm-n1,.my-sm-n1{margin-top:-.25rem !important}.mr-sm-n1,.mx-sm-n1{margin-right:-.25rem !important}.mb-sm-n1,.my-sm-n1{margin-bottom:-.25rem !important}.ml-sm-n1,.mx-sm-n1{margin-left:-.25rem !important}.m-sm-n2{margin:-.5rem !important}.mt-sm-n2,.my-sm-n2{margin-top:-.5rem !important}.mr-sm-n2,.mx-sm-n2{margin-right:-.5rem !important}.mb-sm-n2,.my-sm-n2{margin-bottom:-.5rem !important}.ml-sm-n2,.mx-sm-n2{margin-left:-.5rem !important}.m-sm-n3{margin:-1rem !important}.mt-sm-n3,.my-sm-n3{margin-top:-1rem !important}.mr-sm-n3,.mx-sm-n3{margin-right:-1rem !important}.mb-sm-n3,.my-sm-n3{margin-bottom:-1rem !important}.ml-sm-n3,.mx-sm-n3{margin-left:-1rem !important}.m-sm-n4{margin:-1.5rem !important}.mt-sm-n4,.my-sm-n4{margin-top:-1.5rem !important}.mr-sm-n4,.mx-sm-n4{margin-right:-1.5rem !important}.mb-sm-n4,.my-sm-n4{margin-bottom:-1.5rem !important}.ml-sm-n4,.mx-sm-n4{margin-left:-1.5rem !important}.m-sm-n5{margin:-3rem !important}.mt-sm-n5,.my-sm-n5{margin-top:-3rem !important}.mr-sm-n5,.mx-sm-n5{margin-right:-3rem !important}.mb-sm-n5,.my-sm-n5{margin-bottom:-3rem !important}.ml-sm-n5,.mx-sm-n5{margin-left:-3rem !important}.m-sm-auto{margin:auto !important}.mt-sm-auto,.my-sm-auto{margin-top:auto !important}.mr-sm-auto,.mx-sm-auto{margin-right:auto !important}.mb-sm-auto,.my-sm-auto{margin-bottom:auto !important}.ml-sm-auto,.mx-sm-auto{margin-left:auto !important}}@media (min-width: 768px){.m-md-0{margin:0 !important}.mt-md-0,.my-md-0{margin-top:0 !important}.mr-md-0,.mx-md-0{margin-right:0 !important}.mb-md-0,.my-md-0{margin-bottom:0 !important}.ml-md-0,.mx-md-0{margin-left:0 !important}.m-md-1{margin:.25rem !important}.mt-md-1,.my-md-1{margin-top:.25rem !important}.mr-md-1,.mx-md-1{margin-right:.25rem !important}.mb-md-1,.my-md-1{margin-bottom:.25rem !important}.ml-md-1,.mx-md-1{margin-left:.25rem !important}.m-md-2{margin:.5rem !important}.mt-md-2,.my-md-2{margin-top:.5rem !important}.mr-md-2,.mx-md-2{margin-right:.5rem !important}.mb-md-2,.my-md-2{margin-bottom:.5rem !important}.ml-md-2,.mx-md-2{margin-left:.5rem !important}.m-md-3{margin:1rem !important}.mt-md-3,.my-md-3{margin-top:1rem !important}.mr-md-3,.mx-md-3{margin-right:1rem !important}.mb-md-3,.my-md-3{margin-bottom:1rem !important}.ml-md-3,.mx-md-3{margin-left:1rem !important}.m-md-4{margin:1.5rem !important}.mt-md-4,.my-md-4{margin-top:1.5rem !important}.mr-md-4,.mx-md-4{margin-right:1.5rem !important}.mb-md-4,.my-md-4{margin-bottom:1.5rem !important}.ml-md-4,.mx-md-4{margin-left:1.5rem !important}.m-md-5{margin:3rem !important}.mt-md-5,.my-md-5{margin-top:3rem !important}.mr-md-5,.mx-md-5{margin-right:3rem !important}.mb-md-5,.my-md-5{margin-bottom:3rem !important}.ml-md-5,.mx-md-5{margin-left:3rem !important}.p-md-0{padding:0 !important}.pt-md-0,.py-md-0{padding-top:0 !important}.pr-md-0,.px-md-0{padding-right:0 !important}.pb-md-0,.py-md-0{padding-bottom:0 !important}.pl-md-0,.px-md-0{padding-left:0 !important}.p-md-1{padding:.25rem !important}.pt-md-1,.py-md-1{padding-top:.25rem !important}.pr-md-1,.px-md-1{padding-right:.25rem !important}.pb-md-1,.py-md-1{padding-bottom:.25rem !important}.pl-md-1,.px-md-1{padding-left:.25rem !important}.p-md-2{padding:.5rem !important}.pt-md-2,.py-md-2{padding-top:.5rem !important}.pr-md-2,.px-md-2{padding-right:.5rem !important}.pb-md-2,.py-md-2{padding-bottom:.5rem !important}.pl-md-2,.px-md-2{padding-left:.5rem !important}.p-md-3{padding:1rem !important}.pt-md-3,.py-md-3{padding-top:1rem !important}.pr-md-3,.px-md-3{padding-right:1rem !important}.pb-md-3,.py-md-3{padding-bottom:1rem !important}.pl-md-3,.px-md-3{padding-left:1rem !important}.p-md-4{padding:1.5rem !important}.pt-md-4,.py-md-4{padding-top:1.5rem !important}.pr-md-4,.px-md-4{padding-right:1.5rem !important}.pb-md-4,.py-md-4{padding-bottom:1.5rem !important}.pl-md-4,.px-md-4{padding-left:1.5rem !important}.p-md-5{padding:3rem !important}.pt-md-5,.py-md-5{padding-top:3rem !important}.pr-md-5,.px-md-5{padding-right:3rem !important}.pb-md-5,.py-md-5{padding-bottom:3rem !important}.pl-md-5,.px-md-5{padding-left:3rem !important}.m-md-n1{margin:-.25rem !important}.mt-md-n1,.my-md-n1{margin-top:-.25rem !important}.mr-md-n1,.mx-md-n1{margin-right:-.25rem !important}.mb-md-n1,.my-md-n1{margin-bottom:-.25rem !important}.ml-md-n1,.mx-md-n1{margin-left:-.25rem !important}.m-md-n2{margin:-.5rem !important}.mt-md-n2,.my-md-n2{margin-top:-.5rem !important}.mr-md-n2,.mx-md-n2{margin-right:-.5rem !important}.mb-md-n2,.my-md-n2{margin-bottom:-.5rem !important}.ml-md-n2,.mx-md-n2{margin-left:-.5rem !important}.m-md-n3{margin:-1rem !important}.mt-md-n3,.my-md-n3{margin-top:-1rem !important}.mr-md-n3,.mx-md-n3{margin-right:-1rem !important}.mb-md-n3,.my-md-n3{margin-bottom:-1rem !important}.ml-md-n3,.mx-md-n3{margin-left:-1rem !important}.m-md-n4{margin:-1.5rem !important}.mt-md-n4,.my-md-n4{margin-top:-1.5rem !important}.mr-md-n4,.mx-md-n4{margin-right:-1.5rem !important}.mb-md-n4,.my-md-n4{margin-bottom:-1.5rem !important}.ml-md-n4,.mx-md-n4{margin-left:-1.5rem !important}.m-md-n5{margin:-3rem !important}.mt-md-n5,.my-md-n5{margin-top:-3rem !important}.mr-md-n5,.mx-md-n5{margin-right:-3rem !important}.mb-md-n5,.my-md-n5{margin-bottom:-3rem !important}.ml-md-n5,.mx-md-n5{margin-left:-3rem !important}.m-md-auto{margin:auto !important}.mt-md-auto,.my-md-auto{margin-top:auto !important}.mr-md-auto,.mx-md-auto{margin-right:auto !important}.mb-md-auto,.my-md-auto{margin-bottom:auto !important}.ml-md-auto,.mx-md-auto{margin-left:auto !important}}@media (min-width: 992px){.m-lg-0{margin:0 !important}.mt-lg-0,.my-lg-0{margin-top:0 !important}.mr-lg-0,.mx-lg-0{margin-right:0 !important}.mb-lg-0,.my-lg-0{margin-bottom:0 !important}.ml-lg-0,.mx-lg-0{margin-left:0 !important}.m-lg-1{margin:.25rem !important}.mt-lg-1,.my-lg-1{margin-top:.25rem !important}.mr-lg-1,.mx-lg-1{margin-right:.25rem !important}.mb-lg-1,.my-lg-1{margin-bottom:.25rem !important}.ml-lg-1,.mx-lg-1{margin-left:.25rem !important}.m-lg-2{margin:.5rem !important}.mt-lg-2,.my-lg-2{margin-top:.5rem !important}.mr-lg-2,.mx-lg-2{margin-right:.5rem !important}.mb-lg-2,.my-lg-2{margin-bottom:.5rem !important}.ml-lg-2,.mx-lg-2{margin-left:.5rem !important}.m-lg-3{margin:1rem !important}.mt-lg-3,.my-lg-3{margin-top:1rem !important}.mr-lg-3,.mx-lg-3{margin-right:1rem !important}.mb-lg-3,.my-lg-3{margin-bottom:1rem !important}.ml-lg-3,.mx-lg-3{margin-left:1rem !important}.m-lg-4{margin:1.5rem !important}.mt-lg-4,.my-lg-4{margin-top:1.5rem !important}.mr-lg-4,.mx-lg-4{margin-right:1.5rem !important}.mb-lg-4,.my-lg-4{margin-bottom:1.5rem !important}.ml-lg-4,.mx-lg-4{margin-left:1.5rem !important}.m-lg-5{margin:3rem !important}.mt-lg-5,.my-lg-5{margin-top:3rem !important}.mr-lg-5,.mx-lg-5{margin-right:3rem !important}.mb-lg-5,.my-lg-5{margin-bottom:3rem !important}.ml-lg-5,.mx-lg-5{margin-left:3rem !important}.p-lg-0{padding:0 !important}.pt-lg-0,.py-lg-0{padding-top:0 !important}.pr-lg-0,.px-lg-0{padding-right:0 !important}.pb-lg-0,.py-lg-0{padding-bottom:0 !important}.pl-lg-0,.px-lg-0{padding-left:0 !important}.p-lg-1{padding:.25rem !important}.pt-lg-1,.py-lg-1{padding-top:.25rem !important}.pr-lg-1,.px-lg-1{padding-right:.25rem !important}.pb-lg-1,.py-lg-1{padding-bottom:.25rem !important}.pl-lg-1,.px-lg-1{padding-left:.25rem !important}.p-lg-2{padding:.5rem !important}.pt-lg-2,.py-lg-2{padding-top:.5rem !important}.pr-lg-2,.px-lg-2{padding-right:.5rem !important}.pb-lg-2,.py-lg-2{padding-bottom:.5rem !important}.pl-lg-2,.px-lg-2{padding-left:.5rem !important}.p-lg-3{padding:1rem !important}.pt-lg-3,.py-lg-3{padding-top:1rem !important}.pr-lg-3,.px-lg-3{padding-right:1rem !important}.pb-lg-3,.py-lg-3{padding-bottom:1rem !important}.pl-lg-3,.px-lg-3{padding-left:1rem !important}.p-lg-4{padding:1.5rem !important}.pt-lg-4,.py-lg-4{padding-top:1.5rem !important}.pr-lg-4,.px-lg-4{padding-right:1.5rem !important}.pb-lg-4,.py-lg-4{padding-bottom:1.5rem !important}.pl-lg-4,.px-lg-4{padding-left:1.5rem !important}.p-lg-5{padding:3rem !important}.pt-lg-5,.py-lg-5{padding-top:3rem !important}.pr-lg-5,.px-lg-5{padding-right:3rem !important}.pb-lg-5,.py-lg-5{padding-bottom:3rem !important}.pl-lg-5,.px-lg-5{padding-left:3rem !important}.m-lg-n1{margin:-.25rem !important}.mt-lg-n1,.my-lg-n1{margin-top:-.25rem !important}.mr-lg-n1,.mx-lg-n1{margin-right:-.25rem !important}.mb-lg-n1,.my-lg-n1{margin-bottom:-.25rem !important}.ml-lg-n1,.mx-lg-n1{margin-left:-.25rem !important}.m-lg-n2{margin:-.5rem !important}.mt-lg-n2,.my-lg-n2{margin-top:-.5rem !important}.mr-lg-n2,.mx-lg-n2{margin-right:-.5rem !important}.mb-lg-n2,.my-lg-n2{margin-bottom:-.5rem !important}.ml-lg-n2,.mx-lg-n2{margin-left:-.5rem !important}.m-lg-n3{margin:-1rem !important}.mt-lg-n3,.my-lg-n3{margin-top:-1rem !important}.mr-lg-n3,.mx-lg-n3{margin-right:-1rem !important}.mb-lg-n3,.my-lg-n3{margin-bottom:-1rem !important}.ml-lg-n3,.mx-lg-n3{margin-left:-1rem !important}.m-lg-n4{margin:-1.5rem !important}.mt-lg-n4,.my-lg-n4{margin-top:-1.5rem !important}.mr-lg-n4,.mx-lg-n4{margin-right:-1.5rem !important}.mb-lg-n4,.my-lg-n4{margin-bottom:-1.5rem !important}.ml-lg-n4,.mx-lg-n4{margin-left:-1.5rem !important}.m-lg-n5{margin:-3rem !important}.mt-lg-n5,.my-lg-n5{margin-top:-3rem !important}.mr-lg-n5,.mx-lg-n5{margin-right:-3rem !important}.mb-lg-n5,.my-lg-n5{margin-bottom:-3rem !important}.ml-lg-n5,.mx-lg-n5{margin-left:-3rem !important}.m-lg-auto{margin:auto !important}.mt-lg-auto,.my-lg-auto{margin-top:auto !important}.mr-lg-auto,.mx-lg-auto{margin-right:auto !important}.mb-lg-auto,.my-lg-auto{margin-bottom:auto !important}.ml-lg-auto,.mx-lg-auto{margin-left:auto !important}}@media (min-width: 1200px){.m-xl-0{margin:0 !important}.mt-xl-0,.my-xl-0{margin-top:0 !important}.mr-xl-0,.mx-xl-0{margin-right:0 !important}.mb-xl-0,.my-xl-0{margin-bottom:0 !important}.ml-xl-0,.mx-xl-0{margin-left:0 !important}.m-xl-1{margin:.25rem !important}.mt-xl-1,.my-xl-1{margin-top:.25rem !important}.mr-xl-1,.mx-xl-1{margin-right:.25rem !important}.mb-xl-1,.my-xl-1{margin-bottom:.25rem !important}.ml-xl-1,.mx-xl-1{margin-left:.25rem !important}.m-xl-2{margin:.5rem !important}.mt-xl-2,.my-xl-2{margin-top:.5rem !important}.mr-xl-2,.mx-xl-2{margin-right:.5rem !important}.mb-xl-2,.my-xl-2{margin-bottom:.5rem !important}.ml-xl-2,.mx-xl-2{margin-left:.5rem !important}.m-xl-3{margin:1rem !important}.mt-xl-3,.my-xl-3{margin-top:1rem !important}.mr-xl-3,.mx-xl-3{margin-right:1rem !important}.mb-xl-3,.my-xl-3{margin-bottom:1rem !important}.ml-xl-3,.mx-xl-3{margin-left:1rem !important}.m-xl-4{margin:1.5rem !important}.mt-xl-4,.my-xl-4{margin-top:1.5rem !important}.mr-xl-4,.mx-xl-4{margin-right:1.5rem !important}.mb-xl-4,.my-xl-4{margin-bottom:1.5rem !important}.ml-xl-4,.mx-xl-4{margin-left:1.5rem !important}.m-xl-5{margin:3rem !important}.mt-xl-5,.my-xl-5{margin-top:3rem !important}.mr-xl-5,.mx-xl-5{margin-right:3rem !important}.mb-xl-5,.my-xl-5{margin-bottom:3rem !important}.ml-xl-5,.mx-xl-5{margin-left:3rem !important}.p-xl-0{padding:0 !important}.pt-xl-0,.py-xl-0{padding-top:0 !important}.pr-xl-0,.px-xl-0{padding-right:0 !important}.pb-xl-0,.py-xl-0{padding-bottom:0 !important}.pl-xl-0,.px-xl-0{padding-left:0 !important}.p-xl-1{padding:.25rem !important}.pt-xl-1,.py-xl-1{padding-top:.25rem !important}.pr-xl-1,.px-xl-1{padding-right:.25rem !important}.pb-xl-1,.py-xl-1{padding-bottom:.25rem !important}.pl-xl-1,.px-xl-1{padding-left:.25rem !important}.p-xl-2{padding:.5rem !important}.pt-xl-2,.py-xl-2{padding-top:.5rem !important}.pr-xl-2,.px-xl-2{padding-right:.5rem !important}.pb-xl-2,.py-xl-2{padding-bottom:.5rem !important}.pl-xl-2,.px-xl-2{padding-left:.5rem !important}.p-xl-3{padding:1rem !important}.pt-xl-3,.py-xl-3{padding-top:1rem !important}.pr-xl-3,.px-xl-3{padding-right:1rem !important}.pb-xl-3,.py-xl-3{padding-bottom:1rem !important}.pl-xl-3,.px-xl-3{padding-left:1rem !important}.p-xl-4{padding:1.5rem !important}.pt-xl-4,.py-xl-4{padding-top:1.5rem !important}.pr-xl-4,.px-xl-4{padding-right:1.5rem !important}.pb-xl-4,.py-xl-4{padding-bottom:1.5rem !important}.pl-xl-4,.px-xl-4{padding-left:1.5rem !important}.p-xl-5{padding:3rem !important}.pt-xl-5,.py-xl-5{padding-top:3rem !important}.pr-xl-5,.px-xl-5{padding-right:3rem !important}.pb-xl-5,.py-xl-5{padding-bottom:3rem !important}.pl-xl-5,.px-xl-5{padding-left:3rem !important}.m-xl-n1{margin:-.25rem !important}.mt-xl-n1,.my-xl-n1{margin-top:-.25rem !important}.mr-xl-n1,.mx-xl-n1{margin-right:-.25rem !important}.mb-xl-n1,.my-xl-n1{margin-bottom:-.25rem !important}.ml-xl-n1,.mx-xl-n1{margin-left:-.25rem !important}.m-xl-n2{margin:-.5rem !important}.mt-xl-n2,.my-xl-n2{margin-top:-.5rem !important}.mr-xl-n2,.mx-xl-n2{margin-right:-.5rem !important}.mb-xl-n2,.my-xl-n2{margin-bottom:-.5rem !important}.ml-xl-n2,.mx-xl-n2{margin-left:-.5rem !important}.m-xl-n3{margin:-1rem !important}.mt-xl-n3,.my-xl-n3{margin-top:-1rem !important}.mr-xl-n3,.mx-xl-n3{margin-right:-1rem !important}.mb-xl-n3,.my-xl-n3{margin-bottom:-1rem !important}.ml-xl-n3,.mx-xl-n3{margin-left:-1rem !important}.m-xl-n4{margin:-1.5rem !important}.mt-xl-n4,.my-xl-n4{margin-top:-1.5rem !important}.mr-xl-n4,.mx-xl-n4{margin-right:-1.5rem !important}.mb-xl-n4,.my-xl-n4{margin-bottom:-1.5rem !important}.ml-xl-n4,.mx-xl-n4{margin-left:-1.5rem !important}.m-xl-n5{margin:-3rem !important}.mt-xl-n5,.my-xl-n5{margin-top:-3rem !important}.mr-xl-n5,.mx-xl-n5{margin-right:-3rem !important}.mb-xl-n5,.my-xl-n5{margin-bottom:-3rem !important}.ml-xl-n5,.mx-xl-n5{margin-left:-3rem !important}.m-xl-auto{margin:auto !important}.mt-xl-auto,.my-xl-auto{margin-top:auto !important}.mr-xl-auto,.mx-xl-auto{margin-right:auto !important}.mb-xl-auto,.my-xl-auto{margin-bottom:auto !important}.ml-xl-auto,.mx-xl-auto{margin-left:auto !important}}.text-monospace{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace !important}.text-justify{text-align:justify !important}.text-wrap{white-space:normal !important}.text-nowrap{white-space:nowrap !important}.text-truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.text-left{text-align:left !important}.text-right{text-align:right !important}.text-center{text-align:center !important}@media (min-width: 576px){.text-sm-left{text-align:left !important}.text-sm-right{text-align:right !important}.text-sm-center{text-align:center !important}}@media (min-width: 768px){.text-md-left{text-align:left !important}.text-md-right{text-align:right !important}.text-md-center{text-align:center !important}}@media (min-width: 992px){.text-lg-left{text-align:left !important}.text-lg-right{text-align:right !important}.text-lg-center{text-align:center !important}}@media (min-width: 1200px){.text-xl-left{text-align:left !important}.text-xl-right{text-align:right !important}.text-xl-center{text-align:center !important}}.text-lowercase{text-transform:lowercase !important}.text-uppercase{text-transform:uppercase !important}.text-capitalize{text-transform:capitalize !important}.font-weight-light{font-weight:300 !important}.font-weight-lighter{font-weight:lighter !important}.font-weight-normal{font-weight:400 !important}.font-weight-bold{font-weight:700 !important}.font-weight-bolder{font-weight:bolder !important}.font-italic{font-style:italic !important}.text-white{color:#fff !important}.text-primary{color:#007bff !important}a.text-primary:hover,a.text-primary:focus{color:#0056b3 !important}.text-secondary{color:#6c757d !important}a.text-secondary:hover,a.text-secondary:focus{color:#494f54 !important}.text-success{color:#28a745 !important}a.text-success:hover,a.text-success:focus{color:#19692c !important}.text-info{color:#17a2b8 !important}a.text-info:hover,a.text-info:focus{color:#0f6674 !important}.text-warning{color:#ffc107 !important}a.text-warning:hover,a.text-warning:focus{color:#ba8b00 !important}.text-danger{color:#dc3545 !important}a.text-danger:hover,a.text-danger:focus{color:#a71d2a !important}.text-light{color:#f8f9fa !important}a.text-light:hover,a.text-light:focus{color:#cbd3da !important}.text-dark{color:#343a40 !important}a.text-dark:hover,a.text-dark:focus{color:#121416 !important}.text-body{color:#212529 !important}.text-muted{color:#6c757d !important}.text-black-50{color:rgba(0,0,0,0.5) !important}.text-white-50{color:rgba(255,255,255,0.5) !important}.text-hide{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.text-decoration-none{text-decoration:none !important}.text-break{word-break:break-word !important;overflow-wrap:break-word !important}.text-reset{color:inherit !important}.visible{visibility:visible !important}.invisible{visibility:hidden !important}@media print{*,*::before,*::after{text-shadow:none !important;box-shadow:none !important}a:not(.btn){text-decoration:underline}abbr[title]::after{content:" (" attr(title) ")"}pre{white-space:pre-wrap !important}pre,blockquote{border:1px solid #adb5bd;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}p,h2,h3{orphans:3;widows:3}h2,h3{page-break-after:avoid}@page{size:a3}body{min-width:992px !important}.container{min-width:992px !important}.navbar{display:none}.badge{border:1px solid #000}.table{border-collapse:collapse !important}.table td,.table th{background-color:#fff !important}.table-bordered th,.table-bordered td{border:1px solid #dee2e6 !important}.table-dark{color:inherit}.table-dark th,.table-dark td,.table-dark thead th,.table-dark tbody+tbody{border-color:#dee2e6}.table .thead-dark th{color:inherit;border-color:#dee2e6}}.highlight table td{padding:5px}.highlight table pre{margin:0}.highlight .cm{color:#999988;font-style:italic}.highlight .cp{color:#999999;font-weight:bold}.highlight .c1{color:#999988;font-style:italic}.highlight .cs{color:#999999;font-weight:bold;font-style:italic}.highlight .c,.highlight .cd{color:#8c8c8c;font-style:italic}.highlight .err{color:#a61717;background-color:#e3d2d2}.highlight .gd{color:#000000;background-color:#ffdddd}.highlight .ge{color:#000000;font-style:italic}.highlight .gr{color:#aa0000}.highlight .gh{color:#999999}.highlight .gi{color:#000000;background-color:#ddffdd}.highlight .go{color:#888888}.highlight .gp{color:#555555}.highlight .gs{font-weight:bold}.highlight .gu{color:#aaaaaa}.highlight .gt{color:#aa0000}.highlight .kc{color:#000000;font-weight:bold}.highlight .kd{color:#000000;font-weight:bold}.highlight .kn{color:#000000;font-weight:bold}.highlight .kp{color:#000000;font-weight:bold}.highlight .kr{color:#000000;font-weight:bold}.highlight .kt{color:#445588;font-weight:bold}.highlight .k,.highlight .kv{color:#000000;font-weight:bold}.highlight .mf{color:#009999}.highlight .mh{color:#009999}.highlight .il{color:#009999}.highlight .mi{color:#009999}.highlight .mo{color:#009999}.highlight .m,.highlight .mb,.highlight .mx{color:#009999}.highlight .sb{color:#d14}.highlight .sc{color:#d14}.highlight .sd{color:#d14}.highlight .s2{color:#d14}.highlight .se{color:#d14}.highlight .sh{color:#d14}.highlight .si{color:#d14}.highlight .sx{color:#d14}.highlight .sr{color:#009926}.highlight .s1{color:#d14}.highlight .ss{color:#990073}.highlight .s{color:#d14}.highlight .na{color:#008080}.highlight .bp{color:#999999}.highlight .nb{color:#0086B3}.highlight .nc{color:#445588;font-weight:bold}.highlight .no{color:#008080}.highlight .nd{color:#3c5d5d;font-weight:bold}.highlight .ni{color:#800080}.highlight .ne{color:#990000;font-weight:bold}.highlight .nf{color:#990000;font-weight:bold}.highlight .nl{color:#990000;font-weight:bold}.highlight .nn{color:#555555}.highlight .nt{color:#000080}.highlight .vc{color:#008080}.highlight .vg{color:#008080}.highlight .vi{color:#008080}.highlight .nv{color:#008080}.highlight .ow{color:#000000;font-weight:bold}.highlight .o{color:#000000;font-weight:bold}.highlight .w{color:#bbbbbb}.highlight{background-color:#f8f8f8}.container{padding-left:30px;padding-right:30px;max-width:1240px}.container-fluid{padding-left:0;padding-right:0}@font-face{font-family:FreightSans;font-weight:700;font-style:normal;src:url("/assets/fonts/FreightSans/freight-sans-bold.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-bold.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:700;font-style:italic;src:url("/assets/fonts/FreightSans/freight-sans-bold-italic.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-bold-italic.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:500;font-style:normal;src:url("/assets/fonts/FreightSans/freight-sans-medium.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-medium.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:500;font-style:italic;src:url("/assets/fonts/FreightSans/freight-sans-medium-italic.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-medium-italic.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:100;font-style:normal;src:url("/assets/fonts/FreightSans/freight-sans-light.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-light.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:100;font-style:italic;src:url("/assets/fonts/FreightSans/freight-sans-light-italic.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-light-italic.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:400;font-style:italic;src:url("/assets/fonts/FreightSans/freight-sans-book-italic.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-book-italic.woff") format("woff")}@font-face{font-family:FreightSans;font-weight:400;font-style:normal;src:url("/assets/fonts/FreightSans/freight-sans-book.woff2") format("woff2"),url("/assets/fonts/FreightSans/freight-sans-book.woff") format("woff")}@font-face{font-family:IBMPlexMono;font-weight:600;font-style:normal;unicode-range:u+0020-007f;src:local("IBMPlexMono-SemiBold"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2") format("woff2"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff") format("woff")}@font-face{font-family:IBMPlexMono;font-weight:500;font-style:normal;unicode-range:u+0020-007f;src:local("IBMPlexMono-Medium"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2") format("woff2"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Medium.woff") format("woff")}@font-face{font-family:IBMPlexMono;font-weight:400;font-style:normal;unicode-range:u+0020-007f;src:local("IBMPlexMono-Regular"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff2") format("woff2"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Regular.woff") format("woff")}@font-face{font-family:IBMPlexMono;font-weight:300;font-style:normal;unicode-range:u+0020-007f;src:local("IBMPlexMono-Light"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff2") format("woff2"),url("/assets/fonts/IBMPlexMono/IBMPlexMono-Light.woff") format("woff")}*{font-family:FreightSans, Helvetica Neue, Helvetica, Arial, sans-serif;font-weight:400}h1,h2,h3,h4,h5,h6{font-family:FreightSans}p{margin-bottom:1.25rem}a,em,i,b,strong,u,span{font-size:inherit}a:link,a:visited,a:hover{text-decoration:none;color:#ee4c2c}p a:link,p a:visited,p a:hover{color:#ee4c2c;text-decoration:none}@media screen and (min-width: 768px){p a:hover{text-decoration:underline}p a.social-icon:hover{text-decoration:none}}.btn,a.btn{border-radius:0;border:none;background-color:#f3f4f7;color:#6c6c6d;font-weight:400;position:relative;letter-spacing:0.25px}.btn.btn-lg,.btn-group-lg>.btn,a.btn.btn-lg,.btn-group-lg>a.btn{font-size:1.125rem;padding-top:.5rem}.btn.btn-white,a.btn.btn-white{background-color:#fff}.btn.btn-orange,a.btn.btn-orange{background-color:#ee4c2c}.btn.btn-demo,a.btn.btn-demo{color:#fff}@media screen and (min-width: 768px){.btn:after,a.btn:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.btn:hover:after,a.btn:hover:after{width:100%}.btn:hover,a.btn:hover{color:#262626}}.navbar{padding-left:0;padding-right:0}html{position:relative;min-height:100%;font-size:12px}@media screen and (min-width: 768px){html{font-size:16px}}@media screen and (min-width: 768px){body{margin:0 0 620px}}body.no-scroll{height:100%;overflow:hidden}a.with-right-arrow,.btn.with-right-arrow{padding-right:2rem;position:relative;background-image:url("/assets/images/chevron-right-orange.svg");background-size:6px 13px;background-position:top 10px right 11px;background-repeat:no-repeat}@media screen and (min-width: 768px){a.with-right-arrow,.btn.with-right-arrow{background-size:8px 14px;background-position:top 15px right 12px;padding-right:2rem}}a.with-left-arrow,.btn.with-left-arrow{padding-left:2rem;position:relative;background-image:url("/assets/images/chevron-left-grey.svg");background-size:6px 13px;background-position:top 10px left 11px;background-repeat:no-repeat}@media screen and (min-width: 768px){a.with-left-arrow,.btn.with-left-arrow{background-size:8px 14px;background-position:top 16px left 12px;padding-left:2rem}}.main-background{position:absolute;top:0;left:0;width:100%;height:350px;background-size:100% 100%;background-repeat:no-repeat;background-image:url("/assets/images/pytorch_bg_purple.jpg")}@media screen and (min-width: 768px){.main-background{height:640px}}.main-background.home-page-background{z-index:-1;height:350px}@media screen and (min-width: 768px){.main-background.home-page-background{height:570px}}.main-background.hub-background{height:380px}@media screen and (min-width: 768px){.main-background.hub-background{height:495px}}@media screen and (min-width: 768px){.main-background.ecosystem-background{height:472px}}@media screen and (min-width: 768px){.main-background.events-background{height:472px}}@media screen and (min-width: 768px){.main-background.ecosystem-join-background{height:435px}}.main-background.resources-background{height:380px}@media screen and (min-width: 768px){.main-background.resources-background{height:472px}}.main-background.get-started-background{height:275px}@media screen and (min-width: 768px){.main-background.get-started-background{height:380px}}.main-background.comm-stories-background{height:275px}@media screen and (min-width: 768px){.main-background.comm-stories-background{height:380px}}.main-background.features-background{height:335px}@media screen and (min-width: 768px){.main-background.features-background{height:300px}}.bg-light-grey{background-color:#f3f4f7}.text-dark-grey{color:#6c6c6d}.sidebar-links .top-section{color:#000}.sidebar-links ul{list-style-type:none;padding-left:0}.sidebar-links ul li{color:#6c6c6d;margin-left:20px}.sidebar-links ul li a{color:inherit}.sidebar-links .with-sub-sections.top-section:before{content:"+ ";font-family:"Courier New", Courier, monospace;width:50px}.sidebar-links .with-sub-sections.top-section.open:before{content:"- ";font-family:"Courier New", Courier, monospace;width:50px}.bg-very-light-grey{background-color:#f3f4f7}.email-subscribe-form input.email{color:#ee4c2c;border:none;border-bottom:1px solid #939393;width:100%;background-color:transparent;outline:none;font-size:1.125rem;letter-spacing:0.25px;line-height:2.25rem}.email-subscribe-form ::-webkit-input-placeholder{color:#ee4c2c}.email-subscribe-form ::-moz-placeholder{color:#ee4c2c}.email-subscribe-form :-ms-input-placeholder{color:#ee4c2c}.email-subscribe-form :-moz-placeholder{color:#ee4c2c}.email-subscribe-form input[type="submit"]{position:absolute;right:0;top:10px;height:15px;width:15px;background-image:url("/assets/images/arrow-right-with-tail.svg");background-color:transparent;background-repeat:no-repeat;background-size:15px 15px;background-position:center center;-webkit-appearance:none;-moz-appearance:none;appearance:none;border:0}.email-subscribe-form-fields-wrapper{position:relative}.bg-slate{background-color:#262626}.tweets-wrapper{width:100%}.tweets-wrapper p{font-size:1rem;line-height:1.5rem;letter-spacing:0.22px}.tweets-wrapper ol{padding-left:0}.tweets-wrapper a{color:#ee4c2c}.tweets-wrapper img,.tweets-wrapper .timeline-Tweet-actions,.tweets-wrapper .timeline-Tweet-media,.tweets-wrapper .MediaCard{display:none !important}.tweet{margin-bottom:2.2rem;word-wrap:break-word}.tweet a{color:#ee4c2c;display:inline}.tweet a span{color:inherit}.tweet p,.tweet span{font-size:1rem;line-height:1.5rem;letter-spacing:0.22px;color:#A0A0A1}@media screen and (min-width: 1240px){.tweet p{padding-right:40px}}.tweet span.retweeted,.tweet span.in-reply-to{font-size:.8125rem}.tweet p.tweet-header{margin-bottom:.3125rem;line-height:.75rem}.tweet .tweet-bird:before{content:"";position:relative;left:0;background-image:url("/assets/images/logo-twitter-grey.svg");background-size:20px 16px;display:inline-block;width:20px;height:16px}@media screen and (min-width: 768px){.tweet .tweet-bird:before{margin-bottom:.625rem}}.anchorjs-link{color:#6c6c6d !important}@media screen and (min-width: 768px){.anchorjs-link:hover{color:inherit;text-decoration:none !important}}.article-page-module{background-color:#f3f4f7;padding-top:1.875rem;padding-bottom:1.875rem}@media screen and (min-width: 768px){.article-page-module{padding-top:3.75rem;padding-bottom:3.75rem}}@media screen and (min-width: 1240px){.article-page-module .col-md-3{padding-left:20px;padding-right:20px}}.article-page-module .module-link-col .btn{padding-left:0}@media screen and (min-width: 768px){.article-page-module .module-link-col{text-align:right}.article-page-module .module-link-col .btn{padding-left:inherit}}.article-page-module .module-content-wrapper{margin-top:1.25rem;margin-bottom:1.25rem}@media screen and (min-width: 768px){.article-page-module .module-content-wrapper{margin-top:0;margin-bottom:0}}.article-page-module img{margin-bottom:1.875rem;width:100%}.article-page-module h3{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;text-transform:uppercase;margin-bottom:1.25rem}@media screen and (min-width: 768px){.article-page-module h3{margin-bottom:3.75rem}}.article-page-module h5,.article-page-module p{font-size:1rem;line-height:1.5rem}.article-page-module h5{color:#262626}.article-page-module p{color:#CCCDD1;letter-spacing:0.25px}.article-page-module .module-header{position:relative}.article-page-module .module-button{padding-left:0}@media screen and (min-width: 768px){.article-page-module .module-button{position:absolute;right:15px;top:0;padding-top:0;padding-bottom:.125rem;background-position:center right;padding-right:16px}}article.pytorch-article .note-card{border-radius:0;border:none;background-color:#ee4c2c;color:white;padding:30px;margin-bottom:50px}article.pytorch-article .note-card h4{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;text-transform:uppercase;color:white;margin-top:0;margin-bottom:1.125rem}article.pytorch-article .note-card p{font-size:1.125rem;line-height:1.5em;margin-bottom:0;color:white}article.pytorch-article .note-card p a{color:white;font-weight:700}.ecosystem-card,.resource-card,.hub-card{border-radius:0;border:none;height:110px;margin-bottom:1.25rem;margin-bottom:1.875rem;overflow:scroll}@media screen and (min-width: 1240px){.ecosystem-card,.resource-card,.hub-card{height:150px;overflow:inherit}}@media (min-width: 768px) and (max-width: 1239px){.ecosystem-card,.resource-card,.hub-card{height:170px;overflow:inherit}}.ecosystem-card p.card-summary,.resource-card p.card-summary,.hub-card p.card-summary{font-size:1.125rem;line-height:1.5rem;margin-bottom:0;color:#6c6c6d}.ecosystem-card h4,.resource-card h4,.hub-card h4{color:#262626;margin-bottom:1.125rem;overflow:hidden;white-space:nowrap;text-overflow:ellipsis}.ecosystem-card a,.resource-card a,.hub-card a{height:100%}@media screen and (min-width: 768px){.ecosystem-card a,.resource-card a,.hub-card a{min-height:190px}}@media (min-width: 768px) and (max-width: 1239px){.ecosystem-card a,.resource-card a,.hub-card a{min-height:234px}}@media screen and (min-width: 768px){.ecosystem-card:after,.resource-card:after,.hub-card:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.ecosystem-card:hover:after,.resource-card:hover:after,.hub-card:hover:after{width:100%}.ecosystem-card:hover,.resource-card:hover,.hub-card:hover{color:#262626}}.ecosystem-card:hover p.card-summary,.resource-card:hover p.card-summary,.hub-card:hover p.card-summary{color:#262626}.ecosystem-card .card-body{background-position:top 1.25rem right 1.25rem;background-repeat:no-repeat;padding:1.5625rem 1.875rem}.ecosystem-card .card-body.reasoning{background-image:url("/assets/images/logo-elf.svg");background-size:29px 25px}.ecosystem-card .card-body.tool{background-image:url("/assets/images/logo-wav2letter.svg");background-size:29px 25px}.ecosystem-card .card-body.language{background-image:url("/assets/images/logo-parlai.svg");background-size:29px 25px}.ecosystem-card .card-body.vision{background-image:url("/assets/images/logo-detectron.svg");background-size:29px 25px}.resource-card{border:1px solid #d6d7d8;background-color:transparent;margin-bottom:1.25rem}@media screen and (min-width: 768px){.resource-card{margin-bottom:0}}@media (min-width: 768px) and (max-width: 1239px){.resource-card{height:225px}}.resource-card .pytorch-image{position:relative;height:1.25rem;width:1.25rem;top:3.125rem}.resource-card a{letter-spacing:0.25px;color:#262626}.resource-card .card-body{display:block;padding:0 15px 0 0;position:relative;top:20px;margin-left:60px}@media (min-width: 768px) and (max-width: 1239px){.resource-card .card-body{top:18px}}@media screen and (min-width: 1240px){.resource-card .card-body{top:30px;margin-left:80px;padding-right:30px}}.resource-card.slack:before,.resource-card.github:before,.resource-card.pytorch-resource:before{content:"";background-size:32px 32px;background-repeat:no-repeat;display:block;position:absolute;height:32px;width:32px;top:15px;left:15px}@media screen and (min-width: 1240px){.resource-card.slack:before,.resource-card.github:before,.resource-card.pytorch-resource:before{left:30px;top:30px}}.resource-card.slack:before{background-image:url("/assets/images/logo-slack.svg")}.resource-card.github:before{background-image:url("/assets/images/logo-github.svg")}.resource-card.pytorch-resource:before{background-image:url("/assets/images/logo-icon.svg")}.resource-card .pytorch-discuss .discuss{color:#ee4c2c;font-weight:400}@media screen and (min-width: 768px){.resource-card:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.resource-card:hover:after{width:100%}.resource-card:hover{color:#262626}}.article-page-module.similar-projects .ecosystem-card p.card-summary{font-size:1rem;height:36px}@media screen and (min-width: 768px){.article-page-module.similar-projects .ecosystem-card p.card-summary{height:50px}}#twitter-widget iframe{display:none !important}body.general .main-content-wrapper{margin-top:80px}@media screen and (min-width: 768px){body.general .main-content-wrapper{margin-top:100px}}.domain-card{background-color:#f3f4f7;padding:40px 20px;margin:20px 0}.domain-card h4{color:#000}.domain-card p{color:#6c6c6d;margin-bottom:0}.domain-card:hover h4{color:#ee4c2c}code,kbd,pre,samp,code b{font-family:IBMPlexMono,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace}code span,kbd span,pre span,samp span,code b span{font-family:IBMPlexMono,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace}pre{padding:1.125rem;background-color:#f3f4f7}pre code{font-size:.875rem}pre.highlight{background-color:#f3f4f7;line-height:1.3125rem}code.highlighter-rouge{color:#6c6c6d;background-color:#f3f4f7;padding:2px 6px}a:link code.highlighter-rouge,a:visited code.highlighter-rouge,a:hover code.highlighter-rouge{color:#4974D1}a:link.has-code,a:visited.has-code,a:hover.has-code{color:#4974D1}p code,h1 code,h2 code,h3 code,h4 code,h5 code,h6 code{font-size:78.5%}.header-holder{height:68px;align-items:center;display:flex;left:0;margin-left:auto;margin-right:auto;position:fixed;right:0;top:0;width:100%;z-index:9999}@media screen and (min-width: 1200px){.header-holder{height:70px}}@media screen and (min-width: 1200px){.header-holder{top:32px}}.header-holder.blog-header,.header-holder.blog-detail-header,.header-holder.resources-header,.header-holder.get-started-header,.header-holder.features-header,.header-holder.comm-stories-header,.header-holder.ecosystem-header,.header-holder.announcement-header,.header-holder.hub-header,.header-holder.mobile-header{background-color:#fff;border-bottom:1px solid #e2e2e2}.hello-bar{display:none}@media screen and (min-width: 1200px){.hello-bar{background-color:#CC2F90;color:#fff;display:flex;letter-spacing:.34px;justify-content:center;padding:4px 0;position:fixed;top:0;text-align:center;z-index:9999;margin-left:auto;margin-right:auto;width:100%}.hello-bar a{color:#fff;text-decoration:underline}}.header-container{position:relative;display:flex;align-items:center}.header-container:before,.header-container:after{content:"";display:table}.header-container:after{clear:both}.header-container{*zoom:1}@media screen and (min-width: 1200px){.header-container{display:block}}.header-logo{height:23px;width:93px;background-image:url("/assets/images/logo.svg");background-repeat:no-repeat;background-size:93px 23px;display:block;float:left}@media screen and (min-width: 1200px){.header-logo{background-size:108px 27px;position:absolute;height:27px;width:108px;top:4px;float:none}}.main-menu-open-button{background-image:url("/assets/images/icon-menu-dots.svg");background-position:center center;background-size:25px 7px;background-repeat:no-repeat;width:25px;height:7px;position:absolute;right:0;top:4px}@media screen and (min-width: 1200px){.main-menu-open-button{display:none}}.header-holder .main-menu{display:none}@media screen and (min-width: 1200px){.header-holder .main-menu{display:flex;align-items:center;justify-content:flex-end}}.header-holder .main-menu ul{display:flex;align-items:center;margin:0}.header-holder .main-menu ul li{display:inline-block;margin-right:34px;position:relative}.header-holder .main-menu ul li.active:after{content:"•";bottom:-24px;color:#ee4c2c;font-size:1.375rem;left:0;position:absolute;right:0;text-align:center}.header-holder .main-menu ul li.active a{color:#ee4c2c}.header-holder .main-menu ul li.active .with-down-arrow{background-image:url("/assets/images/chevron-down-orange.svg")}.header-holder .main-menu ul li.resources-active:after{left:-27px}.header-holder .main-menu ul li:last-of-type{margin-right:0}.header-holder .main-menu ul li a{color:#fff;font-size:1.2rem;letter-spacing:0;line-height:2.125rem;text-align:center;text-decoration:none;padding-bottom:10px}@media screen and (min-width: 1200px){.header-holder .main-menu ul li a:hover{color:#ffffff;border-bottom:2px solid #ffffff}}.header-holder .main-menu ul li a.with-down-arrow{cursor:default;padding-right:2rem;position:relative;background-image:url("/assets/images/chevron-down-white.svg");background-size:14px 18px;background-position:top 7px right 10px;background-repeat:no-repeat;padding-bottom:20px}.header-holder .main-menu ul li a.with-down-arrow:hover{border-bottom:none}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu{border-radius:0;padding:0}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu .dropdown-item{color:#6c6c6d;border-bottom:1px solid #e2e2e2}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu .dropdown-item:last-of-type{border-bottom-color:transparent}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu .dropdown-item:hover{background-color:#ee4c2c}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu .dropdown-item p{font-size:1rem;color:#757575}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu a.dropdown-item:hover{color:#fff}.header-holder .main-menu ul li a.with-down-arrow .dropdown-menu a.dropdown-item:hover p{color:#fff}.mobile-main-menu{display:none}.mobile-main-menu.open{background-color:#262626;display:block;height:100%;left:0;margin-left:auto;margin-right:auto;min-height:100%;position:fixed;right:0;top:0;width:100%;z-index:99999}.mobile-main-menu .container-fluid{background-color:inherit;align-items:center;display:flex;height:68px;position:relative;z-index:1}.mobile-main-menu .container-fluid:before,.mobile-main-menu .container-fluid:after{content:"";display:table}.mobile-main-menu .container-fluid:after{clear:both}.mobile-main-menu .container-fluid{*zoom:1}.mobile-main-menu.open ul{list-style-type:none;padding:0}.mobile-main-menu.open ul li a,.mobile-main-menu.open .resources-mobile-menu-title{font-size:2rem;color:#fff;letter-spacing:0;line-height:4rem}.mobile-main-menu.open ul li.active a{color:#ee4c2c}.main-menu-close-button{background-image:url("/assets/images/icon-close.svg");background-position:center center;background-repeat:no-repeat;background-size:24px 24px;height:24px;position:absolute;right:0;width:24px;top:-4px}.mobile-main-menu-header-container{position:relative}.mobile-main-menu-links-container{display:flex;padding-left:2.8125rem;height:100%;min-height:100%;margin-top:20px;overflow-y:scroll}@media only screen and (max-width: 320px){.mobile-main-menu-links-container .main-menu{padding-top:5rem}}@media only screen and (max-width: 320px){.mobile-main-menu-links-container .navSearchWrapper{width:75%}}#topnav-gh-icon{background-image:url(/assets/social/github-white.svg);color:white;width:33px;height:33px;background-size:23px 23px;background-repeat:no-repeat;background-position:5px 4px;border-radius:25px}#topnav-gh-icon:hover{background-color:#88888833}.blog-header .header-logo,.blog-detail-header .header-logo,.resources-header .header-logo,.get-started-header .header-logo,.features-header .header-logo,.ecosystem-header .header-logo,.announcement-header .header-logo,.comm-stories-header .header-logo,.hub-header .header-logo,.mobile-header .header-logo{background-image:url("/assets/images/logo-dark.svg")}.blog-header .main-menu ul li a,.blog-detail-header .main-menu ul li a,.resources-header .main-menu ul li a,.get-started-header .main-menu ul li a,.features-header .main-menu ul li a,.ecosystem-header .main-menu ul li a,.announcement-header .main-menu ul li a,.comm-stories-header .main-menu ul li a,.hub-header .main-menu ul li a,.mobile-header .main-menu ul li a{color:#262626}@media screen and (min-width: 1200px){.blog-header .main-menu ul li a:hover,.blog-detail-header .main-menu ul li a:hover,.resources-header .main-menu ul li a:hover,.get-started-header .main-menu ul li a:hover,.features-header .main-menu ul li a:hover,.ecosystem-header .main-menu ul li a:hover,.announcement-header .main-menu ul li a:hover,.comm-stories-header .main-menu ul li a:hover,.hub-header .main-menu ul li a:hover,.mobile-header .main-menu ul li a:hover{color:#262626;border-bottom:2px solid #262626}}.blog-header .main-menu ul li a.with-down-arrow,.blog-detail-header .main-menu ul li a.with-down-arrow,.resources-header .main-menu ul li a.with-down-arrow,.get-started-header .main-menu ul li a.with-down-arrow,.features-header .main-menu ul li a.with-down-arrow,.ecosystem-header .main-menu ul li a.with-down-arrow,.announcement-header .main-menu ul li a.with-down-arrow,.comm-stories-header .main-menu ul li a.with-down-arrow,.hub-header .main-menu ul li a.with-down-arrow,.mobile-header .main-menu ul li a.with-down-arrow{background-image:url("/assets/images/chevron-down-black.svg")}.blog-header .main-menu-open-button,.blog-detail-header .main-menu-open-button,.resources-header .main-menu-open-button,.get-started-header .main-menu-open-button,.features-header .main-menu-open-button,.ecosystem-header .main-menu-open-button,.announcement-header .main-menu-open-button,.comm-stories-header .main-menu-open-button,.hub-header .main-menu-open-button,.mobile-header .main-menu-open-button{background-image:url("/assets/images/icon-menu-dots-dark.svg")}.blog-header #topnav-gh-icon,.blog-detail-header #topnav-gh-icon,.resources-header #topnav-gh-icon,.get-started-header #topnav-gh-icon,.features-header #topnav-gh-icon,.ecosystem-header #topnav-gh-icon,.announcement-header #topnav-gh-icon,.comm-stories-header #topnav-gh-icon,.hub-header #topnav-gh-icon,.mobile-header #topnav-gh-icon{background-image:url(/assets/social/github-black.svg)}.ecosystem-dropdown-menu,.resources-dropdown-menu{left:-25px;width:300px;display:none;position:absolute;z-index:1000;display:none;top:45px;float:left;min-width:10rem;padding:0.5rem 0;font-size:1rem;color:#212529;text-align:left;list-style:none;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,0.15);border-radius:0.25rem}.ecosystem-dropdown:hover .ecosystem-dropdown-menu,.ecosystem-dropdown:hover .resources-dropdown-menu,.resources-dropdown:hover .ecosystem-dropdown-menu,.resources-dropdown:hover .resources-dropdown-menu,.resources-active:hover .ecosystem-dropdown-menu,.resources-active:hover .resources-dropdown-menu{display:block}.main-menu ul li .ecosystem-dropdown-menu,.main-menu ul li .resources-dropdown-menu{border-radius:0;padding:0}.main-menu ul li .ecosystem-dropdown-menu .dropdown-item,.main-menu ul li .resources-dropdown-menu .dropdown-item{color:#6c6c6d;border-bottom:1px solid #e2e2e2}.header-holder .main-menu ul li a.nav-dropdown-item{display:block;font-size:1rem;line-height:1.3125rem;width:100%;padding:0.25rem 1.5rem;clear:both;font-weight:400;color:#757575;text-align:left;background-color:transparent;border-bottom:1px solid #e2e2e2}.header-holder .main-menu ul li a.nav-dropdown-item p{margin-bottom:.5rem}.header-holder .main-menu ul li a.nav-dropdown-item:last-of-type{border-bottom-color:transparent}.header-holder .main-menu ul li a.nav-dropdown-item:hover{background-color:#ee4c2c;color:white}.header-holder .main-menu ul li a.nav-dropdown-item .dropdown-title{font-size:1.125rem;color:#212529;letter-spacing:0;line-height:34px}.header-holder .main-menu ul li a.nav-dropdown-item .docs-title{display:block;padding-top:0.5rem}.header-holder .main-menu ul li a.nav-dropdown-item:hover .dropdown-title{background-color:#ee4c2c;color:white}.mobile-main-menu-links-container ul.resources-mobile-menu-items li{padding-left:15px}.mobile-main-menu-links-container ul.resources-mobile-menu-items li a{font-size:1.5rem;line-height:3rem}.jumbotron{background-color:transparent;position:absolute;left:0;right:0;margin-right:auto;margin-left:auto;padding:0;margin-bottom:0;display:flex;align-items:center;top:68px}@media screen and (min-width: 768px){.jumbotron{height:550px;top:90px}}.jumbotron .jumbotron-content{display:flex;align-items:center}.jumbotron .lead{font-weight:400;letter-spacing:0.25px;font-size:20px;line-height:1.2}@media screen and (min-width: 768px){.jumbotron .lead{font-size:29px}}.jumbotron h1{font-size:2rem;text-transform:uppercase;font-weight:lighter;letter-spacing:1.08px;margin-bottom:.625rem;line-height:1.05;margin-top:4rem}@media screen and (min-width: 768px){.jumbotron h1{font-size:3.875rem;margin-top:0}}.jumbotron h1 img{margin-bottom:1rem}.jumbotron p{font-size:1.125rem;margin-bottom:1.25rem}@media screen and (min-width: 1200px){.jumbotron p{width:50%}}.jumbotron.on-dark-background h1,.jumbotron.on-dark-background p{color:#fff}.jumbotron .btn{padding-top:.5625rem}@media screen and (min-width: 768px){.jumbotron .btn{margin-top:.625rem}}.homepage .main-content-wrapper{margin-top:315px}@media screen and (min-width: 768px){.homepage .main-content-wrapper{margin-top:472px}}.homepage h2{margin-bottom:1.5625rem;text-transform:uppercase;letter-spacing:1.78px;line-height:2.5rem}@media screen and (min-width: 768px){.homepage h2{margin-bottom:2.0625rem}}.homepage h3{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;text-transform:uppercase;margin-bottom:1.25rem}.homepage h5{margin-bottom:.5rem}@media screen and (min-width: 768px){.homepage h5{margin-bottom:.9375rem}}.homepage .jumbotron{height:195px}@media screen and (min-width: 768px){.homepage .jumbotron{height:395px}}.homepage .jumbotron .btn{margin-top:.375rem}.homepage .ecosystem-row .card{background-color:#f3f4f7}.homepage .homepage-header{background-color:rgba(0,0,0,0.165)}.homepage-feature-module{padding-top:2.5rem;padding-bottom:2.5rem}@media screen and (min-width: 768px){.homepage-feature-module{padding-top:3.875rem;padding-bottom:4.5rem}.homepage-feature-module .module-button{position:absolute;right:15px;top:0}}.homepage-feature-module p{color:#6c6c6d;font-size:1.125em}.homepage-feature-module .title{color:#000;font-weight:300;font-size:1.5rem}@media (min-width: 768px) and (max-width: 1239px){.homepage-feature-module .title{font-size:1.25rem}}.homepage-feature-module .pytorch-title{font-size:1.5rem;letter-spacing:0.33px;line-height:2.25rem}.homepage-feature-module .subtext{font-size:1.125rem;color:#8c8c8c;letter-spacing:0;line-height:1.5rem}@media (min-width: 768px) and (max-width: 1239px){.homepage-feature-module .subtext{font-size:.9375rem}}.key-features-module{padding-bottom:0}@media screen and (min-width: 768px){.key-features-module{padding-bottom:1.55rem}}.key-features-module .key-features-boxes{margin-top:2rem}@media screen and (min-width: 768px){.key-features-module .key-features-boxes{margin-top:0}}.key-features-module .key-feature-box{margin-bottom:2rem}.key-features-module .key-feature-box p{margin-bottom:0;letter-spacing:0.25px}@media screen and (min-width: 768px){.key-features-module .key-feature-box{margin-bottom:2.5rem}}.community-heading{margin-top:2rem}.community-module{background-color:#fff}.community-module .ecosystem-card{height:auto}@media (min-width: 768px) and (max-width: 1239px){.community-module .ecosystem-card{padding:.625rem}}.community-module h2{margin-bottom:0}.community-module h5{text-transform:uppercase;color:#c6000a;margin-bottom:1.25rem}.community-module .h2-subheadline{margin-top:1.25rem;margin-bottom:2.6rem}@media screen and (min-width: 768px){.community-module .h2-subheadline{margin-top:0}}@media (min-width: 768px) and (max-width: 1239px){.community-module .card-body{padding:.625rem}}.community-module .module-button{background-color:#f3f4f7}.community-module p{margin-bottom:2.5rem;letter-spacing:0.25px}.community-module .module-subtext{margin-right:15.625rem}.community-module .email-subscribe-form input.email{border-bottom:1px solid #d6d7d8;font-size:1.25rem;line-height:0;padding-bottom:.75rem}.community-module .email-subscribe-form input[type="submit"]{top:6px}@media screen and (min-width: 768px){.community-module .email-subscribe-form input[type="submit"]{top:10px}}.pytorch-users-module,.homepage-bottom-wrapper{background-color:#f3f4f7}@media screen and (min-width: 768px){.pytorch-users-module{padding-bottom:1.9rem}}.community-avatar{height:60px;width:60px}.community-logo-bottom{height:200px;background-color:#f3f4f7}.university-testimonials h2{margin-bottom:2.2rem}.university-testimonials-content{margin-top:2.5rem;margin-bottom:2rem}@media screen and (min-width: 768px){.university-testimonials-content{margin-top:0}}.university-testimonials-content .col-md-4{margin-bottom:2.5rem}.university-testimonials-content .case-study-title{font-size:1.5rem;margin-bottom:1.25rem}.university-testimonials-content p{color:#6c6c6d;font-size:1.125rem;letter-spacing:0.25px}.university-testimonials-content .btn{background-color:#fff}.follow-us-on-twitter h2{margin-bottom:1.25rem}@media screen and (min-width: 768px){.follow-us-on-twitter h2{margin-bottom:2.5rem}}.homepage-feature-module .tweets-wrapper p{font-size:1rem}.quick-starts p{font-size:1.125rem;line-height:1.75rem}.quick-start-guides{font-size:1.5rem;letter-spacing:0.25px;line-height:2.25rem;color:#a5a5a5}.quick-start-guides .step-counter{margin-bottom:.1875rem}.quick-start-guides ul{list-style-type:none;padding-left:0}.quick-start-guides ul li{margin-bottom:0;font-size:1.125rem}@media screen and (min-width: 768px){.quick-start-guides ul li{margin-bottom:.75rem}.quick-start-guides ul li:last-of-type{margin-bottom:0}}.quick-start-guides ul li.selected{color:#ee4c2c}.quick-start-guides ul li.selected:before{content:"\2022";position:absolute;left:0}@media screen and (min-width: 768px){.quick-start-guides ul li.selected:before{left:-5px}}.quick-start-guides .select-instructions{color:#262626;border-bottom:2px solid #a5a5a5;margin-bottom:1rem;font-size:1.125rem;display:inline-block}@media screen and (min-width: 768px){.quick-start-guides .select-instructions{margin-bottom:0}}.homepage .news-banner-container{background:#000;color:#fff;text-align:center;padding:20px;width:90%}.homepage .news-banner-container .right-arrow,.homepage .news-banner-container .left-arrow{height:15px;bottom:-3px;position:relative}@media screen and (min-width: 768px){.homepage .news-banner-container .right-arrow,.homepage .news-banner-container .left-arrow{bottom:-8px}}.homepage .news-banner-container .right-arrow:hover,.homepage .news-banner-container .left-arrow:hover{cursor:pointer}.homepage .news-banner-container .right-arrow{float:right}.homepage .news-banner-container .left-arrow{float:left}.homepage #news-items .pagination{display:none !important}.banner-info{display:inline-block;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;margin:auto;width:80%;font-size:1.125rem}@media screen and (min-width: 768px){.banner-info{padding-top:3px}}.banner-info:hover{cursor:pointer;color:#ee4c2c}.news-banner-text a{color:white}.news-banner-text a:hover{color:#ee4c2c}.no-banner{padding-bottom:2rem}.homepage-box-module div.col-md{background:#F3F4F7;margin:10px;padding:30px}@media screen and (min-width: 768px){.homepage-box-module div.col-md{margin:20px}}.site-footer{padding:3.75rem 0;width:100%;background:#000;background-size:100%;margin-left:0;margin-right:0}@media screen and (min-width: 768px){.site-footer{position:absolute;left:0;bottom:0;height:620px}}.site-footer p{color:#fff}.site-footer ul{list-style-type:none;padding-left:0;margin-bottom:0}.site-footer ul li{font-size:1.125rem;line-height:2rem;color:#A0A0A1;padding-bottom:.375rem}.site-footer ul li.list-title{padding-bottom:.75rem;color:#fff}.site-footer ul li.list-title p{margin-bottom:0}.site-footer a:link,.site-footer a:visited{color:inherit}@media screen and (min-width: 768px){.site-footer a:hover{color:#ee4c2c}}.site-footer .privacy-policy{background:#000000;border-top:1px solid #fff;display:flex;flex-direction:column;margin-top:40px}.site-footer .privacy-policy ul{border-bottom:1px solid white}.site-footer .privacy-policy ul .privacy-policy-links{padding-bottom:1rem;padding-top:1rem;padding-right:1rem;display:inline-flex;color:white}.site-footer .privacy-policy .copyright{padding-top:1rem}.site-footer .privacy-policy .copyright p{color:#dfdfdf;font-size:14px}.site-footer .privacy-policy .copyright a{color:#dfdfdf;font-weight:600}.site-footer .privacy-policy .copyright a:hover{color:#dfdfdf;font-weight:600}.docs-tutorials-resources{background-color:#262626;color:#fff;padding-top:2.5rem;padding-bottom:2.5rem}@media screen and (min-width: 768px){.docs-tutorials-resources{padding-top:4.125rem;padding-bottom:4.09rem}}.docs-tutorials-resources h2{font-size:1.5rem;letter-spacing:-0.25px;text-transform:none;margin-bottom:0.25rem}@media screen and (min-width: 768px){.docs-tutorials-resources h2{margin-bottom:1.25rem}}.docs-tutorials-resources .col-md-4{margin-bottom:2rem}@media screen and (min-width: 768px){.docs-tutorials-resources .col-md-4{margin-bottom:0}}.docs-tutorials-resources .with-right-arrow{margin-left:12px;background-position:top 3px right 11px}@media screen and (min-width: 768px){.docs-tutorials-resources .with-right-arrow{background-position:top 6px right 11px}}.docs-tutorials-resources .with-right-arrow:hover{background-image:url("/assets/images/chevron-right-white.svg")}.docs-tutorials-resources p{font-size:1rem;line-height:1.5rem;letter-spacing:0.22px;color:#A0A0A1;margin-bottom:.5rem}@media screen and (min-width: 768px){.docs-tutorials-resources p{margin-bottom:1.25rem}}.docs-tutorials-resources a{font-size:1.125rem;color:#ee4c2c}.docs-tutorials-resources a:hover{color:#fff}.footer-container{position:relative}.footer-logo-wrapper{display:none}@media screen and (min-width: 768px){.footer-logo-wrapper{display:flex;grid-column:span 6}}.footer-logo-wrapper .footer-logo img{width:40px}.footer-links-wrapper{display:flex;flex-wrap:wrap;padding-bottom:1rem;border-bottom:1px solid white}@media screen and (min-width: 768px){.footer-links-wrapper{flex-wrap:initial;justify-content:flex-end}}.footer-links-col{margin-bottom:3.75rem;width:50%}@media screen and (min-width: 768px){.footer-links-col{margin-bottom:0;width:14%;margin-right:23px}.footer-links-col.follow-us-col{width:18%;margin-right:0}}@media (min-width: 768px) and (max-width: 1239px){.footer-links-col{width:18%;margin-right:30px}}.footer-social-icons{margin:8.5625rem 0 2.5rem 0}.footer-social-icons a{height:32px;width:32px;display:inline-block;background-color:#CCCDD1;border-radius:50%;margin-right:5px}.footer-social-icons a.facebook{background-image:url("/assets/images/logo-facebook-dark.svg");background-position:center center;background-size:9px 18px;background-repeat:no-repeat}.footer-social-icons a.twitter{background-image:url("/assets/images/logo-twitter-dark.svg");background-position:center center;background-size:17px 17px;background-repeat:no-repeat}.footer-social-icons a.youtube{background-image:url("/assets/images/logo-youtube-dark.svg");background-position:center center;background-repeat:no-repeat}.site-footer .mc-field-group{margin-top:-2px}.site-footer .email-subscribe-form input[type="submit"]{top:9px}@media screen and (min-width: 768px){.site-footer .email-subscribe-form input[type="submit"]{top:13px}}.social-links{grid-column:span 12;display:grid;grid-column-gap:3%;grid-row-gap:30px;grid-template-columns:repeat(6, minmax(0, 1fr))}@media (min-width: 600px){.social-links{grid-column:span 8}}@media screen and (min-width: 768px){.social-links{grid-column:span 6;align-self:end}}@media (max-width: 999px){.social-links{margin-left:10px;margin-right:10px}}.social-links li{text-align:center}.social-links svg{height:25px;max-width:30px;fill:#fff;color:#fff}.social-links svg:hover{fill:#ee4c2c;color:#ee4c2c}.lf-grid{grid-column-gap:3%;grid-row-gap:30px;display:grid;grid-template-columns:repeat(12, 1fr)}.hs-recaptcha{display:none}.newsletter{line-height:140%;margin-bottom:80px}.newsletter__title{line-height:140%;font-size:24px}@media (min-width: 1000px){.newsletter__title{font-size:40px}}.newsletter .legal-consent-container{display:none}.newsletter p.newsletter__privacy{max-width:860px;margin-top:30px;line-height:21px;font-size:14px;color:#dfdfdf}.newsletter p.newsletter__privacy a{color:#dfdfdf;font-weight:600}.newsletter p.newsletter__privacy a:hover{color:#dfdfdf;font-weight:600}.newsletter .hbspt-form{min-height:300px}@media (min-width: 500px){.newsletter .hbspt-form{min-height:100px}}@media (min-width: 1000px){.newsletter .hbspt-form{min-height:20px}}.newsletter .hbspt-form .hs-error-msg{display:block;margin-right:8px;color:#ee4c2c;font-size:14px;line-height:1.1em;width:95%;padding-top:15px}.newsletter .hbspt-form .hs-form{display:grid;grid-template-columns:1fr;grid-gap:30px}@media (min-width: 500px){.newsletter .hbspt-form .hs-form{grid-template-columns:minmax(0, 1fr) minmax(0, 1fr)}}@media (min-width: 700px){.newsletter .hbspt-form .hs-form{grid-template-columns:repeat(3, minmax(0, 1fr))}}@media (min-width: 950px){.newsletter .hbspt-form .hs-form{grid-template-columns:1fr 1fr 1fr 1fr 1fr;grid-row-gap:1.5rem;grid-column-gap:1.5rem}}.newsletter .hbspt-form .hs-form input[type='text'],.newsletter .hbspt-form .hs-form input[type='email']{height:50px;width:100%;background:transparent;border:none;border-bottom:2px solid #fff;border-radius:0;transition:all 0.25s ease;color:#fff;font-size:16px;line-height:105%}@media (min-width: 500px){.newsletter .hbspt-form .hs-form input[type='text'],.newsletter .hbspt-form .hs-form input[type='email']{height:42px}}@media (min-width: 500px){.newsletter .hbspt-form .hs-form input[type='text'],.newsletter .hbspt-form .hs-form input[type='email']{font-size:20px}}.newsletter .hbspt-form .hs-form input[type='text']::-moz-placeholder, .newsletter .hbspt-form .hs-form input[type='email']::-moz-placeholder{color:#fff;font-size:16px;line-height:105%}.newsletter .hbspt-form .hs-form input[type='text']:-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:-ms-input-placeholder{color:#fff;font-size:16px;line-height:105%}.newsletter .hbspt-form .hs-form input[type='text']::-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']::-ms-input-placeholder{color:#fff;font-size:16px;line-height:105%}.newsletter .hbspt-form .hs-form input[type='text']::placeholder,.newsletter .hbspt-form .hs-form input[type='email']::placeholder{color:#fff;font-size:16px;line-height:105%}@media (min-width: 500px){.newsletter .hbspt-form .hs-form input[type='text']::-moz-placeholder, .newsletter .hbspt-form .hs-form input[type='email']::-moz-placeholder{font-size:20px}.newsletter .hbspt-form .hs-form input[type='text']:-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:-ms-input-placeholder{font-size:20px}.newsletter .hbspt-form .hs-form input[type='text']::-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']::-ms-input-placeholder{font-size:20px}.newsletter .hbspt-form .hs-form input[type='text']::placeholder,.newsletter .hbspt-form .hs-form input[type='email']::placeholder{font-size:20px}}.newsletter .hbspt-form .hs-form input[type='text']:focus,.newsletter .hbspt-form .hs-form input[type='email']:focus{outline:0;border-bottom:2px solid #ee4c2c;transition:color 0.25s ease}.newsletter .hbspt-form .hs-form input[type='text']:focus::-moz-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:focus::-moz-placeholder{-moz-transition:color 0.25s ease;transition:color 0.25s ease;color:transparent}.newsletter .hbspt-form .hs-form input[type='text']:focus:-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:focus:-ms-input-placeholder{-ms-transition:color 0.25s ease;transition:color 0.25s ease;color:transparent}.newsletter .hbspt-form .hs-form input[type='text']:focus::-ms-input-placeholder, .newsletter .hbspt-form .hs-form input[type='email']:focus::-ms-input-placeholder{-ms-transition:color 0.25s ease;transition:color 0.25s ease;color:transparent}.newsletter .hbspt-form .hs-form input[type='text']:focus::placeholder,.newsletter .hbspt-form .hs-form input[type='email']:focus::placeholder{transition:color 0.25s ease;color:transparent}.newsletter .hbspt-form .hs-form input:-webkit-autofill,.newsletter .hbspt-form .hs-form input:-webkit-autofill:hover,.newsletter .hbspt-form .hs-form input:-webkit-autofill:focus,.newsletter .hbspt-form .hs-form textarea:-webkit-autofill,.newsletter .hbspt-form .hs-form textarea:-webkit-autofill:hover,.newsletter .hbspt-form .hs-form textarea:-webkit-autofill:focus,.newsletter .hbspt-form .hs-form select:-webkit-autofill,.newsletter .hbspt-form .hs-form select:-webkit-autofill:hover,.newsletter .hbspt-form .hs-form select:-webkit-autofill:focus{-webkit-text-fill-color:#fff}.newsletter .hbspt-form .hs-form select{-webkit-appearance:none;-moz-appearance:none;appearance:none;background:transparent;border:0px solid transparent;border-bottom:2px solid #fff;border-radius:0;box-shadow:0 1px 0 1px transparent;display:block;height:50px;margin:0;max-width:100%;padding:0.25em 0 calc(0.25em + 1px) 5px;transition:all 0.25s ease;width:100%;color:#fff;font-size:16px;line-height:105%}@media (min-width: 500px){.newsletter .hbspt-form .hs-form select{height:42px}}@media (min-width: 500px){.newsletter .hbspt-form .hs-form select{font-size:20px}}.newsletter .hbspt-form .hs-form select::-ms-expand{display:none}.newsletter .hbspt-form .hs-form select:focus{outline:0;border-bottom:2px solid #ee4c2c}.newsletter .hbspt-form .hs-form select:focus::-moz-placeholder{-moz-transition:color 0.4s ease;transition:color 0.4s ease;color:transparent}.newsletter .hbspt-form .hs-form select:focus:-ms-input-placeholder{-ms-transition:color 0.4s ease;transition:color 0.4s ease;color:transparent}.newsletter .hbspt-form .hs-form select:focus::-ms-input-placeholder{-ms-transition:color 0.4s ease;transition:color 0.4s ease;color:transparent}.newsletter .hbspt-form .hs-form select:focus::placeholder{transition:color 0.4s ease;color:transparent}.newsletter .hbspt-form .hs-form select option{font-weight:normal;color:black}.newsletter .hbspt-form .hs-form .hs-button{border-radius:5px;margin-top:20px;border:none;background-color:#ee4c2c;color:#fff;font-weight:400;padding:11px 40px;font-size:16px;font-weight:700;text-decoration:none}.newsletter .hbspt-form .hs-form .hs-input.invalid{border-bottom:2px dashed red !important}.newsletter .hbspt-form .hs-form .hs_error_rollup{display:none}.newsletter .submitted-message{display:flex;align-content:center;align-items:center;justify-content:center;border:2px solid #fff;min-height:280px;font-size:18px;padding:20px 20px 0;line-height:1.1em}@media (min-width: 500px){.newsletter .submitted-message{min-height:80px}}@media (min-width: 1000px){.newsletter .submitted-message{min-height:unset}}.newsletter .submitted-message p{max-width:none}.main-content-wrapper{margin-top:300px}@media screen and (min-width: 768px){.main-content-wrapper{margin-top:540px;min-height:400px}}.main-content{padding-top:1.5rem;padding-bottom:1.5rem}@media screen and (min-width: 768px){.main-content{padding-top:2.625rem}}.main-content-menu{margin-bottom:1.25rem}@media screen and (min-width: 768px){.main-content-menu{margin-bottom:5rem}}.main-content-menu .navbar-nav .nav-link{color:#262626;padding-left:1.875rem;padding-right:1.875rem}@media screen and (min-width: 768px){.main-content-menu .navbar-nav .nav-link:first-of-type{padding-left:0}}article.pytorch-article{max-width:920px;margin:0 auto;padding-bottom:90px}article.pytorch-article h2,article.pytorch-article h3,article.pytorch-article h4,article.pytorch-article h5,article.pytorch-article h6{margin-top:1.875rem;margin-bottom:1.5rem;color:#262626}article.pytorch-article h2{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;margin-top:3.125rem;text-transform:uppercase}article.pytorch-article h3{font-size:1.5rem;letter-spacing:-0.25px;line-height:1.875rem;text-transform:none}article.pytorch-article h4,article.pytorch-article h5,article.pytorch-article h6{font-size:1.125rem;letter-spacing:-0.19px;line-height:1.875rem}article.pytorch-article p{margin-bottom:1.125rem}article.pytorch-article p,article.pytorch-article ul li,article.pytorch-article ol li,article.pytorch-article dl dt,article.pytorch-article dl dd,article.pytorch-article blockquote{font-size:1.125rem;line-height:1.875rem;color:#6c6c6d}article.pytorch-article table{margin-bottom:2.5rem;width:100%}article.pytorch-article table thead{border-bottom:1px solid #cacaca}article.pytorch-article table th,article.pytorch-article table tr,article.pytorch-article table td{color:#6c6c6d;font-size:1rem;letter-spacing:-0.17px}article.pytorch-article table th{padding:.625rem;color:#262626}article.pytorch-article table td{padding:.3125rem}article.pytorch-article ul,article.pytorch-article ol{margin:1.5rem 0 3.125rem 0}@media screen and (min-width: 768px){article.pytorch-article ul,article.pytorch-article ol{padding-left:6.25rem}}article.pytorch-article ul li,article.pytorch-article ol li{margin-bottom:.625rem}article.pytorch-article dl{margin-bottom:2.5rem}article.pytorch-article dl dt{margin-bottom:.75rem;font-weight:400}article.pytorch-article pre{margin-bottom:2.5rem}article.pytorch-article hr{margin-top:4.6875rem;margin-bottom:4.6875rem}article.pytorch-article blockquote{font-size:.75rem;font-style:italic;padding:15px 15px 5px 15px;width:100%;background-color:rgba(211,211,211,0.3);border-left:2px solid #000000}article.pytorch-article h3.no_toc{margin:0px}article.pytorch-article nav{float:right;display:block;overflow-y:auto;background-color:white;margin-left:20px;border-left:1px #717171}article.pytorch-article nav li{font-size:12px;line-height:20px;padding-top:0px;list-style:none}article.pytorch-article nav a{color:#717171;font-weight:bold}article.pytorch-article ul#markdown-toc{padding-left:1em;margin:0px}article.pytorch-article ul#markdown-toc ul{margin:0px;padding-left:1em}article.pytorch-article ul#markdown-toc li{margin:0px}.get-started article{margin-bottom:5rem}.get-started .quick-start-guides ul{margin-bottom:0;padding-left:0}.get-started .main-content-wrapper{margin-top:275px}@media screen and (min-width: 768px){.get-started .main-content-wrapper{margin-top:350px}}.get-started .jumbotron{height:190px}@media screen and (min-width: 768px){.get-started .jumbotron{height:260px}}.get-started .main-content .navbar{background-color:#f3f4f7;padding-left:0;padding-bottom:0;padding-top:0}@media (min-width: 992px){.get-started .main-content .navbar li:first-of-type{padding-left:3.4375rem}.get-started .main-content .navbar .nav-item{padding:1rem;cursor:pointer}.get-started .main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.get-started .main-content .navbar .nav-select{background-color:#fff}.get-started .main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.get-started .main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}.get-started .main-content .navbar .nav-link:hover{color:#ee4c2c}.get-started .main-content .navbar .get-started-nav-link{padding-left:1.25rem;padding-right:1.25rem}@media screen and (min-width: 768px){.get-started .main-content .navbar .get-started-nav-link{padding-left:1.875rem;padding-right:1.875rem}}.get-started .main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.get-started .main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.get-started .main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.get-started .main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.get-started .main-content .navbar .navbar-toggler{margin-left:2.5rem}.get-started .main-content{padding-top:0}@media screen and (min-width: 768px){.get-started .main-content{padding-top:1.9rem}}.get-started .quick-start-module{padding-bottom:0;padding-top:0;background-color:#fff}.get-started .quick-start-module .option,.get-started .quick-start-module #command{border:2px solid #fff;background:#f3f4f7}.get-started .quick-start-module .title-block{border:2px solid #fff}.get-started .quick-start-module .selected{background-color:#ee4c2c}.get-started .quick-start-module h1{font-size:2rem;letter-spacing:1.78px;line-height:2.5rem;text-transform:uppercase;margin-bottom:1.5rem}.get-started .nav-menu-wrapper{background-color:#f3f4f7}.get-started .nav-menu-wrapper .container{padding-left:0;padding-right:0}@media screen and (min-width: 768px){.get-started .nav-menu-wrapper .container{padding-left:30px;padding-right:30px}}.get-started .navbar-nav{flex-direction:row}#installation .os{display:none}#installation .selected{display:block}#cloud .platform{display:none}#cloud .selected{display:block}.screencast{display:none}.screencast iframe{width:100% !important}.get-started .quick-starts .row.ptbuild,.get-started .quick-starts .row.os,.get-started .quick-starts .row.package,.get-started .quick-starts .row.language,.get-started .quick-starts .row.cuda{margin-bottom:1.25rem}@media screen and (min-width: 768px){.get-started .quick-starts .row.ptbuild,.get-started .quick-starts .row.os,.get-started .quick-starts .row.package,.get-started .quick-starts .row.language,.get-started .quick-starts .row.cuda{margin-bottom:0}}@media (min-width: 768px) and (max-width: 1239px){.get-started .quick-starts{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 768px){.get-started .quick-starts{margin-bottom:2.5rem}.get-started .quick-starts .row{margin-bottom:0}}@media screen and (min-width: 1240px){.get-started .quick-starts{margin-bottom:0}}.get-started .get-started-locally-sidebar{padding-top:2.5rem;padding-bottom:2.5rem;top:15%;z-index:385}@media screen and (min-width: 768px){.get-started .get-started-locally-sidebar{padding-top:0;max-height:100vh;overflow:auto}}.get-started .get-started-locally-sidebar ul{padding-left:0}.get-started .get-started-locally-sidebar li{list-style-type:none;line-height:36px}.get-started .get-started-locally-sidebar li a{color:#8c8c8c}.get-started .get-started-locally-sidebar li a.active,.get-started .get-started-locally-sidebar li a:hover{color:#ee4c2c}.get-started .get-started-locally-sidebar li .subitem{padding-left:1.25rem}.get-started .get-started-locally-sidebar li.subitem{padding-left:1.25rem}.cloud-nav{display:none}.get-started .get-started-cloud-sidebar{padding-top:3.125rem;padding-bottom:2.5rem;top:15%}.get-started .get-started-cloud-sidebar ul{padding-left:0}.get-started .get-started-cloud-sidebar li{list-style-type:none;line-height:36px}.get-started .get-started-cloud-sidebar li a{color:#8c8c8c}.get-started .get-started-cloud-sidebar li a.active,.get-started .get-started-cloud-sidebar li a:hover{color:#ee4c2c}.get-started .get-started-cloud-sidebar li .subitem{padding-left:1.25rem}.get-started .get-started-cloud-sidebar li.subitem{padding-left:1.25rem}.pytorch-2 .article-wrapper article.pytorch-article table tr td:first-of-type{padding-left:10px}.pytorch-2 .article-wrapper article.pytorch-article table,.pytorch-2 .article-wrapper article.pytorch-article td{border:1px solid #A0A0A1;padding:10px}.pytorch-2 .article-wrapper article.pytorch-article b,.pytorch-2 .article-wrapper article.pytorch-article em,.pytorch-2 .article-wrapper article.pytorch-article h3,.pytorch-2 .article-wrapper article.pytorch-article h2,.pytorch-2 .article-wrapper article.pytorch-article p,.pytorch-2 .article-wrapper article.pytorch-article a,.pytorch-2 .article-wrapper article.pytorch-article strong,.pytorch-2 .article-wrapper article.pytorch-article td,.pytorch-2 .article-wrapper article.pytorch-article tr{font-family:Verdana}.pytorch-2 .article-wrapper article.pytorch-article ul,.pytorch-2 .article-wrapper article.pytorch-article ol{margin:1.5rem 0 1.5rem 0}.pytorch-2 .article-wrapper article.pytorch-article ul li,.pytorch-2 .article-wrapper article.pytorch-article ol li{font-family:Verdana}.pytorch-2 .article-wrapper article.pytorch-article code{font-family:IBMPlexMono,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;padding:2px;color:inherit;background-color:#f1f1f1}.pytorch-2 .article-wrapper article.pytorch-article p,.pytorch-2 .article-wrapper article.pytorch-article a{font-family:Verdana;word-break:break-word}.pytorch-2 .article-wrapper article.pytorch-article p strong,.pytorch-2 .article-wrapper article.pytorch-article a strong{font-family:Verdana}@media screen and (max-width: 418px){.pytorch-2 .article-wrapper article.pytorch-article .QnATable{max-width:95vw}}.ecosystem .jumbotron{height:170px}@media screen and (min-width: 768px){.ecosystem .jumbotron{height:300px}}.ecosystem .jumbotron h1{padding-top:8.4375rem;color:#fff}.ecosystem .jumbotron p.lead{margin-bottom:1.5625rem;padding-top:1.25rem;color:#fff}.ecosystem .jumbotron .ecosystem-join{margin-bottom:3rem}.ecosystem .jumbotron svg{margin-bottom:1.25rem}@media screen and (min-width: 768px){.ecosystem .main-content{padding-top:3.25rem}}.ecosystem .main-content-wrapper{background-color:#f3f4f7;margin-top:340px}@media screen and (min-width: 768px){.ecosystem .main-content-wrapper{margin-top:435px}}.ecosystem.ecosystem-detail .main-content-wrapper{background-color:#fff}.ecosystem-cards-wrapper{margin-bottom:1.125rem;padding-top:1.25rem}@media (min-width: 768px){.ecosystem-cards-wrapper .col-md-6{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 1240px){.ecosystem-cards-wrapper .col-md-6{flex:0 0 50%;max-width:50%}}.ecosystem .main-content-menu .navbar-nav .nav-link{font-size:1.125rem;color:#CCCDD1;padding-right:0;margin-right:1.875rem}.ecosystem .main-content-menu .navbar-nav .nav-link.selected{color:#ee4c2c;border-bottom:1px solid #ee4c2c}@media screen and (min-width: 768px){.ecosystem .main-content-menu .nav-item:last-of-type{position:absolute;right:0}.ecosystem .main-content-menu .nav-item:last-of-type a{margin-right:0}}.ecosystem.ecosystem-detail .main-content{padding-bottom:0}.ecosystem article.pytorch-article{counter-reset:article-list}.ecosystem article.pytorch-article>ol{padding-left:0;list-style-type:none}@media screen and (min-width: 1240px){.ecosystem article.pytorch-article>ol>li{position:relative}.ecosystem article.pytorch-article>ol>li:before{counter-increment:article-list;content:counter(article-list,decimal-leading-zero);color:#B932CC;line-height:2.5rem;letter-spacing:-0.34px;font-size:2rem;font-weight:300;position:absolute;left:-60px;top:-16px;padding:.625rem 0;background-color:#fff;z-index:10}.ecosystem article.pytorch-article>ol>li:after{content:"";width:2px;position:absolute;left:-42px;top:0;height:100%;background-color:#f3f3f3;z-index:9}}.ecosystem article.pytorch-article>ol>li>h4{color:#262626}.ecosystem article.pytorch-article>ol>li ul li{list-style-type:disc}.ecosystem .quick-starts{background:#ecedf1}.ecosystem .quick-starts .title-block,.ecosystem .quick-starts #command,.ecosystem .quick-starts .option,.ecosystem .quick-starts .cloud-option{border-color:#ecedf1}.ecosystem .join-link{color:inherit;text-decoration:underline}.ecosystem .join-notice{text-align:center;padding-top:1.25rem;padding-bottom:2.5rem}.ecosystem .join-notice p{color:#6c6c6d;margin-bottom:0;line-height:1.875rem}.ecosystem .join-jumbotron{width:90%}@media screen and (min-width: 768px){.ecosystem .join-jumbotron{height:262px}}.ecosystem .join-jumbotron .container{max-width:920px}.ecosystem .join-jumbotron h1{padding-top:.3125rem;color:#fff}.ecosystem .join-jumbotron h1 span{font-weight:300}.ecosystem .join-wrapper{background-color:#f3f4f7}@media screen and (min-width: 768px){.ecosystem .join-wrapper .main-content{padding-top:1.5rem}}.ecosystem .join-wrapper .container{max-width:920px}.ecosystem .join-wrapper #success-response{color:#6c6c6d}.ecosystem .join-intro{color:#6c6c6d;line-height:28px}.ecosystem .requirements span{color:#000;font-weight:bold}.ecosystem .requirements .join-number{color:#812CE5;display:flex;align-items:center}@media screen and (min-width: 768px){.ecosystem .requirements .join-number{padding-left:.625rem}}.ecosystem .requirements p{margin-bottom:0;margin-top:-.4375rem}@media screen and (min-width: 768px){.ecosystem .requirements p{padding-left:1.5rem}}@media screen and (min-width: 768px){.ecosystem .requirements .col-md-11{border-left:2px solid #f3f4f7}}.ecosystem .row.requirements{padding-bottom:2.5rem}.ecosystem .experimental .ecosystem-card-title-container{display:inline-flex}.ecosystem .experimental .ecosystem-card-title-container .experimental-badge{text-transform:uppercase;margin-left:15px;background-color:#e4e4e4;color:#262626;opacity:0.75;font-size:.625rem;letter-spacing:1px;line-height:1.375rem;height:1.25rem;width:6rem;text-align:center;margin-top:.25rem}.ecosystem .ecosystem-card-title-container .card-title{padding-left:0;font-size:1.5rem;color:#262626}.ecosystem .star-list{list-style:none;padding-left:0}.ecosystem .star-list li{display:inline}.ecosystem .star-list li.github-stars-count-whole-number{display:none}.ecosystem .icon-count-container{display:inline-block;vertical-align:text-bottom;margin-left:.5rem}.ecosystem .github-logo{height:15px;width:13px;margin-left:10px}.ecosystem .github-stars-count{color:#797676;position:relative;top:.25rem;font-size:14px;margin-left:0.125rem}@media screen and (min-width: 768px){.ecosystem .github-stars-count{top:.1875rem;font-size:initial}}.ecosystem-divider{position:relative;margin-bottom:4rem;margin-top:1.5rem;top:3rem}.ecosystem #dropdownSort,.ecosystem #dropdownSortLeft{margin-left:0}.ecosystem #dropdownSortLeft{font-size:19px;top:inherit;right:inherit}.ecosystem-filter-menu ul{list-style-type:none;padding-left:1.25rem}.ecosystem-filter-menu ul li{padding-right:1.25rem;word-break:break-all}.ecosystem-filter-menu ul li a{color:#797676}.ecosystem-filter-menu ul li a:hover{color:#ee4c2c}.ecosystem .ecosystem-filter{cursor:pointer}.ecosystem .ecosystem-filter ul{list-style-type:none}.ecosystem #dropdownFilter,#dropdownSort,#dropdownSortLeft{color:#797676;cursor:pointer;z-index:1;position:absolute}.ecosystem .pagination .page{border:1px solid #dee2e6;padding:0.5rem 0.75rem}.ecosystem .pagination .active .page{background-color:#dee2e6}.ecosystem-form .hbspt-form{padding-bottom:3rem}.ecosystem-form .hbspt-form .hs-form-field{width:100%}.ecosystem-form .hbspt-form .hs-form-field .input input{width:100%;border:none;border-bottom:2px solid #812CE5;height:2.75rem;outline:none;padding-left:.9375rem;margin-bottom:1.875rem}.ecosystem-form .hbspt-form .hs-richtext h3{text-transform:uppercase;padding-top:1.5625rem;padding-bottom:1.875rem}.ecosystem-form .hbspt-form label{color:#6c6c6d}.ecosystem-form .hbspt-form textarea{width:100%;border:none;border-bottom:2px solid #812CE5;outline:none;padding-left:.9375rem;margin-bottom:1.875rem;height:5.625rem;padding-top:.625rem}.ecosystem-form .hbspt-form ::-moz-placeholder{color:#6c6c6d;opacity:0.5}.ecosystem-form .hbspt-form :-ms-input-placeholder{color:#6c6c6d;opacity:0.5}.ecosystem-form .hbspt-form ::-ms-input-placeholder{color:#6c6c6d;opacity:0.5}.ecosystem-form .hbspt-form ::placeholder{color:#6c6c6d;opacity:0.5}.ecosystem-form .hbspt-form .actions{display:flex;width:100%;justify-content:center}.ecosystem-form .hbspt-form .hs-button{padding-left:.75rem;margin-top:2.5rem;background-color:#ee4c2c;color:#fff;cursor:pointer;border:none;width:30%;height:2.8125rem;text-align:left;background-repeat:no-repeat;background-image:url(/assets/images/arrow-right-with-tail-white.svg);background-size:30px 12px;background-position:right}@media screen and (min-width: 768px){.ecosystem-form .hbspt-form .hs-button{padding-left:1.125rem;background-origin:content-box;background-size:30px 15px}}.features .main-content{padding-bottom:0}.features .navbar-nav .nav-link{color:#000}.features .nav-logo{background-image:url("/assets/images/logo-dark.svg")}@media screen and (min-width: 768px){.features .main-background{height:575px}}.features .main-content-wrapper{margin-top:350px}@media screen and (min-width: 768px){.features .main-content-wrapper{margin-top:540px}}.features-row{padding-bottom:3.75rem;align-items:center}.features-row:first-of-type{margin-top:1.25rem}.features-row:last-of-type{padding-bottom:4.5rem}@media screen and (min-width: 768px){.features-row{padding-bottom:6rem}.features-row:first-of-type{margin-top:4.05rem}}.features-row h3{font-size:2rem;letter-spacing:1.78px;line-height:2.25rem;font-weight:400;text-transform:uppercase;margin-bottom:1.25rem;font-weight:300}@media (min-width: 768px) and (max-width: 1239px){.features-row h3{width:80%}}@media screen and (min-width: 1240px){.features-row h3{width:590px}}.features-row p{font-size:1.125rem;letter-spacing:0.25px;line-height:1.75rem;color:#6c6c6d;padding-right:1.875rem}@media (min-width: 768px) and (max-width: 1239px){.features-row p{width:80%}}@media screen and (min-width: 1240px){.features-row p{width:590px}}.features-row .feature-content-holder{width:100%}@media screen and (min-width: 1240px){.features-row .feature-content-holder{width:495px}}.features-row .feature-content-holder pre.highlight{margin-bottom:0}.features-row:nth-child(odd) .col-md-6:nth-child(1n){order:2}.features-row:nth-child(odd) .col-md-6:nth-child(2n){order:1}@media screen and (min-width: 768px){.features-row:nth-child(odd) .col-md-6:nth-child(1n){order:1}.features-row:nth-child(odd) .col-md-6:nth-child(2n){order:2}}.features-row:nth-child(1n) h3{color:#B73BC9}.features-row:nth-child(1n) .feature-content-holder{border-bottom:2px solid #B73BC9}.features-row:nth-child(2n) h3{color:#D92F4C}.features-row:nth-child(2n) .feature-content-holder{border-bottom:2px solid #D92F4C}.features-row:nth-child(3n) h3{color:#8038E0}.features-row:nth-child(3n) .feature-content-holder{border-bottom:2px solid #8038E0}@media screen and (min-width: 1240px){.features-row .col-md-6{padding-left:0;padding-right:0}}@media screen and (min-width: 768px){.features-row .col-md-6:nth-of-type(2) .feature-content{width:100%}.features-row .col-md-6:nth-of-type(2) .feature-content h3,.features-row .col-md-6:nth-of-type(2) .feature-content p,.features-row .col-md-6:nth-of-type(2) .feature-content .feature-content-holder{float:right}}.features .jumbotron{height:200px}@media screen and (min-width: 768px){.features .jumbotron{height:195px}}@media (max-width: 320px){.features .jumbotron{height:250px}}.features .jumbotron h1{padding-top:1.875rem}@media screen and (min-width: 768px){.features .jumbotron{height:468px}.features .jumbotron h1{padding-top:0}}.features .jumbotron h1,.features .jumbotron p{color:#fff}@media screen and (min-width: 768px){.features .jumbotron .btn{margin-top:.375rem}}.resources .jumbotron{align-items:flex-end;color:#fff;height:220px}@media screen and (min-width: 768px){.resources .jumbotron{height:300px}}.resources .jumbotron h1{padding-top:8.4375rem}.resources .jumbotron p.lead{margin-bottom:1.5625rem;padding-top:1.25rem}.resources .main-content-wrapper{margin-top:385px;margin-bottom:0.75rem}@media screen and (min-width: 768px){.resources .main-content-wrapper{margin-top:475px}}@media screen and (min-width: 768px){.resources .resource-card{margin-bottom:2.25rem}}.quick-starts{background:#f3f4f7}.quick-starts .col-md-2-4{position:relative;width:100%;min-height:1px;padding-right:15px;padding-left:15px}@media (min-width: 768px){.quick-starts .col-md-2-4{flex:0 0 20%;max-width:20%}}.quick-starts .start-locally-col{margin-bottom:1.25rem}.quick-starts .start-locally-col .row.ptbuild,.quick-starts .start-locally-col .row.os,.quick-starts .start-locally-col .row.package,.quick-starts .start-locally-col .row.language,.quick-starts .start-locally-col .row.cuda{margin-bottom:1.25rem}@media screen and (min-width: 768px){.quick-starts .start-locally-col .row.ptbuild,.quick-starts .start-locally-col .row.os,.quick-starts .start-locally-col .row.package,.quick-starts .start-locally-col .row.language,.quick-starts .start-locally-col .row.cuda{margin-bottom:0}}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .start-locally-col{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 768px){.quick-starts .start-locally-col{margin-bottom:2.5rem}.quick-starts .start-locally-col .row{margin-bottom:0}}@media screen and (min-width: 1240px){.quick-starts .start-locally-col{margin-bottom:0}}.quick-starts .start-locally-col pre{font-size:80% !important;background-color:#ffffff !important}.quick-starts .start-locally-col .prev-versions-btn{margin-top:30px}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .cloud-options-col{flex:0 0 100%;max-width:100%;margin-left:0;margin-top:1.25rem}}.quick-starts p{font-size:1.125rem;line-height:1.75rem}.quick-starts .card-body{flex:1 1 auto}.quick-starts .cloud-option-image{margin-left:.9375rem;margin-right:1.5625rem;margin-bottom:.3125rem}.quick-starts .cloud-option-row{margin-left:0;cursor:pointer}.quick-starts .option{border:2px solid #f3f4f7;font-size:1rem;color:#6c6c6d;letter-spacing:-0.22px;line-height:1.25rem;background:#fff;cursor:pointer}.quick-starts .option:hover{background-color:#ee4c2c;color:#fff}.quick-starts .selected{background-color:#ee4c2c;color:#fff}.quick-starts .block{margin-bottom:.0625rem;height:2.5rem;display:flex;align-items:center}.quick-starts .title-block{margin:.0625rem;height:2.5rem;border:2px solid #f3f4f7;font-size:1rem;color:#6c6c6d;line-height:1.25rem;display:flex;align-items:center}.quick-starts .title-block:before{display:block;content:".";color:transparent;border-left:2px solid #CCCDD1;height:100%;position:absolute;left:0}.quick-starts #command{color:#4a4a4a;background-color:#fff;padding:.9375rem;border:2px solid #f3f4f7;word-wrap:break-word;display:table-cell;vertical-align:middle}.quick-starts #command a{font-size:125%}@media screen and (min-width: 768px){.quick-starts #command a:hover{color:#ee4c2c}}.quick-starts #command pre{word-break:break-all;white-space:normal}.quick-starts .command-container{display:table;width:100%}@media screen and (min-width: 768px){.quick-starts .command-container{min-height:5.25rem}}.quick-starts .command-container pre{margin-bottom:0px;padding:0px;font-size:75%;background-color:#f3f4f7}.quick-starts .command-block{height:5.25rem;word-wrap:break-word;color:#6c6c6d}.quick-starts .command-block:before{border-left:2px solid #000}.quick-starts .quick-start-link{color:#6c6c6d}.quick-starts .mobile-heading{display:flex;align-items:center;font-weight:400}@media screen and (min-width: 768px){.quick-starts .mobile-heading{display:none}}.quick-starts .command-mobile-heading{display:flex;align-items:center;font-weight:400;color:#000}@media screen and (min-width: 768px){.quick-starts .command-mobile-heading{display:none}}.quick-starts .headings{display:none}@media screen and (min-width: 768px){.quick-starts .headings{display:block}}.quick-starts .cloud-options-col{margin-top:1.25rem}@media screen and (min-width: 768px){.quick-starts .cloud-options-col{margin-top:0}}@media (max-width: 978px){.quick-starts .os-text{margin-top:0}}.quick-start-guides{font-size:1.125rem;letter-spacing:0.25px;line-height:2.25rem;color:#CCCDD1}.quick-start-guides .select-instructions{color:#262626;border-bottom:2px solid #CCCDD1;margin-bottom:1rem;display:inline-block}@media screen and (min-width: 768px){.quick-start-guides .select-instructions{margin-bottom:0}}.quick-start-module{padding-top:2.5rem;padding-bottom:2.5rem}.quick-start-module .option-module{float:right}@media screen and (min-width: 768px){.quick-start-module{padding-top:4rem;padding-bottom:4.125rem}}.quick-start-module p{color:#6c6c6d;font-size:1.125em;letter-spacing:0.25px;padding-bottom:.9375rem;margin-bottom:1.4rem}.quick-start-module h3{font-size:1.5rem;letter-spacing:1.33px;line-height:2rem;text-transform:uppercase;margin-bottom:2.1rem}.quick-starts .cloud-option-body{display:flex;align-items:center;height:64px;padding:0 0 0 5rem;position:relative;background-image:url("/assets/images/chevron-right-orange.svg");background-size:6px 13px;background-position:center right 15px;background-repeat:no-repeat}@media screen and (min-width: 768px){.quick-starts .cloud-option-body:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.quick-starts .cloud-option-body:hover:after{width:100%}.quick-starts .cloud-option-body:hover{color:#262626}}@media screen and (min-width: 768px){.quick-starts .cloud-option-body{padding-right:2rem}}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .cloud-option-body{padding-right:1.25rem}}@media screen and (min-width: 768px){.quick-starts .cloud-option-body{background-size:8px 14px}}.quick-starts .cloud-option-body:before{opacity:0.5;position:absolute;left:1.875rem;top:21px}.quick-starts .cloud-option-body.aws:before{content:url("/assets/images/aws-logo.svg")}.quick-starts .cloud-option-body.microsoft-azure:before{content:url("/assets/images/microsoft-azure-logo.svg")}.quick-starts .cloud-option-body.lightning-studios:before{content:url("/assets/images/lightning-studios-logo.svg")}.quick-starts .cloud-option-body.google-cloud:before{content:url("/assets/images/google-cloud-logo.svg")}.quick-starts .cloud-option-body.colab:before{content:url("/assets/images/colab-logo.svg")}@media screen and (min-width: 768px){.quick-starts .cloud-option-body:hover:before{opacity:1}}.quick-starts .cloud-option{background-color:#fff;margin-bottom:.125rem;border:2px solid #f3f4f7;font-size:1.125rem;letter-spacing:-0.25px;line-height:1.875rem;color:#262626}.quick-starts .cloud-option #microsoft-azure p{color:#262626;margin:0;padding:0;font-size:inherit;line-height:1.3rem}.quick-starts .cloud-option #microsoft-azure span{margin-bottom:0;padding-bottom:0;color:#ee4c2c;padding:0px 35px 0px 8px;font-style:italic;line-height:1.3rem}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .cloud-option{font-size:1rem}}.quick-starts .cloud-option ul{display:none;width:100%;margin:0 0 1.25rem 0;padding:0}.quick-starts .cloud-option ul li{margin-top:0;position:relative;padding-left:5rem}@media (min-width: 768px) and (max-width: 1239px){.quick-starts .cloud-option ul li{font-size:1rem}}.quick-starts .cloud-option ul li a{color:#6c6c6d;letter-spacing:-0.25px;line-height:30px}@media screen and (min-width: 768px){.quick-starts .cloud-option ul li a:hover{color:#ee4c2c}}@media screen and (min-width: 768px){.quick-starts .cloud-option ul li:hover:before{content:"\2022";color:#ee4c2c;position:absolute;left:36px}}.quick-starts .cloud-option ul li:first-of-type{margin-top:1.25rem}.quick-starts .cloud-option.open .cloud-option-body{background-image:url("/assets/images/chevron-down-orange.svg");background-size:14px 14px;border-bottom:1px solid #ee4c2c;color:#262626}@media screen and (min-width: 768px){.quick-starts .cloud-option.open .cloud-option-body{border-bottom:none}}.quick-starts .cloud-option.open .cloud-option-body:after{width:100%}.quick-starts .cloud-option.open .cloud-option-body:before{opacity:1}.quick-starts .cloud-option.open ul{display:block}.blog .navbar-nav .nav-link{color:#000}.blog .main-content{padding-bottom:1.5rem}@media screen and (min-width: 768px){.blog .main-content{padding-top:1.70rem;padding-bottom:3.5rem}}.blog .main-background{height:290px}@media screen and (min-width: 768px){.blog .main-background{height:485px}}.blog .blog-detail-background{height:300px}@media screen and (min-width: 768px){.blog .blog-detail-background{height:312px}}.blog .main-content-menu .navbar-nav .nav-link{text-transform:capitalize}.blog .main-content-menu .navbar-nav .nav-link.selected{color:#ee4c2c !important;text-decoration:underline;-webkit-text-decoration-color:#ee4c2c;text-decoration-color:#ee4c2c;opacity:0.75 !important}@media screen and (min-width: 768px){.blog .main-content-menu .nav-item:last-of-type{position:absolute;right:0}.blog .main-content-menu .nav-item:last-of-type a{margin-right:0}}.blog .zoom-in{cursor:zoom-in}.blog .zoomed{cursor:zoom-out}.blog .zoomed img{margin:auto !important;position:absolute;top:0;left:0;right:0;bottom:0;max-width:98%}.blog .nav-logo{background-image:url("/assets/images/logo-dark.svg")}.blog .main-content-wrapper{margin-top:275px}.blog .main-content-wrapper .row.blog-index{margin-top:30px}.blog .main-content-wrapper .row.blog-index p{color:#6c6c6d}.blog .main-content-wrapper .row.blog-vertical{display:block;max-width:100%;margin:auto}.blog .main-content-wrapper .row.blog-vertical .col-md-4{display:initial}.blog .main-content-wrapper .row.blog-vertical .btn{float:left}.blog .main-content-wrapper .vertical-blog-container{border-bottom:1px solid #E2E2E2;padding-bottom:3rem}.blog .main-content-wrapper .vertical-blog-container:last-of-type{margin-bottom:2rem}@media screen and (min-width: 768px){.blog .main-content-wrapper{margin-top:470px}.blog .main-content-wrapper .row.blog-index [class*="col-"]:not(:first-child):not(:last-child):not(:nth-child(3n)){padding-right:2.1875rem;padding-left:2.1875rem}.blog .main-content-wrapper .row.blog-index [class*="col-"]:nth-child(3n){padding-left:2.1875rem}.blog .main-content-wrapper .row.blog-index [class*="col-"]:nth-child(3n+1){padding-right:2.1875rem}.blog .main-content-wrapper .col-md-4{margin-bottom:1.4375rem}}.blog .main-content-wrapper h4 a{font-family:FreightSans;font-size:1.5rem;color:#000;letter-spacing:0;line-height:2rem;font-weight:400}.blog .main-content-wrapper .author{color:#ee4c2c;font-size:1.25rem;letter-spacing:0.25px;line-height:1.875rem;margin-bottom:1.875rem}.blog .main-content-wrapper .author-icon{position:relative;top:1.625rem;height:1.0625rem;width:1.1875rem}.blog .blog-detail-content{padding-bottom:2.8rem}@media screen and (min-width: 768px){.blog .blog-detail-wrapper{margin-top:324px}}.blog .jumbotron{top:6.5625rem}@media screen and (min-width: 768px){.blog .jumbotron{height:25.3125rem}}@media screen and (min-width: 768px){.blog .jumbotron .container{padding-bottom:2.8125rem}}.blog .jumbotron .blog-index-title{overflow:hidden;margin-top:1.5rem;white-space:nowrap;text-overflow:ellipsis;color:white}@media screen and (min-width: 768px){.blog .jumbotron .blog-index-title{overflow:unset;white-space:unset;text-overflow:unset}}.blog .jumbotron h1{letter-spacing:-1.65px;font-size:3.25rem;line-height:3.5rem;text-transform:none;color:#fff}.blog .jumbotron h1 a{color:#fff;word-wrap:break-word}.blog .jumbotron h2{color:#fff}.blog .jumbotron .blog-title{display:inline-flex}.blog .jumbotron .blog-title:hover{color:#fff}.blog .jumbotron .blog-detail-container{padding-top:4rem}@media screen and (min-width: 768px){.blog .jumbotron .blog-detail-container{padding-top:10.875rem}}.blog .jumbotron p{font-size:1.25rem;letter-spacing:0;line-height:1.875rem;color:#fff}.blog .jumbotron .btn{margin-top:.75rem;padding-top:.5625rem}.blog .jumbotron .blog-page-container p.blog-date{padding-top:.625rem}.blog .jumbotron .blog-page-container .btn{margin-bottom:.625rem}.blog .blog-detail-jumbotron{top:45px}@media screen and (min-width: 768px){.blog .blog-detail-jumbotron{height:107px;top:75px}}.blog p.blog-date{font-size:1.125rem;letter-spacing:0;line-height:1.5rem;margin-bottom:.625rem;color:#6c6c6d}.blog p.featured-post{font-size:1.125rem;letter-spacing:0;line-height:1.5rem;margin-bottom:.625rem;color:#fff}.blog p.featured-blog-preview{margin-bottom:.75rem}.blog #blogPostFilter .nav-link{opacity:0.53;font-size:1.25rem;color:#000;letter-spacing:0;line-height:2.125rem}.blog .page-link{font-size:1.25rem;letter-spacing:0;line-height:2.125rem;color:#ee4c2c;width:7.5rem;text-align:center}.blog .blog-modal{max-width:75%;top:5rem}.blog .blog-modal:hover{cursor:zoom-out}@media (max-width: 575px){.blog .blog-modal{max-width:100%;top:10rem}}.blog .blog-image{cursor:zoom-in}@media (max-width: 1067px){.blog .jumbotron h1{margin-right:0;margin-top:1.5rem}.blog .jumbotron h1 a{font-size:2.8125rem;line-height:2.5rem}.blog .main-content-wrapper .col-md-4{margin-bottom:4.6875rem}.blog .similar-posts{margin-bottom:3.125rem}}@media (max-width: 1050px){.blog .main-content-wrapper .author-icon{left:-1.875rem}}.blog table tr th{font-weight:600}.blog .pytorch-article .enterprise-azure-logo-container{padding-left:0}.blog .pytorch-article .enterprise-azure-logo-container img{margin-bottom:0}.blog .pytorch-article img{margin-bottom:1.125rem}twitterwidget{margin:0 auto;margin-top:1.125rem !important;margin-bottom:1.125rem !important}.pytorch-article .outlined-code-block{border:1px solid black;padding:1rem;margin-bottom:1rem}.pytorch-article .outlined-code-block pre{margin:0;padding:0;background-color:white}.pytorch-article .reference-list li{overflow-wrap:anywhere}.similar-posts-module{background:#f3f4f7}.similar-posts-module p.blog-date{font-size:1.125rem;color:#CCCDD1;letter-spacing:0;line-height:1.5rem}.similar-posts-module h4 a{font-family:FreightSans;font-size:1.5rem;color:#000;letter-spacing:0;line-height:2rem;font-weight:400}.similar-posts-module .module-content{margin-bottom:2.1875rem}.similar-posts-module .module-content .navbar-nav{margin-top:3.75rem}.similar-posts-module .module-content .module-heading{text-transform:uppercase;color:#000;font-size:1.5rem;letter-spacing:.083125rem;line-height:2rem;font-weight:400}@media screen and (min-width: 768px){.similar-posts-module .module-content .nav-item:last-of-type{position:absolute;right:0}.similar-posts-module .module-content .nav-item:last-of-type a{margin-right:0}}.similar-posts-module .see-more-posts{color:#000;font-size:1.125rem;letter-spacing:-0.25px;line-height:1.875rem;top:.125rem}input[type='search']{-moz-appearance:none;-webkit-appearance:none}.navSearchWrapper{align-items:center;align-self:center;display:flex;justify-content:center;position:relative;right:10px;top:15px;margin-left:0;padding-bottom:20px}@media screen and (min-width: 768px){.navSearchWrapper{position:absolute;margin-left:30px;display:block;padding-left:3px;padding-bottom:0}}.tabletSearchWrapper{top:0px}@media (min-width: 768px) and (max-width: 1239px){.tabletSearchWrapper{padding-bottom:20px;position:relative;margin-left:0}}.navSearchWrapper .aa-dropdown-menu{background:#f9f9f9;border:3px solid rgba(57,57,57,0.25);color:#393939;font-size:.875rem;left:auto !important;line-height:1.2em;right:0 !important}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--category-header{background:#000;color:white;font-size:.875rem;font-weight:400}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--category-header .algolia-docsearch-suggestion--highlight{background-color:#000;color:#fff}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--title .algolia-docsearch-suggestion--highlight,.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--subcategory-column .algolia-docsearch-suggestion--highlight{color:#000}.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion__secondary,.navSearchWrapper .aa-dropdown-menu .algolia-docsearch-suggestion--subcategory-column{border-color:rgba(57,57,57,0.3)}@media screen and (min-width: 768px){.navSearchWrapper .algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column{word-wrap:normal}}input#search-input{background-color:inherit;border:none;border-radius:20px;color:#000;font-size:1.125rem;font-weight:300;line-height:20px;outline:none;padding-left:25px;position:relative;transition:0.5s width ease;display:none;width:220px;background-image:url("/assets/images/search-icon.svg");background-size:12px 15px;background-repeat:no-repeat;background-position:8px 5px}input#search-input:hover{background-image:url("/assets/images/search-icon-orange.svg")}input#mobile-search-input{font-size:2rem;background-color:transparent;color:#fff;border:none;outline:none;padding-left:25px;position:relative;border-top-left-radius:20px;border-bottom-left-radius:20px;width:300px;display:block}input#search-input:focus,input#search-input:active{color:#000}.navigationSlider .slidingNav .navSearchWrapper .algolia-docsearch-footer a{height:auto}@media only screen and (max-width: 735px){.navSearchWrapper{width:100%}}input::-moz-placeholder{color:#e5e5e5}input:-ms-input-placeholder{color:#e5e5e5}input::-ms-input-placeholder{color:#e5e5e5}input::placeholder{color:#e5e5e5}.hljs{padding:1.25rem 1.5rem}@media only screen and (max-width: 1024px){.reactNavSearchWrapper input#search-input{background-color:rgba(242,196,178,0.25);border:none;border-radius:20px;box-sizing:border-box;color:#393939;font-size:.875rem;line-height:20px;outline:none;padding-left:25px;position:relative;transition:background-color 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55),width 0.2s cubic-bezier(0.68, -0.55, 0.265, 1.55),color 0.2s ease;width:100%}.reactNavSearchWrapper input#search-input:focus,.reactNavSearchWrapper input#search-input:active{background-color:#000;color:#fff}.reactNavSearchWrapper .algolia-docsearch-suggestion--subcategory-inline{display:none}.reactNavSearchWrapper>span{width:100%}.reactNavSearchWrapper .aa-dropdown-menu{font-size:.75rem;line-height:2em;padding:0;border-width:1px;min-width:500px}.reactNavSearchWrapper .algolia-docsearch-suggestion__secondary{border-top:none}.aa-suggestions{min-height:140px;max-height:60vh;-webkit-overflow-scrolling:touch;overflow-y:scroll}}@media only screen and (min-width: 1024px){.navSearchWrapper{padding-left:10px;position:relative;right:auto;top:auto}}@media only screen and (min-width: 1024px) and (min-width: 768px){.navSearchWrapper{padding-left:3px;right:10px;margin-left:0}}@media only screen and (min-width: 1024px){.navSearchWrapper .algolia-autocomplete{display:block}.tabletSearchWrapper{right:10px}}@media only screen and (max-width: 735px){.reactNavSearchWrapper .aa-dropdown-menu{min-width:400px}}@media only screen and (max-width: 475px){.reactNavSearchWrapper .aa-dropdown-menu{min-width:300px}}.search-border{display:none;flex-direction:row;border:none;background-color:transparent;border-radius:20px;width:100%;float:right}@media screen and (min-width: 768px){.search-border{display:flex}}.mobile-search-border{flex-direction:row;border:none;background-color:rgba(255,255,255,0.1);border-radius:20px;width:100%;float:right;display:flex}@media (min-width: 768px) and (max-width: 1239px){.mobile-search-border{border-radius:25px}}#close-search{color:#ee4c2c;padding-right:10px;font-size:.99em;display:none;cursor:pointer}.active-header{margin-top:-1px}.active-search-icon{background-image:url("/assets/images/search-icon-orange.svg") !important;display:inline-block !important}.active-background{background-color:#f3f4f7;width:50%;padding:4px}.homepage-header input#search-input{background-image:url("/assets/images/search-icon-white.svg");color:#fff}.homepage-header input#search-input:focus,.homepage-header input#search-input:active{color:#fff}.homepage-header .active-background{background-color:#88888833}.homepage-header #close-search{color:#fff;opacity:0.5}.homepage-header #close-search:hover{color:#ee4c2c}.homepage-header #search-icon{background-image:url(/assets/images/search-icon-white.svg)}.homepage-header #search-icon:hover{background-color:#88888833}#search-icon{background-image:url(/assets/images/search-icon.svg);color:transparent;width:33px;height:33px;background-size:21px 21px;background-repeat:no-repeat;background-position:6px 5px;border-radius:25px;cursor:pointer}#search-icon:hover{background-color:#f3f4f7}#mobile-search-icon{background-image:url(/assets/images/search-icon-white.svg);width:30px;height:38px;background-size:16px 28px;background-repeat:no-repeat;background-position:0px 5px;cursor:pointer;border-top-right-radius:20px;border-bottom-right-radius:20px}@media (min-width: 768px) and (max-width: 1239px){#mobile-search-icon{height:50px;width:35px;background-size:20px 42px}}.navSearchWrapper .algolia-autocomplete .ds-dropdown-menu{min-width:330px;height:500px;overflow-y:scroll}@media screen and (min-width: 768px){.navSearchWrapper .algolia-autocomplete .ds-dropdown-menu{height:auto;min-width:700px;overflow-y:hidden}}@media (min-width: 768px) and (max-width: 1239px){.navSearchWrapper .algolia-autocomplete .ds-dropdown-menu{height:700px;overflow-y:scroll}}@media (min-width: 769px) and (max-width: 1024px){.navSearchWrapper .algolia-autocomplete .ds-dropdown-menu{min-width:950px}}.cookie-banner-wrapper{display:none}.cookie-banner-wrapper.is-visible{display:block;position:fixed;bottom:0;background-color:#f3f4f7;min-height:100px;width:100%;z-index:401;border-top:3px solid #ededee}.cookie-banner-wrapper .gdpr-notice{color:#6c6c6d;margin-top:1.5625rem;text-align:left;max-width:1440px}@media screen and (min-width: 768px){.cookie-banner-wrapper .gdpr-notice{width:77%}}@media (min-width: 768px) and (max-width: 1239px){.cookie-banner-wrapper .gdpr-notice{width:inherit}}.cookie-banner-wrapper .gdpr-notice .cookie-policy-link{color:#343434}.cookie-banner-wrapper .close-button{-webkit-appearance:none;-moz-appearance:none;appearance:none;background:transparent;border:1px solid #f3f4f7;height:1.3125rem;position:absolute;bottom:42px;right:0;top:0;cursor:pointer;outline:none}@media screen and (min-width: 768px){.cookie-banner-wrapper .close-button{right:20%;top:inherit}}@media (min-width: 768px) and (max-width: 1239px){.cookie-banner-wrapper .close-button{right:0;top:0}}.hub .jumbotron{height:300px}@media screen and (min-width: 768px){.hub .jumbotron{height:420px}}.hub .jumbotron h1{color:#fff}.hub .jumbotron h1 #hub-header,.hub .jumbotron h1 #hub-sub-header{font-weight:lighter}.hub .jumbotron p.lead,.hub .jumbotron p.hub-release-message{margin-bottom:1.5625rem;padding-top:1.5625rem;color:#fff}@media screen and (min-width: 768px){.hub .jumbotron p.lead,.hub .jumbotron p.hub-release-message{width:77%}}.hub .jumbotron p.hub-release-message{padding-top:0;font-style:italic}.hub .jumbotron svg{margin-bottom:1.25rem}.hub .jumbotron p.detail-lead{padding-top:3.125rem;color:#797676;width:100%;margin-bottom:0px}.hub .jumbotron p.lead-summary{color:#6c6c6d}.hub.hub-index .jumbotron{height:280px}@media screen and (min-width: 768px){.hub.hub-index .jumbotron{height:325px}}.hub .detail-github-link{background:#ee4c2c;color:#fff}.hub .detail-colab-link{background:#ffc107;color:#000}.hub .detail-web-demo-link{background:#4a9fb5;color:#fff}.hub .detail-colab-link,.hub .detail-github-link,.hub .detail-web-demo-link{margin-top:1rem}.hub .detail-button-container{margin-top:2.8125rem}@media (min-width: 768px) and (max-width: 1239px){.hub .detail-button-container{margin-top:1.25rem}}@media (max-width: 320px){.hub .detail-button-container{margin-top:1.25rem}}@media (max-width: 360px){.hub .detail-button-container{margin-top:1.25rem}}.hub a .detail-colab-link,.hub a .detail-github-link{padding-right:3.125rem}.hub .detail-arrow{color:#ee4c2c;font-size:2.5rem}@media screen and (min-width: 768px){.hub .detail-arrow{font-size:4.5rem}}.hub .with-right-white-arrow{padding-right:2rem;position:relative;background-image:url("/assets/images/chevron-right-white.svg");background-size:6px 13px;background-position:top 10px right 11px;background-repeat:no-repeat}@media screen and (min-width: 768px){.hub .with-right-white-arrow{background-size:8px 14px;background-position:top 15px right 12px;padding-right:2rem}}.hub .main-content{padding-top:8.75rem}@media screen and (min-width: 768px){.hub .main-content{padding-top:8.4375rem}}@media (max-width: 320px){.hub .main-content{padding-top:10rem}}.hub.hub-detail .main-content{padding-top:12.5rem}@media screen and (min-width: 768px){.hub.hub-detail .main-content{padding-top:9.375rem}}.hub.hub-detail .jumbotron{height:350px}@media screen and (min-width: 768px){.hub.hub-detail .jumbotron{height:400px}}.hub .main-content-wrapper{background-color:#f3f4f7;margin-top:300px}@media screen and (min-width: 768px){.hub .main-content-wrapper{margin-top:395px}}.hub-feedback-button{border:2px solid #e2e2e2;color:#A0A0A1;padding-left:0;padding-right:5rem;font-size:1rem;width:13rem}.hub-feedback-button:after{bottom:-1px}.hub-flag{background-image:url("/assets/images/feedback-flag.svg");background-size:15px 20px;background-position:center right 10px;background-repeat:no-repeat}#hub-icons{height:2rem}@media (max-width: 480px){#hub-icons{position:initial;padding-left:0;padding-top:1rem}}.hub.hub-detail .main-content-wrapper{margin-top:305px}@media screen and (min-width: 768px){.hub.hub-detail .main-content-wrapper{margin-top:390px}}@media (min-width: 768px) and (max-width: 1239px){.hub.hub-detail .main-content-wrapper{margin-top:490px}}@media (max-width: 320px){.hub.hub-detail .main-content-wrapper{margin-top:330px}}.hub .hub-cards-wrapper,.hub-cards-wrapper-right{margin-bottom:1.125rem;padding-top:1.25rem}.hub .hub-cards-wrapper .card-body .card-summary,.hub-cards-wrapper-right .card-body .card-summary{width:75%}.hub .hub-cards-wrapper .card-body .hub-image,.hub-cards-wrapper-right .card-body .hub-image{position:absolute;top:0px;right:0px;height:100%;width:25%}.hub .hub-cards-wrapper .card-body .hub-image img,.hub-cards-wrapper-right .card-body .hub-image img{height:100%;width:100%}.hub .hub-cards-wrapper .card-body .hub-image:before,.hub-cards-wrapper-right .card-body .hub-image:before{content:'';position:absolute;top:0;left:0;bottom:0;right:0;z-index:1;background:#000000;opacity:.075}.hub .github-stars-count{color:#797676;position:relative;top:.25rem;font-size:14px}@media screen and (min-width: 768px){.hub .github-stars-count{top:.1875rem;font-size:initial}}.hub .github-stars-count-whole-number{display:none}.hub .github-logo{height:15px;width:13px}.hub .icon-count-container{display:inline-block;vertical-align:text-bottom;margin-left:.5rem}.hub .detail-count{font-size:1.25rem}.hub .main-stars-container{display:flex}.hub .detail-stars-container{display:inline-flex}.hub .detail-stars-container .github-stars-image{margin-left:0}.hub .card-body .hub-card-title-container{width:75%;display:inline-flex;max-width:18.75rem}.hub .card-body .hub-card-title-container .experimental-badge{text-transform:uppercase;margin-left:.9375rem;background-color:#e4e4e4;color:#262626;opacity:0.75;font-size:.625rem;letter-spacing:1px;line-height:1.375rem;height:1.25rem;width:6rem;text-align:center;margin-top:.25rem}.hub .card-body .hub-card-title-container .card-title{padding-left:0;font-size:1.5rem;color:#262626}.hub .card-body .hub-card-title-container .star-list{list-style:none;padding-left:0}.hub .card-body .hub-card-title-container .star-list li{display:inline}.hub .card-body .hub-card-title-container .star-list li.github-stars-count-whole-number{display:none}.hub .hub-filter-menu ul{list-style-type:none;padding-left:1.25rem}.hub .hub-filter-menu ul li{padding-right:1.25rem;word-break:break-all}.hub .hub-filter-menu ul li a{color:#797676}.hub .hub-filter-menu ul li a:hover{color:#ee4c2c}.hub .hub-filter{cursor:pointer}.hub-index #dropdownSortLeft{color:#797676;cursor:pointer;z-index:1;position:absolute;top:inherit;left:23%;max-width:4rem}@media (min-width: 480px) and (max-width: 590px){.hub-index #dropdownSortLeft{left:40%}}.hub #dropdownFilter,#dropdownSort,#dropdownSortLeft{color:#797676;cursor:pointer;z-index:1;position:absolute;top:11rem;right:1rem;left:inherit}@media (min-width: 480px) and (max-width: 590px){.hub #dropdownFilter,#dropdownSort,#dropdownSortLeft{top:7rem}}@media (min-width: 590px){.hub #dropdownFilter,#dropdownSort,#dropdownSortLeft{top:5rem}}@media screen and (min-width: 768px){.hub #dropdownFilter,#dropdownSort,#dropdownSortLeft{top:5rem}}.hub .sort-menu{left:inherit;right:1rem;top:12.5rem;max-width:12rem}@media (min-width: 480px) and (max-width: 590px){.hub .sort-menu{top:8.5rem}}@media (min-width: 590px) and (max-width: 900px){.hub .sort-menu{top:6.5rem}}@media (min-width: 900px) and (max-width: 1239px){.hub .sort-menu{top:6.5rem}}@media screen and (min-width: 1240px){.hub .sort-menu{right:0;top:6.5rem}}.hub-index .sort-menu{left:23%;top:inherit;max-width:12rem}.hub .research-hub-title,.research-hub-sub-title{text-transform:uppercase;letter-spacing:1.78px;line-height:2rem}.research-hub-sub-title{padding-bottom:1.25rem}.hub .research-hub-title{color:#ee4c2c}.hub .all-models-button,.full-docs-button{font-size:1.125rem;position:relative;cursor:pointer;outline:none;padding:.625rem 1.875rem .625rem 1.25rem;background-color:#fff;margin-bottom:0.125rem;border:2px solid #f3f4f7;letter-spacing:-0.25px;line-height:1.75rem;color:#6c6c6d;background-image:url("/assets/images/chevron-right-orange.svg");background-size:6px 13px;background-position:center right 10px;background-repeat:no-repeat}.hub .all-models-button a,.full-docs-button a{color:#6c6c6d}@media screen and (min-width: 768px){.hub .all-models-button:after,.full-docs-button:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.hub .all-models-button:hover:after,.full-docs-button:hover:after{width:100%}.hub .all-models-button:hover,.full-docs-button:hover{color:#262626}}.hub .hub-column{padding-bottom:4.6875rem}.hub.hub-index .hub-column{padding-bottom:0}.hub .how-it-works{padding-top:3.125rem;padding-bottom:2.8125rem}.hub .how-it-works .how-it-works-text{color:#6c6c6d;font-size:1.25rem;letter-spacing:0;line-height:1.875rem}.hub .how-it-works .how-it-works-title-col{padding-bottom:3.4375rem}.hub .how-it-works .full-docs-button{margin-top:1.875rem}.hub .hub-code-text{font-size:80%;color:#262626;background-color:#e2e2e2;padding:2px}.hub .hub-code-block{display:block;border-left:3px solid #ee4c2c;padding:1.25rem 1.5625rem 1.25rem 1.5625rem;margin-bottom:3.75rem}.hub pre.highlight{background-color:#e2e2e2;border-left:2px solid #ee4c2c}.hub code.highlighter-rouge{background-color:#e2e2e2}.hub article{padding-top:1.25rem}@media screen and (min-width: 768px){.hub article{padding-top:0}}.hub article p{color:#262626}@media screen and (min-width: 768px){.hub .hub-detail-background{height:515px}}.hub .dropdown-menu{border-radius:0;padding-bottom:0}.hub .card:hover .hub-image:before{bottom:100%}.hub.hub.hub-detail .github-stars-image img{height:9px}@media screen and (min-width: 768px){.hub.hub.hub-detail .github-stars-image img{height:10px}}.hub #development-models-hide,#research-models-hide{display:none}@media (min-width: 768px){.hub .col-md-6.hub-column{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 1240px){.hub .col-md-6.hub-column{flex:0 0 50%;max-width:50%}}@media (min-width: 768px){.hub .col-md-12.hub-column .col-md-6{flex:0 0 100%;max-width:100%}}@media screen and (min-width: 1240px){.hub .col-md-12.hub-column .col-md-6{flex:0 0 100%;max-width:50%}}.hub .featured-image{padding-bottom:1.25rem}.hub .coming-soon{font-weight:300;font-style:italic}@media screen and (min-width: 768px){.hub.hub-index .jumbotron{height:325px}}.hub.hub-index .jumbotron h1{padding-top:0}@media screen and (min-width: 768px){.hub.hub-index .jumbotron h1{padding-top:3.4375rem}}.hub.hub-index .jumbotron p.lead{padding-top:3.4375rem}.hub.hub-index .main-content-wrapper{margin-top:210px}@media screen and (min-width: 768px){.hub.hub-index .main-content-wrapper{margin-top:280px}}.hub .page-link{font-size:1.25rem;letter-spacing:0;line-height:2.125rem;color:#ee4c2c;width:7.5rem;text-align:center}.hub .filter-btn{color:#797676;border:1px solid #797676;display:inline-block;text-align:center;white-space:nowrap;vertical-align:middle;padding:0.375rem 0.75rem;font-size:1rem;line-height:1.5;margin-bottom:5px}.hub .filter-btn:hover{border:1px solid #ee4c2c;color:#ee4c2c}.hub .selected{border:1px solid #ee4c2c;background-color:#ee4c2c;color:#fff}.hub .selected:hover{color:#fff}.hub .all-tag-selected{background-color:#797676;color:#fff}.hub .all-tag-selected:hover{border-color:#797676;color:#fff}.hub .pagination .page{border:1px solid #dee2e6;padding:0.5rem 0.75rem}.hub .pagination .active .page{background-color:#dee2e6}.hub .hub-tags-container{width:60%}.hub .hub-tags-container.active{width:0}@media screen and (min-width: 768px){.hub .hub-search-wrapper{top:8px}}.hub .hub-search-wrapper .algolia-autocomplete .ds-dropdown-menu{min-width:100%;max-width:100% !important}.hub .hub-search-wrapper .algolia-autocomplete{width:100%}.hub .hub-search-wrapper.active{width:100%}.hub .hub-search-wrapper span{font-size:1.125rem;text-align:center}@media (max-width: 480px){.hub #hub-search-icon{margin-top:1rem}}#hub-search-icon{background-image:url("/assets/images/search-icon.svg");color:transparent;opacity:0.4;width:25px;height:25px;margin-left:3rem;background-size:15px 20px;background-repeat:no-repeat;right:10px;position:absolute;z-index:1;cursor:pointer}#hub-search-icon:hover{background-image:url("/assets/images/search-icon-orange.svg");opacity:1}#hub-search-input{background-color:#CCCDD1;border:none;color:#000;font-size:1.125rem;font-weight:300;line-height:20px;outline:none;position:relative;display:none;width:100%;border-radius:5px;padding:.875rem 0 .875rem .3125rem}#hub-close-search{display:none;margin-left:20px;opacity:0.4;right:10px;position:absolute;z-index:1;cursor:pointer;font-size:1.125rem}@media screen and (min-width: 768px){#hub-close-search{top:1.125rem}}#hub-close-search:hover{color:#ee4c2c;opacity:1}.hub .hub-divider{margin-bottom:2.2rem;margin-top:1.5rem}.hub .active-hub-divider{border-color:#ee4c2c}.hub .hub-search-border{display:flex;align-items:center;flex-direction:row;border:none;background-color:transparent;border-radius:20px;width:100%}.hub .hub-cards-wrapper{z-index:1000}.hub .nav-container{display:flex;width:100%;position:absolute}.compact-cards{width:100%}.compact-cards a{color:#6C6C6D}.compact-cards a:hover{color:#ee4c2c}.compact-hub-card-wrapper{padding:0}.compact-card-container{display:flex;align-items:center}.compact-card-body{padding-top:8px}.compact-card-body:hover{border-bottom:1px solid #ee4c2c;color:#ee4c2c}.compact-card-body:hover .compact-item-title{color:#ee4c2c}.compact-card-body .compact-hub-card-title-container{width:75%;display:flex}.compact-model-card{height:auto;border-bottom:1px solid #E2E2E2}.compact-item-title{padding-left:0;color:#000}.compact-card-summary{white-space:nowrap;overflow:hidden;text-overflow:ellipsis;top:5px}.compact-hub-divider{padding:0;width:100%}.hub-select-container{position:absolute;right:0;height:2rem}.compact-hub-index-cards{padding-bottom:2rem}.full-hub-icon:hover{cursor:pointer;height:3rem}.compact-hub-icon{margin-left:0.5rem;margin-right:3.125rem}.compact-hub-icon:hover{cursor:pointer}.mobile article{margin-bottom:5rem}.mobile .main-background{height:275px}@media screen and (min-width: 768px){.mobile .main-background{height:380px}}.mobile .main-content-wrapper{margin-top:275px}@media screen and (min-width: 768px){.mobile .main-content-wrapper{margin-top:350px}}.mobile .jumbotron{height:190px}@media screen and (min-width: 768px){.mobile .jumbotron{height:260px}}.mobile .main-content .navbar{background-color:#f3f4f7;padding-left:0;padding-bottom:0;padding-top:0}@media (min-width: 992px){.mobile .main-content .navbar li:first-of-type{padding-left:3.4375rem}.mobile .main-content .navbar .nav-item{padding:2rem;cursor:pointer}.mobile .main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.mobile .main-content .navbar .nav-select{background-color:#fff}.mobile .main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.mobile .main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}@media screen and (min-width: 768px){.mobile .main-content .navbar .nav-link{margin-left:1.875rem}}.mobile .main-content .navbar .nav-link:hover{color:#ee4c2c}.mobile .main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.mobile .main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.mobile .main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.mobile .main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.mobile .main-content .navbar .navbar-toggler{margin-left:2.5rem}.mobile .main-content{padding-top:0}@media screen and (min-width: 768px){.mobile .main-content{padding-top:1.9rem}}.mobile .nav-menu-wrapper{background-color:#f3f4f7}.mobile .navbar-nav{flex-direction:row}.mobile .mobile-page-sidebar{padding-top:2.5rem;padding-bottom:2.5rem;top:15%}@media screen and (min-width: 768px){.mobile .mobile-page-sidebar{padding-top:0}}.mobile .mobile-page-sidebar ul{padding-left:0}.mobile .mobile-page-sidebar li{list-style-type:none;line-height:23px;margin-bottom:15px}.mobile .mobile-page-sidebar li a{color:#8c8c8c}.mobile .mobile-page-sidebar li a.active,.mobile .mobile-page-sidebar li a:hover{color:#ee4c2c}@media screen and (min-width: 1240px){.deep-learning .header-container{margin-bottom:1rem}}.deep-learning .jumbotron{height:180px}@media screen and (min-width: 768px){.deep-learning .jumbotron{height:250px}}.deep-learning .jumbotron .thank-you-page-container{margin-top:0}@media (min-width: 768px) and (max-width: 1239px){.deep-learning .jumbotron .thank-you-page-container{margin-top:250px}}@media screen and (min-width: 768px){.deep-learning .jumbotron .deep-learning-jumbotron-text{margin-top:55px}.deep-learning .jumbotron .deep-learning-jumbotron-text h1{padding-top:30px}}@media (min-width: 768px) and (max-width: 1239px){.deep-learning .jumbotron .deep-learning-jumbotron-text{max-width:95%;flex-basis:100%}}.deep-learning .jumbotron .deep-learning-thank-you-text{width:80%}.deep-learning .jumbotron .deep-learning-thank-you-text .download-book-link{display:inline-block}.deep-learning .jumbotron .deep-learning-landing-text{width:100%}@media screen and (min-width: 768px){.deep-learning .jumbotron .deep-learning-landing-text{width:85%}}.deep-learning .jumbotron .deep-learning-book-container{display:none}@media screen and (min-width: 768px){.deep-learning .jumbotron .deep-learning-book-container{display:block}}@media (min-width: 768px) and (max-width: 1239px){.deep-learning .jumbotron .deep-learning-book-container{display:none}}.deep-learning .jumbotron .thank-you-book-container{display:none}@media (min-width: 768px) and (max-width: 1239px){.deep-learning .jumbotron .thank-you-book-container{display:block}}@media screen and (min-width: 768px){.deep-learning .jumbotron .thank-you-book-container{display:block}}@media screen and (min-width: 768px){.deep-learning .deep-learning-col{max-width:80%}}@media screen and (min-width: 768px){.deep-learning .deep-learning-background{height:440px}}@media screen and (min-width: 768px){.deep-learning .header-holder{height:90px}}.deep-learning .main-content-wrapper{margin-top:250px}@media screen and (min-width: 768px){.deep-learning .main-content-wrapper{margin-top:480px}}@media screen and (min-width: 768px){.deep-learning .deep-learning-content{padding-top:0}}.deep-learning .main-background{height:250px}@media screen and (min-width: 768px){.deep-learning .main-background{height:440px}}.deep-learning .thank-you-wrapper{margin-top:400px}@media screen and (min-width: 768px){.deep-learning .thank-you-wrapper{margin-top:275px}}.deep-learning .thank-you-background{height:438px}@media screen and (min-width: 768px){.deep-learning .thank-you-background{height:680px}}.deep-learning-container{display:flex;align-items:center}.deep-learning-logo{background-image:url("/assets/images/pytorch-logo.png")}.deep-learning-row{display:flex;align-items:center}.deep-learning-row .lead{margin-top:1rem;margin-bottom:2rem}@media (min-width: 768px) and (max-width: 1239px){.deep-learning-row h1{font-size:3rem}}@media screen and (min-width: 768px){.deep-learning-row h1{margin-top:2rem}}.deep-learning-book{max-width:100%;height:400px}.deep-learning-form{margin-left:-1rem}@media screen and (min-width: 768px){.deep-learning-form{margin-left:0;margin-top:1rem}}#deep-learning-button{margin-top:2rem}.deep-learning-form .email-subscribe-form .deep-learning-input{padding-left:.5rem;background-color:#f3f4f7}.deep-learning-form #mce-error-response{color:#ee4c2c}.video-item{margin-bottom:5rem}.video-item a h5{color:#000;margin-top:1rem}.video-item a:hover h5{color:#ee4c2c}.video-item .image-container{overflow:hidden}.video-item .image-container img{margin:-10% 0;width:100%}.ecosystem .contributor-jumbotron{width:90%}@media screen and (min-width: 768px){.ecosystem .contributor-jumbotron{height:262px}}.ecosystem .contributor-jumbotron .container{max-width:920px}.ecosystem .contributor-jumbotron h1{padding-top:0}.ecosystem .contributor-jumbotron h1 span{font-weight:300;color:#812CE5}.ecosystem .contributor-jumbotron .contributor-jumbo-text h1{color:white}.ecosystem .contributor-jumbotron .contributor-jumbo-text h2{color:white;padding-top:0}.hidden{display:none}.contributor-container-fluid{height:4rem;width:100%}@media screen and (max-width: 767px){.contributor-container-fluid{margin-top:2rem}}@media screen and (min-width: 1200px){.contributor-container-fluid{margin-left:0}}.ecosystem .contributor.main-content{padding-top:0}.ecosystem .contributor.main-content .navbar{padding-left:0;padding-bottom:0;padding-top:0}.ecosystem .contributor.main-content .navbar .nav-item{cursor:pointer}.ecosystem .contributor.main-content .navbar .nav-item:last-of-type{position:relative}@media (min-width: 992px){.ecosystem .contributor.main-content .navbar .nav-item{padding:2rem;cursor:pointer}.ecosystem .contributor.main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.ecosystem .contributor.main-content .navbar .nav-select{background-color:#fff}.ecosystem .contributor.main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.ecosystem .contributor.main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}@media screen and (min-width: 768px){.ecosystem .contributor.main-content .navbar .nav-link{margin-left:1.875rem}}.ecosystem .contributor.main-content .navbar .nav-link:hover{color:#ee4c2c}.ecosystem .contributor.main-content .navbar .contributor-nav-link{padding-left:1.25rem;padding-right:1.25rem}@media screen and (min-width: 768px){.ecosystem .contributor.main-content .navbar .contributor-nav-link{padding-left:1.875rem;padding-right:1.875rem}}.ecosystem .contributor.main-content .navbar .contributor-nav{flex-direction:row}.ecosystem .contributor.main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.ecosystem .contributor.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.ecosystem .contributor.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.ecosystem .contributor.main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.ecosystem .contributor.main-content .navbar .navbar-toggler{margin-left:2.5rem}.past-issue-container{display:flex}@media (max-width: 767px){.past-issue-container{display:block}}.past-issue-container .get-started-cloud-sidebar .sticky-top{position:-webkit-sticky;position:sticky;top:15%}@media (max-width: 767px){.past-issue-container .get-started-cloud-sidebar .sticky-top{position:relative;top:0;margin-left:0}}.past-issue-container .get-started-cloud-sidebar .pytorch-article li{list-style:initial}.past-issue-container .get-started-cloud-sidebar li{list-style-type:none;line-height:36px;color:#8c8c8c}.past-issue-container .get-started-cloud-sidebar span{white-space:nowrap}#past-issues{max-width:920px;margin:auto;margin-top:0;margin-bottom:0}.contributor-container{max-width:920px;left:0;right:0;margin-left:auto;margin-right:auto;padding-left:30px;padding-right:30px;width:90%}.past-issue-container.container{padding-left:5px;padding-top:45px}.nav-background{width:100%;background-color:#f3f4f7}#get-started-contributor-sidebar-list{padding-left:0}#get-started-contributor-sidebar-list .active{color:#ee4c2c}#get-started-contributor-sidebar-list li a{color:#8c8c8c}.two-column-row{max-width:920px;margin:0 auto 0 auto;padding:0 30px 43px 30px;width:90%}@media screen and (min-width: 768px){.two-column-row{display:flex}}.two-column-row h2{text-transform:uppercase;font-weight:100;margin-bottom:30px}.two-column-row p{margin-bottom:40px}.two-column-row .content-left{flex:60%;padding-top:76px}@media screen and (min-width: 768px){.two-column-row .content-left{margin-right:62px}}.two-column-row .content-left h2{color:#ee4c2c}.two-column-row .content-left .contributor-consent-check{max-width:400px}.two-column-row .content-left .email-consent{color:#797676;font-size:14px}.two-column-row .content-left .please-accept-terms{display:none;color:#ee4c2c;font-size:14px}.two-column-row .content-right{flex:40%;padding-top:76px}.two-column-row .content-right h2{color:#812CE5}.two-column-row .contributor-form{margin:-8px 0 47px 0}.two-column-row .contributor-form .form-success,.two-column-row .contributor-form .form-fail{color:#ee4c2c;display:none;flex:none;margin:8px 0 12px 0}.two-column-row .contributor-form form{width:100%}.two-column-row .contributor-form form .contributor-form-ui{display:flex;max-width:390px;flex-wrap:wrap}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]{border:1px solid #e6e6e6;border-radius:4px;flex:1 70%;padding:5px 8px 5px 8px;margin-right:10px}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]::-moz-placeholder{color:silver}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]:-ms-input-placeholder{color:silver}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]::-ms-input-placeholder{color:silver}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]::placeholder{color:silver}.two-column-row .contributor-form form .contributor-form-ui input[type="text"]:focus{border:1px solid #ee4c2c}.two-column-row .contributor-form form .contributor-form-ui input[type="submit"]{background:#e6e6e6;border:none;border-radius:4px;color:#6d6d6d}.two-column-row .contributor-form form .contributor-form-ui input[type="submit"]:hover{background:silver;color:#3a3a3a}.two-column-row .contributor-form input[type="checkbox"]{margin:1px 6px 0 0}.two-column-row .contributor-form .contributor-consent-check{color:#797676;margin-top:1rem}.two-column-row .contributors-button{background-image:url("/assets/images/chevron-right-orange.svg");background-color:#fff;background-size:6px 13px;background-position:center right 10px;background-repeat:no-repeat;border:2px solid #f3f4f7;color:#6c6c6d;cursor:pointer;font-size:1.125rem;outline:none;letter-spacing:-0.25px;line-height:1.75rem;margin-bottom:0.125rem;padding:.625rem 1.875rem .625rem 1.25rem}.two-column-row .contributors-button a{color:#6c6c6d}@media screen and (min-width: 768px){.two-column-row .contributors-button:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.two-column-row .contributors-button:hover:after{width:100%}.two-column-row .contributors-button:hover{color:#262626}}.mobile .enterprise-jumbotron{height:210px}@media screen and (min-width: 768px){.mobile .enterprise-jumbotron{height:280px}}.enterprise{padding-bottom:0}.enterprise p,.enterprise li{color:#6c6c6d;font-size:18px}.enterprise h2{padding-bottom:1.5rem}.enterprise .container{padding:48px 30px 48px 30px}.enterprise .enterprise-gray-container{background-color:#f3f4f7}.enterprise .pyt-enterprise-logo{background-image:url("/assets/images/PTE_lockup_PRIMARY.svg");background-repeat:no-repeat;height:60px}.enterprise .container{max-width:940px}.enterprise .enterprise-landing-azure-logo-container{float:left;padding:0}.ecosystem .events-wrapper{background-color:white}@media screen and (min-width: 768px){.ecosystem .events-wrapper{margin-top:472px}}.ecosystem .events{padding-top:0}.ecosystem .events .event-info-container{display:flex;flex-flow:column}.ecosystem .events .sticky-top{top:15%}.ecosystem .events .event-label{margin-bottom:2rem}.ecosystem .live-event-container{display:flex}@media (max-width: 767px){.ecosystem .live-event-container{flex-flow:wrap}}.ecosystem .events-section{max-width:920px;margin:0 auto 0 auto;padding:0 30px 43px 30px;width:90%}.ecosystem .events-section .event-item{padding-bottom:3rem;border-bottom:1px solid #D6D7D8}.ecosystem .events-section .event-item h2{padding-bottom:1rem}.ecosystem .community-event{margin:0;padding:3px 10px;border:1px solid #8c8c8c;border-radius:3px;text-transform:uppercase;font-size:14px;font-weight:700;color:#8c8c8c}.ecosystem .event-side-nav-container{padding-left:3rem}.ecosystem .event-side-nav-container ul{list-style:none}.ecosystem .live-events-section p{font-size:18px;margin-top:2rem}@media (min-width: 768px) and (max-width: 1239px){.ecosystem .live-events-section{width:100%;padding-left:5px;padding-right:5px}}@media (max-width: 767px){.ecosystem .live-events-section{width:100%;padding-left:5px;padding-right:5px}}.ecosystem .events.main-content{padding-top:0}.events-container-fluid{height:5rem;width:100%;padding-bottom:7rem}@media screen and (max-width: 767px){.events-container-fluid{margin-top:2rem}}@media screen and (min-width: 1200px){.events-container-fluid{margin-left:0}}.events-container{max-width:920px;left:0;right:0;margin-left:auto;margin-right:auto;padding-left:0px;padding-right:0px;width:90%}.ecosystem .events.main-content .navbar{padding-left:0;padding-bottom:0;padding-top:0}.ecosystem .events.main-content .navbar .nav-item{cursor:pointer}.ecosystem .events.main-content .navbar .nav-item:last-of-type{position:relative}@media (min-width: 992px){.ecosystem .events.main-content .navbar .nav-item{padding:.5rem;cursor:pointer}.ecosystem .events.main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.ecosystem .events.main-content .navbar .nav-select{background-color:#fff}.ecosystem .events.main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.ecosystem .events.main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}@media screen and (min-width: 768px){.ecosystem .events.main-content .navbar .nav-link{margin-left:1.875rem}}.ecosystem .events.main-content .navbar .nav-link:hover{color:#ee4c2c}.ecosystem .events.main-content .navbar .events-nav-link{padding-left:.9375rem;padding-right:.3125rem}@media screen and (min-width: 768px){.ecosystem .events.main-content .navbar .events-nav-link{padding-left:1.25rem;padding-right:1.25rem}}.ecosystem .events.main-content .navbar .events-nav{flex-direction:row}.ecosystem .events.main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.ecosystem .events.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.ecosystem .events.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.ecosystem .events.main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.ecosystem .events.main-content .navbar .navbar-toggler{margin-left:2.5rem}.events-video-wrapper{width:100%;border:1px solid #797676;background-color:#f3f4f7;height:21rem;margin-top:2.5rem}.events-video-wrapper .video-container{display:flex;top:12%}.events-video-wrapper .video-tabs{display:flex}.events-video-wrapper .events-video-nav{flex-direction:row;padding-right:0;margin-bottom:1rem}.events-video-wrapper .events-video-nav .nav-item{border-right:1px solid #797676;border-bottom:1px solid #797676}.events-video-wrapper .events-video-nav .nav-select{background-color:#fff;border-bottom:none}.events-video-wrapper .events-video-nav .nav-select .nav-link{color:#ee4c2c}.events-video-wrapper .events-nav-link{text-align:center}.events-video-wrapper .video{position:relative;height:0;padding-bottom:30%;place-self:center}.events-video-wrapper .video-info{margin-left:3rem;max-width:45%}.events-video-wrapper iframe{height:100%;width:100%;position:absolute}.video-links-container{border:1px solid #797676}.video-links-container .video-links{display:flex}.video-links-container .video-links .video-link-item{padding-left:1rem;list-style:none}.episode-header-text{font-size:26px;margin-bottom:2rem}.episode-card-row{display:block}@media screen and (min-width: 908px){.episode-card-row{display:flex;flex-wrap:wrap;margin-bottom:2rem}}.episode-card-row .episode-card.resource-card{height:14rem;margin-right:1rem;margin-bottom:1rem;background-color:#f3f4f7;border:none;max-width:31%;flex:auto}.episode-card-row .episode-card.resource-card ul{list-style:none}.episode-card-row .episode-card.resource-card a{color:inherit}.episode-card-row .episode-card.resource-card .episode-body{display:block;position:relative;top:30px;margin-left:20px}.episode-card-row .episode-card.resource-card .episode-title{margin-left:3.2rem;margin-bottom:.5rem;font-size:1.5rem}@media screen and (min-width: 768px){.episode-card-row .episode-card.resource-card .episode-title{margin-left:2.5rem}}.episode-card-row .episode-card.resource-card .guest-name{font-weight:500;font-size:1.25rem;overflow:hidden;white-space:nowrap;text-overflow:ellipsis}.episode-card-row .episode-card.resource-card .episode-info{display:flex;justify-content:space-between}.episode-card-row .episode-card.resource-card .episode-info span{padding-left:5px;padding-right:5px}.episode-card-row .episode-card.resource-card .info-divide{display:block;border-bottom:1px solid #D6D7D8;margin-top:.5rem;margin-bottom:.5rem}.episode-card-row .episode-card.resource-card .episode-poster{color:#ee4c2c}.episode-card-row .episode-card.resource-card .episode-date-time{display:flex;padding-left:0}.episode-card-row .episode-card.resource-card .episode-date-time span{padding-left:5px;padding-right:5px}@media screen and (max-width: 907px){.episode-card-row .episode-card.resource-card{max-width:100%;margin-bottom:1.25rem}}.episode-card-row .episode-card.resource-card.pytorch-resource:before{content:"";background-size:32px 32px;background-repeat:no-repeat;display:block;position:absolute;height:32px;width:32px;top:30px;left:15px}@media screen and (min-width: 768px){.episode-card-row .episode-card.resource-card.pytorch-resource:before{left:30px;top:30px}}.podcast-container{padding-left:0}@media screen and (min-width: 768px){.podcast-container{display:flex}.podcast-container .podcast-card:not(:first-of-type){margin-left:1rem}}.podcast-container .podcast-card{display:flex;align-items:center;justify-content:center;margin-top:2rem;border:1px solid #D6D7D8;height:8.75rem}@media screen and (min-width: 768px){.podcast-container .podcast-card:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.podcast-container .podcast-card:hover:after{width:100%}.podcast-container .podcast-card:hover{color:#262626}}.podcast-container .podcast-title{font-size:24px;font-weight:400}.comm-stories .community-stories-wrapper{background-color:white}.comm-stories .community-stories{padding-top:0}.comm-stories .community-stories .production-info-container,.comm-stories .community-stories .research-info-container{display:flex;flex-flow:column}.comm-stories .community-stories .sticky-top{top:15%}.comm-stories .production-container,.comm-stories .research-container{display:flex;padding-left:0}@media (max-width: 767px){.comm-stories .production-container,.comm-stories .research-container{flex-flow:wrap}}.comm-stories .production-section,.comm-stories .research-section{max-width:920px;margin:0 auto 0 auto;padding:0 30px 43px 30px;width:90%}.comm-stories .production-section .production-item,.comm-stories .production-section .research-item,.comm-stories .research-section .production-item,.comm-stories .research-section .research-item{padding-bottom:2rem;padding-top:2rem;border-bottom:1px solid #d6d7d8}.comm-stories .production-section .production-item h2,.comm-stories .production-section .research-item h2,.comm-stories .research-section .production-item h2,.comm-stories .research-section .research-item h2{padding-bottom:1rem}.comm-stories .production-side-nav-container #research-sidebar-list,.comm-stories .production-side-nav-container #production-sidebar-list,.comm-stories .research-side-nav-container #research-sidebar-list,.comm-stories .research-side-nav-container #production-sidebar-list{padding-left:0}.comm-stories .production-side-nav-container #research-sidebar-list .active,.comm-stories .production-side-nav-container #production-sidebar-list .active,.comm-stories .research-side-nav-container #research-sidebar-list .active,.comm-stories .research-side-nav-container #production-sidebar-list .active{color:#ee4c2c}.comm-stories .production-side-nav-container #research-sidebar-list ul,.comm-stories .production-side-nav-container #production-sidebar-list ul,.comm-stories .research-side-nav-container #research-sidebar-list ul,.comm-stories .research-side-nav-container #production-sidebar-list ul{padding-left:3rem;list-style:none}.comm-stories .production-side-nav-container #research-sidebar-list ul li,.comm-stories .production-side-nav-container #production-sidebar-list ul li,.comm-stories .research-side-nav-container #research-sidebar-list ul li,.comm-stories .research-side-nav-container #production-sidebar-list ul li{line-height:36px}.comm-stories .production-side-nav-container #research-sidebar-list ul li a,.comm-stories .production-side-nav-container #production-sidebar-list ul li a,.comm-stories .research-side-nav-container #research-sidebar-list ul li a,.comm-stories .research-side-nav-container #production-sidebar-list ul li a{color:#8c8c8c}.comm-stories .production-section p,.comm-stories .research-section p{font-size:18px;margin-top:2rem}@media (min-width: 768px) and (max-width: 1239px){.comm-stories .production-section,.comm-stories .research-section{width:100%;padding-left:5px;padding-right:5px}}@media (max-width: 767px){.comm-stories .production-section,.comm-stories .research-section{width:100%;padding-left:5px;padding-right:5px}}.comm-stories .main-content-wrapper{margin-top:275px}@media screen and (min-width: 768px){.comm-stories .main-content-wrapper{margin-top:380px}}.comm-stories .jumbotron{color:#fff;height:190px}@media screen and (min-width: 768px){.comm-stories .jumbotron{height:260px}}.ecosystem .community-stories.main-content{padding-top:0}.community-stories-container-fluid{height:5rem;width:100%;padding-bottom:7rem}@media screen and (max-width: 767px){.community-stories-container-fluid{margin-top:2rem}}@media screen and (min-width: 1200px){.community-stories-container-fluid{margin-left:0}}.comm-stories .community-stories.main-content .navbar{padding-left:0;padding-bottom:0;padding-top:0}.comm-stories .community-stories.main-content .navbar .nav-item{cursor:pointer}.comm-stories .community-stories.main-content .navbar .nav-item:last-of-type{position:relative}@media (min-width: 992px){.comm-stories .community-stories.main-content .navbar .nav-item{padding:2rem;cursor:pointer}.comm-stories .community-stories.main-content .navbar .nav-link{position:relative;top:10%;transform:translateY(-50%)}}.comm-stories .community-stories.main-content .navbar .nav-select{background-color:#fff}.comm-stories .community-stories.main-content .navbar .nav-select .nav-link{color:#ee4c2c;font-weight:500}.comm-stories .community-stories.main-content .navbar .nav-link{font-size:1.125rem;color:#8c8c8c}@media screen and (min-width: 768px){.comm-stories .community-stories.main-content .navbar .nav-link{margin-left:1.875rem}}.comm-stories .community-stories.main-content .navbar .nav-link:hover{color:#ee4c2c}.comm-stories .community-stories.main-content .navbar .community-stories-nav-link{padding-left:1.25rem;padding-right:1.25rem}@media screen and (min-width: 768px){.comm-stories .community-stories.main-content .navbar .community-stories-nav-link{padding-left:1.875rem;padding-right:1.875rem}}.comm-stories .community-stories.main-content .navbar .community-stories-nav{flex-direction:row}.comm-stories .community-stories.main-content .navbar .nav-item{padding-top:.9375rem;padding-bottom:.9375rem}@media screen and (min-width: 768px){.comm-stories .community-stories.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (min-width: 768px) and (max-width: 1239px){.comm-stories .community-stories.main-content .navbar .nav-item{padding-bottom:0;padding-top:2rem}}@media (max-width: 990px){.comm-stories .community-stories.main-content .navbar .nav-item{padding-bottom:.625rem;padding-top:1rem}}.comm-stories .community-stories.main-content .navbar .navbar-toggler{margin-left:2.5rem}.announcement .hero-content{top:148px;height:250px;position:relative;margin-bottom:120px;justify-content:center}@media screen and (min-width: 768px){.announcement .hero-content{top:178px;height:350px}}.announcement .hero-content h1{font-size:3.75rem;text-transform:uppercase;font-weight:lighter;letter-spacing:1.08px;margin-bottom:.625rem;line-height:1.05;color:#fff}@media screen and (min-width: 768px){.announcement .hero-content h1{font-size:4.5rem}}.announcement .hero-content h1.small{font-size:40px}@media screen and (min-width: 768px){.announcement .hero-content h1.small{font-size:58px}}.announcement .hero-content .lead{margin-bottom:1.5625rem;padding-top:1.875rem;color:#fff;width:100%}.announcement .row{justify-content:center}.announcement .main-content{margin-bottom:5rem;padding-bottom:0}.announcement .main-background{height:370px}@media screen and (min-width: 768px){.announcement .main-background{height:450px}}.announcement .card-container{display:grid;grid-template-columns:repeat(2, 1fr);gap:20px;padding-top:3rem}.announcement .card-container .card{border:none;display:block}.announcement .card-container .card a{color:#000}.announcement .card-container .card .card-body{display:flex;flex-direction:column;height:100%;justify-content:space-between;padding:0}.announcement .card-container .card .card-body img{width:100%;height:207px;-o-object-fit:contain;object-fit:contain;padding:20px}@media screen and (min-width: 1000px){.announcement .card-container .card .card-body img{padding:30px}}@media screen and (min-width: 1000px){.announcement .card-container{grid-template-columns:repeat(3, 1fr);gap:36px}}.announcement .contact-us-section{background-color:#f3f4f7;padding:50px 0}.announcement .contact-us-section .row{justify-content:center}.announcement .contact-us-section .row .lead{padding-top:1.5rem}.announcement .contact-us-section .row .hbspt-form{padding:30px 0}.announcement .contact-us-section .row .hbspt-form .hs-button{background-image:url("/assets/images/chevron-right-orange.svg");background-size:6px 13px;background-position:top 16px right 11px;background-repeat:no-repeat;border-radius:0;border:none;background-color:#fff;color:#6c6c6d;font-weight:400;position:relative;letter-spacing:0.25px;padding:.75rem 2rem .75rem .75rem;margin:10px 0}@media screen and (min-width: 768px){.announcement .contact-us-section .row .hbspt-form .hs-button:after{content:"";display:block;width:0;height:1px;position:absolute;bottom:0;left:0;background-color:#ee4c2c;transition:width .250s ease-in-out}.announcement .contact-us-section .row .hbspt-form .hs-button:hover:after{width:100%}.announcement .contact-us-section .row .hbspt-form .hs-button:hover{color:#262626}}@media screen and (min-width: 768px){.announcement .contact-us-section .row .hbspt-form .hs-button{background-position:top 19px right 11px}}.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-2,.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-1{max-width:100%}.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-2 .hs-form-field,.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-1 .hs-form-field{max-width:100%;padding:10px 0;width:100%}.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-2 .hs-form-field input,.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-1 .hs-form-field input{border:none;width:100%}.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-2 .hs-form-field textarea,.announcement .contact-us-section .row .hbspt-form fieldset.form-columns-1 .hs-form-field textarea{border:none;width:100%}.announcement .contact-us-section .row .hbspt-form li.hs-form-radio input[type=radio]{width:auto !important}.announcement .contact-us-section .row .hbspt-form li.hs-form-radio span{margin-left:5px}.announcement .contact-us-section .row .hbspt-form ul{list-style-type:none}.announcement .light-background-section{background-color:#fff}.announcement .light-background-section .content{padding:40px 0}.announcement .light-background-section ul li{font-size:1.25rem;font-weight:300}.announcement .darker-background-section{background-color:#f3f4f7}.announcement .darker-background-section .content{padding:40px 0}.announcement .grey-background-section{background-color:#f3f4f7;padding:60px 0}.announcement .grey-background-section img{height:100px}.announcement .grey-background-section p{font-size:14px;line-height:170%}.announcement .color-background-section{background-image:url("/assets/images/pytorch_bg_purple.jpg");background-size:100% 100%;background-repeat:no-repeat;padding:60px 0}.announcement .color-background-section h2{color:white}.announcement .body-side-text .lead{margin-bottom:1.5625rem;padding-top:1.5rem}.announcement img{width:100%}.announcement h2.upper{font-size:25px;line-height:130%;text-align:center;letter-spacing:1.75px;text-transform:uppercase;margin-bottom:30px}.announcement h3.upper{font-size:19px;text-transform:uppercase;letter-spacing:1.75px;line-height:130%;margin:25px 0}.announcement table.benefits{background-color:white;font-size:14px;text-align:center}.announcement table.benefits td.benefit{border-left:none;min-width:300px;text-align:left}@media screen and (min-width: 768px){.announcement table.benefits td.benefit{min-width:520px}}.announcement table.benefits tbody td{border-left:1px solid #812CE5;vertical-align:middle}.announcement table.benefits tbody td.benefit{font-weight:600}.announcement table.benefits thead,.announcement table.benefits tfoot{background-color:#812CE5;color:white;font-size:16px;font-weight:700}@media screen and (min-width: 768px){.announcement table.benefits thead,.announcement table.benefits tfoot{font-size:20px}}.announcement table.benefits thead td,.announcement table.benefits tfoot td{border-left:1px solid #000;vertical-align:middle;border-top:none}.announcement table.benefits thead a,.announcement table.benefits tfoot a{text-decoration:underline;color:white}.announcement table.benefits thead td.price,.announcement table.benefits tfoot td.price{font-size:14px;line-height:1.2}@media screen and (min-width: 768px){.announcement table.benefits thead td.price,.announcement table.benefits tfoot td.price{font-size:16px}}.announcement table.benefits img{width:15px}.announcement .modal-header{border-bottom:none;padding-bottom:0}.announcement .consolidated-employees tbody td{font-weight:600}.announcement .consolidated-employees td.no-border{border-left:none}.announcement .member-boxes{gap:20px;margin:0}.announcement .member-boxes div.col-sm{background-color:white}.board-member{margin:35px 0}.board-member img{margin-bottom:15px}.board-member a svg{margin-top:5px;height:25px;max-width:30px;fill:#000;color:#000}.board-member a:hover svg{fill:#ee4c2c;color:#ee4c2c}.announcement .cloud-credits-table{font-size:1.1rem;margin-top:40px}.announcement .cloud-credits-table ul{padding-left:20px}.announcement .cloud-credits-table ul li{margin-top:10px;font-size:1.1rem}.announcement .cloud-credits-table .col-md{border-radius:5px;margin-bottom:40px}.announcement .cloud-credits-table .card{border-radius:6px}.announcement .cloud-credits-table .thead{border-top-left-radius:5px;border-top-right-radius:5px;color:#fff;padding:14px 20px;text-align:center}.announcement .cloud-credits-table .col-md:first-child .thead{background:conic-gradient(from 53deg at 37% 100%, #828282 0, rgba(130,130,130,0.95) 100%)}.announcement .cloud-credits-table .col-md:nth-child(2) .thead{background:conic-gradient(from 53deg at 37% 100%, #ab9344 0, rgba(171,147,68,0.95) 100%)}.announcement .cloud-credits-table .col-md:nth-child(3) .thead{background:conic-gradient(from 53deg at 37% 100%, #293850 0, rgba(41,56,80,0.95) 100%)}.announcement .cloud-credits-table .tbody{border-bottom:1px solid #d0d0d0;border-left:1px solid #d0d0d0;border-right:1px solid #d0d0d0;height:100%;padding:26px 20px}.announcement .cloud-credits-table .tfoot{background-color:#000;border-bottom-left-radius:5px;border-bottom-right-radius:5px;color:#fff;padding:20px;text-align:center}.announcement .steps-columns{background-color:transparent}.announcement .steps-columns .col-md{margin-bottom:20px;padding:20px}.announcement .steps-columns h3{margin-bottom:20px}.announcement .steps-columns .step{font-size:1.5rem;margin-bottom:5px;margin-top:20px}.announcement .steps-columns ul{padding-left:20px}.announcement .steps-columns ul li{margin-top:10px} diff --git a/assets/main.scss b/assets/main.scss deleted file mode 100644 index 46de7d9d32c7..000000000000 --- a/assets/main.scss +++ /dev/null @@ -1,40 +0,0 @@ ---- ---- - -@import "bootstrap/scss/functions"; -@import "bootstrap/scss/bootstrap"; -@import "syntax-highlighting"; - -$baseurl:"{{ site.baseurl }}"; - -@import "variables"; -@import "bootstrap-overrides"; -@import "fonts"; -@import "base_styles"; -@import "code"; -@import "navigation"; -@import "jumbotron"; -@import "homepage"; -@import "footer"; -@import "main-content"; -@import "article"; -@import "get-started"; -@import "ecosystem"; -@import "features"; -@import "resources"; -@import "quick-start-module"; -@import "blog"; -@import "similar-posts-module"; -@import "search"; -@import "cookie-banner"; -@import "hub"; -@import "hub-search"; -@import "compact"; -@import "mobile"; -@import "deep-learning"; -@import "videos"; -@import "contributors"; -@import "enterprise"; -@import "events"; -@import "community-stories"; -@import "announcement"; diff --git a/autonomous-language-model-systems.html b/autonomous-language-model-systems.html index 3b065fafb852..690dffefcd70 100644 --- a/autonomous-language-model-systems.html +++ b/autonomous-language-model-systems.html @@ -1,12 +1,310 @@ ---- -layout: default -title: "Towards Autonomous Language Model Systems" -body-class: announcement -background-class: announcement-background -permalink: /autonomous-language-model-systems ---- - -
        + + + + + + + + + + + + + Towards Autonomous Language Model Systems | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Webinars

        @@ -43,4 +341,306 @@

        Register now to attend this event

        -
        \ No newline at end of file + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog.html b/blog.html deleted file mode 100644 index e39a2a2a555c..000000000000 --- a/blog.html +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: blog -title: Blog -permalink: /blog/ -body-class: blog -redirect_from: "/blog/categories/" -pagination: - enabled: true - permalink: /:num/ ---- diff --git a/blog/10/index.html b/blog/10/index.html new file mode 100644 index 000000000000..0e7df860fedc --- /dev/null +++ b/blog/10/index.html @@ -0,0 +1,991 @@ + + + + + + + + + + + + + Blog | 10 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Speeding up ViTs using Block Sparsity +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        May 02, 2024

        +

        + A Hitchhiker’s Guide to Speculative Decoding +

        +

        Speculative decoding is an optimization technique for inference that makes educated guesses about future tokens while generating the current token, all within a single forward pass. It incorporates a verification mechanism to ensure the correctness of these speculated tokens, thereby guaranteeing that the overall output of speculative decoding is identical to that of vanilla decoding. Optimizing the cost of inference of large language models (LLMs) is arguably one of the most critical factor...

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 02, 2024

        +

        + Announcing PyTorch Docathon June, 2024 +

        +

        We are thrilled to announce the upcoming PyTorch Docathon in June! The Docathon, akin to a hackathon, is an event dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Documentation is a vital component of any technology. By refining it, we can simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine l...

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 01, 2024

        +

        + Accelerating Llama3 FP8 Inference with Triton Kernels +

        +

        1.0 Summary + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 30, 2024

        +

        + ExecuTorch Alpha: Taking LLMs and AI to the Edge with Our Community and Partners +

        +

        We are excited to announce the release of ExecuTorch alpha, focused on deploying large language models (LLMs) and large ML models to the edge, stabilizing the API surface, and improving our installation processes. It has been an exciting few months from our 0.1 (preview) release in collaboration with our partners at Arm, Apple, and Qualcomm Technologies, Inc. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 24, 2024

        +

        + PyTorch 2.3 Release Blog +

        +

        We are excited to announce the release of PyTorch® 2.3 (release note)! PyTorch 2.3 offers support for user-defined Triton kernels in torch.compile, allowing for users to migrate their own Triton kernels from eager without experiencing performance regressions or graph breaks. Tensor Parallelism improves the experience for training Large Language Models using native PyTorch functions, which has been validated on training runs for 100B parameter models. As well, semi-structured sparsity implemen...

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 16, 2024

        +

        + torchtune: Easily fine-tune LLMs using PyTorch +

        +

        We’re pleased to announce the alpha release of torchtune, a PyTorch-native library for easily fine-tuning large language models. + +

        + +
        + + Read More + +
        + + + + + + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/11/index.html b/blog/11/index.html new file mode 100644 index 000000000000..2e1cac4b1293 --- /dev/null +++ b/blog/11/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 11 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Maximizing training throughput using PyTorch FSDP +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        February 06, 2024

        +

        + PyTorch 2 paper and tutorial @ ASPLOS 2024 +

        +

        The PyTorch team is excited to share that our paper on PyTorch 2 has been accepted for presentation at the ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), scheduled to take place from April 27 to May 1, 2024, in San Diego, CA, USA. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        February 01, 2024

        +

        + What's New in PyTorch Documentation +

        +

        Greetings to the PyTorch community! Here is a quick update on PyTorch docs. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 30, 2024

        +

        + PyTorch 2.2: FlashAttention-v2 integration, AOTInductor +

        +

        We are excited to announce the release of PyTorch® 2.2 (release note)! PyTorch 2.2 offers ~2x performance improvements to scaled_dot_product_attention via FlashAttention-v2 integration, as well as AOTInductor, a new ahead-of-time compilation and deployment tool built for non-python server-side deployments. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 30, 2024

        +

        + New Library Updates in PyTorch 2.2 +

        +

        Summary + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 23, 2024

        +

        + Accelerating Generative AI with PyTorch IV: Seamless M4T, fast +

        +

        This post is the fourth part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. To skip to the code, check out our github (seamless_communication, fairseq2). We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In part two, we showed how...

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        January 16, 2024

        +

        + Accelerating Triton Dequantization Kernels for GPTQ +

        +

        TL;DR + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/12/index.html b/blog/12/index.html new file mode 100644 index 000000000000..1f1cee3f0ee3 --- /dev/null +++ b/blog/12/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 12 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Finetune LLMs on your own consumer hardware using tools from PyTorch and Hugging Face ecosystem +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        January 09, 2024

        +

        + Accelerate AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe, saving up to 75% on inference costs +

        +

        Multi-model endpoints (MMEs) are a powerful feature of Amazon SageMaker designed to simplify the deployment and operation of machine learning (ML) models. With MMEs, you can host multiple models on a single serving container and host all the models behind a single endpoint. The SageMaker platform automatically manages the loading and unloading of models and scales resources based on traffic patterns, reducing the operational burden of managing a large quantity of models. This feature is parti...

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 03, 2024

        +

        + Accelerating Generative AI Part III: Diffusion, Fast +

        +

        This post is the third part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In part two, we showed how to accelerate Llama-7B by almost 10x using only native PyTorch optimizations. ...

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 19, 2023

        +

        + Understanding GPU Memory 2: Finding and Removing Reference Cycles +

        +

        This is part 2 of the Understanding GPU Memory blog series. Our first post Understanding GPU Memory 1: Visualizing All Allocations over Time shows how to use the memory snapshot tool. In this part, we will use the Memory Snapshot to visualize a GPU memory leak caused by reference cycles, and then locate and remove them in our code using the Reference Cycle Detector. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 18, 2023

        +

        + Training Production AI Models with PyTorch 2.0 +

        +

        1. Introduction + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        December 14, 2023

        +

        + Understanding GPU Memory 1: Visualizing All Allocations over Time +

        +

        During your time with PyTorch on GPUs, you may be familiar with this common error message: + +

        + +
        + + Read More + +
        + + + + + + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/13/index.html b/blog/13/index.html new file mode 100644 index 000000000000..781cf915730f --- /dev/null +++ b/blog/13/index.html @@ -0,0 +1,999 @@ + + + + + + + + + + + + + Blog | 13 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + + + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        November 30, 2023

        +

        + Accelerating Generative AI with PyTorch II: GPT, Fast +

        +

        This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In this blog we’ll focus on LLM optimization. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 29, 2023

        +

        + PyTorch 2.1 Contains New Performance Features for AI Developers +

        +

        We are excited to see the release of PyTorch 2.1. In this blog, we discuss the five features for which Intel made significant contributions to PyTorch 2.1: + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 16, 2023

        +

        + 🎉 PyTorch Docathon H2 2023 Wrap-up 🎉 +

        +

        We are thrilled to announce the successful completion of the Fall 2023 PyTorch Docathon! The event was a resounding success, and we want to extend our heartfelt gratitude to all the participants who made it possible. Dedication, expertise, and tireless efforts of our open-source contributors have once again helped us to improve PyTorch documentation. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 16, 2023

        +

        + Accelerating Generative AI with PyTorch: Segment Anything, Fast +

        +

        This post is the first part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples of how these features can be combined to see how far we can push PyTorch native performance. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 07, 2023

        +

        + PyTorch compile to speed up inference on Llama 2 +

        +

        In this blog, we discuss how to improve the inference latencies of the Llama 2 family of models using PyTorch native optimizations such as native fast kernels, compile transformations from torch compile, and tensor parallel for distributed inference. Our approach results in 29ms/token latency for single user requests on the 70B LLaMa model (as measured on 8 A100 GPUs). We are excited to share our findings with the community and make our code available here. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 06, 2023

        +

        + High-Performance Llama 2 Training and Inference with PyTorch/XLA on Cloud TPUs +

        +

        In a landscape where AI innovation is accelerating at an unprecedented pace, Meta’s Llama family of open sourced large language models (LLMs) stands out as a notable breakthrough. Llama marked a significant step forward for LLMs, demonstrating the power of pre-trained architectures for a wide range of applications. Llama 2 further pushed the boundaries of scale and capabilities, inspiring advancements in language understanding, generation, and beyond. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 02, 2023

        +

        + Accelerating Inference on x86-64 Machines with oneDNN Graph +

        +

        Supported in PyTorch 2.0 as a beta feature, oneDNN Graph leverages aggressive fusion patterns to accelerate inference on x86-64 machines, especially Intel® Xeon® Scalable processors. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/14/index.html b/blog/14/index.html new file mode 100644 index 000000000000..91d330af4bf9 --- /dev/null +++ b/blog/14/index.html @@ -0,0 +1,1001 @@ + + + + + + + + + + + + + Blog | 14 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + AMD Extends Support for PyTorch Machine Learning Development on Select RDNA™ 3 GPUs with ROCm™ 5.7 +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        October 17, 2023

        +

        + PyTorch Edge: Enabling On-Device Inference Across Mobile and Edge Devices with ExecuTorch +

        +

        We are excited to announce ExecuTorch, our all-new solution for enabling on-device inference capabilities across mobile and edge devices with the backing of industry leaders like Arm, Apple, and Qualcomm Innovation Center. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 17, 2023

        +

        + Lightning AI Joins the PyTorch Foundation as a Premier Member +

        +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Lightning AI has joined as a premier member. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 17, 2023

        +

        + Huawei Joins the PyTorch Foundation as a Premier Member +

        +

        Today, the PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, announced that Huawei has joined as a premier member. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 17, 2023

        +

        + Compiling NumPy code into C++ or CUDA via torch.compile +

        +

        Quansight engineers have implemented support for tracing through NumPy code via +torch.compile in PyTorch 2.1. This feature leverages PyTorch’s compiler to +generate efficient fused vectorized code without having to modify your original +NumPy code. Even more, it also allows for executing NumPy code on CUDA +just by running it through torch.compile under torch.device("cuda")! + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 13, 2023

        +

        + Flash-Decoding for long-context inference +

        +

        Motivation + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 11, 2023

        +

        + ML Model Server Resource Saving - Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance +

        +

        Reviewers: Yunsang Ju(Naver GplaceAI Leader), Min Jean Cho(Intel), Jing Xu(Intel), Mark Saroufim(Meta) + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 10, 2023

        +

        + Real-time Audio-visual Speech Recognition +

        +

        Audio-Visual Speech Recognition (AV-ASR, or AVSR) is the task of transcribing text from audio and visual streams, which has recently attracted a lot of research attention due to its robustness to noise. The vast majority of work to date has focused on developing AV-ASR models for non-streaming recognition; studies on streaming AV-ASR are very limited. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/15/index.html b/blog/15/index.html new file mode 100644 index 000000000000..4ab53dfea986 --- /dev/null +++ b/blog/15/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 15 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + PyTorch 2.1: automatic dynamic shape compilation, distributed checkpointing +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        October 04, 2023

        +

        + New Library Updates in PyTorch 2.1 +

        +

        Summary + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 04, 2023

        +

        + High performance Llama 2 deployments with AWS Inferentia2 using TorchServe +

        +

        Recently, Llama 2 was released and has attracted a lot of interest from the machine learning community. Amazon EC2 Inf2 instances, powered by AWS Inferentia2, now support training and inference of Llama 2 models. In this post, we show low-latency and cost-effective inference of Llama-2 models on Amazon EC2 Inf2 instances using the latest AWS Neuron SDK release.  We first introduce how to create, compile and deploy the Llama-2 model and explain the optimization techniques introduced by AWS Neu...

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 03, 2023

        +

        + How to Build an Interactive Chat-Generation Model using DialoGPT and PyTorch +

        +

        The focus on interactive chat-generation (or conversational response-generation) models has greatly increased in the past several months. Conversational response-generation models such as ChatGPT and Google Bard have taken the AI world by storm. The purpose of interactive chat generation is to answer various questions posed by humans, and these AI based models use natural language processing (NLP) to generate conversations almost indistinguishable from those generated by humans. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 02, 2023

        +

        + Announcing PyTorch Docathon H2 2023 +

        +

        We are excited to announce that we will be holding a Docathon for PyTorch on November 1, 2023! This event is an opportunity for our community to come together and improve the quality of our documentation. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        September 25, 2023

        +

        + Inside the Matrix: Visualizing Matrix Multiplication, Attention and Beyond +

        +

        Use 3D to visualize matrix multiplication expressions, attention heads with real weights, and more. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        September 13, 2023

        +

        + Accelerated CPU Inference with PyTorch Inductor using torch.compile +

        +

        Story at a Glance + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        September 12, 2023

        +

        + One Year of PyTorch Foundation +

        +

        It’s been one year since we announced the formation of the PyTorch Foundation! 🎉 + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/16/index.html b/blog/16/index.html new file mode 100644 index 000000000000..fadfe3d07151 --- /dev/null +++ b/blog/16/index.html @@ -0,0 +1,997 @@ + + + + + + + + + + + + + Blog | 16 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + + + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        September 05, 2023

        +

        + Automated trace collection and analysis +

        +

        In this blog, we share how we enabled the collection and analysis of PyTorch Profiler traces for training workloads without any user side code instrumentation. We leveraged Dynolog - an open source daemon for CPU and GPU telemetry to collect PyTorch Profiler traces, and analyzed the collected traces using Holistic Trace Analysis - an open source library for analyzing PyTorch Profiler traces. This toolchain has allowed engineers at Meta to accelerate their performance optimization workflows. T...

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 31, 2023

        +

        + PyTorch/XLA SPMD: Scale Up Model Training and Serving with Automatic Parallelization +

        +

        Today, we are delighted to announce PyTorch/XLA SPMD: the integration of GSPMD into PyTorch with an easy to use API. PyTorch developers seeking superior performance and scale can train and serve the largest neural networks while maximizing utilization of AI accelerators, such as Google Cloud TPUs. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 24, 2023

        +

        + Large Scale Training of Hugging Face Transformers on TPUs With PyTorch/XLA FSDP +

        +

        AI is transforming many industries through advanced capabilities such as understanding and generating language, answering questions, and delivering accurate recommendations. These capabilities are fueled by ever-increasing size and complexity of AI models, which require vast amounts of computing power to train. + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        August 07, 2023

        +

        + INT8 Quantization for x86 CPU in PyTorch +

        +

        Overview + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        August 01, 2023

        +

        + AMD's Journey to Openness and Performance +

        +

        AMD has gained progress in building a robust software stack that supports an open ecosystem of models, libraries, frameworks, and tools. With proven platforms gaining momentum, there is significance of a leadership software stack and an optimized ecosystem for achieving application performance. PyTorch is a key part of AMD’s AI journey, and AMD’s Victor Peng, AMD President and Soumith Chintala, founder of PyTorch discussed the latest progress at the DC & AI Keynote on June 12. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/17/index.html b/blog/17/index.html new file mode 100644 index 000000000000..2f78d308fdd6 --- /dev/null +++ b/blog/17/index.html @@ -0,0 +1,999 @@ + + + + + + + + + + + + + Blog | 17 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + + + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        July 27, 2023

        +

        + IBM Joins the PyTorch Foundation as a Premier Member +

        +

        The PyTorch Foundation, part of The Linux Foundation, is pleased to announce that IBM has joined as a premier member. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 25, 2023

        +

        + Announcing CPP-based S3 IO DataPipes +

        +

        Training large deep learning models requires large datasets. Amazon Simple Storage Service (Amazon S3) is a scalable cloud object store service used for storing large training datasets. Machine learning (ML) practitioners need an efficient data pipe that can download data from Amazon S3, transform the data, and feed the data to GPUs for training models with high throughput and low latency. + +In this post, we introduce the new S3 IO DataPipes for PyTorch, S3FileLister and S3FileLoader. For memo...

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 10, 2023

        +

        + How to Accelerate PyTorch Geometric on Intel® CPUs +

        +

        Overview + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        June 28, 2023

        +

        + The Path to Achieve Ultra-Low Inference Latency With LLaMA 65B on PyTorch/XLA +

        +

        Background & State of the Art + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 22, 2023

        +

        + Optimized PyTorch 2.0 Inference with AWS Graviton processors +

        +

        New generations of CPUs offer significant performance improvement in machine learning (ML) inference due to specialized built-in instructions. Combined with their flexibility, high speed of development, and low operating cost, these general-purpose processors offer an alternative ML inference solution to other existing hardware solutions. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 16, 2023

        +

        + 🎉 PyTorch Docathon H1 2023 Wrap-up 🎉 +

        +

        Thank you to all who participated in our first ever PyTorch Docathon, the results have been nothing short of amazing! We want to extend our sincerest gratitude to all the participants who made this event a resounding success. Your passion, talent, and hard work have left an indelible mark on the PyTorch documentation. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/18/index.html b/blog/18/index.html new file mode 100644 index 000000000000..f1dba17fb881 --- /dev/null +++ b/blog/18/index.html @@ -0,0 +1,991 @@ + + + + + + + + + + + + + Blog | 18 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Join the PyTorch Foundation: Membership Now Open +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        May 22, 2023

        +

        + Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0 +

        +

        As part of PyTorch 2.0 release, an accelerated implementation of the attention mechanism as part of the “Better Transformer” project (and known in PyTorch as Accelerated Transformers) has been added natively into PyTorch as torch.nn.functional.scaled_dot_product_attention. This implementation leverages fused kernels from FlashAttention and Memory-efficient attention, and supports both training and inference. + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        May 12, 2023

        +

        + Language Identification: Building an End-to-End AI Solution using PyTorch +

        +

        Language Identification is the process of identifying the primary language from multiple audio input samples. In natural language processing (NLP), language identification is an important problem and a challenging issue. There are many language-related tasks such as entering text on your phone, finding news articles you enjoy, or discovering answers to questions that you may have. All these tasks are powered by NLP models. To decide which model to invoke at a particular point in time, we must...

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 03, 2023

        +

        + Announcing PyTorch Docathon 2023 +

        +

        + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 02, 2023

        +

        + Accelerated Image Segmentation using PyTorch +

        +

        Using Intel® Extension for PyTorch to Boost Image Processing Performance + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 27, 2023

        +

        + Introducing Hidet: A Deep Learning Compiler for Efficient Model Serving +

        +

        Hidet is a powerful deep learning compiler that simplifies the process of implementing high-performing deep learning operators on modern accelerators (e.g., NVIDIA GPUs). With the new feature of torch.compile(...) in PyTorch 2.0, integrating a novel compiler into PyTorch is easier than ever - Hidet now can be used as a torch.compile(...) backend to accelerate PyTorch models, making it an attractive option for PyTorch users who want to improve the inference performance of their models, especia...

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 19, 2023

        +

        + Accelerating Large Language Models with Accelerated Transformers +

        +

        TL;DR. We show how to use Accelerated PyTorch 2.0 Transformers and the newly introduced torch.compile() method to accelerate Large Language Models on the example of nanoGPT, a compact open-source implementation of the GPT model from Andrej Karpathy. Using the new scaled dot product attention operator introduced with Accelerated PT2 Transformers, we select the flash_attention custom kernel and achieve faster training time per batch (measured with Nvidia A100 GPUs), going from a ~143ms/batch ba...

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/19/index.html b/blog/19/index.html new file mode 100644 index 000000000000..8dd787fd998b --- /dev/null +++ b/blog/19/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 19 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Experience the power of PyTorch 2.0 on AMD Solutions +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        April 14, 2023

        +

        + Accelerated Generative Diffusion Models with PyTorch 2 +

        +

        TL;DR: PyTorch 2.0 nightly offers out-of-the-box performance improvement for Generative Diffusion models by using the new torch.compile() compiler and optimized implementations of Multihead Attention integrated with PyTorch 2. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 07, 2023

        +

        + Straggler Mitigation On PyTorch DDP By Hierarchical SGD +

        +

        PyTorch DDP has been widely adopted across the industry for distributed training, which by default runs synchronous SGD to synchronize gradients across model replicas at every step. The performance of this technique is critical for fast iteration during model exploration as well as resource and cost saving. The performance is critical for fast iteration and cost saving of model development and exploration. To resolve a ubiquitous performance bottleneck introduced by slow nodes in large-scale ...

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 07, 2023

        +

        + Celebrate PyTorch 2.0 with New Performance Features for AI Developers +

        +

        Congratulations to the PyTorch Foundation for its release of PyTorch 2.0! In this blog, I discuss the four features for which Intel made significant contributions to PyTorch 2.0: + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 03, 2023

        +

        + PyTorch & OpenXLA: The Path Forward +

        +

        As we celebrate the release of OpenXLA, PyTorch 2.0, and PyTorch/XLA 2.0, it’s worth taking a step back and sharing where we see it all going in the short to medium term. With PyTorch adoption leading in the AI space and XLA supporting best-in-class compiler features, PyTorch/XLA is well positioned to provide a cutting edge development stack for both model training and inference. To achieve this, we see investments in three main areas: + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 28, 2023

        +

        + Accelerated PyTorch 2 Transformers +

        +

        The PyTorch 2.0 release includes a new high-performance implementation of the PyTorch Transformer API with the goal of making training and deployment of state-of-the-art Transformer models affordable. Following the successful release of “fastpath” inference execution (“Better Transformer”), this release introduces high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA). + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 22, 2023

        +

        + PyTorch 2.0 & XLA—The Latest Cutting Edge Features +

        +

        Today, we are excited to share our latest work for PyTorch/XLA 2.0. The release of PyTorch 2.0 is yet another major milestone for this storied community and we are excited to continue to be part of it. When the PyTorch/XLA project started in 2018 between Google and Meta, the focus was on bringing cutting edge Cloud TPUs to help support the PyTorch community. Along the way, others in the community such as Amazon joined the project and very quickly the community expanded. We are excited about X...

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 16, 2023

        +

        + Accelerated Diffusers with PyTorch 2.0 +

        +

        PyTorch 2.0 has just been released. Its flagship new feature is torch.compile(), a one-line code change that promises to automatically improve performance across codebases. We have previously checked on that promise in Hugging Face Transformers and TIMM models, and delved deep into its motivation, architecture and the road ahead. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/2/index.html b/blog/2/index.html new file mode 100644 index 000000000000..9c2749687733 --- /dev/null +++ b/blog/2/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + Blog | 2 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + PyTorch 2.7 Release +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        April 08, 2025

        +

        + Accelerating Whisper on Arm with PyTorch and Hugging Face Transformers +

        +

        Automatic speech recognition (ASR) has revolutionized how we interact with technology, clearing the way for applications like real-time audio transcription, voice assistants, and accessibility tools. OpenAI Whisper is a powerful model for ASR, capable of multilingual speech recognition and translation. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 03, 2025

        +

        + PyTorch Day France 2025: Call For Proposals Open +

        +

        We’re pleased to announce PyTorch Day France 2025, a dedicated gathering of the PyTorch community held 7 May 2025 in Paris, France. Proudly hosted by the PyTorch Foundation and co-located with GOSIM AI Paris 2025, this event will bring together developers, researchers, and practitioners driving innovation in open source AI and machine learning. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 19, 2025

        +

        + PyTorch Day China 2025 Call for Proposals Open +

        +

        We’re excited to announce the first-ever PyTorch Day China! This new event, hosted by the PyTorch Foundation, will take place on June 7 in Beijing, China, bringing together AI practitioners, researchers, and industry professionals to explore the latest advancements in open source AI and machine learning. Co-located with the BAAI Conference, PyTorch Day China is a chance to connect with the community, share knowledge, and help shape the future of deep learning. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 13, 2025

        +

        + Introducing the New PyTorch Landscape: Your Guide to the PyTorch Ecosystem +

        +

        We’re excited to reveal our brand new PyTorch Landscape. The PyTorch Landscape helps researchers, developers, and organizations easily locate useful, curated, community-built tools that augment the PyTorch core framework. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 11, 2025

        +

        + Scaling Recommendation Systems Training to Thousands of GPUs with 2D Sparse Parallelism +

        +

        At Meta, recommendation systems are the cornerstone of delivering relevant and personalized ads to billions of users globally. Through technologies like PyTorch’s TorchRec, we’ve successfully developed solutions that enable model training across hundreds of GPUs. While these systems have served us well, recent research on scaling laws has revealed a compelling opportunity: we can achieve significantly better model performance by training dramatically larger neural networks. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 06, 2025

        +

        + Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel +

        +

        LinkedIn: Shivam Sahni, Byron Hsu, Yanning Chen +Meta: Ankith Gunapal, Evan Smothers + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 05, 2025

        +

        + Current and New Activation Checkpointing Techniques in PyTorch +

        +

        As models scale in depth, batch size, and sequence length, etc, activation memory becomes an increasingly significant contributor to the overall memory usage. To help address this, PyTorch provides utilities for activation checkpointing, which reduce the number of saved tensors by recomputing them when needed, trading off memory usage for additional compute. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/20/index.html b/blog/20/index.html new file mode 100644 index 000000000000..4de7cdad8d20 --- /dev/null +++ b/blog/20/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 20 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + PyTorch 2.0: Our next generation release that is faster, more Pythonic and Dynamic as ever +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        March 15, 2023

        +

        + New Library Updates in PyTorch 2.0 +

        +

        Summary + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        February 02, 2023

        +

        + Deprecation of CUDA 11.6 and Python 3.7 Support +

        +

        For the upcoming PyTorch 2.0 feature release (target March 2023), we will target CUDA 11.7 as the stable version and CUDA 11.8 as the experimental version of CUDA and Python >=3.8, <=3.11. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 09, 2023

        +

        + PyTorch Trace Analysis for the Masses +

        +

        We are excited to announce the public release of Holistic Trace Analysis (HTA), an open source performance analysis and visualization Python library for PyTorch users. HTA takes as input Kineto traces collected by the PyTorch profiler, which are complex and challenging to interpret, and up-levels the performance information contained in these traces. It was initially developed internally at Meta to understand and debug performance problems for large-scale distributed training jobs on GPUs. Th...

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 31, 2022

        +

        + Compromised PyTorch-nightly dependency chain between December 25th and December 30th, 2022. +

        +

        If you installed PyTorch-nightly on Linux via pip between December 25, 2022 and December 30, 2022, please uninstall it and torchtriton immediately, and use the latest nightly binaries (newer than Dec 30th 2022). + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 28, 2022

        +

        + Torchserve Performance Tuning, Animated Drawings Case-Study +

        +

        In this post we discuss performance tuning of Torchserve for serving your models in production. One of the biggest challenges in the life cycle of a ML project is deploying models in production. This requires a reliable serving solution along with solutions that address the MLOps needs. A robust serving solution needs to provide support for multi model serving, model versioning, metric logging, monitoring and scaling to serve the peak traffic. In this post, we will have an overview of Torchs...

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 22, 2022

        +

        + Scaling Vision Model Training Platforms with PyTorch +

        +

        TL;DR: We demonstrate the use of PyTorch with FairScale’s FullyShardedDataParallel (FSDP) API in writing large vision transformer models. We discuss our techniques for scaling and optimizing these models on a GPU cluster. The goal of this platform scaling effort is to enable research at scale. This blog does not discuss model accuracy, new model architectures, or new training recipes. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/2024-year-in-review/index.html b/blog/2024-year-in-review/index.html new file mode 100644 index 000000000000..ed442c8e5796 --- /dev/null +++ b/blog/2024-year-in-review/index.html @@ -0,0 +1,713 @@ + + + + + + + + + + + + + PyTorch Grows as the Dominant Open Source Framework for AI and ML: 2024 Year in Review | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Eli Uriegas, Meta and Jennifer Bly, PyTorch Foundation + +

        +

        This past year was a monumental year for PyTorch from major releases to the flagship PyTorch Conference. We’ve seen incredible growth in contributions from more than 3,500 individuals and 3,000 organizations. It’s safe to say PyTorch has now become the dominant deep learning framework for AI/ML. PyTorch leads the model training space with a 63% adoption rate according to the recent Shaping the Future of Generative AI Report from the Linux Foundation.

        + +

        group at a conference

        + +

        The PyTorch Foundation was formed in 2022 with the goal to drive the adoption of AI tooling by fostering and sustaining an ecosystem of open source, vendor-neutral projects centered around PyTorch and today remains a vibrant, collaborative hub created for and by the deep learning community. As we wrap up the year, let’s take a look back at a few highlights and how this year has been one of growth, collaboration, innovation, and community.

        + +

        2024 Highlights: A Year of Growth and Impact

        + +

        PyTorch accelerated its growth this year. Contributions are up 133%, from double the amount of organizations worldwide compared to last year.

        + +

        The project has seen 20% year-over-year growth in new repositories using PyTorch, and a 30% increase in forks and users this past year.

        + +

        Over 70% of AI research implementations are now using PyTorch.

        + +

        Statistics based on the 2024 Linux Foundation Annual Report.

        + +

        people at a conference

        + +

        PyTorch Tools ecosystem grew by over 25%, enhancing both software and hardware capabilities. Working with all major cloud service providers, dozens of major software vendors, and industry partners, PyTorch is setting a new bar for the pace and breadth of AI innovation.

        + +

        people at a conference

        + +

        This year featured 4 milestone releases for PyTorch in the 2.2, 2.3, 2.4 and 2.5 releases. We observed the release of various hallmark features like AOTInductor, FlashAttention-2 support, Tensor Parallelism, a new Python Custom Operator API, and the introduction of FlexAttention. Engineers from across PyTorch Foundation member companies have also come together to introduce support and optimizations for platforms like Intel GPUs (XPU), AWS Graviton processors, Inductor performance, etc.

        + +

        Throughout the year the PyTorch Team has been working hard to introduce a number of new PyTorch-native libraries! The ExecuTorch team released their alpha in collaboration with partners from Arm, Apple, and Qualcomm Technologies, Inc. then quickly followed with a beta focused on stability and adding MediaTek. TorchTune established a PyTorch-native library for easily fine-tuning large language models. TorchAO introduced a PyTorch native library that makes models faster and smaller by leveraging low bit dtypes, quantization and sparsity. TorchCodec was launched to give developers a simple, performant, and PyTorch native way to decode videos into tensors. TorchRec 1.0 was released, the first stable release of the PyTorch native recommendation systems library.

        + +

        We’ve also had a number of strong technical showcases throughout the year to highlight how PyTorch can be used! TorchTitan exhibited what an open source, PyTorch-native distributed training system could look like for training large language models (LLMs). TorchChat showcased how to seamlessly and performantly run LLMs across laptop, desktop, and mobile devices.

        + +

        As well we were very excited to include multiple new projects into the PyTorch ecosystem throughout 2024, including the introduction of vLLM into the PyTorch Ecosystem, a state-of-the-art inference engine, which gives machine learning engineers an easy, fast, and cheap way of serving LLMs. If you are interested in joining the PyTorch Ecosystem, please join!

        + +

        people at a conference

        + +

        In June in Paris, France we premiered the official PyTorch documentary on powering the AI Revolution that spotlights PyTorch’s vibrant ecosystem and its role in advancing AI innovation. The film unveiled the authentic narrative of PyTorch’s inception, attributing its existence to a dedicated group of unsung heroes driving technological innovation.

        + +

        people at a conference

        + +

        The PyTorch Conference 2024, brought in triple the registrations compared to 2023, reflecting the rapid growth of AI and machine learning communities around open source technologies. The two day event included insightful talks, hands-on sessions, and lively discussions about the future of AI, covering everything from generative AI to large language models.

        + +

        A brand new Startup Showcase featured early-stage founders pitching their AI startups to a panel of top venture capitalists, a DL Compiler Mini-Summit took a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads, and a Fine-Tuning Mini-Summit brought together a thriving community of researchers, developers, practitioners and hobbyists to discuss topics like memory efficiency, parameter-efficient fine-tuning, and performance at scale.

        + +

        speaking on stage at a conference

        + +

        Outstanding contributors were honored with PyTorch Contributor Awards. Congratulations to this year’s nominees and recipients for the outstanding individuals and teams who have played a pivotal role in PyTorch’s journey this year.

        + +

        people at a conference

        + +

        PyTorch Foundation membership is growing with the addition of Arm and Rebellions this year. At the year-end mark, Premier Members include: AMD, Arm, AWS, Google Cloud, Huawei, Hugging Face, IBM, Intel, Lightning AI, Meta, Microsoft Azure, and NVIDIA. General Members include: Graphcore, Rebellions, and Snowflake. If your organization is interested in joining, find out how you can become a member of the PyTorch Foundation.

        + +

        PyTorch hosted numerous in-person and virtual events, including The PyTorch Docathon where contributors worked to improve PyTorch documentation and foster collaboration, Local meetups around the world brought together interested parties in locations from Shanghai to Seoul, and more than a dozen webinars brought in attendees from everywhere during our Summer Webinar Series, live Q&As, and Expert Exchanges.

        + +

        Matt speaking at a conference

        + +

        PyTorch Foundation welcomed new leadership this year. Executive Director Matt White took the reins in April and immediately began raising the profile of PyTorch across the AI landscape. The Technical Advisory Council (TAC) also elected new leadership with Luca Antiga, Lightning AI as the Chair and Jiong Gong, Intel as Vice Chair.

        + +

        The PyTorch Governing Board continued to set the direction and lead the Foundation in accomplishing its mission. The PyTorch Marketing and Outreach Committee developed programs to maximize the visibility of PyTorch and advance the interests of the community. The PyTorch CI Working Group assembled to successfully migrate the PyTorch CI pipeline to the Linux Foundation.

        + +

        Our community joined us on social media with 775 thousand followers strong across X, LinkedIn, Facebook, and YouTube with more than 12 million impressions of PyTorch content throughout the year. The PyTorch Ecosystem also grew, adding many new projects to leverage PyTorch deep learning across many vertical domains.

        + +

        people at a conference

        + +

        PyTorch was mentioned in the media in top technology publications such as The New Stack’s article on Why PyTorch Gets All the Love and InfoWorld’s article on how the TorchAO PyTorch library makes models faster and smaller.

        + +

        We published 74 technical and community blogs, and nearly ten million people visited the PyTorch website throughout the year.

        + +

        fire dancers at a conference

        + +

        Thanks to each of you who helped make this year an outstanding success! The evolution and growth we’ve seen PyTorch undergo over the past year is driven by the passion, dedication, and ingenuity of this amazing community. Looking ahead to next year, we’re excited to build on this momentum as we continue to push the boundaries of AI.

        + +

        Save the date for the PyTorch Conference which will be held October 22-23, 2025 in San Francisco. 2025 promises even greater innovation and stronger community collaboration.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/2025-priorities-for-tac/index.html b/blog/2025-priorities-for-tac/index.html new file mode 100644 index 000000000000..58d2bb2c0553 --- /dev/null +++ b/blog/2025-priorities-for-tac/index.html @@ -0,0 +1,664 @@ + + + + + + + + + + + + + 2025 Priorities for the PyTorch Technical Advisory Council (TAC) | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Luca Antiga, PyTorch TAC Chair + +

        +

        social share

        + +

        2024 has been a year of incredible growth for PyTorch. As that continues in 2025, the PyTorch Foundation has made important steps towards evolving the governance of the project under the Linux Foundation’s vendor-neutral umbrella.

        + +

        An important piece of governance for PyTorch is represented by the Technical Advisory Council (TAC). The TAC acts as a bridge between the industry, including but not limited to the PyTorch Foundation members, the community, and the PyTorch core development team.

        + +

        Operating with transparency and inclusivity, the TAC gathers input, facilitates collaboration, and drives initiatives that enhance the experience for everyone who relies on PyTorch.

        + +

        In 2025, the TAC will focus on four key areas:

        + +
          +
        1. Build Open, Multi-Cloud Continuous Integration (CI): Building on the groundwork from 2024, the TAC will oversee the transition to an open, community-driven CI infrastructure. In addition to ensuring the extremely high bar for correctness that PyTorch has, PyTorch’s CI is complex with a high-quality bar including many automated functional and performance daily test runs. In 2025, PyTorch’s CI infrastructure will be fully open sourced and extended to support multiple compute providers, enabling broader contribution and participation to the effort from organizations benefitting from PyTorch.
        2. +
        3. Support more Accelerators: The TAC is committed to creating a level playing field for the growing landscape of AI accelerators. By gathering industry players and PyTorch developers, the TAC will facilitate efforts towards third-party device support and provide levels of integration of external CI systems with the main PyTorch CI. This will make it easier for emerging hardware to gain adoption within the PyTorch ecosystem, and for users to experiment with diverse compute options for training and inference.
        4. +
        5. Create a High-Quality, User-Centric Ecosystem: A big focus for the TAC in early 2025 is on improving the experience and discoverability of the PyTorch ecosystem. With many projects growing organically, users often face challenges navigating projects of different scope and quality within the rapidly changing AI landscape. To solve this, a newly curated ecosystem landscape tool will be launched soon on the PyTorch website. We will also introduce lightweight, open processes to improve projects and ensure users a predictable, high-quality experience. In many ways, the experience with PyTorch is as good as its ecosystem.
        6. +
        7. Gather Feedback from Industry and the Community: PyTorch has widespread adoption across research labs, startups, and enterprises. Striking the right balance between expressiveness and performance across the board is a very challenging task, so the TAC set out to be one of the several ways the Core development team receives signals. During our monthly TAC meetings, we provide the opportunity to PyTorch Foundation members from industry and academia, as well as non-member organizations to present their use case, their challenges and discuss them directly with appropriate members of the Core team. This feedback loop helps prioritize improvements, ensuring the framework stays relevant in a fast-evolving AI landscape.
        8. +
        + +

        By focusing on these priorities, the TAC aims to maintain PyTorch’s position as the leading deep learning framework, while ensuring it remains open, accessible, and responsive to the needs of its diverse community.

        + +

        As members of the TAC, we’re extremely excited to contribute to the success of PyTorch and to the impact it’s having in the real world. If you are a PyTorch user or developer, consider participating in our monthly calls (they are open to everyone, and the recordings are available here). Also, if you develop or maintain a project based on PyTorch, consider contributing it to the new PyTorch ecosystem (instructions).

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/21/index.html b/blog/21/index.html new file mode 100644 index 000000000000..63c53e856beb --- /dev/null +++ b/blog/21/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 21 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Efficient Large-Scale Training with Pytorch FSDP and AWS +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        December 15, 2022

        +

        + Scaling PyTorch FSDP for Training Foundation Models on IBM Cloud +

        +

        Large model training using a cloud native approach is of growing interest for many enterprises given the emergence and success of foundation models. Some AI practitioners may assume that the only way they can achieve high GPU utilization for distributed training jobs is to run them on HPC systems, such as those inter-connected with Infiniband and may not consider Ethernet connected systems. We demonstrate how the latest distributed training technique, Fully Sharded Data Parallel (FSDP) from P...

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 02, 2022

        +

        + Get Started with PyTorch 2.0 Summary and Overview +

        +

        Introducing PyTorch 2.0, our first steps toward the next generation 2-series release of PyTorch. Over the last few years we have innovated and iterated from PyTorch 1.0 to the most recent 1.13 and moved to the newly formed PyTorch Foundation, part of the Linux Foundation. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 02, 2022

        +

        + Accelerating Hugging Face and TIMM models with PyTorch 2.0 +

        +

        torch.compile() makes it easy to experiment with different compiler backends to make PyTorch code faster with a single line decorator torch.compile(). It works either directly over an nn.Module as a drop-in replacement for torch.jit.script() but without requiring you to make any source code changes. We expect this one line code change to provide you with between 30%-2x training time speedups on the vast majority of models that you’re already running. + +

        + +
        + + Read More + +
        + + + + + + + + + + + + + + + + + + + +
        +
        +

        November 17, 2022

        +

        + Introducing TorchMultimodal - a library for accelerating exploration in Multimodal AI +

        +

        We are announcing TorchMultimodal Beta, a PyTorch domain library for training SoTA multi-task multimodal models at scale. The library provides composable building blocks (modules, transforms, loss functions) to accelerate model development, SoTA model architectures (FLAVA, MDETR, Omnivore) from published research, training and evaluation scripts, as well as notebooks for exploring these models. The library is under active development, and we’d love to hear your feedback! You can find more det...

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/22/index.html b/blog/22/index.html new file mode 100644 index 000000000000..270922c8f32a --- /dev/null +++ b/blog/22/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 22 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + PyTorch Enterprise Support Program Update +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        November 03, 2022

        +

        + Extending TorchVision’s Transforms to Object Detection, Segmentation & Video tasks +

        +

        Note: A previous version of this post was published in November 2022. We have updated this post with the most up-to-date info, in view of the upcoming 0.15 release of torchvision in March 2023, jointly with PyTorch 2.0. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 28, 2022

        +

        + New Library Updates in PyTorch 1.13 +

        +

        Summary + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 28, 2022

        +

        + PyTorch 1.13 release, including beta versions of functorch and improved support for Apple’s new M1 chips. +

        +

        We are excited to announce the release of PyTorch® 1.13 (release note)! This includes Stable versions of BetterTransformer. We deprecated CUDA 10.2 and 11.3 and completed migration of CUDA 11.6 and 11.7. Beta includes improved support for Apple M1 chips and functorch, a library that offers composable vmap (vectorization) and autodiff transforms, being included in-tree with the PyTorch release. This release is composed of over 3,749 commits and 467 contributors since 1.12.1. We want to sincere...

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 17, 2022

        +

        + PyTorch’s Tracing Based Selective Build +

        +

        Introduction + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 13, 2022

        +

        + Scaling PyTorch models on Cloud TPUs with FSDP +

        +

        Introduction + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        September 29, 2022

        +

        + Performance Debugging of Production PyTorch Models at Meta +

        +

        1. Meta’s AI Performance Profiling (MAIProf) + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        September 26, 2022

        +

        + Announcing PyTorch Conference 2022 +

        +

        We are excited to announce that the PyTorch Conference returns in-person as a satellite event to NeurlPS (Neural Information Processing Systems) in New Orleans on Dec. 2nd. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/23/index.html b/blog/23/index.html new file mode 100644 index 000000000000..6fe200decb71 --- /dev/null +++ b/blog/23/index.html @@ -0,0 +1,997 @@ + + + + + + + + + + + + + Blog | 23 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + PyTorch strengthens its governance by joining the Linux Foundation +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        August 29, 2022

        +

        + Fast Beam Search Decoding in PyTorch with TorchAudio and Flashlight Text +

        +

        Beam search decoding with industry-leading speed from Flashlight Text (part of the Flashlight ML framework) is now available with official support in TorchAudio, bringing high-performance beam search and text utilities for speech and text applications built on top of PyTorch. The current integration supports CTC-style decoding, but it can be used for any modeling setting that outputs token-level probability distributions over time steps. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 26, 2022

        +

        + Introducing nvFuser, a deep learning compiler for PyTorch +

        +

        nvFuser is a Deep Learning Compiler for NVIDIA GPUs that automatically just-in-time compiles fast and flexible kernels to reliably accelerate users’ networks. It provides significant speedups for deep learning networks running on Volta and later CUDA accelerators by generating fast custom “fusion” kernels at runtime. nvFuser is specifically designed to meet the unique requirements of the PyTorch community, and it supports diverse network architectures and programs with dynamic inputs of varyi...

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 24, 2022

        +

        + Accelerating PyTorch Vision Models with Channels Last on CPU +

        +

        Overview + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 18, 2022

        +

        + Easily list and initialize models with new APIs in TorchVision +

        +

        TorchVision now supports listing and initializing all available built-in models and weights by name. This new API builds upon the recently introduced Multi-weight support API, is currently in Beta, and it addresses a long-standing request from the community. + +

        + +
        + + Read More + +
        + + + + + + + + + + + + + + +
        +
        +

        July 19, 2022

        +

        + What Every User Should Know About Mixed Precision Training in PyTorch +

        +

        Efficient training of modern neural networks often relies on using lower precision data types. Peak float16 matrix multiplication and convolution performance is 16x faster than peak float32 performance on A100 GPUs. And since the float16 and bfloat16 data types are only half the size of float32 they can double the performance of bandwidth-bound kernels and reduce the memory required to train a network, allowing for larger models, larger batches, or larger inputs. Using a module like torch.amp...

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/24/index.html b/blog/24/index.html new file mode 100644 index 000000000000..778bd83ec28b --- /dev/null +++ b/blog/24/index.html @@ -0,0 +1,996 @@ + + + + + + + + + + + + + Blog | 24 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Case Study: PathAI Uses PyTorch to Improve Patient Outcomes with AI-powered Pathology +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        July 12, 2022

        +

        + A BetterTransformer for Fast Transformer Inference +

        +

        tl;dr Transformers achieve state-of-the-art performance for NLP, and are becoming popular for a myriad of other tasks. They are computationally expensive which has been a blocker to their widespread productionisation. Launching with PyTorch 1.12, BetterTransformer implements a backwards-compatible fast path of torch.nn.TransformerEncoder for Transformer Encoder Inference and does not require model authors to modify their models. BetterTransformer improvements can exceed 2x in speedup and thro...

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 28, 2022

        +

        + PyTorch 1.12: TorchArrow, Functional API for Modules and nvFuser, are now available +

        +

        We are excited to announce the release of PyTorch 1.12 (release note)! This release is composed of over 3124 commits, 433 contributors. Along with 1.12, we are releasing beta versions of AWS S3 Integration, PyTorch Vision Models on Channels Last on CPU, Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16 and FSDP API. We want to sincerely thank our dedicated community for your contributions. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 28, 2022

        +

        + New library updates in PyTorch 1.12 +

        +

        We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 1.12 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 27, 2022

        +

        + How Computational Graphs are Executed in PyTorch +

        +

        Welcome to the last entry into understanding the autograd engine of PyTorch series! +If you haven’t read parts 1 & 2 check them now to understand how PyTorch creates the computational graph for the backward pass! + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 23, 2022

        +

        + Geospatial deep learning with TorchGeo +

        +

        TorchGeo is a PyTorch domain library providing datasets, samplers, transforms, and pre-trained models specific to geospatial data. + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        May 18, 2022

        +

        + Introducing Accelerated PyTorch Training on Mac +

        +

        In collaboration with the Metal engineering team at Apple, we are excited to announce support for GPU-accelerated PyTorch training on Mac. Until now, PyTorch training on Mac only leveraged the CPU, but with the upcoming PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/25/index.html b/blog/25/index.html new file mode 100644 index 000000000000..8307aefa4a0e --- /dev/null +++ b/blog/25/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + Blog | 25 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        + +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        March 16, 2022

        +

        + Running PyTorch Models on Jetson Nano +

        +

        Overview +NVIDIA Jetson Nano, part of the Jetson family of products or Jetson modules, is a small yet powerful Linux (Ubuntu) based embedded computer with 2/4GB GPU. With it, you can run many PyTorch models efficiently. This document summarizes our experience of running different deep learning models using 3 different mechanisms on Jetson Nano: + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 14, 2022

        +

        + Introducing PyTorch Fully Sharded Data Parallel (FSDP) API +

        +

        Recent studies have shown that large model training will be beneficial for improving model quality. During the last 3 years, model size grew 10,000 times from BERT with 110M parameters to Megatron-2 with one trillion. However, training large AI models is not easy—aside from the need for large amounts of computing resources, software engineering complexity is also challenging. PyTorch has been working on building tools and infrastructure to make it easier. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 10, 2022

        +

        + PyTorch 1.11, TorchData, and functorch are now available +

        +

        We are excited to announce the release of PyTorch 1.11 (release notes). This release is composed of over 3,300 commits since 1.10, made by 434 contributors. Along with 1.11, we are releasing beta versions of TorchData and functorch. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 10, 2022

        +

        + Introducing TorchRec, and other domain library updates in PyTorch 1.11 +

        +

        We are introducing the beta release of TorchRec and a number of improvements to the current PyTorch domain libraries, alongside the PyTorch 1.11 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. Highlights include: + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        February 24, 2022

        +

        + Case Study: Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing +

        +

        Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        February 23, 2022

        +

        + Introducing TorchRec, a library for modern production recommendation systems +

        +

        We are excited to announce TorchRec, a PyTorch domain library for Recommendation Systems. This new library provides common sparsity and parallelism primitives, enabling researchers to build state-of-the-art personalization models and deploy them in production. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/26/index.html b/blog/26/index.html new file mode 100644 index 000000000000..25031f242d9d --- /dev/null +++ b/blog/26/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 26 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Practical Quantization in PyTorch +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        December 22, 2021

        +

        + Introducing TorchVision’s New Multi-Weight Support API +

        +

        TorchVision has a new backwards compatible API for building models with multi-weight support. The new API allows loading different pre-trained weights on the same model variant, keeps track of vital meta-data such as the classification labels and includes the preprocessing transforms necessary for using the models. In this blog post, we plan to review the prototype API, show-case its features and highlight key differences with the existing one. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 15, 2021

        +

        + Efficient PyTorch: Tensor Memory Format Matters +

        +

        Ensuring the right memory format for your inputs can significantly impact the running time of your PyTorch vision models. When in doubt, choose a Channels Last memory format. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 08, 2021

        +

        + Announcing the Winners of the 2021 PyTorch Annual Hackathon +

        +

        More than 1,900 people worked hard in this year’s PyTorch Annual Hackathon to create unique tools and applications for PyTorch developers and researchers. + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        October 29, 2021

        +

        + Feature Extraction in TorchVision using Torch FX +

        +

        + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 26, 2021

        +

        + Accelerating PyTorch with CUDA Graphs +

        +

        Today, we are pleased to announce a new advanced CUDA feature, CUDA Graphs, has been brought to PyTorch. Modern DL frameworks have complicated software stacks that incur significant overheads associated with the submission of each operation to the GPU. When DL workloads are strong-scaled to many GPUs for performance, the time taken by each GPU operation diminishes to just a few microseconds and, in these cases, the high work submission latencies of frameworks often lead to low utilization of ...

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 21, 2021

        +

        + PyTorch 1.10 Release, including CUDA Graphs APIs, Frontend and Compiler Improvements +

        +

        We are excited to announce the release of PyTorch 1.10. This release is composed of over 3,400 commits since 1.9, made by 426 contributors. We want to sincerely thank our community for continuously improving PyTorch. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/27/index.html b/blog/27/index.html new file mode 100644 index 000000000000..2807203041ab --- /dev/null +++ b/blog/27/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | 27 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + New Library Releases in PyTorch 1.10, including TorchX, TorchAudio, TorchVision +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        September 08, 2021

        +

        + Announcing PyTorch Annual Hackathon 2021 +

        +

        We’re excited to announce the PyTorch Annual Hackathon 2021! This year, we’re looking to support the community in creating innovative PyTorch tools, libraries, and applications. 2021 is the third year we’re hosting this Hackathon, and we welcome you to join the PyTorch community and put your machine learning skills into action. Submissions start on September 8 and end on November 3. Good luck to everyone! + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 31, 2021

        +

        + How Computational Graphs are Constructed in PyTorch +

        +

        In the previous post we went over the theoretical foundations of automatic differentiation and reviewed the implementation in PyTorch. In this post, we will be showing the parts of PyTorch involved in creating the graph and executing it. In order to understand the following contents, please read @ezyang’s wonderful blog post about PyTorch internals. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 23, 2021

        +

        + Announcing PyTorch Developer Day 2021 +

        +

        We are excited to announce PyTorch Developer Day (#PTD2), taking place virtually from December 1 & 2, 2021. Developer Day is designed for developers and users to discuss core technical developments, ideas, and roadmaps. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 18, 2021

        +

        + PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models +

        +

        In this blog post, we describe the first peer-reviewed research paper that explores accelerating the hybrid of PyTorch DDP (torch.nn.parallel.DistributedDataParallel) [1] and Pipeline (torch.distributed.pipeline) - PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models (Transformers such as BERT [2] and ViT [3]), published at ICML 2021. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 03, 2021

        +

        + What’s New in PyTorch Profiler 1.9? +

        +

        PyTorch Profiler v1.9 has been released! The goal of this new release (previous PyTorch Profiler release) is to provide you with new state-of-the-art tools to help diagnose and fix machine learning performance issues regardless of whether you are working on one or numerous machines. The objective is to target the execution steps that are the most costly in time and/or memory, and visualize the work load distribution between GPUs and CPUs. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 27, 2021

        +

        + Everything You Need To Know About Torchvision’s SSDlite Implementation +

        +

        In the previous article, we’ve discussed how the SSD algorithm works, covered its implementation details and presented its training process. If you have not read the previous blog post, I encourage you to check it out before continuing. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 23, 2021

        +

        + The torch.linalg module: Accelerated Linear Algebra with Autograd in PyTorch +

        +

        Linear algebra is essential to deep learning and scientific computing, and it’s always been a core part of PyTorch. PyTorch 1.9 extends PyTorch’s support for linear algebra operations with the torch.linalg module. This module, documented here, has 26 operators, including faster and easier to use versions of older PyTorch operators, every function from NumPy’s linear algebra module extended with accelerator and autograd support, and a few operators that are completely new. This makes the torch...

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/28/index.html b/blog/28/index.html new file mode 100644 index 000000000000..0203ea43473c --- /dev/null +++ b/blog/28/index.html @@ -0,0 +1,999 @@ + + + + + + + + + + + + + Blog | 28 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + An Overview of the PyTorch Mobile Demo Apps +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        June 16, 2021

        +

        + Everything You Need To Know About Torchvision’s SSD Implementation +

        +

        In TorchVision v0.10, we’ve released two new Object Detection models based on the SSD architecture. Our plan is to cover the key implementation details of the algorithms along with information on how they were trained in a two-part article. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 15, 2021

        +

        + PyTorch 1.9 Release, including torch.linalg and Mobile Interpreter +

        +

        We are excited to announce the release of PyTorch 1.9. The release is composed of more than 3,400 commits since 1.8, made by 398 contributors. The release notes are available here. Highlights include: + + Major improvements to support scientific computing, including torch.linalg, torch.special, and Complex Autograd + Major improvements in on-device binary size with Mobile Interpreter + Native support for elastic-fault tolerance training through the upstreaming of TorchElastic into PyTorch Core...

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 15, 2021

        +

        + New PyTorch Library Releases in PyTorch 1.9, including TorchVision, TorchAudio, and more +

        +

        Today, we are announcing updates to a number of PyTorch libraries, alongside the PyTorch 1.9 release. The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio. These releases, along with the PyTorch 1.9 release, include a number of new features and improvements that will provide a broad set of updates for the PyTorch community. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 08, 2021

        +

        + Overview of PyTorch Autograd Engine +

        +

        This blog post is based on PyTorch version 1.8, although it should apply for older versions too, since most of the mechanics have remained constant. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 26, 2021

        +

        + Everything you need to know about TorchVision’s MobileNetV3 implementation +

        +

        In TorchVision v0.9, we released a series of new mobile-friendly models that can be used for Classification, Object Detection and Semantic Segmentation. In this article, we will dig deep into the code of the models, share notable implementation details, explain how we configured and trained them, and highlight important tradeoffs we made during their tuning. Our goal is to disclose technical details that typically remain undocumented in the original papers and repos of the models. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 25, 2021

        +

        + Announcing the PyTorch Enterprise Support Program +

        +

        Today, we are excited to announce the PyTorch Enterprise Support Program, a participatory program that enables service providers to develop and offer tailored enterprise-grade support to their customers. This new offering, built in collaboration between Facebook and Microsoft, was created in direct response to feedback from PyTorch enterprise users who are developing models in production at scale for mission-critical applications. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 10, 2021

        +

        + PyTorch Ecosystem Day 2021 Recap and New Contributor Resources +

        +

        Thank you to our incredible community for making the first ever PyTorch Ecosystem Day a success! The day was filled with discussions on new developments, trends and challenges showcased through 71 posters, 32 breakout sessions and 6 keynote speakers. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/29/index.html b/blog/29/index.html new file mode 100644 index 000000000000..2f6193ec60e1 --- /dev/null +++ b/blog/29/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + Blog | 29 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + An overview of the ML models introduced in TorchVision v0.9 +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        March 25, 2021

        +

        + Introducing PyTorch Profiler - the new and improved performance tool +

        +

        Along with PyTorch 1.8.1 release, we are excited to announce PyTorch Profiler – the new and improved performance debugging profiler for PyTorch. Developed as part of a collaboration between Microsoft and Facebook, the PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 24, 2021

        +

        + PyTorch for AMD ROCm™ Platform now available as Python package +

        +

        With the PyTorch 1.8 release, we are delighted to announce a new installation option for users of +PyTorch on the ROCm™ open software platform. An installable Python package is now hosted on +pytorch.org, along with instructions for local installation in the same simple, selectable format as +PyTorch packages for CPU-only configurations and other GPU platforms. PyTorch on ROCm includes full +capability for mixed-precision and large-scale training using AMD’s MIOpen & RCCL libraries. This +prov...

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 09, 2021

        +

        + Announcing PyTorch Ecosystem Day +

        +

        We’re proud to announce our first PyTorch Ecosystem Day. The virtual, one-day event will focus completely on our Ecosystem and Industry PyTorch communities! + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 04, 2021

        +

        + PyTorch 1.8 Release, including Compiler and Distributed Training updates, and New Mobile Tutorials +

        +

        We are excited to announce the availability of PyTorch 1.8. This release is composed of more than 3,000 commits since 1.7. It includes major updates and new features for compilation, code optimization, frontend APIs for scientific computing, and AMD ROCm support through binaries that are available via pytorch.org. It also provides improved features for large-scale training for pipeline and model parallelism, and gradient compression. A few of the highlights include: + + Support for doing pytho...

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 04, 2021

        +

        + New PyTorch library releases including TorchVision Mobile, TorchAudio I/O, and more +

        +

        Today, we are announcing updates to a number of PyTorch libraries, alongside the PyTorch 1.8 release. The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio as well as new version of TorchCSPRNG. These releases include a number of new features and improvements and, along with the PyTorch 1.8 release, provide a broad set of updates for the PyTorch community to build on and leverage. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 03, 2021

        +

        + The torch.fft module: Accelerated Fast Fourier Transforms with Autograd in PyTorch +

        +

        The Fast Fourier Transform (FFT) calculates the Discrete Fourier Transform in O(n log n) time. It is foundational to a wide variety of numerical algorithms and signal processing techniques since it makes working in signals’ “frequency domains” as tractable as working in their spatial or temporal domains. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 12, 2020

        +

        + Prototype Features Now Available - APIs for Hardware Accelerated Mobile and ARM64 Builds +

        +

        Today, we are announcing four PyTorch prototype features. The first three of these will enable Mobile machine-learning developers to execute models on the full set of hardware (HW) engines making up a system-on-chip (SOC). This gives developers options to optimize their model execution for unique performance, power, and system-level concurrency. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/3/index.html b/blog/3/index.html new file mode 100644 index 000000000000..1f14b89b6cb2 --- /dev/null +++ b/blog/3/index.html @@ -0,0 +1,1000 @@ + + + + + + + + + + + + + Blog | 3 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + 📣 Submit to Speak at PyTorch Conference + Save on Registration +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        February 26, 2025

        +

        + Accelerating Generative AI with PyTorch: Segment Anything 2 - Fast and furious inference with low latency and fast cold starts +

        +

        This post is a follow-up to our first entry in the multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch and a focus on latency and elastic scalability. We use torch.compile and torch.export to create highly optimized low latency versions of SAM2 that can be quickly scaled up on new instances. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        February 11, 2025

        +

        + Unlocking the Latest Features in PyTorch 2.6 for Intel Platforms +

        +

        PyTorch* 2.6 has just been released with a set of exciting new features including torch.compile compatibility with Python 3.13, new security and performance enhancements, and a change in the default parameter for torch.load. PyTorch also announced the deprecation of its official Anaconda channel. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        February 05, 2025

        +

        + Enabling advanced GPU features in PyTorch - Warp Specialization +

        +

        Meta: Hongtao Yu, Manman Ren, Bert Maher, Shane Nay +NVIDIA: Gustav Zhu, Shuhao Jiang + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 29, 2025

        +

        + PyTorch 2.6 Release Blog +

        +

        We are excited to announce the release of PyTorch® 2.6 (release notes)! This release features multiple improvements for PT2: torch.compile can now be used with Python 3.13; new performance-related knob torch.compiler.set_stance; several AOTInductor enhancements. Besides the PT2 improvements, another highlight is FP16 support on X86 CPUs. + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        January 24, 2025

        +

        + How Intel Uses PyTorch to Empower Generative AI through Intel Arc GPUs +

        +

        Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 21, 2025

        +

        + Accelerating LLM Inference with GemLite, TorchAO and SGLang +

        +

        Large Language Models (LLMs) are typically very resource-intensive, requiring significant amounts of memory, compute and power to operate effectively. Quantization provides a solution by reducing weights and activations from 16 bit floats to lower bitrates (e.g., 8 bit, 4 bit, 2 bit), achieving significant speedup and memory savings and also enables support for larger batch sizes. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/30/index.html b/blog/30/index.html new file mode 100644 index 000000000000..39179ec7c4a7 --- /dev/null +++ b/blog/30/index.html @@ -0,0 +1,989 @@ + + + + + + + + + + + + + Blog | 30 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Announcing PyTorch Developer Day 2020 +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        October 27, 2020

        +

        + PyTorch 1.7 released w/ CUDA 11, New APIs for FFTs, Windows support for Distributed training and more +

        +

        Today, we’re announcing the availability of PyTorch 1.7, along with updated domain libraries. The PyTorch 1.7 release includes a number of new APIs including support for NumPy-Compatible FFT operations, profiling tools and major updates to both distributed data parallel (DDP) and remote procedure call (RPC) based distributed training. In addition, several features moved to stable including custom C++ Classes, the memory profiler, extensions via custom tensor-like objects, user async functions...

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 01, 2020

        +

        + Announcing the Winners of the 2020 Global PyTorch Summer Hackathon +

        +

        More than 2,500 participants in this year’s Global PyTorch Summer Hackathon pushed the envelope to create unique new tools and applications for PyTorch developers and researchers. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 24, 2020

        +

        + PyTorch framework for cryptographically secure random number generation, torchcsprng, now available +

        +

        One of the key components of modern cryptography is the pseudorandom number generator. Katz and Lindell stated, “The use of badly designed or inappropriate random number generators can often leave a good cryptosystem vulnerable to attack. Particular care must be taken to use a random number generator that is designed for cryptographic use, rather than a ‘general-purpose’ random number generator which may be fine for some applications but not ones that are required to be cryptographically secu...

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 18, 2020

        +

        + PyTorch 1.6 now includes Stochastic Weight Averaging +

        +

        Do you use stochastic gradient descent (SGD) or Adam? Regardless of the procedure you use to train your neural network, you can likely achieve significantly better generalization at virtually no additional cost with a simple new technique now natively supported in PyTorch 1.6, Stochastic Weight Averaging (SWA) [1]. Even if you have already trained your model, it’s easy to realize the benefits of SWA by running SWA for a small number of epochs starting with a pre-trained model. Again and again...

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 11, 2020

        +

        + Efficient PyTorch I/O library for Large Datasets, Many Files, Many GPUs +

        +

        Data sets are growing bigger every day and GPUs are getting faster. This means there are more data sets for deep learning researchers and engineers to train and validate their models. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 28, 2020

        +

        + PyTorch 1.6 released w/ Native AMP Support, Microsoft joins as maintainers for Windows +

        +

        Today, we’re announcing the availability of PyTorch 1.6, along with updated domain libraries. We are also excited to announce the team at Microsoft is now maintaining Windows builds and binaries and will also be supporting the community on GitHub as well as the PyTorch Windows discussion forums. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 28, 2020

        +

        + PyTorch feature classification changes +

        +

        Traditionally features in PyTorch were classified as either stable or experimental with an implicit third option of testing bleeding edge features by building master or through installing nightly builds (available via prebuilt whls). This has, in a few cases, caused some confusion around the level of readiness, commitment to the feature and backward compatibility that can be expected from a user perspective. Moving forward, we’d like to better classify the 3 types of features as well as defin...

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/31/index.html b/blog/31/index.html new file mode 100644 index 000000000000..884555a7cc0c --- /dev/null +++ b/blog/31/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 31 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Microsoft becomes maintainer of the Windows version of PyTorch +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        July 28, 2020

        +

        + Introducing native PyTorch automatic mixed precision for faster training on NVIDIA GPUs +

        +

        Most deep learning frameworks, including PyTorch, train with 32-bit floating point (FP32) arithmetic by default. However this is not essential to achieve full accuracy for many deep learning models. In 2017, NVIDIA researchers developed a methodology for mixed-precision training, which combined single-precision (FP32) with half-precision (e.g. FP16) format when training a network, and achieved the same accuracy as FP32 training using the same hyperparameters, with additional performance benef...

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 05, 2020

        +

        + Updates & Improvements to PyTorch Tutorials +

        +

        PyTorch.org provides researchers and developers with documentation, installation instructions, latest news, community projects, tutorials, and more. Today, we are introducing usability and content improvements including tutorials in additional categories, a new recipe format for quickly referencing common topics, sorting using tags, and an updated homepage. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 21, 2020

        +

        + PyTorch library updates including new model serving library +

        +

        Along with the PyTorch 1.5 release, we are announcing new libraries for high-performance PyTorch model serving and tight integration with TorchElastic and Kubernetes. Additionally, we are releasing updated packages for torch_xla (Google Cloud TPUs), torchaudio, torchvision, and torchtext. All of these new libraries and enhanced capabilities are available today and accompany all of the core features released in PyTorch 1.5. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 21, 2020

        +

        + PyTorch 1.5 released, new and updated APIs including C++ frontend API parity with Python +

        +

        Today, we’re announcing the availability of PyTorch 1.5, along with new and updated libraries. This release includes several major new API additions and improvements. PyTorch now includes a significant update to the C++ frontend, ‘channels last’ memory format for computer vision models, and a stable release of the distributed RPC framework used for model-parallel training. The release also has new APIs for autograd for hessians and jacobians, and an API that allows the creation of Custom C++ ...

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 26, 2020

        +

        + Introduction to Quantization on PyTorch +

        +

        It’s important to make efficient use of both server-side and on-device compute resources when developing machine learning applications. To support more efficient deployment on servers and edge devices, PyTorch added a support for model quantization using the familiar eager mode Python API. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 15, 2020

        +

        + PyTorch 1.4 released, domain libraries updated +

        +

        Today, we’re announcing the availability of PyTorch 1.4, along with updates to the PyTorch domain libraries. These releases build on top of the announcements from NeurIPS 2019, where we shared the availability of PyTorch Elastic, a new classification framework for image and video, and the addition of Preferred Networks to the PyTorch community. For those that attended the workshops at NeurIPS, the content can be found here. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 06, 2019

        +

        + PyTorch adds new tools and libraries, welcomes Preferred Networks to its community +

        +

        PyTorch continues to be used for the latest state-of-the-art research on display at the NeurIPS conference next week, making up nearly 70% of papers that cite a framework. In addition, we’re excited to welcome Preferred Networks, the maintainers of the Chainer framework, to the PyTorch community. Their teams are moving fully over to PyTorch for developing their ML capabilities and services. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/32/index.html b/blog/32/index.html new file mode 100644 index 000000000000..14621617ac2d --- /dev/null +++ b/blog/32/index.html @@ -0,0 +1,992 @@ + + + + + + + + + + + + + Blog | 32 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + OpenMined and PyTorch partner to launch fellowship funding for privacy-preserving ML community +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        October 10, 2019

        +

        + PyTorch 1.3 adds mobile, privacy, quantization, and named tensors +

        +

        PyTorch continues to gain momentum because of its focus on meeting the needs of researchers, its streamlined workflow for production use, and most of all because of the enthusiastic support it has received from the AI community. PyTorch citations in papers on ArXiv grew 194 percent in the first half of 2019 alone, as noted by O’Reilly, and the number of contributors to the platform has grown more than 50 percent over the last year, to nearly 1,200. Facebook, Microsoft, Uber, and other organiz...

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 08, 2019

        +

        + New Releases: PyTorch 1.2, torchtext 0.4, torchaudio 0.3, and torchvision 0.4 +

        +

        Since the release of PyTorch 1.0, we’ve seen the community expand to add new tools, contribute to a growing set of models available in the PyTorch Hub, and continually increase usage in both research and production. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 23, 2019

        +

        + Mapillary Research: Seamless Scene Segmentation and In-Place Activated BatchNorm +

        +

        With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 18, 2019

        +

        + PyTorch Adds New Ecosystem Projects for Encrypted AI and Quantum Computing, Expands PyTorch Hub +

        +

        The PyTorch ecosystem includes projects, tools, models and libraries from a broad community of researchers in academia and industry, application developers, and ML engineers. The goal of this ecosystem is to support, accelerate, and aid in your exploration with PyTorch and help you push the state of the art, no matter what field you are exploring. Similarly, we are expanding the recently launched PyTorch Hub to further help you discover and reproduce the latest research. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 10, 2019

        +

        + Towards Reproducible Research with PyTorch Hub +

        +

        Reproducibility is an essential requirement for many fields of research including those based on machine learning techniques. However, many machine learning publications are either not reproducible or are difficult to reproduce. With the continued growth in the number of research publications, including tens of thousands of papers now hosted on arXiv and submissions to conferences at an all time high, research reproducibility is more important than ever. While many of these publications are a...

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 22, 2019

        +

        + torchvision 0.3: segmentation, detection models, new datasets and more.. +

        +

        PyTorch domain libraries like torchvision provide convenient access to common datasets and models that can be used to quickly create a state-of-the-art baseline. Moreover, they also provide common abstractions to reduce boilerplate code that users might have to otherwise repeatedly write. The torchvision 0.3 release brings several new features including models for semantic segmentation, object detection, instance segmentation, and person keypoint detection, as well as custom C++ / CUDA ops sp...

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 08, 2019

        +

        + Model Serving in PyTorch +

        +

        PyTorch has seen a lot of adoption in research, but people can get confused about how well PyTorch models can be taken into production. This blog post is meant to clear up any confusion people might have about the road to production in PyTorch. +Usually when people talk about taking a model “to production,” they usually mean performing inference, sometimes called model evaluation or prediction or serving. At the level of a function call, in PyTorch, inference looks something like this: + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/33/index.html b/blog/33/index.html new file mode 100644 index 000000000000..1b2af7d09b40 --- /dev/null +++ b/blog/33/index.html @@ -0,0 +1,997 @@ + + + + + + + + + + + + + Blog | 33 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Optimizing CUDA Recurrent Neural Networks with TorchScript +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        May 01, 2019

        +

        + PyTorch adds new dev tools as it hits production scale +

        +

        This is a partial re-post of the original blog post on the Facebook AI Blog. The full post can be viewed here + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 29, 2019

        +

        + Stochastic Weight Averaging in PyTorch +

        +

        In this blogpost we describe the recently proposed Stochastic Weight Averaging (SWA) technique [1, 2], and its new implementation in torchcontrib. SWA is a simple procedure that improves generalization in deep learning over Stochastic Gradient Descent (SGD) at no additional cost, and can be used as a drop-in replacement for any other optimizer in PyTorch. SWA has a wide range of applications and features: + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 02, 2018

        +

        + The road to 1.0: production ready PyTorch +

        +

        We would like to give you a preview of the roadmap for PyTorch 1.0 , the next release of PyTorch. Over the last year, we’ve had 0.2, 0.3 and 0.4 transform PyTorch from a [Torch+Chainer]-like interface into something cleaner, adding double-backwards, numpy-like functions, advanced indexing and removing Variable boilerplate. At this time, we’re confident that the API is in a reasonable and stable state to confidently release a 1.0. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 22, 2018

        +

        + PyTorch 0.4.0 Migration Guide +

        +

        Welcome to the migration guide for PyTorch 0.4.0. In this release we introduced many exciting new features and critical bug fixes, with the goal of providing users a better and cleaner interface. In this guide, we will cover the most important changes in migrating existing code from previous versions: + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        March 05, 2018

        +

        + Tensor Comprehensions in PyTorch +

        +

        Tensor Comprehensions (TC) is a tool that lowers the barrier for writing high-performance code. It generates GPU code from a simple high-level language and autotunes the code for specific input sizes. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 19, 2018

        +

        + PyTorch, a year in.... +

        +

        Today marks 1 year since PyTorch was released publicly. It’s been a wild ride — our quest to build a flexible deep learning research platform. Over the last year, we’ve seen an amazing community of people using, contributing to and evangelizing PyTorch — thank you for the love. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 27, 2017

        +

        + PyTorch Internals Part II - The Build System +

        +

        In the first post I explained how we generate a torch.Tensor object that you can use in your Python interpreter. Next, I will explore the build system for PyTorch. The PyTorch codebase has a variety of components: + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/34/index.html b/blog/34/index.html new file mode 100644 index 000000000000..972cd1030a1e --- /dev/null +++ b/blog/34/index.html @@ -0,0 +1,863 @@ + + + + + + + + + + + + + Blog | 34 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + A Tour of PyTorch Internals (Part I) +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/4/index.html b/blog/4/index.html new file mode 100644 index 000000000000..e61998f1368f --- /dev/null +++ b/blog/4/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 4 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + GenAI Acceleration for PyTorch 2.5 on Intel® Xeon®Processors +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        January 09, 2025

        +

        + Integrating Ascend Backend with Torchtune through PyTorch Multi-Device Support +

        +

        In this blog, we will briefly introduce torchtune, the Ascend backend, and demonstrate how torchtune can be used to fine-tune models with Ascend. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        January 06, 2025

        +

        + High-Performance Low-Bit Operators for PyTorch +

        +

        We are excited to announce the addition of embedding operators with low-bit weights (1-8 bit) and linear operators with 8-bit dynamically quantized activations and low-bit weights (1-8 bit) for Arm CPUs in TorchAO, PyTorch’s native low-precision library. These operators work seamlessly across all PyTorch surfaces, including eager, torch.compile, AOTI, and ExecuTorch, and are available to use in torchchat. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 23, 2024

        +

        + PyTorch Grows as the Dominant Open Source Framework for AI and ML: 2024 Year in Review +

        +

        This past year was a monumental year for PyTorch from major releases to the flagship PyTorch Conference. We’ve seen incredible growth in contributions from more than 3,500 individuals and 3,000 organizations. It’s safe to say PyTorch has now become the dominant deep learning framework for AI/ML. PyTorch leads the model training space with a 63% adoption rate according to the recent Shaping the Future of Generative AI Report from the Linux Foundation. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 20, 2024

        +

        + Improve RAG performance with torch.compile on AWS Graviton Processors +

        +

        Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to support tasks like answering questions, translating languages, and completing sentences. There are a few challenges when working with LLMs such as domain knowledge gaps, factuality issues, and hallucination, which affect their reliability especially for the fields that require high levels of accuracy, such as healthcare, law, or engineering. Retrieval Augmented Generation (RAG) provides a soluti...

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 11, 2024

        +

        + torchcodec: Easy and Efficient Video Decoding for PyTorch +

        +

        We are pleased to officially announce torchcodec, a library for decoding videos into PyTorch tensors. It is fast, accurate, and easy to use. When running PyTorch models on videos, torchcodec is our recommended way to turn those videos into data your model can use. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 06, 2024

        +

        + Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton +

        +

        2D block quantization for Float8 (FP8) holds the promise of improving the accuracy of Float8 quantization while also accelerating GEMM’s for both inference and training. In this blog, we showcase advances using Triton for the two main phases involved in doing block quantized Float8 GEMMs. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        December 02, 2024

        +

        + HadaCore: Tensor Core Accelerated Hadamard Transform Kernel +

        +

        Quantization is a method for improving model inference speeds by compressing model weights and performing (faster) computation in lower precision data types. However, quantization can result in accuracy loss due to the presence of outliers.

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/5/index.html b/blog/5/index.html new file mode 100644 index 000000000000..d6517d93c52d --- /dev/null +++ b/blog/5/index.html @@ -0,0 +1,991 @@ + + + + + + + + + + + + + Blog | 5 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Supercharging Training using float8 and FSDP2 +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        November 21, 2024

        +

        + Rebellions Joins the PyTorch Foundation as a General Member +

        +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Rebellions has joined as a general member.

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 18, 2024

        +

        + Distilling Llama3.1 8B into 1B in torchtune +

        +

        In this blog, we present a case study on distilling a Llama 3.1 8B model into Llama 3.2 1B using torchtune’s knowledge distillation recipe. We demonstrate how knowledge distillation (KD) can be used in post-training to improve instruction-following task performance and showcase how users can leverage the recipe. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        November 01, 2024

        +

        + Deep Dive on CUTLASS Ping-Pong GEMM Kernel +

        +

        In this post, we provide an overview, with relevant FP8 inference kernel benchmarking, of the CUTLASS Ping-Pong GEMM kernel.

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 31, 2024

        +

        + Deploying LLMs with TorchServe + vLLM +

        +

        The vLLM engine is currently one of the top-performing ways to execute large language models (LLM). It provides the vllm serve command as an easy option to deploy a model on a single machine. While this is convenient, to serve these LLMs in production and at scale some advanced features are necessary. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 30, 2024

        +

        + Triton Kernel Compilation Stages +

        +

        The Triton open-source programming language and compiler offers a high-level, python-based approach to create efficient GPU code. In this blog, we highlight the underlying details of how a triton program is compiled and the intermediate representations. For an introduction to Triton, we refer readers to this blog. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 28, 2024

        +

        + Unleashing the Power of AI on Mobile: LLM Inference for Llama 3.2 Quantized Models with ExecuTorch and KleidiAI +

        +

        At the recent PyTorch Conference, Arm highlighted the widespread impact of its technology, spanning from cloud to edge, emphasizing its commitment to delivering its advanced AI computing capabilities seamlessly to millions of developers worldwide.

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 28, 2024

        +

        + Getting started with PyTorch, ExecuTorch, and Ethos-U85 in three easy steps +

        +

        ExecuTorch support for Ethos-U85 + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/6/index.html b/blog/6/index.html new file mode 100644 index 000000000000..e7a393e27ff7 --- /dev/null +++ b/blog/6/index.html @@ -0,0 +1,998 @@ + + + + + + + + + + + + + Blog | 6 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Intel GPU Support Now Available in PyTorch 2.5 +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        October 24, 2024

        +

        + ExecuTorch Beta: On-Device AI and LLMs, Stability, and Acceleration with Partners +

        +

        + ExecuTorch has achieved Beta status with the release of v0.4, providing stable APIs and runtime, as well as extensive kernel coverage. + ExecuTorch is the recommended on-device inference engine for Llama 3.2 1B/3B models, offering enhanced performance and memory efficiency for both original and quantized models. + There has been a significant increase in adoption and ecosystem growth for ExecuTorch, and the focus is now on improving reliability, performance, and coverage for non-CPU backen...

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 23, 2024

        +

        + TorchRec and FBGEMM 1.0 Stable Release +

        +

        We are happy to announce the stable release, 1.0, for TorchRec and FBGEMM. TorchRec is the PyTorch native recommendation systems library, powered by FBGEMM’s (Facebook GEneral Matrix Multiplication) efficient, low-level kernels. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 17, 2024

        +

        + PyTorch 2.5 Release Blog +

        +

        We are excited to announce the release of PyTorch® 2.5 (release note)! This release features a new cuDNN backend for SDPA, enabling speedups by default for users of SDPA on H100s or newer GPUs. As well, regional compilation of torch.compile offers a way to reduce the cold start up time for torch.compile by allowing users to compile a repeated nn.Module (e.g. a transformer layer in LLM) without recompilations. Finally, TorchInductor CPP backend offers solid performance speedup with numerous en...

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 15, 2024

        +

        + The Path to Achieve PyTorch Performance Boost on Windows CPU +

        +

        The challenge of PyTorch’s lower CPU performance on Windows compared to Linux has been a significant issue. There are multiple factors leading to this performance disparity. Through our investigation, we’ve identified several reasons for poor CPU performance on Windows, two primary issues have been pinpointed: the inefficiency of the Windows default malloc memory allocator and the absence of SIMD for vectorization optimizations on the Windows platform. In this article, we show how PyTorch CPU...

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 08, 2024

        +

        + PyTorch Foundation Technical Advisory Council Elects New Leadership +

        +

        We are pleased to announce the first-ever Chair and Vice Chair of the PyTorch Foundation’s Technical Advisory Council (TAC): Luca Antiga as the Chair and Jiong Gong as Vice Chair. Both leaders bring extensive experience and deep commitment to the PyTorch community, and they are set to guide the TAC in its mission to foster an open, diverse, and innovative PyTorch technical community. + +Meet the New Leadership + + + +Luca Antiga is the CTO at Lightning AI since 2022. He is an early contributor to P...

        + +
        + + Read More + +
        + + + + +
        +
        +

        October 02, 2024

        +

        + PyTorch Conference 2024 Recap: On Fire 🔥 +

        +

        + +The 2024 PyTorch Conference in San Francisco gathered nearly 1,500 AI researchers, developers, and enthusiasts. Over two days, the event featured engaging discussions, insightful keynotes, and hands-on sessions focused on artificial intelligence (AI) and advancements in PyTorch, the leading open-source machine learning framework. Attendees delved into the future of generative AI, Large Language Models (LLMs), and the crucial role open-source technology plays in driving AI innovation. Here’s...

        + +
        + + Read More + +
        + + + + +
        +
        +

        September 26, 2024

        +

        + PyTorch Native Architecture Optimization: torchao +

        +

        We’re happy to officially launch torchao, a PyTorch native library that makes models faster and smaller by leveraging low bit dtypes, quantization and sparsity. torchao is an accessible toolkit of techniques written (mostly) in easy to read PyTorch code spanning both inference and training. This blog will help you pick which techniques matter for your workloads. + +We benchmarked our techniques on popular GenAI models like LLama 3 and Diffusion models and saw minimal drops in accuracy. Unless o...

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/6x-faster-async-checkpointing/index.html b/blog/6x-faster-async-checkpointing/index.html new file mode 100644 index 000000000000..608d029a9f64 --- /dev/null +++ b/blog/6x-faster-async-checkpointing/index.html @@ -0,0 +1,746 @@ + + + + + + + + + + + + + 6x faster Async Checkpointing in PyTorch, using Cached Plans, no GIL contention | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Meta and Crusoe + +

        +

        Meta: Less Wright, Meet Vadakkanchery, Saurabh Mishra, Ela Krepska, Hamid Shojanazeri, Pradeep Fernando
        +Crusoe: Ethan Petersen, Martin Cala, Chip Smith

        + +

        PyTorch DCP (Distributed Checkpointing) has recently enabled new optimizations in asynchronous checkpointing to reduce GPU utilization drop by minimizing collective overhead and improving overall checkpointing efficiency.

        + +

        Using Crusoe’s 2K H200 cluster, with TorchTitan and training a Llama3-70B, we were able to verify these new features deliver substantial speedups at 1856 GPU scale, reducing the background processing time for async DCP checkpoints from ~436 seconds to ~67 seconds.

        + +

        This is roughly a 6.5x reduction in background checkpoint processing time, enabling even more total training time to proceed at full training throughput.

        + +

        chart

        + +

        Fig 1: 1856 training run with high frequency checkpointing. The first checkpoint (drop down in tps) does not have a cached save plan, and the background processing takes far longer than the rest where the cached plan is used.

        + +

        Background: What is Asynchronous Checkpointing?

        + +

        In a standard checkpointing workflow, GPUs are blocked while the checkpointing data is offloaded from GPU to CPU and then written to storage. After the save to physical media is complete, training can resume.

        + +

        Asynchronous checkpointing greatly reduces this downtime by enabling the actual saving to storage to be done via CPU threads, allowing GPU-based training to continue while the checkpoint data is being persisted in parallel. It is used primarily for intermediate/fault tolerant checkpoints as it unblocks the GPUs much faster compared to the synchronous checkpoints.
        +For example, in our large-scale experiment, GPU training was blocked for less than a second (.78 seconds at 1856 scale) while checkpoint data was moved from GPU to CPU (staging). At that point, GPU training immediately continues, which is a substantial training time improvement over traditional checkpointing. For reference, Async Checkpointing is covered in more detail here.

        + +

        Challenges with Asynchronous Checkpointing

        + +

        However, the background processing inherent in Asynchronous Checkpointing has additional challenges that result in a temporary reduction of training throughput while the storage phase is being completed. These are highlighted below.

        + +

        GPU utilization drop from GIL contention:

        + +

        The Global Interpreter Lock (GIL) in Python is a mechanism that prevents multiple native threads from executing Python bytecode at the same time. This lock is necessary mainly because CPython’s memory management is not thread-safe.

        + +

        DCP currently uses background threads for metadata collectives and uploading to storage. Although these expensive steps are done asynchronously, it leads to contention for the GIL with the trainer threads. This causes the GPU utilization (QPS) to suffer significantly and also increases the e2e upload latency. For large-scale checkpoints, the overhead of the CPU parallel processing has a suppressive effect on net GPU training speed since CPUs also drive the training process via GPU kernel launches.

        + +

        Please refer to the following figure from our experiments:

        + +

        chart

        + +

        Fig 2: One can see a sustained drop in training QPS even after staging (i.e. blocking operation to trainer) is complete.

        + +

        The first dip in Figure 2 (marked by the purple line) indicates that staging is complete, and training can continue. However, a second drop is evident (marked by the area between the purple and yellow lines) which is due to trainer thread and checkpointing threads contending for the Python GIL, leading to degraded training QPS until the checkpoint thread completes execution.

        + +

        Collective communications cost:

        + +

        DCP performs multiple collectives today for various reasons: dedupe, global metadata for the checkpoint, resharding, and distributed exception handling. Collectives are costly as these require network I/O and pickling/unpickling of the large metadata being sent across the GPU network. These collectives become extremely expensive as the job scale grows, leading to significantly higher e2e latency and potential for collective timeouts.

        + +

        Solutions

        + +

        Process based async checkpointing

        + +

        DCP now supports async checkpoint save via a background process. This helps avoid the training QPS drop by eliminating the python GIL contention with the trainer threads. Please see Fig 2 for checkpointing via threads and Fig 3 for checkpointing via background process.

        + +

        Caching of the save plans

        + +

        DCP has a clear boundary between the planning and storage I/O steps. SavePlanner in DCP is a stateful component which acts as an access proxy to the state_dict. Planner manages save plans prepared by individual ranks, which carry metadata information necessary to do the write I/O. The planning step involves a collective operation to gather a comprehensive view of the checkpoint on the coordinator rank. The coordinator rank is responsible for de-duplicating parameters/weights to eliminate redundancies, validating the global plan to ensure accuracy and consistency, and creating the global metadata structs. This is followed by a scatter collective where the coordinator rank assigns I/O tasks to each rank. Any transformations done on the plans affect how the storage components finally write the data.

        + +

        During the course of a training job, multiple checkpoints are saved. In the majority of these cases, only the checkpoint data changes between different save instances, and thus, the plan remains the same. This presented an opportunity for us to cache the plans, pay the planning cost only on the first save, and then amortize that cost across all the subsequent attempts. Only the updated plans (plans which changed in the next attempt) are sent via collective, thus reducing the collective overhead significantly.

        + +

        Experiment Results

        + +

        Set up: 1856 H200 GPUs, Llama3-70B, HSDP2 with TorchTitan

        + +

        After deploying both the solutions above, the following are the key results:

        + +
          +
        • TPS drop has significantly narrowed, with a peak dip to 372 vs 315 tps, and for a greatly reduced time window (~67 seconds vs ~437 seconds). This time window is now mostly attributed to the blocking for CPU processing.
        • +
        • Subsequent checkpoint save attempts also continue to be much faster due to very low overhead at the planning stage. E2E latency is thus improved by over 6.5x. This will allow our partners to increase the checkpointing frequency and reduce the lost training progress (i.e. wasted training time).
        • +
        + +

        If you look at the very first downspike in Figure 1, this drawdown in GPU processing time takes training throughput from 700 down to 320 tps, and suppresses it for roughly 7 minutes (467 seconds). Once the CPUs have finished processing, training continues again at full speed.

        + +

        Previously, this ~7 minute suppression would be repeated at every checkpoint. However, with the new process-based checkpointing feature, only the first checkpoint has the full drawdown time (mainly due to overhead from daemon process initialization), as all future checkpoints are executed via the background process, mitigating GIL contention with the trainer threads.

        + +

        This is visually shown in all the subsequent checkpoints where the average MFU suppression time drops to just over a minute, reflected by the sharp spikes that almost immediately revert to full MFU throughput.

        + +

        chart

        + +

        Fig 3: The red box shows the non-cached plan checkpoint, which also includes Checkpoint Background Init process overhead, while the purple box highlights the first checkpoint to run with the cached plan.

        + +

        This means that even large-scale checkpointing, such as shown in Fig 2 at 1856 GPU scale, can be done with ~6x reduced training throughput impact. This enables Asynchronous DCP checkpointing to be run more frequently (thus better rollback protection) while enhancing total training throughput relative to previous Async Checkpointing overhead.

        + +

        Using DCP’s cached checkpointing:

        + +

        This feature is already available as part of the PyTorch nightly builds, and you can test out PyTorch’s Asynchronous DCP checkpointing directly in TorchTitan. Following are the instructions to enable these features:

        + +
          +
        • Process-based asynchronous checkpointing: +
            +
          • Set the async_checkpointer_type to AsyncCheckpointerType.PROCESS in the async_save API. (file: pytorch/torch/distributed/checkpoint/state_dict_saver.py)
          • +
          +
        • +
        • Save plan caching: +
            +
          • Set the enable_plan_caching flag to true in the DefaultSavePlanner. (file: pytorch/torch/distributed/checkpoint/default_planner.py)
          • +
          +
        • +
        + +

        Future work

        + +

        DCP will be rolling out additional optimizations to further improve the checkpointing cost. Currently even though the save plans are cached, coordinator rank still prepares the metadata. For larger jobs and models with many tensors, this overhead is non-trivial. In the next iteration, DCP will eliminate the metadata overhead and improve the e2e latency further. DCP will also introduce additional optimizations, such as zero-overhead checkpointing, to enable efficient checkpointing in large-scale jobs.

        + +

        Stay tuned!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/7/index.html b/blog/7/index.html new file mode 100644 index 000000000000..3ac8d9817833 --- /dev/null +++ b/blog/7/index.html @@ -0,0 +1,998 @@ + + + + + + + + + + + + + Blog | 7 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + + + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        September 12, 2024

        +

        + Arm Joins the PyTorch Foundation as a Premier Member +

        +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Arm has joined as a premier member. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        September 04, 2024

        +

        + CUDA-Free Inference for LLMs +

        +

        In this blog, we discuss the methods we used to achieve FP16 inference with popular LLM models such as Meta’s Llama3-8B and IBM’s Granite-8B Code, where 100% of the computation is performed using OpenAI’s Triton Language. +For single token generation times using our Triton kernel based models, we were able to approach 0.76-0.78x performance relative to the CUDA kernel dominant workflows for both Llama and Granite on Nvidia H100 GPUs, and 0.62-0.82x on Nvidia A100 GPUs. + +Why explore using 100%...

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 29, 2024

        +

        + Accelerate Your AI: PyTorch 2.4 Now Supports Intel GPUs for Faster Workloads +

        +

        We have exciting news! PyTorch 2.4 now supports Intel® Data Center GPU Max Series and the SYCL software stack, making it easier to speed up your AI workflows for both training and inference. This update allows for you to have a consistent programming experience with minimal coding effort and extends PyTorch’s device and runtime capabilities, including device, stream, event, generator, allocator, and guard, to seamlessly support streaming devices. This enhancement simplifies deploying PyTorch ...

        + +
        + + Read More + +
        + + + + +
        +
        +

        August 20, 2024

        +

        + Enabling Fast Gradient Clipping and Ghost Clipping in Opacus +

        +

        Introduction and Context + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        July 30, 2024

        +

        + Introducing torchchat: Accelerating Local LLM Inference on Laptop, Desktop and Mobile +

        +

        Today, we’re releasing torchchat, a library showcasing how to seamlessly and performantly run Llama 3, 3.1, and other large language models across laptop, desktop, and mobile. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 30, 2024

        +

        + Quantization-Aware Training for Large Language Models with PyTorch +

        +

        In this blog, we present an end-to-end Quantization-Aware Training (QAT) flow for large language models in PyTorch. We demonstrate how QAT in PyTorch can recover up to 96% of the accuracy degradation on hellaswag and 68% of the perplexity degradation on wikitext for Llama3 compared to post-training quantization (PTQ). We present the QAT APIs in torchao and showcase how users can leverage them for fine-tuning in torchtune. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/8/index.html b/blog/8/index.html new file mode 100644 index 000000000000..51e4c2782883 --- /dev/null +++ b/blog/8/index.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + Blog | 8 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + PyTorch 2.4 Release Blog +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        July 22, 2024

        +

        + Deep Dive on the Hopper TMA Unit for FP8 GEMMs +

        +

        Abstract + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 11, 2024

        +

        + FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision +

        +

        Attention, as a core layer of the ubiquitous Transformer architecture, is a bottleneck for large language models and long-context applications. FlashAttention (and FlashAttention-2) pioneered an approach to speed up attention on GPUs by minimizing memory reads/writes, and is now used by most libraries to accelerate Transformer training and inference. This has contributed to a massive increase in LLM context length in the last two years, from 2-4K (GPT-3, OPT) to 128K (GPT-4), or even 1M (Llam...

        + +
        + + Read More + +
        + + + + +
        +
        +

        July 10, 2024

        +

        + Learn how to develop Android applications with ExecuTorch and Llama models +

        +

        This blog is courtesy of the PyTorch team at Arm. More details can be found here. + +

        + +
        + + Read More + +
        + + + + + + + + + +
        +
        +

        July 03, 2024

        +

        + Announcing Hacker Cup AI Track at NeurIPS 2024 +

        +

        The PyTorch team in partnership with Meta Hacker Cup, and Microsoft Research, are excited to announce the Hacker Cup AI Track at NeurIPS 2024. This will be the first AI track for the popular Meta Hacker Cup programming competition designed to assess the capabilities of Generative AI in performing autonomous code generation tasks. We aim to test the limits of AI in complex coding challenges and measure the performance gap between AI systems and human programmers. We will provide access to all ...

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 25, 2024

        +

        + Powering the AI Revolution: The PyTorch Documentary +

        +

        Now live: The official PyTorch Documentary! This film unveils the authentic narrative of PyTorch’s inception, attributing its existence to a dedicated group of unsung heroes driving technological innovation. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 23, 2024

        +

        + Training MoEs at Scale with PyTorch +

        +

        Over the past year, Mixture of Experts (MoE) models have surged in popularity, fueled by powerful open-source models like DBRX, Mixtral, DeepSeek, and many more. At Databricks, we’ve worked closely with the PyTorch team to scale training of MoE models. In this blog post, we’ll talk about how we scale to over three thousand GPUs using PyTorch Distributed and MegaBlocks, an efficient open-source MoE implementation in PyTorch. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/9/index.html b/blog/9/index.html new file mode 100644 index 000000000000..c392e3e592f4 --- /dev/null +++ b/blog/9/index.html @@ -0,0 +1,998 @@ + + + + + + + + + + + + + Blog | 9 of 34 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + 🎉 PyTorch Docathon H1 2024 Wrap-up 🎉 +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        June 20, 2024

        +

        + Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity +

        +

        Over the past year, we’ve added support for semi-structured (2:4) sparsity into PyTorch. With just a few lines of code, we were able to show a 10% end-to-end inference speedup on segment-anything by replacing dense matrix multiplications with sparse matrix multiplications. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 12, 2024

        +

        + Reducing Model Checkpointing Times by Over 10x with PyTorch Distributed Asynchronous Checkpointing +

        +

        Summary: With PyTorch distributed’s new asynchronous checkpointing feature, developed with feedback from IBM, we show how IBM Research Team is able to implement and reduce effective checkpointing time by a factor of 10-20x. Example: 7B model ‘down time’ for a checkpoint goes from an average of 148.8 seconds to 6.3 seconds, or 23.62x faster. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 11, 2024

        +

        + PyTorch Foundation Welcomes New Executive Director +

        +

        +The PyTorch Foundation is excited to welcome Matt White, our new executive director. The PyTorch Foundation formed in 2022 with the goal to drive adoption of AI tooling by fostering and sustaining an ecosystem of open source, vendor-neutral projects with PyTorch. Over the past 2 years, we’ve seen excellent growth across the project – with both contributor and member growth. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 06, 2024

        +

        + INT4 Decoding GQA CUDA Optimizations for LLM Inference +

        +

        An efficient decoding Grouped-Query Attention with low-precision KV cache + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        June 04, 2024

        +

        + Ready, Set, Contribute: PyTorch Docathon Kickoff H1 2024 +

        +

        The PyTorch Docathon is now live! This event is dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Our hope with this Docathon is to simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine learning. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 21, 2024

        +

        + Maximizing Training Throughput Using PyTorch FSDP and Torch.compile +

        +

        Recently, we demonstrated how FSDP and selective activation checkpointing can be used to achieve 57% MFU (Model Flops Utilization) for training a 7B model on A100 GPUs. We also demonstrated how it can train a high quality model, which we open sourced as Granite 7B base model on Hugging Face Hub under the Apache v2.0 license. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 15, 2024

        +

        + Achieving Sustainability Goals with PyTorch and Intel AI +

        +

        This post was contributed by Intel AI in partnership with the PyTorch Foundation. + +

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/Accelerating-Hugging-Face-and-TIMM-models/index.html b/blog/Accelerating-Hugging-Face-and-TIMM-models/index.html new file mode 100644 index 000000000000..4b27f246f81e --- /dev/null +++ b/blog/Accelerating-Hugging-Face-and-TIMM-models/index.html @@ -0,0 +1,809 @@ + + + + + + + + + + + + + Accelerating Hugging Face and TIMM models with PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Mark Saroufim + +

        +

        torch.compile() makes it easy to experiment with different compiler backends to make PyTorch code faster with a single line decorator torch.compile(). It works either directly over an nn.Module as a drop-in replacement for torch.jit.script() but without requiring you to make any source code changes. We expect this one line code change to provide you with between 30%-2x training time speedups on the vast majority of models that you’re already running.

        + +
        
        +opt_module = torch.compile(module)
        +
        +
        + +

        torch.compile supports arbitrary PyTorch code, control flow, mutation and comes with experimental support for dynamic shapes. We’re so excited about this development that we call it PyTorch 2.0.

        + +

        What makes this announcement different for us is we’ve already benchmarked some of the most popular open source PyTorch models and gotten substantial speedups ranging from 30% to 2x https://github.com/pytorch/torchdynamo/issues/681.

        + +

        There are no tricks here, we’ve pip installed popular libraries like https://github.com/huggingface/transformers, https://github.com/huggingface/accelerate and https://github.com/rwightman/pytorch-image-models and then ran torch.compile() on them and that’s it.

        + +

        It’s rare to get both performance and convenience, but this is why the core team finds PyTorch 2.0 so exciting. The Hugging Face team is also excited, in their words:

        + +

        Ross Wightman the primary maintainer of TIMM: “PT 2.0 works out of the box with majority of timm models for inference and train workloads and no code changes”

        + +

        Sylvain Gugger the primary maintainer of transformers and accelerate: “With just one line of code to add, PyTorch 2.0 gives a speedup between 1.5x and 2.x in training Transformers models. This is the most exciting thing since mixed precision training was introduced!”

        + +

        This tutorial will show you exactly how to replicate those speedups so you can be as excited as to PyTorch 2.0 as we are.

        + +

        Requirements and Setup

        + +

        For GPU (newer generation GPUs will see drastically better performance)

        + +
        pip3 install numpy --pre torch --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
        +
        +
        + +

        For CPU

        + +
        pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
        +
        +
        + +

        Optional: Verify Installation

        + +
        git clone https://github.com/pytorch/pytorch
        +cd tools/dynamo
        +python verify_dynamo.py
        +
        + +

        Optional: Docker installation

        + +

        We also provide all the required dependencies in the PyTorch nightly +binaries which you can download with

        + +
        docker pull ghcr.io/pytorch/pytorch-nightly
        +
        +
        + +

        And for ad hoc experiments just make sure that your container has access +to all your GPUs

        + +
        docker run --gpus all -it ghcr.io/pytorch/pytorch-nightly:latest /bin/bash
        +
        +
        + +

        Getting started

        + +

        a toy exmaple

        + +

        Let’s start with a simple example and make things more complicated step +by step. Please note that you’re likely to see more significant speedups the newer your GPU is.

        + +
        import torch
        +def fn(x, y):
        +    a = torch.sin(x).cuda()
        +    b = torch.sin(y).cuda()
        +    return a + b
        +new_fn = torch.compile(fn, backend="inductor")
        +input_tensor = torch.randn(10000).to(device="cuda:0")
        +a = new_fn(input_tensor, input_tensor)
        +
        + +

        This example won’t actually run faster but it’s educational.

        + +

        example that features torch.cos() and torch.sin() which are examples of pointwise ops as in they operate element by element on a vector. A more famous pointwise op you might actually want to use would be something like torch.relu().

        + +

        Pointwise ops in eager mode are suboptimal because each one would need to read a tensor from memory, make some changes and then write back those changes.

        + +

        The single most important optimization that PyTorch 2.0 does for you is fusion.

        + +

        So back to our example we can turn 2 reads and 2 writes into 1 read and 1 write which is crucial especially for newer GPUs where the bottleneck is memory bandwidth (how quickly you can send data to a GPU) instead of compute (how quickly your GPU can crunch floating point operations)

        + +

        The second most important optimization that PyTorch 2.0 does for you is CUDA graphs

        + +

        CUDA graphs help eliminate the overhead from launching individual kernels from a python program.

        + +

        torch.compile() supports many different backends but one that we’re particularly excited about is Inductor which generates Triton kernels https://github.com/openai/triton which are written in Python yet outperform the vast majority of handwritten CUDA kernels. Suppose our example above was called trig.py we can actually inspect the code generated triton kernels by running.

        + +
        TORCH_COMPILE_DEBUG=1 python trig.py
        +
        + +
        
        +@pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
        +@triton.jit
        +def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
        +    xnumel = 10000
        +    xoffset = tl.program_id(0) * XBLOCK
        +    xindex = xoffset + tl.reshape(tl.arange(0, XBLOCK), [XBLOCK])
        +    xmask = xindex < xnumel
        +    x0 = xindex
        +    tmp0 = tl.load(in_ptr0 + (x0), xmask)
        +    tmp1 = tl.sin(tmp0)
        +    tmp2 = tl.sin(tmp1)
        +    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
        +
        +
        + +

        And you can verify that fusing the two sins did actually occur because the two sin operations occur within a single Triton kernel and the temporary variables are held in registers with very fast access.

        + +

        a real model

        + +

        As a next step let’s try a real model like resnet50 from the PyTorch hub.

        + +
        import torch
        +model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
        +opt_model = torch.compile(model, backend="inductor")
        +model(torch.randn(1,3,64,64))
        +
        +
        + +

        If you actually run you may be surprised that the first run is slow and that’s because the model is being compiled. Subsequent runs will be faster so it’s common practice to warm up your model before you start benchmarking it.

        + +

        You may have noticed how we also passed in the name of a compiler explicitly here with “inductor” but it’s not the only available backend, you can run in a REPL torch._dynamo.list_backends() to see the full list of available backends. For fun you should try out aot_cudagraphs or nvfuser.

        + +

        Hugging Face models

        + +

        Let’s do something a bit more interesting now, our community frequently +uses pretrained models from transformers https://github.com/huggingface/transformers or TIMM https://github.com/rwightman/pytorch-image-models and one of our design goals for PyTorch 2.0 was that any new compiler stack needs to work out of the box with the vast majority of models people actually run.

        + +

        So we’re going to directly download a pretrained model from the Hugging Face hub and optimize it

        + +
        
        +import torch
        +from transformers import BertTokenizer, BertModel
        +# Copy pasted from here https://huggingface.co/bert-base-uncased
        +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        +model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
        +model = torch.compile(model) # This is the only line of code that we changed
        +text = "Replace me by any text you'd like."
        +encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
        +output = model(**encoded_input)
        +
        +
        + +

        If you remove the to(device="cuda:0") from the model and encoded_input then PyTorch 2.0 will generate C++ kernels that will be optimized for running on your CPU. You can inspect both Triton or C++ kernels for BERT, they’re obviously more complex than the trigonometry example we had above but you can similarly skim it and understand if you understand PyTorch.

        + +

        The same code also works just fine if used with https://github.com/huggingface/accelerate and DDP

        + +

        Similarly let’s try out a TIMM example

        + +
        import timm
        +import torch
        +model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
        +opt_model = torch.compile(model, backend="inductor")
        +opt_model(torch.randn(64,3,7,7))
        +
        + +

        Our goal with PyTorch was to build a breadth-first compiler that would speed up the vast majority of actual models people run in open source. The Hugging Face Hub ended up being an extremely valuable benchmarking tool for us, ensuring that any optimization we work on actually helps accelerate models people want to run.

        + +

        So please try out PyTorch 2.0, enjoy the free perf and if you’re not seeing it then please open an issue and we will make sure your model is supported https://github.com/pytorch/torchdynamo/issues

        + +

        After all, we can’t claim we’re created a breadth-first unless YOUR models actually run faster.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/FX-feature-extraction-torchvision/index.html b/blog/FX-feature-extraction-torchvision/index.html new file mode 100644 index 000000000000..b264bd7bcd75 --- /dev/null +++ b/blog/FX-feature-extraction-torchvision/index.html @@ -0,0 +1,1096 @@ + + + + + + + + + + + + + Feature Extraction in TorchVision using Torch FX | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Alexander Soare and Francisco Massa + +

        + + +

        Introduction

        + +

        FX based feature extraction is a new TorchVision utility that lets us access intermediate transformations of an input during the forward pass of a PyTorch Module. It does so by symbolically tracing the forward method to produce a graph where each node represents a single operation. Nodes are named in a human-readable manner such that one may easily specify which nodes they want to access.

        + +

        Did that all sound a little complicated? Not to worry as there’s a little in this article for everyone. Whether you’re a beginner or an advanced deep-vision practitioner, chances are you will want to know about FX feature extraction. If you still want more background on feature extraction in general, read on. If you’re already comfortable with that and want to know how to do it in PyTorch, skim ahead to Existing Methods in PyTorch: Pros and Cons. And if you already know about the challenges of doing feature extraction in PyTorch, feel free to skim forward to FX to The Rescue.

        + +

        A Recap On Feature Extraction

        + +

        We’re all used to the idea of having a deep neural network (DNN) that takes inputs and produces outputs, and we don’t necessarily think of what happens in between. Let’s just consider a ResNet-50 classification model as an example:

        + +

        + CResNet-50 takes an image of a bird and transforms that into the abstract concept 'bird' +
        + Figure 1: ResNet-50 takes an image of a bird and transforms that into the abstract concept "bird". Source: Bird image from ImageNet. +

        + +

        We know though, that there are many sequential “layers” within the ResNet-50 architecture that transform the input step-by-step. In Figure 2 below, we peek under the hood to show the layers within ResNet-50, and we also show the intermediate transformations of the input as it passes through those layers.

        + +

        + ResNet-50 transforms the input image in multiple steps. Conceptually, we may access the intermediate transformation of the image after each one of these steps. +
        + Figure 2: ResNet-50 transforms the input image in multiple steps. Conceptually, we may access the intermediate transformation of the image after each one of these steps. Source: Bird image from ImageNet. +

        + +

        Existing Methods In PyTorch: Pros and Cons

        + +

        There were already a few ways of doing feature extraction in PyTorch prior to FX based feature extraction being introduced.

        + +

        To illustrate these, let’s consider a simple convolutional neural network that does the following

        + +
          +
        • Applies several “blocks” each with several convolution layers within.
        • +
        • After several blocks, it uses a global average pool and flatten operation.
        • +
        • Finally it uses a single output classification layer.
        • +
        + +
        import torch
        +from torch import nn
        +
        +
        +class ConvBlock(nn.Module):
        +   """
        +   Applies `num_layers` 3x3 convolutions each followed by ReLU then downsamples
        +   via 2x2 max pool.
        +   """
        +
        +   def __init__(self, num_layers, in_channels, out_channels):
        +       super().__init__()
        +       self.convs = nn.ModuleList(
        +           [nn.Sequential(
        +               nn.Conv2d(in_channels if i==0 else out_channels, out_channels, 3, padding=1),
        +               nn.ReLU()
        +            )
        +            for i in range(num_layers)]
        +       )
        +       self.downsample = nn.MaxPool2d(kernel_size=2, stride=2)
        +      
        +   def forward(self, x):
        +       for conv in self.convs:
        +           x = conv(x)
        +       x = self.downsample(x)
        +       return x
        +      
        +
        +class CNN(nn.Module):
        +   """
        +   Applies several ConvBlocks each doubling the number of channels, and
        +   halving the feature map size, before taking a global average and classifying.
        +   """
        +
        +   def __init__(self, in_channels, num_blocks, num_classes):
        +       super().__init__()
        +       first_channels = 64
        +       self.blocks = nn.ModuleList(
        +           [ConvBlock(
        +               2 if i==0 else 3,
        +               in_channels=(in_channels if i == 0 else first_channels*(2**(i-1))),
        +               out_channels=first_channels*(2**i))
        +            for i in range(num_blocks)]
        +       )
        +       self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        +       self.cls = nn.Linear(first_channels*(2**(num_blocks-1)), num_classes)
        +
        +   def forward(self, x):
        +       for block in self.blocks:
        +           x = block(x)
        +       x = self.global_pool(x)
        +       x = x.flatten(1)
        +       x = self.cls(x)
        +       return x
        +
        +
        +model = CNN(3, 4, 10)
        +out = model(torch.zeros(1, 3, 32, 32))  # This will be the final logits over classes
        +
        +
        + +

        Let’s say we want to get the final feature map before global average pooling. We could do the following:

        + +

        Modify the forward method

        + +
        def forward(self, x):
        +   for block in self.blocks:
        +       x = block(x)
        +   self.final_feature_map = x
        +   x = self.global_pool(x)
        +   x = x.flatten(1)
        +   x = self.cls(x)
        +   return x
        +
        + +

        Or return it directly:

        + +
        def forward(self, x):
        +   for block in self.blocks:
        +       x = block(x)
        +   final_feature_map = x
        +   x = self.global_pool(x)
        +   x = x.flatten(1)
        +   x = self.cls(x)
        +   return x, final_feature_map
        +
        +

        That looks pretty easy. But there are some downsides here which all stem from the same underlying issue: that is, modifying the source code is not ideal:

        + +
          +
        • It’s not always easy to access and change given the practical considerations of a project.
        • +
        • If we want flexibility (switching feature extraction on or off, or having variations on it), we need to further adapt the source code to support that.
        • +
        • It’s not always just a question of inserting a single line of code. Think about how you would go about getting the feature map from one of the intermediate blocks with the way I’ve written this module.
        • +
        • Overall, we’d rather avoid the overhead of maintaining source code for a model, when we actually don’t need to change anything about how it works.
        • +
        + +

        One can see how this downside can start to get a lot more thorny when dealing with larger, more complicated models, and trying to get at features from within nested submodules.

        + +

        Write a new module using the parameters from the original one

        + +

        Following on the example from above, say we want to get a feature map from each block. We could write a new module like so:

        + +
        class CNNFeatures(nn.Module):
        +   def __init__(self, backbone):
        +       super().__init__()
        +       self.blocks = backbone.blocks
        +
        +   def forward(self, x):
        +       feature_maps = []
        +       for block in self.blocks:
        +           x = block(x)
        +           feature_maps.append(x)
        +       return feature_maps
        +
        +
        +backbone = CNN(3, 4, 10)
        +model = CNNFeatures(backbone)
        +out = model(torch.zeros(1, 3, 32, 32))  # This is now a list of Tensors, each representing a feature map
        +
        + +

        In fact, this is much like the method that TorchVision used internally to make many of its detection models.

        + +

        Although this approach solves some of the issues with modifying the source code directly, there are still some major downsides:

        + +
          +
        • It’s only really straight-forward to access the outputs of top-level submodules. Dealing with nested submodules rapidly becomes complicated.
        • +
        • We have to be careful not to miss any important operations in between the input and the output. We introduce potential for errors in transcribing the exact functionality of the original module to the new module.
        • +
        + +

        Overall, this method and the last both have the complication of tying in feature extraction with the model’s source code itself. Indeed, if we examine the source code for TorchVision models we might suspect that some of the design choices were influenced by the desire to use them in this way for downstream tasks.

        + +

        Use hooks

        + +

        Hooks move us away from the paradigm of writing source code, towards one of specifying outputs. Considering our toy CNN example above, and the goal of getting feature maps for each layer, we could use hooks like this:

        + +
        model = CNN(3, 4, 10)
        +feature_maps = []  # This will be a list of Tensors, each representing a feature map
        +
        +def hook_feat_map(mod, inp, out):
        +	feature_maps.append(out)
        +
        +for block in model.blocks:
        +	block.register_forward_hook(hook_feat_map)
        +
        +out = model(torch.zeros(1, 3, 32, 32))  # This will be the final logits over classes
        +
        + +

        Now we have full flexibility in terms of accessing nested submodules, and we free ourselves of the responsibilities of fiddling with the source code. But this approach comes with its own downsides:

        + +
          +
        • We can only apply hooks to modules. If we have functional operations (reshape, view, functional non-linearities, etc) for which we want the outputs, hooks won’t work directly on them.
        • +
        • We have not modified anything about the source code, so the whole forward pass is executed, regardless of the hooks. If we only need to access early features without any need for the final output, this could result in a lot of useless computation.
        • +
        • Hooks are not TorchScript friendly.
        • +
        + +

        Here’s a summary of the different methods and their pros/cons:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
         Can use source code as is without any modifications or rewritingFull flexibility in accessing featuresDrops unnecessary computational stepsTorchScript friendly
        Modify forward methodNOTechnically yes. Depends on how much code you’re willing to write. So in practice, NO.YESYES
        New module that reuses submodules / parameters of original moduleNOTechnically yes. Depends on how much code you’re willing to write. So in practice, NO.YESYES
        HooksYESMostly YES. Only outputs of submodulesNONO
        + +

        Table 1: The pros (or cons) of some of the existing methods for feature extraction with PyTorch

        + +

        In the next section of this article, let’s see how we can get YES across the board.

        + +

        FX to The Rescue

        + +

        The natural question for some new-starters in Python and coding at this point might be: “Can’t we just point to a line of code and tell Python or PyTorch that we want the result of that line?” For those who have spent more time coding, the reason this can’t be done is clear: multiple operations can happen in one line of code, whether they are explicitly written there, or they are implicit as sub-operations. Just take this simple module as an example:

        + +
        class MyModule(torch.nn.Module):
        +    def __init__(self):
        +        super().__init__()
        +        self.param = torch.nn.Parameter(torch.rand(3, 4))
        +        self.submodule = MySubModule()
        +
        +    def forward(self, x):
        +        return self.submodule(x + self.param).clamp(min=0.0, max=1.0)
        +
        + +

        The forward method has a single line of code which we can unravel as:

        + +
          +
        1. Add self.param to x
        2. +
        3. Pass x through self.submodule. Here we would need to consider the steps happening in that submodule. I’m just going to use dummy operation names for illustration: + I. submodule.op_1 + II. submodule.op_2
        4. +
        5. Apply the clamp operation
        6. +
        + +

        So even if we point at this one line, the question then is: “For which step do we want to extract the output?”.

        + +

        FX is a core PyTorch toolkit that (oversimplifying) does the unravelling I just mentioned. It does something called “symbolic tracing”, which means the Python code is interpreted and stepped through, operation-by-operation, using some dummy proxy for a real input. Introducing some nomenclature, each step as described above is considered a “node”, and consecutive nodes are connected to one another to form a “graph” (not unlike the common mathematical notion of a graph). Here are the “steps” above translated to this concept of a graph.

        + +

        + Graphical representation of the result of symbolically tracing our example of a simple forward method. +
        + Figure 3: Graphical representation of the result of symbolically tracing our example of a simple forward method. +

        + +

        Note that we call this a graph, and not just a set of steps, because it’s possible for the graph to branch off and recombine. Think of the skip connection in a residual block. This would look something like:

        + +

        + Graphical representation of a residual skip connection. The middle node is like the main branch of a residual block, and the final node represents the sum of the input and output of the main branch. +
        + Figure 4: Graphical representation of a residual skip connection. The middle node is like the main branch of a residual block, and the final node represents the sum of the input and output of the main branch. +

        + +

        Now, TorchVision’s get_graph_node_names function applies FX as described above, and in the process of doing so, tags each node with a human readable name. Let’s try this with our toy CNN model from the previous section:

        + +
        model = CNN(3, 4, 10)
        +from torchvision.models.feature_extraction import get_graph_node_names
        +nodes, _ = get_graph_node_names(model)
        +print(nodes)
        +
        +

        which will result in:

        +
        ['x', 'blocks.0.convs.0.0', 'blocks.0.convs.0.1', 'blocks.0.convs.1.0', 'blocks.0.convs.1.1', 'blocks.0.downsample', 'blocks.1.convs.0.0', 'blocks.1.convs.0.1', 'blocks.1.convs.1.0', 'blocks.1.convs.1.1', 'blocks.1.convs.2.0', 'blocks.1.convs.2.1', 'blocks.1.downsample', 'blocks.2.convs.0.0', 'blocks.2.convs.0.1', 'blocks.2.convs.1.0', 'blocks.2.convs.1.1', 'blocks.2.convs.2.0', 'blocks.2.convs.2.1', 'blocks.2.downsample', 'blocks.3.convs.0.0', 'blocks.3.convs.0.1', 'blocks.3.convs.1.0', 'blocks.3.convs.1.1', 'blocks.3.convs.2.0', 'blocks.3.convs.2.1', 'blocks.3.downsample', 'global_pool', 'flatten', 'cls']
        +
        + +

        We can read these node names as hierarchically organised “addresses” for the operations of interest. For example ‘blocks.1.downsample’ refers to the MaxPool2d layer in the second ConvBlock.

        + +

        create_feature_extractor, which is where all the magic happens, goes a few steps further than get_graph_node_names. It takes desired node names as one of the input arguments, and then uses more FX core functionality to:

        + +
          +
        1. Assign the desired nodes as outputs.
        2. +
        3. Prune unnecessary downstream nodes and their associated parameters.
        4. +
        5. Translate the resulting graph back into Python code.
        6. +
        7. Return another PyTorch Module to the user. This has the python code from step 3 as the forward method.
        8. +
        + +

        As a demonstration, here’s how we would apply create_feature_extractor to get the 4 feature maps from our toy CNN model

        + +
        from torchvision.models.feature_extraction import create_feature_extractor
        +# Confused about the node specification here?
        +# We are allowed to provide truncated node names, and `create_feature_extractor`
        +# will choose the last node with that prefix.
        +feature_extractor = create_feature_extractor(
        +	model, return_nodes=['blocks.0', 'blocks.1', 'blocks.2', 'blocks.3'])
        +# `out` will be a dict of Tensors, each representing a feature map
        +out = feature_extractor(torch.zeros(1, 3, 32, 32))
        +
        + +

        It’s as simple as that. When it comes down to it, FX feature extraction is just a way of making it possible to do what some of us would have naively hoped for when we first started programming: “just give me the output of this code (points finger at screen)”*.

        + +
          +
        • … does not require us to fiddle with source code.
        • +
        • … provides full flexibility in terms of accessing any intermediate transformation of our inputs, whether they are the results of a module or a functional operation
        • +
        • … does drop unnecessary computations steps once features have been extracted
        • +
        • … and I didn’t mention this before, but it’s also TorchScript friendly!
        • +
        + +

        Here’s that table again with another row added for FX feature extraction

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
         Can use source code as is without any modifications or rewritingFull flexibility in accessing featuresDrops unnecessary computational stepsTorchScript friendly
        Modify forward methodNOTechnically yes. Depends on how much code you’re willing to write. So in practice, NO.YESYES
        New module that reuses submodules / parameters of original moduleNOTechnically yes. Depends on how much code you’re willing to write. So in practice, NO.YESYES
        HooksYESMostly YES. Only outputs of submodulesNONO
        FXYESYESYESYES
        + +

        Table 2: A copy of Table 1 with an added row for FX feature extraction. FX feature extraction gets YES across the board!

        + +

        Current FX Limitations

        + +

        Although I would have loved to end the post there, FX does have some of its own limitations which boil down to:

        + +
          +
        1. There may be some Python code that isn’t yet handled by FX when it comes to the step of interpretation and translation into a graph.
        2. +
        3. Dynamic control flow can’t be represented in terms of a static graph.
        4. +
        + +

        The easiest thing to do when these problems crop up is to bundle the underlying code into a “leaf node”. Recall the example graph from Figure 3? Conceptually, we may agree that the submodule should be treated as a node in itself rather than a set of nodes representing the underlying operations. If we do so, we can redraw the graph as:

        + +

        + The individual operations within `submodule` may (left - within red box), may be consolidated into one node (right - node #2) if we consider the `submodule` as a 'leaf' node. +
        + Figure 5: The individual operations within `submodule` may (left - within red box), may be consolidated into one node (right - node #2) if we consider the `submodule` as a "leaf" node. +

        + +

        We would want to do so if there is some problematic code within the submodule, but we don’t have any need for extracting any intermediate transformations from within it. In practice, this is easily achievable by providing a keyword argument to create_feature_extractor or get_graph_node_names.

        + +
        model = CNN(3, 4, 10)
        +nodes, _ = get_graph_node_names(model, tracer_kwargs={'leaf_modules': [ConvBlock]})
        +print(nodes)
        +
        + +

        for which the output will be:

        + +
        ['x', 'blocks.0', 'blocks.1', 'blocks.2', 'blocks.3', 'global_pool', 'flatten', 'cls']
        +
        + +

        Notice how, as compared to previously, all the nodes for any given ConvBlock are consolidated into a single node.

        + +

        We could do something similar with functions. For example, Python’s inbuilt len needs to be wrapped and the result should be treated as a leaf node. Here’s how you can do that with core FX functionality:

        + +
        torch.fx.wrap('len')
        +
        +class MyModule(nn.Module):
        +   def forward(self, x):
        +       x += 1
        +       len(x)
        +
        +model = MyModule()
        +feature_extractor = create_feature_extractor(model, return_nodes=['add'])
        +
        + +

        For functions you define, you may instead use another keyword argument to create_feature_extractor (minor detail: here’s why you might want to do it this way instead):

        + +
        def myfunc(x):
        +   return len(x)
        +
        +class MyModule(nn.Module):
        +   def forward(self, x):
        +       x += 1
        +       myfunc(x)
        +
        +model = MyModule()
        +feature_extractor = create_feature_extractor(
        +   model, return_nodes=['add'], tracer_kwargs={'autowrap_functions': [myfunc]})
        +
        + +

        Notice that none of the fixes above involved modifying source code.

        + +

        Of course, there may be times when the very intermediate transformation one is trying to get access to is within the same forward method or function that is causing problems. Here, we can’t just treat that module or function as a leaf node, because then we can’t access the intermediate transformations within. In these cases, some rewriting of the source code will be needed. Here are some examples (not exhaustive)

        + +
          +
        • FX will raise an error when trying to trace through code with an assert statement. In this case you may need to remove that assertion or switch it with torch._assert (this is not a public function - so consider it a bandaid and use with caution).
        • +
        • Symbolically tracing in-place changes to slices of tensors is not supported. You will need to make a new variable for the slice, apply the operation, then reconstruct the original tensor using concatenation or stacking.
        • +
        • Representing dynamic control flow in a static graph is just not logically possible. See if you can distill the coded logic down to something that is not dynamic - see FX documentation for tips.
        • +
        + +

        In general, you may consult the FX documentation for more detail on the limitations of symbolic tracing and the possible workarounds.

        + +

        Conclusion

        + +

        We did a quick recap on feature extraction and why one might want to do it. Although there are existing methods for doing feature extraction in PyTorch they all have rather significant shortcomings. We learned how TorchVision’s FX feature extraction utility works and what makes it so versatile compared to the existing methods. While there are still some minor kinks to iron out for the latter, we understand the limitations, and can trade them off against the limitations of other methods depending on our use case. Hopefully by adding this new utility to your PyTorch toolkit, you’re now equipped to handle the vast majority of feature extraction requirements you may come across.

        + +

        Happy coding!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology/index.html b/blog/PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology/index.html new file mode 100644 index 000000000000..f8d9f320fcfd --- /dev/null +++ b/blog/PathAI-Uses-PyTorch-to-Improve-Patient-Outcomes-with-AI-powered-Pathology/index.html @@ -0,0 +1,757 @@ + + + + + + + + + + + + + Case Study: PathAI Uses PyTorch to Improve Patient Outcomes with AI-powered Pathology | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Logan Kilpatrick - Sr. Technology Advocate, Harshith Padigela - ML Engineer, Syed Ashar Javed - ML Technical Lead, Robert Egger - Biomedical Data Scientist + +

        +

        ​PathAI is the leading provider of AI-powered technology tools and services for pathology (the study of disease). Our platform was built to enable substantial improvements to the accuracy of diagnosis and the measurement of therapeutic efficacy for complex diseases, leveraging modern approaches in machine learning like image segmentation, graph neural networks, and multiple instance learning.

        + +

        + +

        + +

        Traditional manual pathology is prone to subjectivity and observer variability that can negatively affect diagnoses and drug development trials. Before we dive into how we use PyTorch to improve our diagnosis workflow, let us first lay out the traditional analog Pathology workflow without machine learning.

        + +

        How Traditional Biopharma Works

        + +

        There are many avenues that biopharma companies take to discover novel therapeutics or diagnostics. One of those avenues relies heavily on the analysis of pathology slides to answer a variety of questions: how does a particular cellular communication pathway work? Can a specific disease state be linked to the presence or lack of a particular protein? Why did a particular drug in a clinical trial work for some patients but not others? Might there be an association between patient outcomes and a novel biomarker?

        + +

        To help answer these questions, biopharma companies rely on expert pathologists to analyze slides and help evaluate the questions they might have. 

        + +

        As you might imagine, it takes an expert board certified pathologist to make accurate interpretations and diagnosis. In one study, a single biopsy result was given to 36 different pathologists and the outcome was 18 different diagnoses varying in severity from no treatment to aggressive treatment necessary. Pathologists also often solicit feedback from colleagues in difficult edge cases. Given the complexity of the problem, even with expert training and collaboration, pathologists can still have a hard time making a correct diagnosis. This potential variance can be the difference between a drug being approved and it failing the clinical trial.

        + +

        How PathAI utilizes machine learning to power drug development

        + +

        PathAI develops machine learning models which provide insights for drug development R&D, for powering clinical trials, and for making diagnoses. To this end, PathAI leverages PyTorch for slide level inference using a variety of methods including graph neural networks (GNN) as well as multiple instance learning. In this context, “slides” refers to full size scanned images of glass slides, which are pieces of glass with a thin slice of tissue between them, stained to show various cell formations. PyTorch enables our teams using these different methodologies to share a common framework which is robust enough to work in all the conditions we need. PyTorch’s high level, imperative, and pythonic syntax allows us to prototype models quickly and then take those models to scale once we have the results we want. 

        + +

        Multi-instance learning on gigabyte images

        + +

        One of the uniquely challenging aspects of applying ML to pathology is the immense size of the images. These digital slides can often be 100,000 x 100,000 pixels or more in resolution and gigabytes in size. Loading the full image in GPU memory and applying traditional computer vision algorithms on them is an almost impossible task. It also takes both a considerable amount of time and resources to have a full slide image (100k x 100k) annotated, especially when annotators need to be domain experts (board-certified pathologists). We often build models to predict image-level labels, like the presence of cancer, on a patient slide which covers a few thousand pixels in the whole image. The cancerous area is sometimes a tiny fraction of the entire slide, which makes the ML problem similar to finding a needle in a haystack. On the other hand, some problems like the prediction of certain histological biomarkers require an aggregation of information from the whole slide which is again hard due to the size of the images. All these factors add significant algorithmic, computational, and logistical complexity when applying ML techniques to pathology problems.

        + +

        Breaking down the image into smaller patches, learning patch representations, and then pooling those representations to predict an image-level label is one way to solve this problem as is depicted in the image below. One popular method for doing this is called Multiple Instance Learning (MIL). Each patch is considered an ‘instance’ and a set of patches forms a ‘bag’. The individual patch representations are pooled together to predict a final bag-level label. Algorithmically, the individual patch instances in the bag do not require labels and hence allow us to learn bag-level labels in a weakly-supervised way. They also use permutation invariant pooling functions which make the prediction independent of the order of patches and allows for an efficient aggregation of information. Typically, attention based pooling functions are used which not only allow for efficient aggregation but also provide attention values for each patch in the bag. These values indicate the importance of the corresponding patch in the prediction and can be visualized to better understand the model predictions. This element of interpretability can be very important to drive adoption of these models in the real world and we use variations like Additive MIL models to enable such spatial explainability. Computationally, MIL models circumvent the problem of applying neural networks to large image sizes since patch representations are obtained independently of the size of the image.

        + +

        + +

        + +

        At PathAI, we use custom MIL models based on deep nets to predict image-level labels. The overview of this process is as follows:

        + +
          +
        1. Select patches from a slide using different sampling approaches.
        2. +
        3. Construct a bag of patches based on random sampling or heuristic rules.
        4. +
        5. Generate patch representations for each instance based on pre-trained models or large-scale representation learning models.
        6. +
        7. Apply permutation invariant pooling functions to get the final slide-level score.
        8. +
        + +

        Now that we have walked through some of the high-level details around MIL in PyTorch, let’s look at some code to see how simple it is to go from ideation to code in production with PyTorch. We begin by defining a sampler, transformations, and our MIL dataset:

        + +
        # Create a bag sampler which randomly samples patches from a slide
        +bag_sampler = RandomBagSampler(bag_size=12)
        +
        +# Setup the transformations
        +crop_transform = FlipRotateCenterCrop(use_flips=True)
        +
        +# Create the dataset which loads patches for each bag
        +train_dataset = MILDataset(
        +  bag_sampler=bag_sampler,
        +  samples_loader=sample_loader,
        +  transform=crop_transform,
        +)
        +
        + +

        After we have defined our sampler and dataset, we need to define the model we will actually train with said dataset. PyTorch’s familiar model definition syntax makes this easy to do while also allowing us to create bespoke models at the same time.

        + +
        classifier = DefaultPooledClassifier(hidden_dims=[256, 256], input_dims=1024, output_dims=1)
        +
        +pooling = DefaultAttentionModule(
        +  input_dims=1024,
        +  hidden_dims=[256, 256],
        +  output_activation=StableSoftmax()
        +)
        +
        +# Define the model which is a composition of the featurizer, pooling module and a classifier
        +model = DefaultMILGraph(featurizer=ShuffleNetV2(), classifier=classifier, pooling = pooling)
        +
        + +

        Since these models are trained end-to-end, they offer a powerful way to go directly from a gigapixel whole slide image to a single label. Due to their wide applicability to different biological problems, two aspects of their implementation and deployment are important:

        + +
          +
        1. Configurable control over each part of the pipeline including the data loaders, the modular parts of the model, and their interaction with each other.
        2. +
        3. Ability to rapidly iterate through the ideate-implement-experiment-productionize loop.
        4. +
        + +

        PyTorch has various advantages when it comes to MIL modeling. It offers an intuitive way to create dynamic computational graphs with flexible control flow which is great for rapid research experimentation. The map-style datasets, configurable sampler and batch-samplers allow us to customize how we construct bags of patches, enabling faster experimentation. Since MIL models are IO heavy, data parallelism and pythonic data loaders make the task very efficient and user friendly. Lastly, the object-oriented nature of PyTorch enables building of reusable modules which aid in the rapid experimentation, maintainable implementation and ease of building compositional components of the pipeline.

        + +

        Exploring spatial tissue organization with GNNs in PyTorch

        + +

        + +

        + +

        In both healthy and diseased tissue, the spatial arrangement and structure of cells can oftentimes be as important as the cells themselves. For example, when assessing lung cancers, pathologists try to look at the overall grouping and structure of tumor cells (do they form solid sheets? Or do they occur in smaller, localized clusters?) to determine if the cancer belongs to specific subtypes which can have vastly different prognosis. Such spatial relationships between cells and other tissue structures can be modeled using graphs to capture tissue topology and cellular composition at the same time. Graph Neural Networks (GNNs) allow learning spatial patterns within these graphs that relate to other clinical variables, for example overexpression of genes in certain cancers.

        + +

        In late 2020, when PathAI started using GNNs on tissue samples, PyTorch had the best and most mature support for GNN functionality via the PyG package. This made PyTorch the natural choice for our team given that GNN models were something that we knew would be an important ML concept we wanted to explore. 

        + +

        One of the main value-adds of GNN’s in the context of tissue samples is that the graph itself can uncover spatial relationships that would otherwise be very difficult to find by visual inspection alone. In our recent AACR publication, we showed that by using GNNs, we can better understand the way the presence of immune cell aggregates (specifically tertiary lymphoid structures, or TLS) in the tumor microenvironment can influence patient prognosis. In this case, the GNN approach was used to predict expression of genes associated with the presence of TLS, and identify histological features beyond the TLS region itself that are relevant to TLS. Such insights into gene expression are difficult to identify from tissue sample images when unassisted by ML models. 

        + +

        One of the most promising GNN variations we have had success with is self attention graph pooling. Let’s take a look at how we define our Self Attention Graph Pooling (SAGPool) model using PyTorch and PyG:

        + +
        class SAGPool(torch.nn.Module):
        +  def __init__(self, ...):
        +    super().__init__()
        +    self.conv1 = GraphConv(in_features, hidden_features, aggr='mean')
        +    self.convs = torch.nn.ModuleList()
        +    self.pools = torch.nn.ModuleList()
        +    self.convs.extend([GraphConv(hidden_features, hidden_features, aggr='mean') for i in range(num_layers - 1)])
        +    self.pools.extend([SAGPooling(hidden_features, ratio, GNN=GraphConv, min_score=min_score) for i in range((num_layers) // 2)])
        +    self.jump = JumpingKnowledge(mode='cat')
        +    self.lin1 = Linear(num_layers * hidden_features, hidden_features)
        +    self.lin2 = Linear(hidden_features, out_features)
        +    self.out_activation = out_activation
        +    self.dropout = dropout
        +
        + +

        In the above code, we begin by defining a single convolutional graph layer and then add two module list layers which allow us to pass in a variable number of layers. We then take our empty module list and append a variable number of GraphConv layers followed by a variable number of SAGPooling layers. We finish up our SAGPool definition by adding a JumpingKnowledge Layer, two linear layers, our activation function, and our dropout value. PyTorch’s intuitive syntax allows us to abstract away the complexity of working with state of the art methods like SAG Poolings while also maintaining the common approach to model development we are familiar with.

        + +

        Models like our SAG Pool one described above are just one example of how GNNs with PyTorch are allowing us to explore new and novel ideas. We also recently explored multimodal CNN - GNN hybrid models which ended up being 20% more accurate than traditional Pathologist consensus scores. These innovations and interplay between traditional CNNs and GNNs are again enabled by the short research to production model development loop.

        + +

        Improving Patient Outcomes

        +

        In order to achieve our mission of improving patient outcomes with AI-powered pathology, PathAI needs to rely on an ML development framework that (1) facilitates quick iteration and easy extension (i.e. Model configuration as code) during initial phases of development and exploration (2) scales model training and inference to massive images (3) easily and robustly serves models for production uses of our products (in clinical trials and beyond). As we’ve demonstrated, PyTorch offers us all of these capabilities and more. We are incredibly excited about the future of PyTorch and cannot wait to see what other impactful challenges we can solve using the framework.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/PyTorch-1.13-release/index.html b/blog/PyTorch-1.13-release/index.html new file mode 100644 index 000000000000..d597cecad1db --- /dev/null +++ b/blog/PyTorch-1.13-release/index.html @@ -0,0 +1,812 @@ + + + + + + + + + + + + + PyTorch 1.13 release, including beta versions of functorch and improved support for Apple’s new M1 chips. | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 1.13 (release note)! This includes Stable versions of BetterTransformer. We deprecated CUDA 10.2 and 11.3 and completed migration of CUDA 11.6 and 11.7. Beta includes improved support for Apple M1 chips and functorch, a library that offers composable vmap (vectorization) and autodiff transforms, being included in-tree with the PyTorch release. This release is composed of over 3,749 commits and 467 contributors since 1.12.1. We want to sincerely thank our dedicated community for your contributions.

        + +

        Summary:

        + +
          +
        • +

          The BetterTransformer feature set supports fastpath execution for common Transformer models during Inference out-of-the-box, without the need to modify the model. Additional improvements include accelerated add+matmul linear algebra kernels for sizes commonly used in Transformer models and Nested Tensors is now enabled by default.

          +
        • +
        • +

          Timely deprecating older CUDA versions allows us to proceed with introducing the latest CUDA version as they are introduced by Nvidia®, and hence allows support for C++17 in PyTorch and new NVIDIA Open GPU Kernel Modules.

          +
        • +
        • +

          Previously, functorch was released out-of-tree in a separate package. After installing PyTorch, a user will be able to import functorch and use functorch without needing to install another package.

          +
        • +
        • +

          PyTorch is offering native builds for Apple® silicon machines that use Apple’s new M1 chip as a beta feature, providing improved support across PyTorch’s APIs.

          +
        • +
        + + + +

        Along with 1.13, we are also releasing major updates to the PyTorch libraries, more details can be found in this blog.

        + +

        Stable Features

        + +

        (Stable) BetterTransformer API

        + +

        The BetterTransformer feature set, first released in PyTorch 1.12, is stable. PyTorch BetterTransformer supports fastpath execution for common Transformer models during Inference out-of-the-box, without the need to modify the model. To complement the improvements in Better Transformer, we have also accelerated add+matmul linear algebra kernels for sizes commonly used in Transformer models.

        + +

        Reflecting the performance benefits for many NLP users, Nested Tensors use for Better Transformer is now enabled by default. To ensure compatibility, a mask check is performed to ensure a contiguous mask is supplied. In Transformer Encoder, the mask check for src_key_padding_mask may be suppressed by setting mask_check=False. This accelerates processing for users than can guarantee that only aligned masks are provided. Finally, better error messages are provided to diagnose incorrect inputs, together with improved diagnostics why fastpath execution cannot be used.

        + +

        Better Transformer is directly integrated into the PyTorch TorchText library, enabling TorchText users to transparently and automatically take advantage of BetterTransformer speed and efficiency performance. (Tutorial)

        + +

        + +

        + +

        + +

        +Figure: BetterTransformer fastpath execution is now stable and enables sparsity optimization using Nested Tensor representation as default +

        + +

        Introduction of CUDA 11.6 and 11.7 and deprecation of CUDA 10.2 and 11.3

        + +

        Timely deprecating older CUDA versions allows us to proceed with introducing the latest CUDA version as they are introduced by Nvidia®, and hence allows developers to use the latest features of CUDA and benefit from correctness fixes provided by the latest version.

        + +

        Decommissioning of CUDA 10.2. CUDA 11 is the first CUDA version to support C++17. Hence decommissioning legacy CUDA 10.2 was a major step in adding support for C++17 in PyTorch. It also helps to improve PyTorch code by eliminating legacy CUDA 10.2 specific instructions.

        + +

        Decommissioning of CUDA 11.3 and introduction of CUDA 11.7 brings compatibility support for the new NVIDIA Open GPU Kernel Modules and another significant highlight is the lazy loading support. CUDA 11.7 is shipped with cuDNN 8.5.0 which contains a number of optimizations accelerating transformer-based models, 30% reduction in library size , and various improvements in the runtime fusion engine. Learn more on CUDA 11.7 with our release notes.

        + +

        Beta Features

        + +

        (Beta) functorch

        + +

        Inspired by Google® JAX, functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples include:

        + + + +

        +We’re excited to announce that, as a first step towards closer integration with PyTorch, functorch has moved to inside the PyTorch library and no longer requires the installation of a separate functorch package. After installing PyTorch via conda or pip, you’ll be able to `import functorch’ in your program. Learn more with our detailed instructions, nightly and release notes.

        + +

        (Beta) Intel® VTune™ Profiler’s Instrumentation and Tracing Technology APIs (ITT) integration

        + +

        PyTorch users are able to visualize op-level timeline of PyTorch scripts execution in Intel® VTune™ Profiler when they need to analyze per-op performance with low-level performance metrics on Intel platforms.

        + +
        with torch.autograd.profiler.emit_itt():
        +    for i in range(10):
        +        torch.itt.range_push('step_{}'.format(i))
        +        model(input)
        +        torch.itt.range_pop()
        +
        + +

        +Learn more with our tutorial.

        + +

        (Beta) NNC: Add BF16 and Channels last support

        + +

        TorchScript graph-mode inference performance on x86 CPU is boosted by adding channels last and BF16 support to NNC. PyTorch users may benefit from channels last optimization on most popular x86 CPUs and benefit from BF16 optimization on Intel Cooper Lake Processor and Sapphire Rapids Processor. >2X geomean performance boost is observed on broad vision models with these two optimizations on Intel Cooper Lake Processor.

        + +

        The performance benefit can be obtained with existing TorchScript, channels last and BF16 Autocast APIs. See code snippet below. We will migrate the optimizations in NNC to the new PyTorch DL Compiler TorchInductor.

        + +

        + +
        import torch
        +import torchvision.models as models
        +model = models.resnet50(pretrained=True)
        +# Convert the model to channels-last
        +model = model.to(memory_format=torch.channels_last)
        +model.eval()
        +data = torch.rand(1, 3, 224, 224)
        +# Convert the data to channels-lastdata = data.to(memory_format=torch.channels_last)
        +# Enable autocast to run with BF16
        +with torch.cpu.amp.autocast(), torch.no_grad():
        +# Trace the model
        +model = torch.jit.trace(model, torch.rand(1, 3, 224, 224))
        +	model = torch.jit.freeze(model)
        +	# Run the traced model
        +	model(data)
        +
        + +

        (Beta) Support for M1 Devices

        + +

        Since v1.12, PyTorch has been offering native builds for Apple® silicon machines that use Apple’s new M1 chip as a prototype feature. In this release, we bring this feature to beta, providing improved support across PyTorch’s APIs.

        + +

        We now run tests for all submodules except torch.distributed on M1 macOS 12.6 instances. With this improved testing, we were able to fix features such as cpp extension and convolution correctness for certain inputs.

        + +

        To get started, just install PyTorch v1.13 on your Apple silicon Mac running macOS 12 or later with a native version (arm64) of Python. Learn more with our release notes.

        + +

        Prototype Features

        + +

        + +

        (Prototype) Arm® Compute Library (ACL) backend support for AWS Graviton

        + +

        We achieved substantial improvements for CV and NLP inference on aarch64 cpu with Arm Compute Library (acl) to enable acl backend for pytorch and torch-xla modules. Highlights include: +

        + +
          +
        • Enabled mkldnn + acl as the default backend for aarch64 torch wheel.
        • +
        • Enabled mkldnn matmul operator for aarch64 bf16 device.
        • +
        • Brought TensorFlow xla+acl feature into torch-xla. We enhanced the TensorFlow xla with Arm Compute Library runtime for aarch64 cpu. These changes are included in TensorFlow master and then the upcoming TF 2.10. Once the torch-xla repo is updated for the tensorflow commit, it will have compiling support for torch-xla. We observed ~2.5-3x improvement for MLPerf Bert inference compared to the torch 1.12 wheel on Graviton3.
        • +
        + +

        (Prototype) CUDA Sanitizer

        + +

        When enabled, the sanitizer begins to analyze low-level CUDA operations invoked as a result of the user’s PyTorch code to detect data race errors caused by unsynchronized data access from different CUDA streams. The errors found are then printed along with stack traces of faulty accesses, much like Thread Sanitizer does. An example of a simple error and the output produced by the sanitizer can be viewed here. It will be especially useful for machine learning applications, where corrupted data can be easy to miss for a human and the errors may not always manifest themselves; the sanitizer will always be able to detect them.

        + +

        (Prototype) Limited Python 3.11 support

        + +

        Binaries for Linux with Python 3.11 support are available to download via pip. Please follow the instructions on the get started page. Please note that Python 3.11 support is only a preview. In particular, features including Distributed, Profiler, FX and JIT might not be fully functional yet.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/PyTorchfoundation/index.html b/blog/PyTorchfoundation/index.html new file mode 100644 index 000000000000..11412fcaa8cf --- /dev/null +++ b/blog/PyTorchfoundation/index.html @@ -0,0 +1,658 @@ + + + + + + + + + + + + + PyTorch strengthens its governance by joining the Linux Foundation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Soumith Chintala + +

        +

        Today, I am proud to announce that PyTorch is moving to the Linux Foundation (LF) as a top-level project under the name PyTorch Foundation. The core mission of the Linux Foundation is the collaborative development of open source software. With a governing board of leaders from AMD, Amazon Web Services (AWS), Google Cloud, Meta, Microsoft Azure and NVIDIA, this model aligns with where PyTorch stands today and what it needs to travel forward. The creation of the PyTorch Foundation will ensure business decisions are being made in a transparent and open manner by a diverse group of members for years to come. The technical decisions remain in control of individual maintainers. I’m excited that the Linux Foundation will be our new home as they have notable experience supporting large open-source projects like ours such as Kubernetes and NodeJS. At this pivotal moment, I want to take a look back at how we started, share why we are moving, and what’s ahead.

        + +

        This January, PyTorch celebrated its 5 year anniversary! I reflected on what it meant to me in this tweet thread, and this conversation with my colleagues Mike Schroepfer, Lin Qiao, and Yann LeCun. When we started PyTorch development in 2016, it was a collective effort by a band of people from the [Lua]Torch community with a big chunk of people and funding from Meta and individuals contributing from NVIDIA, Twitter and other entities.

        + +

        Since 2017, PyTorch has grown far beyond our initial vision. With over 2,400 contributors who have built nearly 154,000 projects using PyTorch as a foundation, PyTorch has become one of the primary platforms for AI research, as well as commercial production use. We’ve seen its impact across industry and academia, from large companies to numerous university courses at Stanford, NYU, EPFL, Oxford, and other academic institutions. As a maintainer of PyTorch, the journey has been extremely fulfilling, with the impact of the project seen in various fields from self-driving cars to healthcare to aerospace.

        + +

        As PyTorch grew, many companies have made foundational investments around it. While Meta remains the largest contributor to PyTorch, companies such as AMD, Amazon Web Services (AWS), Google Cloud, HuggingFace, Lightning AI, Microsoft Azure, Nvidia, and many others have made significant investments, including both technical contributions and community building efforts. They’ve established teams around PyTorch or filled significant voids within the PyTorch community and sent countless contributions to the PyTorch core and to the ecosystem around it — PyTorch is an important part of their future. With PyTorch continuing to grow as a multi-stakeholder project, it’s time to move to a broader open-source foundation.

        + +

        The business governance of PyTorch was fairly unstructured for quite some time since launch – we operated like a scrappy startup. Team members at Meta spent the time and energy to structure this properly and organize PyTorch into an organizationally more healthy entity. Meta helped PyTorch with introducing many structures, such as Contributor License Agreements, Branding Guidelines, and Trademark registration. Keeping PyTorch’s organizational health up to check is essential and beneficial for the community. The next stage of our organizational progress is to support the interests of multiple stakeholders, hence moving to a foundation is good. We chose the Linux Foundation as it has vast organization experience hosting large multi-stakeholder open-source projects with the right balance of organizational structure and finding specific solutions for these projects.

        + +

        Simultaneously, the technical governance of PyTorch has been a loosely structured community model of open-source development — A set of people maintaining PyTorch by area with their responsibility often tied to their individual identity rather than their employment. While we kept a codified list at the PyTorch - Maintainers page, the technical governance was not formalized nor codified. As PyTorch scales as a community, the next step is to structure and codify. The PyTorch Technical Governance now supports a hierarchical maintainer structure and clear outlining of processes around day to day work and escalations. This doesn’t change how we run things, but it does add discipline and openness that at our scale feels essential and timely.

        + +

        It’s been an exciting journey since 2016. I am grateful for the experiences and people I’ve met along the way. PyTorch started with a small group of contributors which have grown and diversified over the years, all bringing in new ideas and innovations that would not have been possible without our community. We want to continue the open-source spirit – for the community and by the community. Thank you to our contributors, maintainers, users, supporters and new foundation members. We look forward to the next chapter of PyTorch with the PyTorch Foundation.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/a-better-transformer-for-fast-transformer-encoder-inference/index.html b/blog/a-better-transformer-for-fast-transformer-encoder-inference/index.html new file mode 100644 index 000000000000..ac13b2e4aa8c --- /dev/null +++ b/blog/a-better-transformer-for-fast-transformer-encoder-inference/index.html @@ -0,0 +1,715 @@ + + + + + + + + + + + + + A BetterTransformer for Fast Transformer Inference | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Michael Gschwind, Eric Han, Scott Wolchok, Rui Zhu, Christian Puhrsch + +

        +

        tl;dr Transformers achieve state-of-the-art performance for NLP, and are becoming popular for a myriad of other tasks. They are computationally expensive which has been a blocker to their widespread productionisation. Launching with PyTorch 1.12, BetterTransformer implements a backwards-compatible fast path of torch.nn.TransformerEncoder for Transformer Encoder Inference and does not require model authors to modify their models. BetterTransformer improvements can exceed 2x in speedup and throughput for many common execution scenarios. To use BetterTransformer, install PyTorch 1.12 and start using high-quality, high-performance Transformer models with the PyTorch API today.

        + +

        + +

        + +

        +Diagram of the Transformer Encoder Architecture (from "Attention Is All You Need"). During Inference, the entire module will execute as a single PyTorch-native function. +

        + +

        In this blog post, we share the following topics — Performance Improvements, Backwards compatibility, and Taking advantage of the FastPath. Learn more about these topics below.

        + +

        Performance Improvements

        + +

        BetterTransformer launches with accelerated native implementations of MultiHeadAttention and TransformerEncoderLayer for CPUs and GPUs. These fast paths are integrated in the standard PyTorch Transformer APIs, and will accelerate TransformerEncoder, TransformerEncoderLayer and MultiHeadAttention nn.modules. These new modules implement two types of optimizations: (1) fused kernels combine multiple individual operators normally used to implement Transformers to provide a more efficient implementation, and (2) take advantage of sparsity in the inputs to avoid performing unnecessary operations on padding tokens. Padding tokens frequently account for a large fraction of input batches in many Transformer models used for Natural Language Processing.

        + +

        Backwards compatibility

        + +

        Advantageously, no model changes are necessary to benefit from the performance boost offered by BetterTransformer. To benefit from fast path execution, inputs and operating conditions must satisfy some access conditions (see below). While the internal implementation of Transformer APIs has changed, PyTorch 1.12 maintains strict compatibility with Transformer modules shipped in previous versions, enabling PyTorch users to use models created and trained with previous PyTorch releases while benefiting from BetterTransformer improvements.

        + +

        In addition to enabling the PyTorch nn.Modules, BetterTransformer provides improvements for PyTorch libraries. Performance benefits will become available through two different enablement paths:

        + +
          +
        1. +

          Transparent acceleration: Current users of PyTorch nn.Modules such as MultiHeadAttention as well as higher-level Transformer components will benefit from the improved performance of the new nn.Modules automatically. An example of this is the visual transformer (ViT) implementation used in the torchvision library (code link).

          +
        2. +
        3. +

          Torchtext library acceleration: As part of this project, we have optimized Torchtext to build on the PyTorch core API to benefit from BetterTransformer enhancements while maintaining strict and transparent compatibility with previous library versions and models trained with previous Torchtext versions. Using PyTorch Transformers in Torchtext also ensures that Torchtext will benefit from expected future enhancements to the PyTorch Transformer implementation.

          +
        4. +
        + +

        Taking advantage of the Fastpath

        + +

        BetterTransformer is a fastpath for the PyTorch Transformer API. The fastpath is a native, specialized implementation of key Transformer functions for CPU and GPU that applies to common Transformer use cases.

        + +

        To take advantage of input sparsity (i.e. padding) in accelerating your model (see Figure 2), set the keyword argument enable_nested_tensor=True when instantiating a TransformerEncoder and pass in the src_key_padding_mask argument (which denotes padding tokens) during inference. This requires the padding mask to be contiguous, which is the typical case.

        + +

        Currently, the BetterTransformer speedup only applies to transformer encoder models used in inference. To benefit from fastpath execution, models must be composed of any of the following components: TransformerEncoder, TransformerEncoderLayer or MultiheadAttention (MHA). Fastpath execution is also subject to some criteria. Most importantly, the model must be executed in inference mode and operate on input tensors that do not collect gradient tape information (e.g., running with torch.no_grad). The full list of conditions can be found at these links for nn.MultiHeadAttention and nn.TransformerEncoder, respectively. If the criteria are not met, control flows to the legacy PyTorch 1.11 Transformer implementation which has the same API, but lacks the fastpath performance boost.

        + +

        Other transformer models (such as decoder models) which use the PyTorch MultiheadAttention module will benefit from the BetterTransformer fastpath. Planned future work is to expand the end-to-end BetterTransformer fastpath to models based on TransformerDecoder to support popular seq2seq and decoder-only (e.g., OPT) model architectures, and to training.

        + +

        Speedups

        + +

        The following graphs show the performance achieved for the BERT-base model with small and large-scale inputs:

        + +

        + +

        + +

        +Figure 1: PyTorch 1.12 Improvements with BetterTransformer fastpath execution +

        + +

        + +

        + +

        +Figure 2: PyTorch 1.12 Improvements with BetterTransformer fastpath execution
        +with sparsity optimization enabled by enable_nested_tensor=True
        +

        + +

        BetterTransformer includes two types of optimization: (1) fused kernels implementing multiple operations more efficiently in a single kernel, and (2) exploiting sparsity by avoiding unnecessary processing on padding tokens. Enhanced performance for small input sizes benefits primarily from the fused kernel implementations, and shows a constant performance improvement regardless of padding amount. While large inputs still benefit from fused kernels, the computation heavy processing limits the benefits that may be obtained by the fused kernels as baseline performance is already closer to the theoretical peak. However, as we increase the amount of padding, performance increases dramatically as increasingly large amounts of computation can be avoided by exploiting the sparsity introduced by padding in NLP workloads.

        + +

        Future Work

        + +

        As part of our ongoing work on PyTorch BetterTransformer, we are working on extending BetterTransformer improvements to Transformer Decoders. We aim to expand beyond inference to training as well.

        + +

        We are partnering to enable BetterTransformer on additional libraries such as FairSeq, MetaSeq, and HuggingFace to benefit all Transformer-based PyTorch models. We’ll provide future updates on the progress of BetterTransformer accelerations for the larger PyTorch ecosystem as part of this blog series.

        + +

        Acknowledgements: The authors would like to thank Lin Qiao, Ajit Mathews, Andrew Tulloch, Dmytro Dzhulgakov, Natalia Gimelshein, Emad El-Haraty, Mark Saroufim, Adnan Aziz, Geeta Chauhan, and Hamid Shojanazeri for their support, contributions and many helpful suggestions throughout the course of this project, and in the preparation of this blog.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/a-tour-of-pytorch-internals-1/index.html b/blog/a-tour-of-pytorch-internals-1/index.html new file mode 100644 index 000000000000..1f387b1550bf --- /dev/null +++ b/blog/a-tour-of-pytorch-internals-1/index.html @@ -0,0 +1,957 @@ + + + + + + + + + + + + + A Tour of PyTorch Internals (Part I) | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        May 11, 2017

        +

        + A Tour of PyTorch Internals (Part I) +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Trevor Killeen + +

        +

        The fundamental unit in PyTorch is the Tensor. This post will serve as an overview for how we implement Tensors in PyTorch, such that the user can interact with it from the Python shell. In particular, we want to answer four main questions:

        + +
          +
        • How does PyTorch extend the Python interpreter to define a Tensor type that can be manipulated from Python code?
        • +
        • How does PyTorch wrap the C libraries that actually define the Tensor’s properties and methods?
        • +
        • How does PyTorch cwrap work to generate code for Tensor methods?
        • +
        • How does PyTorch’s build system take all of these components to compile and generate a workable application?
        • +
        + +

        Extending the Python Interpreter

        + +

        PyTorch defines a new package torch. In this post we will consider the ._C module. This module is known as an “extension module” - a Python module written in C. Such modules allow us to define new built-in object types (e.g. the Tensor) and to call C/C++ functions.

        + +

        The ._C module is defined in torch/csrc/Module.cpp. The init_C() / PyInit__C() function creates the module and adds the method definitions as appropriate. This module is passed around to a number of different __init() functions that add further objects to the module, register new types, etc.

        + +

        One collection of these __init() calls is the following:

        + +
        ASSERT_TRUE(THPDoubleTensor_init(module));
        +ASSERT_TRUE(THPFloatTensor_init(module));
        +ASSERT_TRUE(THPHalfTensor_init(module));
        +ASSERT_TRUE(THPLongTensor_init(module));
        +ASSERT_TRUE(THPIntTensor_init(module));
        +ASSERT_TRUE(THPShortTensor_init(module));
        +ASSERT_TRUE(THPCharTensor_init(module));
        +ASSERT_TRUE(THPByteTensor_init(module));
        +
        + +

        These __init() functions add the Tensor object for each type to the ._C module so that they can be used in the module. Let’s learn how these methods work.

        + +

        The THPTensor Type

        + +

        Much like the underlying TH and THC libraries, PyTorch defines a “generic” Tensor which is then specialized to a number of different types. Before considering how this specialization works, let’s first consider how defining a new type in Python works, and how we create the generic THPTensor type.

        + +

        The Python runtime sees all Python objects as variables of type PyObject *, which serves as a “base type” for all Python objects. Every Python type contains the refcount for the object, and a pointer to the object’s type object. The type object determines the properties of the type. For example, it might contain a list of methods associated with the type, and which C functions get called to implement those methods. The object also contains any fields necessary to represent its state.

        + +

        The formula for defining a new type is as follows:

        + +
          +
        • Create a struct that defines what the new object will contain
        • +
        • Define the type object for the type
        • +
        + +

        The struct itself could be very simple. Inn Python, all floating point types are actually objects on the heap. The Python float struct is defined as:

        +
        typedef struct {
        +    PyObject_HEAD
        +    double ob_fval;
        +} PyFloatObject;
        +
        +

        The PyObject_HEAD is a macro that brings in the code that implements an object’s reference counting, and a pointer to the corresponding type object. So in this case, to implement a float, the only other “state” needed is the floating point value itself.

        + +

        Now, let’s see the struct for our THPTensor type:

        +
        struct THPTensor {
        +    PyObject_HEAD
        +    THTensor *cdata;
        +};
        +
        +

        Pretty simple, right? We are just wrapping the underlying TH tensor by storing a pointer to it.

        + +

        The key part is defining the “type object” for a new type. An example definition of a type object for our Python float takes the form:

        +
        static PyTypeObject py_FloatType = {
        +    PyVarObject_HEAD_INIT(NULL, 0)
        +    "py.FloatObject",          /* tp_name */
        +    sizeof(PyFloatObject),     /* tp_basicsize */
        +    0,                         /* tp_itemsize */
        +    0,                         /* tp_dealloc */
        +    0,                         /* tp_print */
        +    0,                         /* tp_getattr */
        +    0,                         /* tp_setattr */
        +    0,                         /* tp_as_async */
        +    0,                         /* tp_repr */
        +    0,                         /* tp_as_number */
        +    0,                         /* tp_as_sequence */
        +    0,                         /* tp_as_mapping */
        +    0,                         /* tp_hash  */
        +    0,                         /* tp_call */
        +    0,                         /* tp_str */
        +    0,                         /* tp_getattro */
        +    0,                         /* tp_setattro */
        +    0,                         /* tp_as_buffer */
        +    Py_TPFLAGS_DEFAULT,        /* tp_flags */
        +    "A floating point number", /* tp_doc */
        +};
        +
        +

        The easiest way to think of a type object is as a set of fields which define the properties of the object. For example, the tp_basicsize field is set to sizeof(PyFloatObject). This is so that Python knows how much memory to allocate when calling PyObject_New() for a PyFloatObject. The full list of fields you can set is defined in object.h in the CPython backend: +https://github.com/python/cpython/blob/master/Include/object.h.

        + +

        The type object for our THPTensor is THPTensorType, defined in csrc/generic/Tensor.cpp. This object defines the name, size, mapping methods, etc. for a THPTensor.

        + +

        As an example, let’s take a look at the tp_new function we set in the PyTypeObject:

        + +
        PyTypeObject THPTensorType = {
        +  PyVarObject_HEAD_INIT(NULL, 0)
        +  ...
        +  THPTensor_(pynew), /* tp_new */
        +};
        +
        +

        The tp_new function enables object creation. It is responsible for creating (as opposed to initializing) objects of that type and is equivalent to the __new__() method at the Python level. The C implementation is a static method that is passed the type being instantiated and any arguments, and returns a newly created object.

        + +
        static PyObject * THPTensor_(pynew)(PyTypeObject *type, PyObject *args, PyObject *kwargs)
        +{
        +  HANDLE_TH_ERRORS
        +  Py_ssize_t num_args = args ? PyTuple_Size(args) : 0;
        +
        +  THPTensorPtr self = (THPTensor *)type->tp_alloc(type, 0);
        +// more code below
        +
        +

        The first thing our new function does is allocate the THPTensor. It then runs through a series of initializations based off of the args passed to the function. For example, when creating a THPTensor x from another THPTensor y, we set the newly created THPTensor’s cdata field to be the result of calling THTensor_(newWithTensor) with the y’s underlying TH Tensor as an argument. Similar constructors exist for sizes, storages, NumPy arrays, and sequences.

        + +

        ** Note that we solely use tp_new, and not a combination of tp_new and tp_init (which corresponds to the __init__() function).

        + +

        The other important thing defined in Tensor.cpp is how indexing works. PyTorch Tensors support Python’s Mapping Protocol. This allows us to do things like:

        +
        x = torch.Tensor(10).fill_(1)
        +y = x[3] // y == 1
        +x[4] = 2
        +// etc.
        +
        +

        ** Note that this indexing extends to Tensor with more than one dimension

        + +

        We are able to use the []-style notation by defining the three mapping methods described here.

        + +

        The most important methods are THPTensor_(getValue) and THPTensor_(setValue) which describe how to index a Tensor, for returning a new Tensor/Scalar, or updating the values of an existing Tensor in place. Read through these implementations to better understand how PyTorch supports basic tensor indexing.

        + +

        Generic Builds (Part One)

        + +

        We could spend a ton of time exploring various aspects of the THPTensor and how it relates to defining a new Python object. But we still need to see how the THPTensor_(init)() function is translated to the THPIntTensor_init() we used in our module initialization. How do we take our Tensor.cpp file that defines a “generic” Tensor and use it to generate Python objects for all the permutations of types? To put it another way, Tensor.cpp is littered with lines of code like:

        +
        return THPTensor_(New)(THTensor_(new)(LIBRARY_STATE_NOARGS));
        +
        +

        This illustrates both cases we need to make type-specific:

        + +
          +
        • Our output code will call THP<Type>Tensor_New(...) in place of THPTensor_(New)
        • +
        • Our output code will call TH<Type>Tensor_new(...) in place of THTensor_(new)
        • +
        + +

        In other words, for all supported Tensor types, we need to “generate” source code that has done the above substitutions. This is part of the “build” process for PyTorch. PyTorch relies on Setuptools (https://setuptools.readthedocs.io/en/latest/) for building the package, and we define a setup.py file in the top-level directory to customize the build process.

        + +

        One component building an Extension module using Setuptools is to list the source files involved in the compilation. However, our csrc/generic/Tensor.cpp file is not listed! So how does the code in this file end up being a part of the end product?

        + +

        Recall that we are calling the THPTensor* functions (such as init) from the directory above generic. If we take a look in this directory, there is another file Tensor.cpp defined. The last line of this file is important:

        +
        //generic_include TH torch/csrc/generic/Tensor.cpp
        +
        +

        Note that this Tensor.cpp file is included in setup.py, but it is wrapped in a call to a Python helper function called split_types. This function takes as input a file, and looks for the “//generic_include” string in the file contents. If it is found, it generates a new output file for each Tensor type, with the following changes:

        + +
          +
        • The output file is renamed to Tensor<Type>.cpp
        • +
        • The output file is slightly modified as follows:
        • +
        + +
        # Before:
        +//generic_include TH torch/csrc/generic/Tensor.cpp
        +
        +# After:
        +#define TH_GENERIC_FILE "torch/src/generic/Tensor.cpp"
        +#include "TH/THGenerate<Type>Type.h"
        +
        +

        Including the header file on the second line has the side effect of including the source code in Tensor.cpp with some additional context defined. Let’s take a look at one of the headers:

        + +
        #ifndef TH_GENERIC_FILE
        +#error "You must define TH_GENERIC_FILE before including THGenerateFloatType.h"
        +#endif
        +
        +#define real float
        +#define accreal double
        +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
        +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
        +#define Real Float
        +#define THInf FLT_MAX
        +#define TH_REAL_IS_FLOAT
        +#line 1 TH_GENERIC_FILE
        +#include TH_GENERIC_FILE
        +#undef accreal
        +#undef real
        +#undef Real
        +#undef THInf
        +#undef TH_REAL_IS_FLOAT
        +#undef TH_CONVERT_REAL_TO_ACCREAL
        +#undef TH_CONVERT_ACCREAL_TO_REAL
        +
        +#ifndef THGenerateManyTypes
        +#undef TH_GENERIC_FILE
        +#endif
        +
        + +

        What this is doing is bringing in the code from the generic Tensor.cpp file and surrounding it with the following macro definitions. For example, we define real as a float, so any code in the generic Tensor implementation that refers to something as a real will have that real replaced with a float. In the corresponding file THGenerateIntType.h, the same macro would replace real with int.

        + +

        These output files are returned from split_types and added to the list of source files, so we can see how the .cpp code for different types is created.

        + +

        There are a few things to note here: First, the split_types function is not strictly necessary. We could wrap the code in Tensor.cpp in a single file, repeating it for each type. The reason we split the code into separate files is to speed up compilation. Second, what we mean when we talk about the type replacement (e.g. replace real with a float) is that the C preprocessor will perform these substitutions during compilation. Merely surrounding the source code with these macros has no side effects until preprocessing.

        + +

        Generic Builds (Part Two)

        + +

        Now that we have source files for all the Tensor types, we need to consider how the corresponding header declarations are created, and also how the conversions from THTensor_(method) and THPTensor_(method) to TH<Type>Tensor_method and THP<Type>Tensor_method work. For example, csrc/generic/Tensor.h has declarations like:

        +
        THP_API PyObject * THPTensor_(New)(THTensor *ptr);
        +
        +

        We use the same strategy for generating code in the source files for the headers. In csrc/Tensor.h, we do the following:

        +
        #include "generic/Tensor.h"
        +#include <TH/THGenerateAllTypes.h>
        +
        +#include "generic/Tensor.h"
        +#include <TH/THGenerateHalfType.h>
        +
        +

        This has the same effect, where we draw in the code from the generic header, wrapped with the same macro definitions, for each type. The only difference is that the resulting code is contained all within the same header file, as opposed to being split into multiple source files.

        + +

        Lastly, we need to consider how we “convert” or “substitute” the function types. If we look in the same header file, we see a bunch of #define statements, including:

        +
        #define THPTensor_(NAME)            TH_CONCAT_4(THP,Real,Tensor_,NAME)
        +
        +

        This macro says that any string in the source code matching the format THPTensor_(NAME) should be replaced with THPRealTensor_NAME, where Real is derived from whatever the symbol Real is #define‘d to be at the time. Because our header code and source code is surrounded by macro definitions for all the types as seen above, after the preprocessor has run, the resulting code is what we would expect. The code in the TH library defines the same macro for THTensor_(NAME), supporting the translation of those functions as well. In this way, we end up with header and source files with specialized code.

        + +

        Module Objects and Type Methods

        + +

        Now we have seen how we have wrapped TH’s Tensor definition in THP, and generated THP methods such as THPFloatTensor_init(...). Now we can explore what the above code actually does in terms of the module we are creating. The key line in THPTensor_(init) is:

        +
        # THPTensorBaseStr, THPTensorType are also macros that are specific
        +# to each type
        +PyModule_AddObject(module, THPTensorBaseStr, (PyObject *)&THPTensorType);
        +
        +

        This function registers our Tensor objects to the extension module, so we can use THPFloatTensor, THPIntTensor, etc. in our Python code.

        + +

        Just being able to create Tensors isn’t very useful - we need to be able to call all the methods that TH defines. A simple example shows calling the in-place zero_ method on a Tensor.

        +
        x = torch.FloatTensor(10)
        +x.zero_()
        +
        +

        Let’s start by seeing how we add methods to newly defined types. One of the fields in the “type object” is tp_methods. This field holds an array of method definitions (PyMethodDefs) and is used to associate methods (and their underlying C/C++ implementations) with a type. Suppose we wanted to define a new method on our PyFloatObject that replaces the value. We could implement this as follows:

        +
        static PyObject * replace(PyFloatObject *self, PyObject *args) {
        +	double val;
        +	if (!PyArg_ParseTuple(args, "d", &val))
        +		return NULL;
        +	self->ob_fval = val;
        +	Py_RETURN_NONE
        +}
        +
        +

        This is equivalent to the Python method:

        +
        def replace(self, val):
        +	self.ob_fval = val
        +
        +

        It is instructive to read more about how defining methods works in CPython. In general, methods take as the first parameter the instance of the object, and optionally parameters for the positional arguments and keyword arguments. This static function is registered as a method on our float:

        +
        static PyMethodDef float_methods[] = {
        +	{"replace", (PyCFunction)replace, METH_VARARGS,
        +	"replace the value in the float"
        +	},
        +	{NULL} /* Sentinel */
        +}
        +
        +

        This registers a method called replace, which is implemented by the C function of the same name. The METH_VARARGS flag indicates that the method takes a tuple of arguments representing all the arguments to the function. This array is set to the tp_methods field of the type object, and then we can use the replace method on objects of that type.

        + +

        We would like to be able to call all of the methods for TH tensors on our THP tensor equivalents. However, writing wrappers for all of the TH methods would be time-consuming and error prone. We need a better way to do this.

        + +

        PyTorch cwrap

        + +

        PyTorch implements its own cwrap tool to wrap the TH Tensor methods for use in the Python backend. We define a .cwrap file containing a series of C method declarations in our custom YAML format. The cwrap tool takes this file and outputs .cpp source files containing the wrapped methods in a format that is compatible with our THPTensor Python object and the Python C extension method calling format. This tool is used to generate code to wrap not only TH, but also CuDNN. It is defined to be extensible.

        + +

        An example YAML “declaration” for the in-place addmv_ function is as follows:

        +
        [[
        +  name: addmv_
        +  cname: addmv
        +  return: self
        +  arguments:
        +    - THTensor* self
        +    - arg: real beta
        +      default: AS_REAL(1)
        +    - THTensor* self
        +    - arg: real alpha
        +      default: AS_REAL(1)
        +    - THTensor* mat
        +    - THTensor* vec
        +]]
        +
        +

        The architecture of the cwrap tool is very simple. It reads in a file, and then processes it with a series of plugins. See tools/cwrap/plugins/__init__.py for documentation on all the ways a plugin can alter the code.

        + +

        The source code generation occurs in a series of passes. First, the YAML “declaration” is parsed and processed. Then the source code is generated piece-by-piece - adding things like argument checks and extractions, defining the method header, and the actual call to the underlying library such as TH. Finally, the cwrap tool allows for processing the entire file at a time. The resulting output for addmv_ can be explored here.

        + +

        In order to interface with the CPython backend, the tool generates an array of PyMethodDefs that can be stored or appended to the THPTensor’s tp_methods field.

        + +

        In the specific case of wrapping Tensor methods, the build process first generates the output source file from TensorMethods.cwrap. This source file is #include‘d in the generic Tensor source file. This all occurs before the preprocessor does its magic. As a result, all of the method wrappers that are generated undergo the same pass as the THPTensor code above. Thus a single generic declaration and definition is specialized for each type as well.

        + +

        Putting It All Together

        + +

        So far, we have shown how we extend the Python interpreter to create a new extension module, how such a module defines our new THPTensor type, and how we can generate source code for Tensors of all types that interface with TH. Briefly, we will touch on compilation.

        + +

        Setuptools allows us to define an Extension for compilation. The entire torch._C extension is compiled by collecting all of the source files, header files, libraries, etc. and creating a setuptools Extension. Then setuptools handles building the extension itself. I will explore the build process more in a subsequent post.

        + +

        To summarize, let’s revisit our four questions:

        + +
          +
        • How does PyTorch extend the Python interpreter to define a Tensor type that can be manipulated from Python code?
        • +
        + +

        It uses CPython’s framework for extending the Python interpreter and defining new types, while taking special care to generate code for all types.

        + +
          +
        • How does PyTorch wrap the C libraries that actually define the Tensor’s properties and methods?
        • +
        + +

        It does so by defining a new type, THPTensor, that is backed by a TH Tensor. Function calls are forwarded to this tensor via the CPython backend’s conventions.

        + +
          +
        • How does PyTorch cwrap work to generate code for Tensor methods?
        • +
        + +

        It takes our custom YAML-formatted code and generates source code for each method by processing it through a series of steps using a number of plugins.

        + +
          +
        • How does PyTorch’s build system take all of these components to compile and generate a workable application?
        • +
        + +

        It takes a bunch of source/header files, libraries, and compilation directives to build an extension using Setuptools.

        + +

        This is just a snapshot of parts of the build system for PyTorch. There is more nuance, and detail, but I hope this serves as a gentle introduction to a lot of the components of our Tensor library.

        + +

        Resources:

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/a-tour-of-pytorch-internals-2/index.html b/blog/a-tour-of-pytorch-internals-2/index.html new file mode 100644 index 000000000000..436351f2f4e7 --- /dev/null +++ b/blog/a-tour-of-pytorch-internals-2/index.html @@ -0,0 +1,1203 @@ + + + + + + + + + + + + + PyTorch Internals Part II - The Build System | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Trevor Killeen + +

        +

        In the first post I explained how we generate a torch.Tensor object that you can use in your Python interpreter. Next, I will explore the build system for PyTorch. The PyTorch codebase has a variety of components:

        + +
          +
        • The core Torch libraries: TH, THC, THNN, THCUNN
        • +
        • Vendor libraries: CuDNN, NCCL
        • +
        • Python Extension libraries
        • +
        • Additional third-party libraries: NumPy, MKL, LAPACK
        • +
        + +

        How does a simple invocation of python setup.py install do the work that allows you to call import torch and use the PyTorch library in your code?

        + +

        The first part of this document will explain the build process from and end-user point of view. This will explain how we take the components above to build the library. The second part of the document will be important for PyTorch developers. It will document ways to improve your iteration speed by building only a subset of the code that you are working on.

        + +

        Setuptools and PyTorch’s setup( ) function

        + +

        Python uses Setuptools to build the library. Setuptools is an extension to the original distutils system from the core Python library. The core component of Setuptools is the setup.py file which contains all the information needed to build the project. The most important function is the setup() function which serves as the main entry point. Let’s take a look at the one in PyTorch:

        + +
        setup(name="torch", version=version,
        +      description="Tensors and Dynamic neural networks in Python with strong GPU acceleration",
        +      ext_modules=extensions,
        +      cmdclass={
        +          'build': build,
        +          'build_py': build_py,
        +          'build_ext': build_ext,
        +          'build_deps': build_deps,
        +          'build_module': build_module,
        +          'develop': develop,
        +          'install': install,
        +          'clean': clean,
        +      },
        +      packages=packages,
        +      package_data={'torch': [
        +          'lib/*.so*', 'lib/*.dylib*',
        +          'lib/torch_shm_manager',
        +          'lib/*.h',
        +          'lib/include/TH/*.h', 'lib/include/TH/generic/*.h',
        +          'lib/include/THC/*.h', 'lib/include/THC/generic/*.h']},
        +      install_requires=['pyyaml'],
        +      )
        +
        + +

        The function is composed entirely of keyword arguments, which serve two purposes:

        + +
          +
        • Metadata (e.g. name, description, version)
        • +
        • The contents of the package
        • +
        + +

        We are concerned with #2. Let’s break down the individual components:

        + +
          +
        • ext_modules: Python modules are either “pure” modules, containing only Python code, or “extension” modules written in the low-level language of the Python implementation. Here we are listing the extension modules in the build, including the main torch._C library that contains our Python Tensor
        • +
        • cmdclass: When using the setup.py script from the command line, the user must specify one or more “commands”, code snippets that perform a specific action. For example, the “install” command builds and installs the package. This mapping routes specific commands to functions in setup.py that implement them
        • +
        • packages: The list of packages in the project. These are “pure” - i.e. they only contain Python code. These are defined elsewhere in setup.py
        • +
        • package_data: Additional files that need to be installed into a package: in this case the header files and shared libraries that the build will generate must be included in our installation
        • +
        • install_requires: In order to build PyTorch, we need pyyaml. Setuptools will handle making sure that pyyaml will be available, downloading and installing it if necessary
        • +
        + +

        We will consider these components in more detail, but for now it is instructive to look at the end product of an installation – i.e. what Setuptools does after building the code.

        + +

        site_packages

        + +

        Third party packages are by default installed into the lib/<version>/site_packages directory associated with your Python binary. For example, because I am using an Miniconda environment, my Python binary is found at:

        + +
        (p3) killeent@devgpu047:pytorch (master)$ which python
        +~/local/miniconda2/envs/p3/bin/python
        +
        +

        And thus packages are installed into:

        + +
        /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages
        +
        +

        I installed PyTorch, and let’s take a look into torch folder in site-packages:

        + +
        (p3) killeent@devgpu047:site-packages$ cd torch
        +(p3) killeent@devgpu047:torch$ ls
        +autograd  backends  _C.cpython-36m-x86_64-linux-gnu.so  cuda  distributed  _dl.cpython-36m-x86_64-linux-gnu.so  functional.py  __init__.py  legacy  lib  multiprocessing  nn  optim  __pycache__  serialization.py  _six.py  sparse  storage.py  _tensor_docs.py  tensor.py  _tensor_str.py  _thnn  _torch_docs.py  utils  _utils.py  version.py
        +
        + +

        Note that everything we would expect to be here is here:

        + +
          +
        • All the “pure” packages are here [todo print packages from setup.py to explain]
        • +
        • The extension libraries are here - the ._C* and ._dl* shared libraries
        • +
        • The package_data is here: the contents of lib/ match exactly what we described in the setup function:
        • +
        + +
        (p3) killeent@devgpu047:torch$ ls lib/
        +include     libnccl.so.1  libTHC.so.1   libTHCUNN.so.1  libTHNN.so.1  libTH.so.1   THCUNN.h  torch_shm_manager libnccl.so  libshm.so     libTHCS.so.1  libTHD.so.1     libTHPP.so.1  libTHS.so.1  THNN.h
        +
        + +

        The Python interpreter looks into site_packages during an import. If we call import torch in our Python code it will find the module here and initialize and import it. You can read more about the import system here.

        + +

        Building Individual Parts

        + +

        Next, we will look at the various individual components of the build from start to finish. This will illustrate how we combine all the code we mentioned in the introduction.

        + +

        Backend Torch and Vendor Libraries

        + +

        Let’s take a look at the install cmd override in PyTorch’s setup.py:

        + +
        class install(setuptools.command.install.install):
        +
        +    def run(self):
        +        if not self.skip_build:
        +            self.run_command('build_deps')
        +        setuptools.command.install.install.run(self)
        +
        + +

        We note the first thing it does is run a command called “build_deps” - let’s take a look at it’s run() method:

        + +
        def run(self):
        +        from tools.nnwrap import generate_wrappers as generate_nn_wrappers
        +        build_all_cmd = ['bash', 'torch/lib/build_all.sh']
        +        if WITH_CUDA:
        +            build_all_cmd += ['--with-cuda']
        +        if WITH_NCCL and not SYSTEM_NCCL:
        +            build_all_cmd += ['--with-nccl']
        +        if WITH_DISTRIBUTED:
        +            build_all_cmd += ['--with-distributed']
        +        if subprocess.call(build_all_cmd) != 0:
        +            sys.exit(1)
        +        generate_nn_wrappers()
        +
        + +

        Here we note that that we have a shell script build_all.sh in the torch/lib/ directory. This script is configurable by whether we are on a system with CUDA enabled, the NCCL library enabled, and PyTorch’s distributed library enabled.

        + +

        Let’s take a look in torch/lib:

        + +
        (p3) killeent@devgpu047:lib (master)$ ls
        +build_all.sh  libshm  nccl  README.md  TH  THC  THCS  THCUNN  THD  THNN  THPP  THS
        +
        + +

        Here we see the directories for all the backend libraries. TH, THC, THNN, THCUNN, and nccl are git subtrees that are in sync with the libraries in e.g. github.com/torch. THS, THCS, THD, THPP and libshm are libraries specific to PyTorch. All of the libraries contain CMakeLists.txt - indicating they are built with CMake.

        + +

        The build_all.sh is essentially a script that runs the CMake configure step on all of these libraries, and then make install. Let’s run ./build_all.sh and see what we are left with:

        + +
        (p3) killeent@devgpu047:lib (master)$ ./build_all.sh --with-cuda --with-nccl --with-distributed
        +[various CMake output logs]
        +(p3) killeent@devgpu047:lib (master)$ ls
        +build  build_all.sh  include  libnccl.so  libnccl.so.1  libshm  libshm.so  libTHC.so.1  libTHCS.so.1  libTHCUNN.so.1  libTHD.so.1  libTHNN.so.1  libTHPP.so.1  libTH.so.1  libTHS.so.1  nccl  README.md  TH  THC  THCS  THCUNN  THCUNN.h  THD  THNN  THNN.h  THPP  THS  tmp_install  torch_shm_manager
        +
        + +

        Now there are a number of extra things in the directory:

        + +
          +
        • Shared library files for each library
        • +
        • Headers for THNN and THCUNN
        • +
        • build and tmp_install directories
        • +
        • The torch_shm_manager executable
        • +
        + +

        Let’s explore further. In the shell script, we create the build directory and a subdir for each library to build:

        + +
        # We create a build directory for the library, which will
        +# contain the cmake output. $1 is the library to be built
        +  mkdir -p build/$1
        +  cd build/$1
        +
        + +

        Thus e.g. build/TH contains the CMake configuration output including the Makefile for building TH, and also the result of running make install in this directory.

        + +

        Let’s also look at tmp_install:

        + +
        (p3) killeent@devgpu047:lib (master)$ ls tmp_install/
        +bin  include  lib  share
        +
        + +

        tmp_install looks like a standard install directory containing binaries, header files and library files. For example, tmp_install/include/TH contains all the TH headers, and tmp_install/lib/ contains the libTH.so.1 file.

        + +

        So why have this directory? It is used to compile the libraries that depend on each other. For example, the THC library depends on the TH library and its headers. This is referenced in the build shell script as arguments to the cmake command:

        + +
        # install_dir is tmp_install
        +cmake ...
        +	-DTH_INCLUDE_PATH="$INSTALL_DIR/include" \
        +	-DTH_LIB_PATH="$INSTALL_DIR/lib" \
        +
        + +

        And indeed if we look at the THC library we built:

        + +
        (p3) killeent@devgpu047:lib (master)$ ldd libTHC.so.1
        +	...
        +	libTH.so.1 => /home/killeent/github/pytorch/torch/lib/tmp_install/lib/./libTH.so.1 (0x00007f84478b7000)
        +
        + +

        The way the build_all.sh specifies the include and library paths is a little messy but this is representative of the overall idea. Finally, at the end of the script:

        + +
        # If all the builds succeed we copy the libraries, headers,
        +# binaries to torch/lib
        +cp $INSTALL_DIR/lib/* .
        +cp THNN/generic/THNN.h .
        +cp THCUNN/generic/THCUNN.h .
        +cp -r $INSTALL_DIR/include .
        +cp $INSTALL_DIR/bin/* .
        +
        + +

        As we can see, at the end, we copy everything to the top-level torch/lib directory - explaining the contents we saw above. We’ll see why we do this next:

        + +

        NN Wrappers

        + +

        Briefly, let’s touch on the last part of the build_deps command: generate_nn_wrappers(). We bind into the backend libraries using PyTorch’s custom cwrap tooling, which we touched upon in a previous post. For binding TH and THC we manually write the YAML declarations for each function. However, due to the relative simplicity of the THNN and THCUNN libraries, we auto-generate both the cwrap declarations and the resulting C++ code.

        + +

        The reason we copy the THNN.h and THCUNN.h header files into torch/lib is that this is where the generate_nn_wrappers() code expects these files to be located. generate_nn_wrappers() does a few things:

        + +
          +
        1. Parses the header files, generating cwrap YAML declarations and writing them to output .cwrap files
        2. +
        3. Calls cwrap with the appropriate plugins on these .cwrap files to generate source code for each
        4. +
        5. Parses the headers a second time to generate THNN_generic.h - a library that takes THPP Tensors, PyTorch’s “generic” C++ Tensor Library, and calls into the appropriate THNN/THCUNN library function based on the dynamic type of the Tensor
        6. +
        + +

        If we take a look into torch/csrc/nn after running generate_nn_wrappers() we can see the output:

        + +
        (p3) killeent@devgpu047:nn (master)$ ls
        +THCUNN.cpp  THCUNN.cwrap  THNN.cpp  THNN.cwrap  THNN_generic.cpp  THNN_generic.cwrap  THNN_generic.h  THNN_generic.inc.h
        +
        + +

        For example, the code generates cwrap like:

        + +
        [[
        +  name: FloatBatchNormalization_updateOutput
        +  return: void
        +  cname: THNN_FloatBatchNormalization_updateOutput
        +  arguments:
        +    - void* state
        +    - THFloatTensor* input
        +    - THFloatTensor* output
        +    - type: THFloatTensor*
        +      name: weight
        +      nullable: True
        +    - type: THFloatTensor*
        +      name: bias
        +      nullable: True
        +    - THFloatTensor* running_mean
        +    - THFloatTensor* running_var
        +    - THFloatTensor* save_mean
        +    - THFloatTensor* save_std
        +    - bool train
        +    - double momentum
        +    - double eps
        +]]
        +
        + +

        with corresponding .cpp:

        + +
        extern "C" void THNN_FloatBatchNormalization_updateOutput(void*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, THFloatTensor*, bool, double, double);
        +
        +PyObject * FloatBatchNormalization_updateOutput(PyObject *_unused, PyObject *args) {
        +	// argument checking, unpacking
        +	 PyThreadState *_save = NULL;
        +      try {
        +        Py_UNBLOCK_THREADS;
        +        THNN_FloatBatchNormalization_updateOutput(arg_state, arg_input, arg_output, arg_weight, arg_bias, arg_running_mean, arg_running_var, arg_save_mean, arg_save_std, arg_train, arg_momentum, arg_eps);
        +        Py_BLOCK_THREADS;
        +        Py_RETURN_NONE;
        +      } catch (...) {
        +        if (_save) {
        +          Py_BLOCK_THREADS;
        +        }
        +        throw;
        +      }
        +
        +    ...
        +}
        +
        + +

        In the THPP generated code, the function looks like this:

        + +
        void BatchNormalization_updateOutput(thpp::Tensor* input, thpp::Tensor* output, thpp::Tensor* weight, thpp::Tensor* bias, thpp::Tensor* running_mean, thpp::Tensor* running_var, thpp::Tensor* save_mean, thpp::Tensor* save_std, bool train, double momentum, double eps) {
        +	// Call appropriate THNN function based on tensor type, whether its on CUDA, etc.
        +}
        +
        + +

        We will look a little more at how these source files are used later.

        + +

        “Building” the Pure Python Modules

        + +

        Now that we have built the backend libraries (the “dependencies”) we can move forward with building the actual PyTorch code. The next Setuptools command that runs is build_py, which is used to build all the “Pure” python modules in our library. These are the “packages” passed to setup.py.

        + +

        The packages are found using the Setuptools’ utility function find_packages():

        + +
        packages = find_packages(exclude=('tools.*',))
        +['torch', 'torch._thnn', 'torch.autograd', 'torch.backends', 'torch.cuda', 'torch.distributed', 'torch.legacy', 'torch.multiprocessing', 'torch.nn', 'torch.optim', 'torch.sparse', 'torch.utils', 'torch.autograd._functions', 'torch.backends.cudnn', 'torch.legacy.nn', 'torch.legacy.optim', 'torch.nn._functions', 'torch.nn.backends', 'torch.nn.modules', 'torch.nn.parallel', 'torch.nn.utils', 'torch.nn._functions.thnn', 'torch.utils.data', 'torch.utils.ffi', 'torch.utils.serialization', 'torch.utils.trainer', 'torch.utils.backcompat', 'torch.utils.trainer.plugins']
        +
        + +

        As we can see, find_package has recursively traversed the torch directory, finding all the directory paths that have an __init__.py file.

        + +

        When building with Setuptools, the tool creates a build directory in the distribution root, i.e. the same location as the setup.py file. Because PyTorch is composed of both “Pure” python modules and Extension Modules, we need to preserve information about the Operating System and Python version used when performing the build. So if we look in my build directory, we see:

        + +
        (p3) killeent@devgpu047:pytorch (master)$ ls build
        +lib.linux-x86_64-3.6  temp.linux-x86_64-3.6
        +
        + +

        This indicates that I’ve built the project on linux-x86-64 using Python 3.6. The lib directory contains the library files, while the temp directory contains files generated during the build that aren’t needed in the final installation.

        + +

        Because “Pure” python modules are just Python code, and don’t need to be “compiled”, the build_py process simply copies files from their locations as found by find_packages to the equivalent location in build/. So our build output is littered with lines like:

        + +
        copying torch/autograd/_functions/blas.py -> build/lib.linux-x86_64-3.6/torch/autograd/_functions
        +
        + +

        We also noted earlier that we could pass files and directories to the package_data keyword argument to the main setup() function, and that Setuptools would handle copying those files to the installation location. During build_py, these files are copied to the build/ directory, so we also see lines like:

        + +
        copying torch/lib/libTH.so.1 -> build/lib.linux-x86_64-3.6/torch/lib
        +...
        +copying torch/lib/include/THC/generic/THCTensor.h -> build/lib.linux-x86_64-3.6/torch/lib/include/THC/generic
        +
        + +

        Building the Extension Modules

        + +

        Finally, we need to build the Extension Modules, i.e. the PyTorch modules written in C++ using the CPython backend. This also constitutes the majority of the code logic in setup.py. Our overridden build_ext Command has some special logic before the extensions themselves are actually built:

        + +
        from tools.cwrap import cwrap
        +from tools.cwrap.plugins.THPPlugin import THPPlugin
        +from tools.cwrap.plugins.ArgcountSortPlugin import ArgcountSortPlugin
        +from tools.cwrap.plugins.AutoGPU import AutoGPU
        +from tools.cwrap.plugins.BoolOption import BoolOption
        +from tools.cwrap.plugins.KwargsPlugin import KwargsPlugin
        +from tools.cwrap.plugins.NullableArguments import NullableArguments
        +from tools.cwrap.plugins.CuDNNPlugin import CuDNNPlugin
        +from tools.cwrap.plugins.WrapDim import WrapDim
        +from tools.cwrap.plugins.AssertNDim import AssertNDim
        +from tools.cwrap.plugins.Broadcast import Broadcast
        +from tools.cwrap.plugins.ProcessorSpecificPlugin import ProcessorSpecificPlugin
        +        thp_plugin = THPPlugin()
        +        cwrap('torch/csrc/generic/TensorMethods.cwrap', plugins=[
        +            ProcessorSpecificPlugin(), BoolOption(), thp_plugin,
        +            AutoGPU(condition='IS_CUDA'), ArgcountSortPlugin(), KwargsPlugin(),
        +            AssertNDim(), WrapDim(), Broadcast()
        +        ])
        +        cwrap('torch/csrc/cudnn/cuDNN.cwrap', plugins=[
        +            CuDNNPlugin(), NullableArguments()
        +        ])
        +
        + +

        Recall above that I documented that we auto-generated C++ code for calling into the THNN etc. libraries. Here is where we bind TH, THC and CuDNN. We take the YAML declarations in TensorMethods.cwrap, and use them to generate output C++ source files that contain implementations that work within PyTorch’s C++ Ecosystem. For example, a simple declaration like zero_:

        + +
        [[
        +  name: zero_
        +  cname: zero
        +  return: self
        +  arguments:
        +    - THTensor* self
        +]]
        +
        + +

        Generates code like:

        + +
         PyObject * THPTensor_(zero_)(PyObject *self, PyObject *args, PyObject *kwargs) {
        +	...
        +	THTensor_(zero)(LIBRARY_STATE arg_self);
        +	...
        +}
        +
        + +

        In the previous post we documented how these functions are tied to specific Tensor types, so I won’t expand on that there. For the build process its enough to know that these C++ files are generated prior to the extension being built, because these source files are used during Extension compilation.

        + +

        Specifying the Extensions

        + +

        Unlike pure modules, it’s not enough just to list modules or packages and expect the Setuptools to go out and find the right files; you have to specify the extension name, source file(s), and any compile/link requirements (include directories, libraries to link with, etc.).

        + +

        The bulk (200~ LOC at the time of this writing) of the setup.py goes into specifying how to build these Extensions. Here, some of the choices we make in build_all.sh begin to make sense. For example, we saw that our build script specified a tmp_install directory where we installed our backend libraries. In our setup.py code, we reference this directory when adding to the list of directories containing header files to include:

        + +
        # tmp_install_path is torch/lib/tmp_install
        +include_dirs += [
        +    cwd,
        +    os.path.join(cwd, "torch", "csrc"),
        +    tmp_install_path + "/include",
        +    tmp_install_path + "/include/TH",
        +    tmp_install_path + "/include/THPP",
        +    tmp_install_path + "/include/THNN",
        +
        + +

        Similarly, we copied the shared object libraries to torch/csrc at the end of the build_all.sh script. We reference these locations directly in our setup.py code when identifying libraries that we may link against:

        + +
        # lib_path is torch/lib
        +TH_LIB = os.path.join(lib_path, 'libTH.so.1')
        +THS_LIB = os.path.join(lib_path, 'libTHS.so.1')
        +THC_LIB = os.path.join(lib_path, 'libTHC.so.1')
        +THCS_LIB = os.path.join(lib_path, 'libTHCS.so.1')
        +THNN_LIB = os.path.join(lib_path, 'libTHNN.so.1')
        +# ...
        +
        + +

        Let’s consider how we build the main torch._C Extension Module:

        + +
        C = Extension("torch._C",
        +              libraries=main_libraries,
        +              sources=main_sources,
        +              language='c++',
        +              extra_compile_args=main_compile_args + extra_compile_args,
        +              include_dirs=include_dirs,
        +              library_dirs=library_dirs,
        +              extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')],
        +              )
        +
        + +
          +
        • The main libraries are all the libraries we link against. This includes things like shm, PyTorch’s shared memory management library, and also system libraries like cudart and cudnn. Note that the TH libraries are not listed here
        • +
        • The main sources are the C++ files that make up the C++ backend for PyTorch
        • +
        • The compile args are various flags that configure compilation. For example, we might want to add debug flags when compiling in debug mode
        • +
        • The include dirs are the paths to all the directories containing header files. This is also another example where the build_all.sh script is important - for example, we look for the TH header files in torch/lib/tmp_install/include/TH - which is the install location we specified with our CMake configuration
        • +
        • The library dirs are directories to search for shared libraries at link time. For example, we include torch/lib - the location we copied our .so files to at the end of build_all.sh, but also the paths to the CUDA and CuDNN directories
        • +
        • The link arguments are used when linking object files together to create the extension. In PyTorch, this includes more normal options like decided to link libstdc++ statically. However, there is one key component: this is where we link the backend TH libraries. Note that we have lines like:
        • +
        + +
        # The explicit paths to .so files we described above
        +main_link_args = [TH_LIB, THS_LIB, THPP_LIB, THNN_LIB]
        +
        + +

        You might be wondering why we do this as opposed to adding these libraries to the list we pass to the libraries keyword argument. After all, that is a list of libraries to link against. The issue is that Lua Torch installs often set the LD_LIBRARY_PATH variable, and thus we could mistakenly link against a TH library built for Lua Torch, instead of the library we have built locally. This would be problematic because the code could be out of date, and also there are various configuration options for Lua Torch’s TH that would not play nicely with PyTorch.

        + +

        As such, we manually specify the paths to the shared libraries we generated directly to the linker.

        + +

        There are other extensions needed to power PyTorch and they are built in a similar way. The Setuptools library invokes the C++ compiler and linker to build all of these extensions. If the builds succeed, we have successfully built the PyTorch library and we can move on to installation.

        + +

        Installation

        + +

        After building has finished, installation is quite simple. We simply have to copy everything from our build/lib.linux-x86_64-3.6 directory to the appropriate installation directory. Recall that we noted above that this directory is the site_packages directory associated with our Python binary. As a result, we see lines like:

        + +
        running install_lib
        +creating /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch
        +copying build/lib.linux-x86_64-3.6/torch/_C.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch
        +copying build/lib.linux-x86_64-3.6/torch/_dl.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch
        +creating /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn
        +copying build/lib.linux-x86_64-3.6/torch/_thnn/_THNN.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn
        +copying build/lib.linux-x86_64-3.6/torch/_thnn/_THCUNN.cpython-36m-x86_64-linux-gnu.so -> /home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/_thnn
        +
        + +

        Finally lets power up the Python interpreter. When the Python interpreter executes an import statement, it searches for Python code and extension modules along a search path. A default value for the path is configured into the Python binary when the interpreter is built.

        + +
        # note we are now in my home directory
        +(p3) killeent@devgpu047:~$ python
        +Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23)
        +[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux
        +Type "help", "copyright", "credits" or "license" for more information.
        +>>> import sys
        +>>> sys.path
        +['', '/home/killeent/local/miniconda2/envs/p3/lib/python36.zip', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/lib-dynload', '/home/killeent/.local/lib/python3.6/site-packages', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages', '/home/killeent/github/pytorch', '/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/setuptools-27.2.0-py3.6.egg']
        +
        + +

        As we can see, the site-packages directory we copied our PyTorch installation to is part of search path. Now let’s load the torch module and see its location:

        + +
        >>> import torch
        +>>> import inspect
        +>>> inspect.getfile(torch)
        +'/home/killeent/local/miniconda2/envs/p3/lib/python3.6/site-packages/torch/__init__.py'
        +
        + +

        As we can see, we have loaded the module from site_packages as expected - and our build and installation is successful!

        + +

        Note: Python prepends the empty string to sys.path to represent the current working directory - making it the first place we search for a module. So if we run Python from the pytorch directory, we would accidentally load the local version of PyTorch rather than our installed version. This is something to watch out for.

        + +

        Addendum - Developer Efficiency, 3rd Party Libraries, Things I Didn’t Cover

        + +

        The entire installation loop for PyTorch can be quite time-consuming. On my devserver, it takes around 5 minutes for an installation from source. Often times, when developing PyTorch, we only want to work on a subset of the entire project, and re-build only that subset in order to test changes. Fortunately, our build system enables this.

        + +

        Setuptools Develop Mode

        + +

        The main tool that supports this is Setuptools develop command. The documentation states that:

        + +
        +

        This command allows you to deploy your project’s source for use in one or more “staging areas” where it will be available for importing. This deployment is done in such a way that changes to the project source are immediately available in the staging area(s), without needing to run a build or install step after each change.

        +
        + +

        But how does it work? Suppose we run python setup.py build develop in the PyTorch directory. The build command is run, building our dependencies (TH, THPP, etc.) and the extension libraries. However, if we look inside site-packages:

        + +
        (p3) killeent@devgpu047:site-packages$ ls -la torch*
        +-rw-r--r--. 1 killeent users 31 Jun 27 08:02 torch.egg-link
        +
        + +

        Looking at the contents of the torch.egg-link file, it simply references the PyTorch directory:

        + +
        (p3) killeent@devgpu047:site-packages$ cat torch.egg-link
        +/home/killeent/github/pytorch
        +
        + +

        If we navigate back to the PyTorch directory, we see there is a new directory torch.egg-info:

        + +
        (p3) killeent@devgpu047:pytorch (master)$ ls -la torch.egg-info/
        +total 28
        +drwxr-xr-x.  2 killeent users  4096 Jun 27 08:09 .
        +drwxr-xr-x. 10 killeent users  4096 Jun 27 08:01 ..
        +-rw-r--r--.  1 killeent users     1 Jun 27 08:01 dependency_links.txt
        +-rw-r--r--.  1 killeent users   255 Jun 27 08:01 PKG-INFO
        +-rw-r--r--.  1 killeent users     7 Jun 27 08:01 requires.txt
        +-rw-r--r--.  1 killeent users 16080 Jun 27 08:01 SOURCES.txt
        +-rw-r--r--.  1 killeent users    12 Jun 27 08:01 top_level.txt
        +
        + +

        This file contains metadata about the PyTorch project. For example, requirements.txt lists all of the dependencies for setting up PyTorch:

        + +
        (p3) killeent@devgpu047:pytorch (master)$ cat torch.egg-info/requires.txt
        +pyyaml
        +
        + +

        Without going into too much detail, develop allows us to essentially treat the PyTorch repo itself as if it were in site-packages, so we can import the module and it just works:

        + +
        (p3) killeent@devgpu047:~$ python
        +Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23)
        +[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux
        +Type "help", "copyright", "credits" or "license" for more information.
        +>>> import torch
        +>>> torch.__file__
        +'/home/killeent/github/pytorch/torch/__init__.py'
        +
        + +

        As a result, the following consequences hold:

        + +
          +
        • If we change a Python source file, the changes are automatically picked up, and we don’t have to run any commands to let the Python interpreter see this change
        • +
        • If we change a C++ Source File in one of the extension libraries, we can re-run the develop command, it will re-build the extension
        • +
        + +

        Thus we can develop the PyTorch codebases seamlessly, and test our changes in an easy way.

        + +

        Working on the Dependency Libraries

        + +

        If we are working on the dependencies (e.g. TH, THPP, etc.) we can re-build our changes more quickly by simply running the build_deps command directly. This will automatically call into build_all.sh to re-build our libraries, and copy the generated libraries appropriately. If we are using Setuptools develop mode, we will be using the local extension library built in the PyTorch directory. Because we have specified the paths to the shared libraries when compiling our Extension Libraries, the changes will be picked up:

        + +
        # we are using the local extension
        +(p3) killeent@devgpu047:~$ python
        +Python 3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23)
        +[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] on linux
        +Type "help", "copyright", "credits" or "license" for more information.
        +>>> import torch
        +>>> torch._C.__file__
        +'/home/killeent/github/pytorch/torch/_C.cpython-36m-x86_64-linux-gnu.so'
        +
        +# it references the local shared object library we just re-built
        +(p3) killeent@devgpu047:~$ ldd /home/killeent/github/pytorch/torch/_C.cpython-36m-x86_64-linux-gnu.so
        +# ...
        +libTH.so.1 => /home/killeent/github/pytorch/torch/lib/libTH.so.1 (0x00007f543d0e2000)
        +# ...
        +
        + +

        As such, we can test any changes here without having to do a full rebuild.

        + +

        3rd Party Libraries

        + +

        PyTorch has dependencies on some 3rd party libraries. The usual mechanism for using these libraries is to install them via Anaconda, and then link against them. For example, we can use the mkl library with PyTorch by doing:

        + +
        # installed to miniconda2/envs/p3/lib/libmkl_intel_lp64.so
        +conda install mkl
        +
        + +

        And then as long as we have the path to this lib directory on our $CMAKE_PREFIX_PATH, it will successfully find this library when compiling:

        + +
        # in the site-packages dir
        +(p3) killeent@devgpu047:torch$ ldd _C.cpython-36m-x86_64-linux-gnu.so
        +# ...
        +libmkl_intel_lp64.so => /home/killeent/local/miniconda2/envs/p3/lib/libmkl_intel_lp64.so (0x00007f3450bba000)
        +# ...
        +
        + +

        Not Covered, But Also Relevant

        + +
          +
        • How ccache is used to speed up build times
        • +
        • How PyTorch’s top-level __init__.py file handles the initial module import and pulling together all the various modules and extension libraries
        • +
        • The CMake build system, how the backend libraries are configured and built with CMake
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/a-year-in/index.html b/blog/a-year-in/index.html new file mode 100644 index 000000000000..8ccd37a1bef1 --- /dev/null +++ b/blog/a-year-in/index.html @@ -0,0 +1,818 @@ + + + + + + + + + + + + + PyTorch, a year in.... | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        January 19, 2018

        +

        + PyTorch, a year in.... +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + The PyTorch Team + +

        +

        Today marks 1 year since PyTorch was released publicly. It’s been a wild ride — our quest to build a flexible deep learning research platform. Over the last year, we’ve seen an amazing community of people using, contributing to and evangelizing PyTorch — thank you for the love.

        + +

        Looking back, we wanted to summarize PyTorch over the past year: the progress, the news and highlights from the community.

        + +

        Community

        + +

        We’ve been blessed with a strong organic community of researchers and engineers who fell in love with PyTorch. The core team has engineers and researchers from multiple countries, companies and universities, and we couldn’t have made PyTorch what it is without each contribution.

        + +

        Research papers, packages and Github

        + +

        Within days of release, users from the community started to implement their favorite research papers in PyTorch and release the code on Github. Open-source code is a primary and essential tool for researchers today.

        + +

        Folks came together to create torchtext, torchvision and torchaudio packages to help facilitate and democratize research in different domains.

        + +

        The first community package based on PyTorch came from Brandon Amos, titled Block, and helped with easier manipulation of block matrices. The Locus Lab at CMU subsequently went on to publish PyTorch packages and implementations for most of their research. The first research paper code came from Sergey Zagoruyko titled Paying more attention to attention.

        + +

        Jun-Yan Zhu, Taesung Park, Phillip Isola, Alyosha Efros and team from U.C.Berkeley released the hugely popular Cycle-GAN and pix2pix which does image to image transforms.

        + +
        + +
        + +

        The researchers at HarvardNLP and Systran started developing and improving OpenNMT in PyTorch, seeded by initial reimplementation of the [Lua]Torch code from Adam Lerer at Facebook.

        + +

        The MagicPony team at Twitter contributed implementations of their Super-resolution work early on into PyTorch’s examples.

        + +

        Salesforce Research released several packages, including their highlight release of PyTorch-QRNN, a type of RNN that is 2x to 17x faster than standard LSTMs optimized by CuDNN. James Bradbury and team form one of the most active and engaging forces in the PyTorch community.

        + + + + +

        Researchers from Uber, Northeastern and Stanford came together to form an active probabilistic programming community around their packages Pyro and ProbTorch. They are actively developing the torch.distributions core package. This community is so active and fast-moving, we had our first pytorch-probabilistic-programming meetup at NIPS 2017 with Fritz Obermeyer, Noah Goodman, Jan-Willem van de Meent, Brooks Paige, Dustin Tran and 22 additional attendees discussing how to make the world bayesian.

        + +
        + +
        + +

        NVIDIA Researchers released three high-quality repositories that implemented pix2pix-HD, Sentiment Neuron and FlowNet2 papers. Their analysis of scalability of different Data Parallel models in PyTorch was helpful to the community.

        + +
        + +
        + +

        The Allen Institute for AI released AllenNLP which includes several state-of-the-art models in NLP — reference implementations and easy to use web demos for standard NLP tasks.

        + +
        + +
        + +

        We also had our first Kaggle winning team grt123 in July. They won the DataScience Bowl 2017 on Lung Cancer detection and subsequently released their PyTorch implementations.

        + +

        On the visualization front, Tzu-Wei Huang implemented a TensorBoard-PyTorch plugin and Facebook AI Research released PyTorch compatibility for their visdom visualization package.

        + +
        + + +
        + +

        Lastly, Facebook AI Research released several projects such as ParlAI, fairseq-py, VoiceLoop and FaderNetworks that implemented cutting-edge models and interfaced datasets in multiple domains.

        + +

        There are countless good projects that we haven’t highlighted for the lack of space, you can find a curated list here.

        + +

        We would also like to give a huge shout-out to folks who actively help others out on the Forums, especially ptrblck, jpeg729, QuantScientist, albanD, Thomas Viehmann and chenyuntc. You are providing an invaluable service, thank you so much!

        + +

        Metrics

        + +

        In terms of sheer numbers,

        + +
          +
        • 87,769 lines of Python code on github that import torch
        • +
        • 3,983 repositories on Github that mention PyTorch in their name or description
        • +
        • More than half a million downloads of PyTorch binaries. 651,916 to be precise.
        • +
        • 5,400 users wrote 21,500 posts discussing 5,200 topics on our forums discuss.pytorch.org (http://discuss.pytorch.org/)
        • +
        • 131 mentions of PyTorch on Reddit’s /r/machinelearning since the day of release. In the same period, TensorFlow was mentioned 255 times.
        • +
        + +

        Research Metrics

        + +

        PyTorch is a research-focused framework. So one of the metrics of interest is to see the usage of PyTorch in machine learning research papers.

        + +
          +
        • +

          In the recent ICLR2018 conference submissions, PyTorch was mentioned in 87 papers, compared to TensorFlow at 228 papers, Keras at 42 papers, Theano and Matlab at 32 papers.

          +
        • +
        • +

          Monthly arxiv.org mentions for frameworks had PyTorch at 72 mentions, with TensorFlow at 273 mentions, Keras at 100 mentions, Caffe at 94 mentions and Theano at 53 mentions.

          +
        • +
        + +

        Courses, Tutorials and Books

        + +

        When we released PyTorch, we had good API documentation, but our tutorials were limited to a few ipython notebooks — helpful, but not good enough.

        + +

        Sasank Chilamkurthy took it upon himself to revamp the tutorials into the beautiful website that it is today.

        + +
        + +
        + +

        Sean Robertson and Justin Johnson wrote great new tutorials — in NLP, and to learn by example. Yunjey Choi wrote a beautiful tutorial where most models were implemented in 30 lines or less. +Each new tutorial helped users find their way faster, with different approaches to learning.

        + +

        Goku Mohandas and Delip Rao switched the code content of their book-in-progress to use PyTorch.

        + +

        We’ve seen quite a few university machine learning courses being taught with PyTorch as the primary tool, such as Harvard’s CS287. Taking it one step further and democratizing learning, we had three online courses pop up that teach using PyTorch.

        + + + +

        Engineering

        + +

        Over the last year we implemented multiple features, improved performance across the board and fixed lots of bugs. A full list of the work we’ve done is found in our release notes. +Here are highlights from our work over the last year:

        + +

        Higher-order gradients

        + +

        With the release of several papers that implement penalties of gradients and with ongoing research in 2nd order gradient methods, this was an essential and sought-after feature. In August, we implemented a generalized interface that can take n-th order derivatives and increased the coverage of functions that support higher-order gradients over time, such that at the moment of writing almost all ops support this.

        + +

        Distributed PyTorch

        + +

        In August, we released a small distributed package that followed the highly popular MPI-collective approach. The package has multiple backends such as TCP, MPI, Gloo and NCCL2 to support various types of CPU/GPU collective operations and use-cases, and integrates distributed technologies such as Infiniband and RoCE. Distributed is hard, and we had bugs in the initial iteration. Over subsequent releases, we made the package more stable and improved performance.

        + +

        Closer to NumPy

        + +

        One of the biggest demands from users were NumPy features that they were familiar with. Features such as Broadcasting and Advanced Indexing are convenient and save users a lot of verbosity. We implemented these features and started to align our API to be closer to NumPy. Over time, we expect to get closer and closer to NumPy’s API where appropriate.

        + +

        Sparse Tensors

        + +

        In March, we released a small package supporting sparse Tensors and in May we released CUDA support for the sparse package. The package is small and limited in functionality, and is used for implementing Sparse Embeddings and commonly used sparse paradigms in deep learning. This package is still small in scope and there’s demand to expand it — if you are interested in working on expanding the sparse package, reach out to us on our Discussion Boards

        + +

        Performance

        + +

        Performance is always an ongoing battle, especially for PyTorch which is a dynamic framework that wants to maximize flexibility. Over the last year, we’ve improved performance across board, from our core Tensor library to the neural network operators, writing faster micro-optimized across board.

        + +
          +
        • We’ve added specialized AVX and AVX2 intrinsics for Tensor operations
        • +
        • Wrote faster GPU kernels for frequent workloads like concatenation and Softmax (among many other things)
        • +
        • Rewrote the code for several neural network operators (too many to list), but notably nn.Embedding and group convolutions.
        • +
        + +

        Reducing framework overhead by 10x across board

        + +

        Since PyTorch is a dynamic graph framework, we create a new graph on the fly at every iteration of a training loop. Hence, the framework overhead has to be low, or the workload has to be large enough that the framework overhead is hidden. In August, the authors of DyNet (Graham Neubig and team) showcased that it’s much faster than PyTorch on small NLP models. This was an interesting challenge, we didn’t realize that models of those sizes were being trained. In a multi-month (and ongoing) effort, we embarked upon a significant rewrite of PyTorch internals that reduced the framework overhead from more than 10 microseconds per operator execution to as little as 1 microsecond.

        + +

        ATen

        + +

        As we embarked upon a redesign of the PyTorch internals, we built the ATen C++11 library that now powers all of the PyTorch backend. ATen has an API that mirrors PyTorch’s Python API, which makes it a convenient C++ library for Tensor computation. ATen can be built and used independently of PyTorch.

        + +

        Exporting models to production — ONNX Support and the JIT compiler

        + +

        One of the common requests we’ve received was to export PyTorch models to another framework. Users engaged in a rapid research cycle in PyTorch and when they were done, they wanted to ship it to larger projects with C++ only requirements.

        + +

        With this in mind, we built a tracer for PyTorch — which can export PyTorch models into an intermediate representation. +The subsequent trace can be either used to run the current PyTorch model more efficiently (by running optimization passes on it), or be converted to the ONNX format to be shipped to other frameworks such as Caffe2, MXNet, TensorFlow and others or directly to the hardware accelerated libraries like CoreML or TensorRT. Over the next year, you will hear more about the JIT compiler for performance improvements.

        + +

        Users being funny :)

        + +

        Our users express their support in funny ways, made us laugh, thanks for this :)

        + + + + + + + + + + + + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerate-pytorch-models/index.html b/blog/accelerate-pytorch-models/index.html new file mode 100644 index 000000000000..478015b029f5 --- /dev/null +++ b/blog/accelerate-pytorch-models/index.html @@ -0,0 +1,877 @@ + + + + + + + + + + + + + Accelerate PyTorch Models Using Quantization Techniques with Intel Extension for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        Overview

        + +

        PyTorch is a Python-based framework for developing deep learning models. It is one of the most popular industry-standard AI frameworks and is used for a wide variety of computer vision and natural language processing applications. PyTorch was developed by Meta and is now part of The Linux Foundation. Intel works with the open source PyTorch project to optimize the PyTorch framework for Intel® hardware. The newest optimizations and features are first released in Intel® Extension for PyTorch before upstreaming them into PyTorch. The Intel extension provides quantization features to deliver good accuracy results for large deep learning models.

        + +

        This article introduces quantization, types of quantization, and demonstrates a code sample on how to accelerate PyTorch-based models by applying Intel Extension for PyTorch quantization.

        + +

        What Is Quantization?

        + +

        Quantization is a systematic reduction of the precision of all or several layers within the model. This means a higher-precision type (like single precision floating-point (FP32) that is mostly used in deep learning) is converted into a lower-precision type, such as FP16 (16 bits) or int8 (8 bits).

        + +

        This helps to achieve:

        + +
          +
        • Lower memory bandwidth
        • +
        • Lower storage
        • +
        • Higher performance with minimum to zero accuracy loss
        • +
        + +

        Quantization is especially important with large models such as those based on the Transformer architecture (like BERT or GPT).

        + +

        There are two types of quantization:

        + +
          +
        • Static: This quantizes the weights and activations of the model, and is used when memory bandwidth and compute savings are important.
        • +
        • Dynamic: The weights are quantized ahead of time, but the activations are dynamically quantized during inference.
        • +
        + +

        How to Perform Static Quantization and Dynamic Quantization

        + +

        The Intel extension extends PyTorch with up-to-date features and optimizations for an extra performance boost on Intel hardware.

        + +

        Installation Instructions for Intel Extension for PyTorch

        + +

        The extension can be loaded as a Python module or linked as a C++ library. Python users can enable it dynamically by importing intel_extension_for_pytorch. The extension provides built-in quantization to deliver good statistical accuracy for most popular deep learning workloads including convolutional neural networks (CNN), natural language processing (NLP), and recommendation models. The quantization functionality in the Intel extension currently supports post-training quantization.

        + +

        To quantize the existing FP32 model to an int8 model using static quantization:

        + +
          +
        1. Prepare the quantization configuration. For default static quantization configuration, use ipex.quantization.default_static_qconfig.
        2. +
        3. Prepare the model for calibration using the ipex.quantization.prepare method.
        4. +
        5. Perform calibration against the dataset. This calibration is specific for static quantization as it needs the representative dataset to determine the optimal quantization parameters, so the user should provide data to the model in batches to calibrate it.
        6. +
        7. Convert the model from FP32 to int8 using the ipex.quantization.convert method. This function converts the FP32 model to int8 based on the applied calibration and configuration.
        8. +
        + +

        To quantize the existing FP32 model to an int8 model using dynamic quantization, which is similar to static quantization:

        + +
          +
        1. Prepare the quantization configuration. For default dynamic quantization configuration, use ipex.quantization.default_dynamic_qconfig.
        2. +
        3. Prepare the FP32 model by using the ipex.quantization.prepare method. Provide the parameters, such as FP32 model to quantize, the prepared configuration, example inputs, and information.
        4. +
        5. Convert the model from FP32 to int8 using the ipex.quantization.convert method. The input model is the model prepared in Step 2.
        6. +
        + +

        Code Sample

        + +

        Dataset

        + +

        For static quantization, the model is calibrated with the CIFAR-10 dataset. The CIFAR-10 is a subset of the 80 million tiny images dataset collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton.

        + +

        This dataset contains 60,000 images in 10 classes (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and track). Every class has exactly 6,000 images. All images are 32 x 32 pixels and are colored. Also, the classes are completely mutually exclusive, which means there is no overlapping between classes.

        + +

        Implementation

        + +

        The code sample demonstrates how to quantize (using static and dynamic quantization) a ResNet*-50 model using Intel Extension for PyTorch. The following steps are implemented in the code sample:

        + +

        Download and Prepare the Dataset

        + +

        Here, we use the CIFAR-10 dataset available in torchvision.

        + +
          +
        1. To make data fit the model:
        2. +
        + +
          +
        • Transform the data.
        • +
        • Change the size of the images from 32 x 32 pixels to 224 x 224 pixels.
        • +
        • Convert them to tensors.
        • +
        • Normalize them.
        • +
        + +
          +
        1. Prepare transformations of the dataset as shown:
        2. +
        + +
        transform = torchvision.transforms.Compose([
        +torchvision.transforms.Resize((224, 224)),
        +torchvision.transforms.ToTensor(),
        +torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
        +
        +
        + +
          +
        1. Initialize the dataset.
        2. +
        + +
        test_dataset = torchvision.datasets.CIFAR10(root=DATA, train=False, transform=transform, download=Ture)
        +
        + +

        Prepare the Data Loader

        + +

        To load a dataset for static quantization calibration in specific size batches, create the loader as shown:

        + +
        calibration_data_loader = torch.utils.data.DataLoader(
        +dataset=test_dataset,
        +batch_size=128
        +)
        +
        + +

        Create the Model

        + +

        Use the pretrained ResNet-50 model available in the Torchvision library with default weights. The prepared model is FP32.

        + +
        model_fp32 = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)
        +
        + +

        Apply Static Quantization

        + +

        Create a staticQuantize function that implements the steps described previously.

        + +
          +
        1. To perform static quantization, we need:
        2. +
        + +
          +
        • FP32 model loaded earlier
        • +
        • Example data
        • +
        • Calibration dataset
        • +
        + +
          +
        1. Prepare the quantization configuration:
        2. +
        + +
        config_static = ipex.quantization.default_static_qconfig
        +
        + +

        In this code sample, we are using the default quantization configuration, but you can also define your own. \

        + +
          +
        1. Prepare the model using the declared configuration:
        2. +
        + +
        prepared_model_static = prepare(model_fp32,
        +qconfig_static,
        +example_inputs=data,
        +inplace=False)
        +
        + +
          +
        1. Calibrate the model with the calibration dataset. Feed the model with successive batches of data from the dataset.
        2. +
        + +
        for batch_idx, (data, target) in enumerate(calibration_data_loader):
        +prepared_model_static(data)
        +if batch_idx % 10 == 0:
        +print("Batch %d/%d complete, continue ..." %(batch_idx+1, len(calibration_data_loader)))
        +
        + +
          +
        1. Convert the model.
        2. +
        + +
        converted_model_static = convert(prepared_model_static)
        +
        + +

        Apply Dynamic Quantization

        + +

        Create the dynamicQuantize function similar to the staticQuantize function.

        + +
          +
        1. To perform dynamic quantization, we only need:
        2. +
        + +
          +
        • The FP32 model loaded earlier
        • +
        • Example data
        • +
        + +
          +
        1. Prepare the quantization configuration:
        2. +
        + +
        qconfig_dynamic = ipex.quantization.default_dynamic_qconfig
        +
        + +
          +
        1. Prepare the model.
        2. +
        + +
        prepared_model_dynamic = prepare(model_fp32,
        +qconfig_dynamic,
        +example_inputs=data,
        +inplace=False)
        +
        + +
          +
        1. Convert the model from FP32 to int8.
        2. +
        + +
        converted_model_dynamic = convert(prepared_model_dynamic)
        +
        + +

        In this way, two functions are created to take advantage of the optimizations that quantization offers:

        + +
          +
        • DynamicQuantize for dynamic quantization of models
        • +
        • StaticQuantize for static model quantization
        • +
        + +

        Next Steps

        + +

        Get started with Intel Extension for PyTorch quantization today and use it to achieve better accuracy results for deep learning workloads. Additionally, Intel® Neural Compressor provides quantization to improve the speed of inference.

        + +

        Check out and incorporate Intel’s other AI and machine learning framework optimizations and end-to-end portfolio of tools into your AI workflow.

        + +

        Learn about the unified, open, standards-based oneAPI programming model that forms the foundation of Intel’s AI Software Portfolio to help you prepare, build, deploy, and scale your AI solutions.

        + +

        For more details about the 4th gen Intel® Xeon® Scalable processors, visit the Intel® AI platform overview where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs.

        + +

        Additional Resources

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-cpu-inference/index.html b/blog/accelerated-cpu-inference/index.html new file mode 100644 index 000000000000..e395bbf19d5f --- /dev/null +++ b/blog/accelerated-cpu-inference/index.html @@ -0,0 +1,1097 @@ + + + + + + + + + + + + + Accelerated CPU Inference with PyTorch Inductor using torch.compile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        Story at a Glance

        + +
          +
        • Although the PyTorch* Inductor C++/OpenMP* backend has enabled users to take advantage of modern CPU architectures and parallel processing, it has lacked optimizations, resulting in the backend performing worse than eager mode in terms of end-to-end performance.
        • +
        • Intel optimized the Inductor backend using a hybrid strategy that classified operations into two categories: Conv/GEMM and non-Conv/GEMM element-wise and reduction ops.
        • +
        • For popular deep learning models, this hybrid strategy demonstrates promising performance improvements compared to eager mode and improves the C++/OpenMP backend’s efficiency and reliability for PyTorch models.
        • +
        + +
        + +

        Inductor Backend Challenges

        + +

        The PyTorch Inductor C++/OpenMP backend enables users to take advantage of modern CPU architectures and parallel processing to accelerate computations.

        + +

        However, during the early stages of its development, the backend lacked some optimizations, which prevented it from fully utilizing the CPU computation capabilities. As a result, for most models the C++/OpenMP backend performed worse than eager mode in terms of end-to-end performance, with 45% of TorchBench, 100% of Hugging Face, and 75% of TIMM models performing worse than eager mode.

        + +

        In this post, we highlight Intel’s optimizations to the Inductor CPU backend, including the technologies and results.

        + +

        We optimized the backend by using a hybrid strategy that classified operations into two categories: Conv/GEMM and non-Conv/GEMM element-wise and reduction ops. Post-op fusion and weight prepacking using the oneDNN performance library were utilized to optimize the former, while explicit vectorization in C++ codegen was used to optimize the latter.

        + +

        This hybrid strategy demonstrated promising performance improvements compared to eager mode, particularly on popular deep learning models such as Inductor Hugging Face, Inductor TorchBench and Inductor TIMM. Overall, Intel’s optimizations improve the C++/OpenMP backend’s efficiency and reliability for PyTorch models.

        + +

        Figure 1. Performance Speedup Ratio Trend

        + +

        Figure 1: Performance Speedup Ratio Trend

        + +

        Performance Status of Intel Hybrid Optimizations

        + +

        Compared to eager mode with the hybrid optimizations, the C++/OpenMP backend shows promising performance improvements. We measured the performance of the three Inductor benchmark suites—TorchBench, Hugging Face, and TIMM—and the results are as follows. (Note: we publish our performance data twice per week on GitHub.)

        + +

        Overall, these optimizations help to ensure that the C++/OpenMP backend provides efficient and reliable support for PyTorch models.

        + +

        Passrate

        + +
        +----------+------------+-------------+-------------+
        +| Compiler | torchbench | huggingface | timm_models |
        ++----------+------------+-------------+-------------+
        +| inductor | 93%, 56/60 | 96%, 44/46  | 100%, 61/61 |
        ++----------+------------+-------------+-------------+
        +
        + +

        Geometric mean speedup (Single-Socket Multi-threads)

        + +
        +----------+------------+-------------+-------------+
        +| Compiler | torchbench | huggingface | timm_models |
        ++----------+------------+-------------+-------------+
        +| inductor |   1.39x    |    1.20x    |    1.73x    |
        ++----------+------------+-------------+-------------+
        +
        + +

        Individual Model Performance

        + +

        Figure 2. TorchBench FP32 Performance (Single-Socket Multi-threads)

        + +

        Figure 2: TorchBench FP32 Performance (Single-Socket Multi-threads)

        + +

        Figure 3. Hugging Face FP32 Performance (Single-Socket Multi-thread)

        + +

        Figure 3: Hugging Face FP32 Performance (Single-Socket Multi-thread)

        + +

        Figure 4. TIMM FP32 Performance (Single-Socket Multi-threads)

        + +

        Figure 4: TIMM FP32 Performance (Single-Socket Multi-threads)

        + +

        Geometric mean speedup (Single-core Single-thread)

        + +
        +----------+------------+-------------+-------------+
        +| Compiler | torchbench | huggingface | timm_models |
        ++----------+------------+-------------+-------------+
        +| inductor |    1.29x   |    1.15x    |    1.37x    |
        ++----------+------------+-------------+-------------+
        +
        + +

        Figure 5. TorchBench FP32 Performance (Single-Socket Single-thread)

        + +

        Figure 5: TorchBench FP32 Performance (Single-Socket Single-thread)

        + +

        Figure 6. Hugging Face FP32 Performance (Single-Socket Single Thread)

        + +

        Figure 6: Hugging Face FP32 Performance (Single-Socket Single Thread)

        + +

        Figure 7. TIMM FP32 Performance (Single-Socket Single-thread)

        + +

        Figure 7: TIMM FP32 Performance (Single-Socket Single-thread)

        + +

        Technical Deep Dive

        + +

        Now, let’s take a closer look at the two primary optimizations used in the Inductor C++/OpenMP backend:

        + +
          +
        1. weight prepacking and post-operation fusion via oneDNN library
        2. +
        3. explicit vectorization in Inductor C++ codegen
        4. +
        + +

        Weight Prepackaging & Post-op Fusion via oneDNN

        + +

        Shorthand for Intel® oneAPI Deep Neural Network Library, oneDNN library provides a range of post-op fusions (i.e., fuse convolution and matmal with its consecutive operation) that can benefit popular models. The Intel® Extension for PyTorch has implemented most of these fusions and has achieved significant performance improvements. As a result, we have upstreamed all of these fusions that have been applied in Intel’s PyTorch extension to Inductor, enabling a wider range of models to benefit from these optimizations. We have defined these fusions as operators under the mkldnn namespace. This allows the Python module to invoke these mkldnn operations directly.

        + +

        Currently, the defined fused operations are as follows. You can find these defined fused operations at RegisterMkldnnOpContextClass.cpp.

        + +
          +
        • _linear_pointwise: Fuses Linear and its post-unary element-wise operations
        • +
        • _linear_pointwise.binary: Fuses Linear and its post-binary element-wise operations
        • +
        • _convolution_pointwise: Fuses Convolution and its post-unary element-wise operations
        • +
        • _convolution_pointwise.binary: Fuses Convolution and its post-binary element-wise operations
        • +
        + +

        The detailed fusion patterns are defined in the mkldnn.py file: convolution/linear + sigmoid/hardsigmoid/tanh/hardtanh/hardswish/leaky_relu/gelu/relu/relu6/siluconvolution/linear + add/add_/iadd/sub/sub_

        + +

        On the Inductor side, we apply these fusions on the FX graph that has been lowered. We have defined mkldnn_fuse_fx as the entry point to apply all the fusions. The code snippet for this is as follows:

        + +
        def mkldnn_fuse_fx(gm: torch.fx.GraphModule, example_inputs):
        +    ...
        +    gm = fuse_unary(gm)
        +    gm = fuse_binary(gm)
        +    ...
        +    if config.cpp.weight_prepack:
        +        gm = pack_module(gm)
        +    return gm
        +
        + +

        In the mkldnn_fuse_fx function, we apply fusion on the FX graph that hasn’t been lowered yet. To fuse convolution/linear and its consecutive elementwise operations, we invoke fuse_unary and fuse_binary as follows:

        + +
           gm = fuse_unary(gm)
        +   gm = fuse_binary(gm)
        +
        + +

        In addition to the post-op fusion, we apply weight prepacking to improve the Conv/GEMM performance further:

        + +
           gm = pack_module(gm)
        +
        + +

        Weight prepacking involves rearranging the weight tensor in a blocked layout, which:

        + +
          +
        • can improve vectorization and cache reuse compared to plain formats like NCHW or NHWC and;
        • +
        • can help avoid weight reordering at runtime, which can reduce overhead and improve performance and;
        • +
        • increases memory usage as the tradeoff.
        • +
        + +

        For these reasons, we provide config.cpp.weight_prepack flag in Inductor to provide users with more control over this optimization, allowing them to enable it based on their specific needs.

        + +

        Explicit Vectorization in Inductor C++ Codegen

        + +

        Vectorization is a key optimization technique that can significantly improve the performance of numerical computations. By utilizing SIMD (Single Instruction, Multiple Data) instructions, vectorization enables multiple computations to be performed simultaneously on a single processor core, which can lead to significant performance improvements.

        + +

        In the Inductor C++/OpenMP backend, we use Intel® AVX2 and Intel® AVX-512 ISA (Instruction Set Architecture) options for vectorization by leveraging the aten vectorization library to facilitate the implementation. Aten vectorization supports multiple platforms, including x86 and Arm, as well as multiple data types. It can be extended to support other ISAs easily by adding more VecISA sub-classes. This allows Inductor to easily support other platforms and data types in the future.

        + +

        Due to differences in platforms, the C++/OpenMP backend of Inductor starts by detecting the CPU features to determine the vectorization bit width at the beginning of code generation. By default, if the machine supports both AVX-512 and AVX2, the backend will choose 512-bit vectorization.

        + +

        If the hardware supports vectorization, the C++/OpenMP backend first detects if the loop body can be vectorized or not. There are primarily three scenarios that we are not able to generate kernel with vectorization:

        + +
          +
        1. Loop body lacks vector intrinsics support, e.g., rand and atomic_add.
        2. +
        3. Loop body lacks efficient vector intrinsics support, e.g., non-contiguous load/store.
        4. +
        5. Data types with vectorization not yet supported but work in progress, e.g., integer, double, half, and bfloat16.
        6. +
        + +

        To address this issue, the C++/OpenMP backend uses CppVecKernelChecker to detect whether all operations in a particular loop body can be vectorized or not. In general, we classified the operations into two categories by identifying if they depend on the context.

        + +

        For most elementwise operations such as add, sub, relu, vectorization is straightforward, and their execution does not depend on context.

        + +

        However, for certain other operations, their semantics are more complex and their execution depends on context through static analysis.

        + +

        For example, let’s consider the where operation that takes in mask, true_value, and false_value while the mask value is loaded from a uint8 tensor. The fx graph could be as follows:

        + +
        graph():
        +    %ops : [#users=9] = placeholder[target=ops]
        +    %get_index : [#users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
        +    %load : [#users=1] = call_method[target=load](args = (%ops, arg1_1, %get_index), kwargs = {})
        +    %to_dtype : [#users=1] = call_method[target=to_dtype](args = (%ops, %load, torch.bool), kwargs = {})
        +    ...
        +    %where : [#users=1] = call_method[target=where](args = (%ops, %to_dtype, %to_dtype_2, %to_dtype_3), kwargs = {})
        +
        + +

        Regarding uint8, it is a general data type and could be used for computation but is not limited to being used as Boolean for mask. Hence, we need to analyze its context statically. In particular, the CppVecKernelChecker will check whether a uint8 tensor is only used by to_dtype and to_dtype is only used by where. If yes, it could be vectorized. Otherwise, it will fall back to the scalar version. The generated code could be as follows:

        + +

        Scalar Version

        + +
        auto tmp0 = in_ptr0[i1 + (17*i0)];
        +auto tmp3 = in_ptr1[i1 + (17*i0)];
        +auto tmp1 = static_cast<bool>(tmp0);
        +auto tmp2 = static_cast<float>(-33.0);
        +auto tmp4 = tmp1 ? tmp2 : tmp3;
        +tmp5 = std::max(tmp5, tmp4);
        +
        + +

        Vectorization Version

        + +
        float g_tmp_buffer_in_ptr0[16] = {0};
        +// Convert the flag to float for vectorization. 
        +flag_to_float(in_ptr0 + (16*i1) + (17*i0), g_tmp_buffer_in_ptr0, 16);
        +auto tmp0 = at::vec::Vectorized<float>::loadu(g_tmp_buffer_in_ptr0);
        +auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + (16*i1) + (17*i0));
        +auto tmp1 = (tmp0);
        +auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(-33.0));
        +auto tmp4 = decltype(tmp2)::blendv(tmp3, tmp2, tmp1);
        +
        + +

        In addition to context analysis, the C++/OpenMP backend also incorporates several other vectorization-related optimizations. These include:

        + +
          +
        • Tiled kernel implementation for supporting transpose load - cpp.py
        • +
        • Data type demotion based on value range - cpp.py
        • +
        • Replacement of sleef implementation with oneDNN/oneMKL implementation for optimizing aten vectorization - #94577, #92289, #91613
        • +
        + +

        In summary, we examined vectorization optimization in Inductor C++ backend for FP32 training and inference of 150 benchmark models with 90% of inference kernels and 71% of training kernels being vectorized.

        + +

        In terms of inference, a total of 28,185 CPP kernels were generated, with 25,579 (90%) of them being vectorized, while the remaining 10% were scalar. As for training, 103,084 kernels were generated, with 73,909 (71%) being vectorized and 29% not vectorized.

        + +

        The results indicate that the vectorization of inference kernels is quite impressive (there is still some work to be done in training kernels since we just started to work on the training). The remaining non-vectorized kernels are analyzed in different categories, highlighting the next steps to improve vectorization coverage: index-related operations, int64 support, vertical reduction, vectorization with fallback, and more.

        + +

        In addition, we also optimized the C++/OpenMP backend with other optimizations like buffer-reuse and CppWrapper.

        + +

        Future Work

        + +

        The next step, we will continue optimizing the C++/OpenMP backend and extend it to support more data types as the next step. This includes:

        + +
          +
        1. Improve vectorization coverage
        2. +
        3. Support and optimize low precision kernel including BF16, FP16, Quantization
        4. +
        5. Training optimization
        6. +
        7. Loop tiling
        8. +
        9. Autotune
        10. +
        11. Further fusion optimization of Conv/GEMM kernels.
        12. +
        13. Explore alternative codegen paths: clang/llvm/triton
        14. +
        + +

        Summary

        + +

        Inductor C++/OpenMP backend is a flexible and efficient backend for the CPU. This blog describes the optimizations used in the C++/OpenMP backend of Inductor for inference and training of three benchmark suites – TorchBench, Hugging

        + +

        Face and TIMM. The primary optimizations include weight prepacking and post-operation fusion via the oneDNN library, as well as explicit vectorization in Inductor C++ codegen using AVX2 and AVX-512 instructions.

        + +

        The results show that 90% of inference kernels and 71% of training kernels are vectorized, indicating impressive vectorization for inference and room for improvement in training. In addition, we also applied other optimizations like buffer-reuse and CppWrapper. And we will continuously focus on the future work mentioned above to further improve the performance.

        + +

        Acknowledgements

        + +

        The results presented in this blog post are the culmination of a collaborative effort between the Intel PyTorch team and Meta. We would like to express our sincere gratitude to @jansel, @desertfire, and @Chillee for their invaluable contributions and unwavering support throughout the development process. Their expertise and dedication have been instrumental in achieving the optimizations and performance improvements discussed here.

        + +

        Configuration Details

        + +

        Hardware Details

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +Item + +Value +
        +Manufacturer + +Amazon EC2 +
        +Product Name + +c6i.16xlarge +
        +CPU Model + +Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +
        +Installed Memory + +128GB (1x128GB DDR4 3200 MT/s [Unknown]) +
        +OS + +Ubuntu 22.04.2 LTS +
        +Kernel + +5.19.0-1022-aws +
        +Microcode + +0xd000389 +
        +GCC + +gcc (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0 +
        +GLIBC + +ldd (Ubuntu GLIBC 2.35-0ubuntu3.1) 2.35 +
        +Binutils + +GNU ld (GNU Binutils for Ubuntu) 2.38 +
        +Python + +Python 3.10.6 +
        +OpenSSL + +OpenSSL 3.0.2 15 Mar 2022 (Library: OpenSSL 3.0.2 15 Mar 2022) +
        + +

        Software Details

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +SW + +Nightly commit + +Main commit +
        +Pytorch + +a977a12 + +0b1b063 +
        +Torchbench + +/ + +a0848e19 +
        +torchaudio + +0a652f5 + +d5b2996 +
        +torchtext + +c4ad5dd + +79100a6 +
        +torchvision + +f2009ab + +b78d98b +
        +torchdata + +5cb3e6d + +f2bfd3d +
        +dynamo_benchmarks + +fea73cb + +/ +
        + +

        Configuration

        + +
          +
        • Intel OpenMP
        • +
        • Jemalloc - oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1
        • +
        • Single-Socket Multi-threads: #of Instances: 1; Cores/Instance: 32
        • +
        • Single-Core Single-thread: #of Instances: 1; Cores/Instance: 1
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-diffusers-pt-20/index.html b/blog/accelerated-diffusers-pt-20/index.html new file mode 100644 index 000000000000..d66b22162c0f --- /dev/null +++ b/blog/accelerated-diffusers-pt-20/index.html @@ -0,0 +1,741 @@ + + + + + + + + + + + + + Accelerated Diffusers with PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        March 16, 2023

        +

        + Accelerated Diffusers with PyTorch 2.0 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Pedro Cuenca, Patrick von Platen, Suraj Patil, Sayak Paul + +

        +

        PyTorch 2.0 has just been released. Its flagship new feature is torch.compile(), a one-line code change that promises to automatically improve performance across codebases. We have previously checked on that promise in Hugging Face Transformers and TIMM models, and delved deep into its motivation, architecture and the road ahead.

        + +

        As important as torch.compile() is, there’s much more to PyTorch 2.0. Notably, PyTorch 2.0 incorporates several strategies to accelerate transformer blocks, and these improvements are very relevant for diffusion models too. Techniques such as FlashAttention, for example, have become very popular in the diffusion community thanks to their ability to significantly speed up Stable Diffusion and achieve larger batch sizes, and they are now part of PyTorch 2.0.

        + +

        In this post we discuss how attention layers are optimized in PyTorch 2.0 and how these optimization are applied to the popular 🧨 Diffusers library. We finish with a benchmark that shows how the use of PyTorch 2.0 and Diffusers immediately translates to significant performance improvements across different hardware.

        + +

        Update (June 2023): a new section has been added to show dramatic performance improvements of torch.compile() with the latest version of PyTorch (2.0.1), after going through the process of fixing graph breaks in the diffusers codebase. A more detailed analysis of how to find and fix graph breaks will be published in a separate post.

        + +

        Accelerating transformer blocks

        + +

        PyTorch 2.0 includes a scaled dot-product attention function as part of torch.nn.functional. This function encompasses several implementations that can be applied depending on the inputs and the hardware in use. Before PyTorch 2.0, you had to search for third-party implementations and install separate packages in order to take advantage of memory optimized algorithms, such as FlashAttention. The available implementations are:

        +
          +
        • FlashAttention, from the official FlashAttention project.
        • +
        • Memory-Efficient Attention, from the xFormers project.
        • +
        • A native C++ implementation suitable for non-CUDA devices or when high-precision is required.
        • +
        + +

        All these methods are available by default, and PyTorch will try to select the optimal one automatically through the use of the new scaled dot-product attention (SDPA) API. You can also individually toggle them for finer-grained control, see the documentation for details.

        + +

        Using scaled dot-product attention in diffusers

        + +

        The incorporation of Accelerated PyTorch 2.0 Transformer attention to the Diffusers library was achieved through the use of the set_attn_processor method, which allows for pluggable attention modules to be configured. In this case, a new attention processor was created, which is enabled by default when PyTorch 2.0 is available. For clarity, this is how you could enable it manually (but it’s usually not necessary since diffusers will automatically take care of it):

        + +
        from diffusers import StableDiffusionPipeline
        +from diffusers.models.cross_attention import AttnProcessor2_0
        +
        +pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        +pipe.to("cuda")
        +pipe.unet.set_attn_processor(AttnProcessor2_0())
        +
        +prompt = "a photo of an astronaut riding a horse on mars"
        +image = pipe(prompt).images[0]
        +
        + +

        Stable Diffusion Benchmark

        + +

        We ran a number of tests using accelerated dot-product attention from PyTorch 2.0 in Diffusers. We installed diffusers from pip and used nightly versions of PyTorch 2.0, since our tests were performed before the official release. We also used torch.set_float32_matmul_precision('high') to enable additional fast matrix multiplication algorithms.

        + +

        We compared results with the traditional attention implementation in diffusers (referred to as vanilla below) as well as with the best-performing solution in pre-2.0 PyTorch: PyTorch 1.13.1 with the xFormers package (v0.0.16) installed.

        + +

        Results were measured without compilation (i.e., no code changes at all), and also with a single call to torch.compile() to wrap the UNet module. We did not compile the image decoder because most of the time is spent in the 50 denoising iterations that run UNet evaluations.

        + +

        Results in float32

        + +

        Diffusers Speedup vs xFormers float32

        + +

        The following figures explore performance improvement vs batch size for various representative GPUs belonging to different generations. We collected data for each combination until we reached maximum memory utilization. Vanilla attention runs out of memory earlier than xFormers or PyTorch 2.0, which explains the missing bars for larger batch sizes. Similarly, A100 (we used the 40 GB version) is capable of running batch sizes of 64, but the other GPUs could only reach 32 in our tests.

        + +

        Diffusers Inference Speedup vs Vanilla and xFormers Attention (A100, float32)

        + +

        Diffusers Inference Speedup vs Vanilla and xFormers Attention (3090, float32)

        + +

        Diffusers Inference Speedup vs Vanilla and xFormers Attention (4090, float32)

        + +

        Diffusers Inference Speedup vs Vanilla and xFormers Attention (V100, float32)

        + +

        We found very significant performance improvements over vanilla attention across the board, without even using torch.compile(). An out of the box installation of PyTorch 2.0 and diffusers yields about 50% speedup on A100 and between 35% and 50% on 4090 GPUs, depending on batch size. Performance improvements are more pronounced for modern CUDA architectures such as Ada (4090) or Ampere (A100), but they are still very significant for older architectures still heavily in use in cloud services.

        + +

        In addition to faster speeds, the accelerated transformers implementation in PyTorch 2.0 allows much larger batch sizes to be used. A single 40GB A100 GPU runs out of memory with a batch size of 10, and 24 GB high-end consumer cards such as 3090 and 4090 cannot generate 8 images at once. Using PyTorch 2.0 and diffusers we could achieve batch sizes of 48 for 3090 and 4090, and 64 for A100. This is of great significance for cloud services and applications, as they can efficiently process more images at a time.

        + +

        When compared with PyTorch 1.13.1 + xFormers, the new accelerated transformers implementation is still faster and requires no additional packages or dependencies. In this case we found moderate speedups of up to 2% on datacenter cards such as A100 or T4, but performance was great on the two last generations of consumer cards: up to 20% speed improvement on 3090 and between 10% and 45% on 4090, depending on batch size.

        + +

        When torch.compile() is used, we get an additional performance boost of (typically) 2% and 3% over the previous improvements. As compilation takes some time, this is better geared towards user-facing inference services or training. Update: improvements achieved by torch.compile() are much larger when graph breaks are minimized, see the new section for details.

        + +

        Results in float16

        + +

        Diffusers Speedup vs xFormers float16

        + +

        Diffusers Inference Speedup vs Vanilla and xFormers Attention (A100, float16)

        + +

        Diffusers Inference Speedup vs Vanilla and xFormers Attention (4090, float16)

        + +

        Diffusers Inference Speedup vs Vanilla and xFormers Attention (3090, float16)

        + +

        When we consider float16 inference, the performance improvements of the accelerated transformers implementation in PyTorch 2.0 are between 20% and 28% over standard attention, across all the GPUs we tested, except for the 4090, which belongs to the more modern Ada architecture. This GPU benefits from a dramatic performance improvement when using PyTorch 2.0 nightlies. With respect to optimized SDPA vs xFormers, results are usually on par for most GPUs, except again for the 4090. Adding torch.compile() to the mix boosts performance a few more percentage points across the board.

        + +

        Performance of torch.compile() after minimizing graph breaks

        + +

        In the previous sections we saw that using the accelerated transformers implementation of PyTorch 2.0 provides important performance improvements with respect to earlier versions of PyTorch (with or without xFormers). However, torch.compile() only contributed modest marginal improvements. With the help of the PyTorch team we discovered that the reason for those moderate improvements was that some operations in the diffusers source code were causing graph breaks, which prevented torch.compile() from taking full advantage of graph optimizations.

        + +

        After fixing the graph breaks (see these PRs for details), we measured the additional improvement of torch.compile() vs the uncompiled version of PyTorch 2, and we saw very important incremental performance gains. The following chart was obtained using a nightly version of PyTorch 2 downloaded on May 1st, 2023, and it shows improvements in the range of ~13% to 22% for most workloads. The performance gains get better for modern GPU families, achieving more than 30% for A100. There are also two outliers in the chart. First, we see a performance decrease on T4 for a batch size of 16, which imposes a huge memory pressure on that card. At the opposite end of the spectrum, we see a performance increase on A100 of more than 100% when using a batch size of only 1, which is interesting but not representative of real-world use of a gpu with such large amount of RAM – larger batch sizes capable of serving multiple customers will usually be more interesting for service deployment on A100.

        + +

        Diffusers Speedup using torch.compile() in float16

        + +

        To stress it again, these performance gains are additional to the ones achieved by migrating to PyTorch 2 and using the accelerated transformers scaled dot-product attention implementation. We recommend using torch.compile() when deploying diffusers in production.

        + +

        Conclusions

        + +

        PyTorch 2.0 comes with multiple features to optimize the crucial components of the foundational transformer block, and they can be further improved with the use of torch.compile. These optimizations lead to significant memory and time improvements for diffusion models, and remove the need for third-party library installations.

        + +

        To take advantage of these speed and memory improvements all you have to do is upgrade to PyTorch 2.0 and use diffusers >= 0.13.0.

        + +

        For more examples and in-detail benchmark numbers, please also have a look at the Diffusers with PyTorch 2.0 docs.

        + +

        Acknowledgement

        + +

        The authors are grateful to the PyTorch team for creating such excellent software.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-generative-diffusion-models/index.html b/blog/accelerated-generative-diffusion-models/index.html new file mode 100644 index 000000000000..dfda1620ec09 --- /dev/null +++ b/blog/accelerated-generative-diffusion-models/index.html @@ -0,0 +1,1122 @@ + + + + + + + + + + + + + Accelerated Generative Diffusion Models with PyTorch 2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Grigory Sizov, Michael Gschwind, Hamid Shojanazeri, Driss Guessous, Daniel Haziza, Christian Puhrsch + +

        +

        TL;DR: PyTorch 2.0 nightly offers out-of-the-box performance improvement for Generative Diffusion models by using the new torch.compile() compiler and optimized implementations of Multihead Attention integrated with PyTorch 2.

        + +

        Introduction

        + +

        A large part of the recent progress in Generative AI came from denoising diffusion models, which allow producing high quality images and videos from text prompts. This family includes Imagen, DALLE, Latent Diffusion, and others. However, all models in this family share a common drawback: generation is rather slow, due to the iterative nature of the sampling process by which the images are produced. This makes it important to optimize the code running inside the sampling loop.

        + +

        We took an open source implementation of a popular text-to-image diffusion model as a starting point and accelerated its generation using two optimizations available in PyTorch 2: compilation and fast attention implementation. Together with a few minor memory processing improvements in the code these optimizations give up to 49% inference speedup relative to the original implementation without xFormers, and 39% inference speedup relative to using the original code with xFormers (excluding the compilation time), depending on the GPU architecture and batch size. Importantly, the speedup comes without a need to install xFormers or any other extra dependencies.

        + +

        The table below shows the improvement in runtime between the original implementation with xFormers installed and our optimized version with PyTorch-integrated memory efficient attention (originally developed for and released in the xFormers library) and PyTorch compilation. The compilation time is excluded.

        + +

        Runtime improvement in % compared to original+xFormers

        + +

        See the absolute runtime numbers in section “Benchmarking setup and results summary”

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        GPU + Batch size 1 + Batch size 2 + Batch size 4 +
        P100 (no compilation) + -3.8 + 0.44 + 5.47 +
        T4 + 2.12 + 10.51 + 14.2 +
        A10 + -2.34 + 8.99 + 10.57 +
        V100 + 18.63 + 6.39 + 10.43 +
        A100 + 38.5 + 20.33 + 12.17 +
        + +

        One can notice the following:

        + +
          +
        • The improvements are significant for powerful GPUs like A100 and V100. For those GPUs the improvement is most pronounced for batch size 1
        • +
        • For less powerful GPUs we observe smaller speedups (or in two cases slight regressions). The batch size trend is reversed here: improvement is larger for larger batches
        • +
        + +

        In the following sections we describe the applied optimizations and provide detailed benchmarking data, comparing the generation time with various optimization features on/off.

        + +

        Specifically, we benchmark 5 configurations and the plots below compare their absolute performance for different GPUs and batch sizes. For definitions of these configurations see section “Benchmarking setup and results”.

        + +

        Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 1

        + +

        Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 2

        + +

        Benchmark of denoising diffusion text-to-image generation across GPU architectures, batch size 1

        + +

        Optimizations

        + +

        Here we’ll go into more detail about the optimizations introduced into the model code. These optimizations rely on features of PyTorch 2.0 which has been released recently.

        + +

        Optimized Attention

        + +

        One part of the code which we optimized is the scaled dot-product attention. Attention is known to be a heavy operation: naive implementation materializes the attention matrix, leading to time and memory complexity quadratic in sequence length. It is common for diffusion models to use attention (CrossAttention) as part of Transformer blocks in multiple parts of the U-Net. Since the U-Net runs at every sampling step, this becomes a critical point to optimize. Instead of custom attention implementation one can use torch.nn.MultiheadAttention, which in PyTorch 2 has optimized attention implementation is integrated into it. This optimization schematically boils down to the following pseudocode:

        + +
        class CrossAttention(nn.Module):
        +    def __init__(self, ...):
        +        # Create matrices: Q, K, V, out_proj
        +        ...
        +    def forward(self, x, context=None, mask=None):
        +       # Compute out = SoftMax(Q*K/sqrt(d))V
        +       # Return out_proj(out)
        +       …
        +
        + +

        gets replaced with

        + +
        class CrossAttention(nn.Module):
        +    def __init__(self, ...):
        +        self.mha = nn.MultiheadAttention(...)
        +    def forward(self, x, context):
        +	return self.mha(x, context, context)
        +
        + +

        The optimized implementation of attention was available already in PyTorch 1.13 (see here) and widely adopted (see e.g. HuggingFace transformers library example). In particular, it integrates memory-efficient attention from the xFormers library and flash attention from https://arxiv.org/abs/2205.14135. PyTorch 2.0 expands this to additional attention functions such as cross attention and custom kernels for further acceleration, making it applicable to diffusion models.

        + +

        Flash attention is available on GPUs with compute capability SM 7.5 or SM 8.x - for example, on T4, A10, and A100, which are included in our benchmark (you can check compute capability of each NVIDIA GPU here). However, in our tests on A100 the memory efficient attention performed better than flash attention for the particular case of diffusion models, due to the small number of attention heads and small batch size. PyTorch understands this and in this case chooses memory efficient attention over flash attention when both are available (see the logic here). For full control over the attention backends (memory-efficient attention, flash attention, “vanilla math”, or any future ones), power users can enable and disable them manually with the help of the context manager torch.backends.cuda.sdp_kernel.

        + +

        Compilation

        + +

        Compilation is a new feature of PyTorch 2.0, enabling significant speedups with a very simple user experience. To invoke the default behavior, simply wrap a PyTorch module or a function into torch.compile:

        + +
        model = torch.compile(model)
        +
        + +

        PyTorch compiler then turns Python code into a set of instructions which can be executed efficiently without Python overhead. The compilation happens dynamically the first time the code is executed. With the default behavior, under the hood PyTorch utilized TorchDynamo to compile the code and TorchInductor to further optimize it. See this tutorial for more details.

        + +

        Although the one-liner above is enough for compilation, certain modifications in the code can squeeze a larger speedup. In particular, one should avoid so-called graph breaks - places in the code which PyTorch can’t compile. As opposed to previous PyTorch compilation approaches (like TorchScript), PyTorch 2 compiler doesn’t break in this case. Instead it falls back on eager execution - so the code runs, but with reduced performance. We introduced a few minor changes to the model code to get rid of graph breaks. This included eliminating functions from libraries not supported by the compiler, such as inspect.isfunction and einops.rearrange. See this doc to learn more about graph breaks and how to eliminate them.

        + +

        Theoretically, one can apply torch.compile on the whole diffusion sampling loop. However, in practice it is enough to just compile the U-Net. The reason is that torch.compile doesn’t yet have a loop analyzer and would recompile the code for each iteration of the sampling loop. Moreover, compiled sampler code is likely to generate graph breaks - so one would need to adjust it if one wants to get a good performance from the compiled version.

        + +

        Note that compilation requires GPU compute capability >= SM 7.0 to run in non-eager mode. This covers all GPUs in our benchmarks - T4, V100, A10, A100 - except for P100 (see the full list).

        + +

        Other optimizations

        + +

        In addition, we have improved efficiency of GPU memory operations by eliminating some common pitfalls, e.g. creating a tensor on GPU directly rather than creating it on CPU and later moving to GPU. The places where such optimizations were necessary were determined by line-profiling and looking at CPU/GPU traces and Flame Graphs.

        + +

        Benchmarking setup and results summary

        + +

        We have two versions of code to compare: original and optimized. On top of this, several optimization features (xFormers, PyTorch memory efficient attention, compilation) can be turned on/off. Overall, as mentioned in the introduction, we will be benchmarking 5 configurations:

        + +
          +
        • Original code without xFormers
        • +
        • Original code with xFormers
        • +
        • Optimized code with vanilla math attention backend and no compilation
        • +
        • Optimized code with memory-efficient attention backend and no compilation
        • +
        • Optimized code with memory-efficient attention backend and compilation
        • +
        + +

        As the original version we took the version of the code which uses PyTorch 1.12 and a custom implementation of attention. The optimized version uses nn.MultiheadAttention in CrossAttention and PyTorch 2.0.0.dev20230111+cu117. It also has a few other minor optimizations in PyTorch-related code.

        + +

        The table below shows runtime of each version of the code in seconds, and the percentage improvement compared to the _original with xFormers. _The compilation time is excluded.

        + +

        Runtimes for batch size 1. In parenthesis - relative improvement with respect to the “Original with xFormers” row

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Configuration + P100 + T4 + A10 + V100 + A100 +
        Original without xFormers + 30.4s (-19.3%) + 29.8s (-77.3%) + 13.0s (-83.9%) + 10.9s (-33.1%) + 8.0s (-19.3%) +
        Original with xFormers + 25.5s (0.0%) + 16.8s (0.0%) + 7.1s (0.0%) + 8.2s (0.0%) + 6.7s (0.0%) +
        Optimized with vanilla math attention, no compilation + 27.3s (-7.0%) + 19.9s (-18.7%) + 13.2s (-87.2%) + 7.5s (8.7%) + 5.7s (15.1%) +
        Optimized with mem. efficient attention, no compilation + 26.5s (-3.8%) + 16.8s (0.2%) + 7.1s (-0.8%) + 6.9s (16.0%) + 5.3s (20.6%) +
        Optimized with mem. efficient attention and compilation + - + 16.4s (2.1%) + 7.2s (-2.3%) + 6.6s (18.6%) + 4.1s (38.5%) +
        + +

        Runtimes for batch size 2

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Configuration + P100 + T4 + A10 + V100 + A100 +
        Original without xFormers + 58.0s (-21.6%) + 57.6s (-84.0%) + 24.4s (-95.2%) + 18.6s (-63.0%) + 12.0s (-50.6%) +
        Original with xFormers + 47.7s (0.0%) + 31.3s (0.0%) + 12.5s (0.0%) + 11.4s (0.0%) + 8.0s (0.0%) +
        Optimized with vanilla math attention, no compilation + 49.3s (-3.5%) + 37.9s (-21.0%) + 17.8s (-42.2%) + 12.7s (-10.7%) + 7.8s (1.8%) +
        Optimized with mem. efficient attention, no compilation + 47.5s (0.4%) + 31.2s (0.5%) + 12.2s (2.6%) + 11.5s (-0.7%) + 7.0s (12.6%) +
        Optimized with mem. efficient attention and compilation + - + 28.0s (10.5%) + 11.4s (9.0%) + 10.7s (6.4%) + 6.4s (20.3%) +
        + +

        Runtimes for batch size 4

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Configuration + P100 + T4 + A10 + V100 + A100 +
        Original without xFormers + 117.9s (-20.0%) + 112.4s (-81.8%) + 47.2s (-101.7%) + 35.8s (-71.9%) + 22.8s (-78.9%) +
        Original with xFormers + 98.3s (0.0%) + 61.8s (0.0%) + 23.4s (0.0%) + 20.8s (0.0%) + 12.7s (0.0%) +
        Optimized with vanilla math attention, no compilation + 101.1s (-2.9%) + 73.0s (-18.0%) + 28.3s (-21.0%) + 23.3s (-11.9%) + 14.5s (-13.9%) +
        Optimized with mem. efficient attention, no compilation + 92.9s (5.5%) + 61.1s (1.2%) + 23.9s (-1.9%) + 20.8s (-0.1%) + 12.8s (-0.9%) +
        Optimized with mem. efficient attention and compilation + - + 53.1s (14.2%) + 20.9s (10.6%) + 18.6s (10.4%) + 11.2s (12.2%) +
        + +

        To minimize fluctuations and external influence on the performance of the benchmarked code, we ran each version of the code one after another, and then repeated this sequence 10 times: A, B, C, D, E, A, B, … So the results of a typical run would look like the one in the picture below.. Note that one shouldn’t rely on comparison of absolute run times between different graphs, but comparison of run times_ inside_ one graph is pretty reliable, thanks to our benchmarking setup.

        + +

        Denoising diffusion model generation benchmarks

        + +

        Each run of text-to-image generation script produces several batches, the number of which is regulated by the CLI parameter --n_iter. In the benchmarks we used n_iter = 2, but introduced an additional “warm-up” iteration, which doesn’t contribute to the run time. This was necessary for the runs with compilation, because compilation happens the first time the code runs, and so the first iteration is much longer than all subsequent. To make comparison fair, we also introduced this additional “warm-up” iteration to all other runs.

        + +

        The numbers in the table above are for number of iterations 2 (plus a “warm-up one”), prompt ”A photo”, seed 1, PLMS sampler, and autocast turned on.

        + +

        Benchmarks were done using P100, V100, A100, A10 and T4 GPUs. The T4 benchmarks were done in Google Colab Pro. The A10 benchmarks were done on g5.4xlarge AWS instances with 1 GPU.

        + +

        Conclusions and next steps

        + +

        We have shown that new features of PyTorch 2 - compiler and optimized attention implementation - give performance improvements exceeding or comparable with what previously required installation of an external dependency (xFormers). PyTorch achieved this, in particular, by integrating memory efficient attention from xFormers into its codebase. This is a significant improvement for user experience, given that xFormers, being a state-of-the-art library, in many scenarios requires custom installation process and long builds.

        + +

        There are a few natural directions in which this work can be continued:

        + +
          +
        • The optimizations we implemented and described here are only benchmarked for text-to-image inference so far. It would be interesting to see how they affect training performance. PyTorch compilation can be directly applied to training; enabling training with PyTorch optimized attention is on the roadmap
        • +
        • We intentionally minimized changes to the original model code. Further profiling and optimization can probably bring more improvements
        • +
        • At the moment compilation is applied only to the U-Net model inside the sampler. Since there is a lot happening outside of U-Net (e.g. operations directly in the sampling loop), it would be beneficial to compile the whole sampler. However, this would require analysis of the compilation process to avoid recompilation at every sampling step
        • +
        • Current code only applies compilation within the PLMS sampler, but it should be trivial to extend it to other samplers
        • +
        • Besides text-to-image generation, diffusion models are also applied to other tasks - image-to-image and inpainting. It would be interesting to measure how their performance improves from PyTorch 2 optimizations
        • +
        + +

        See if you can increase performance of open source diffusion models using the methods we described, and share the results!

        + +

        Resources

        + + + +

        Acknowledgements

        + +

        We would like to thank Geeta Chauhan, Natalia Gimelshein, Patrick Labatut, Bert Maher, Mark Saroufim, Michael Voznesensky and Francisco Massa for their valuable advice and early feedback on the text.

        + +

        Special thanks to Yudong Tao initiating the work on using PyTorch native attention in diffusion models.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-image-seg/index.html b/blog/accelerated-image-seg/index.html new file mode 100644 index 000000000000..be1fa3e207ad --- /dev/null +++ b/blog/accelerated-image-seg/index.html @@ -0,0 +1,878 @@ + + + + + + + + + + + + + Accelerated Image Segmentation using PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        Using Intel® Extension for PyTorch to Boost Image Processing Performance

        + +

        PyTorch delivers great CPU performance, and it can be further accelerated with Intel® Extension for PyTorch. I trained an AI image segmentation model using PyTorch 1.13.1 (with ResNet34 + UNet architecture) to identify roads and speed limits from satellite images, all on the 4th Gen Intel® Xeon® Scalable processor.

        + +

        I will walk you through the steps to work with a satellite image dataset called SpaceNet5 and how I optimized the code to make deep learning workloads feasible on CPUs just by flipping a few key switches.

        + +

        Before we get started, some housekeeping…

        + +

        The code accompanying this article is available in the examples folder in the Intel Extension for PyTorch repository. I borrowed heavily from the City-Scale Road Extraction from Satellite Imagery (CRESI) repository. I adapted it for the 4th Gen Intel Xeon processors with PyTorch optimizations and Intel Extension for PyTorch optimizations. In particular, I was able to piece together a workflow using the notebooks here.

        + +

        You can find the accompanying talk I gave on YouTube.

        + +

        I also highly recommend these articles for a detailed explanation of how to get started with the SpaceNet5 data:

        + + + +

        I referenced two Hugging Face blogs by Julien Simon; he ran his tests on the AWS instance r7iz.metal-16xl:

        + + + +

        The potential cost savings from using a CPU instance instead of a GPU instance on the major cloud service providers (CSP) can be significant. The latest processors are still being rolled out to the CSPs, so I’m using a 4th Gen Intel Xeon processor that is hosted on the Intel® Developer Cloud (you can sign up for the Beta here: cloud.intel.com).

        + +

        On AWS, you can select from the r7iz.* EC2 instances after you sign up for the preview here (Figure 1). At the time of writing, the new AI-acceleration engine, Intel® Advanced Matrix Extensions (Intel® AMX), is only available on bare metal but it should soon be enabled on the virtual machines.

        + +

        List of 4th Gen Xeon  instances on AWS EC2

        + +

        Figure 1. List of 4th Gen Xeon instances on AWS EC2 (image by author)

        + +

        On Google Cloud* Platform, you can select from the 4th Gen Xeon Scalable processors C3 VMs (Figure 2).

        + +

        List of 4th Gen Intel Xeon Scalable processor instances on Google Cloud Platform

        + +

        Figure 2. List of 4th Gen Intel Xeon Scalable processor instances on Google Cloud Platform (image by author)

        + +

        Hardware Introduction and Optimizations

        + +

        The 4th Gen Intel Xeon processors were released January 2023, and the bare-metal instance I am using has two sockets (each with 56 physical cores), 504 GB of memory, and Intel AMX acceleration. I installed a few key libraries in the backend to take control and monitor the sockets, memory, and cores that I am using on the CPU:

        + +

        numactl (with sudo apt-get install numactl)

        + +

        libjemalloc-dev (with sudo apt-get install libjemalloc)

        + +

        intel-openmp (with conda install intel-openmp)

        + +

        gperftools (with conda install gperftools -c conda-forge)

        + +

        Both PyTorch and Intel Extension for PyTorch have helper scripts so that one does not need to explicitly use intel-openmp and numactl, but they do need to be installed in the backend. In case you want to set them up for other work, here is what I used for OpenMP* …

        + +
        export OMP_NUM_THREADS=36
        +export KMP_AFFINITY=granularity=fine,compact,1,0
        +export KMP_BLOCKTIME=1
        +
        + +

        … where OMP_NUM_THREADS is the number of threads allocated to the job, KMP_AFFINITY affects thread affinity settings (including packing threads close to each other, the state of pinning threads), and KMP_BLOCKTIME sets the time in milliseconds that an idle thread should wait before going to sleep.

        + +

        Here’s what I used for numactl

        + +
        numactl -C 0-35 --membind=0 train.py
        +
        + +

        …where -C specifies which cores to use and --membind instructs the program to only use one socket (socket 0 in this case).

        + +

        SpaceNet Data

        + +

        I am using a satellite image dataset from the SpaceNet 5 Challenge. Different cities can be downloaded for free from an AWS S3 bucket:

        + +
        aws s3 ls s3://spacenet-dataset/spacenet/SN5_roads/tarballs/ --human-readable
        +
        + +
        2019-09-03 20:59:32    5.8 GiB SN5_roads_test_public_AOI_7_Moscow.tar.gz
        +2019-09-24 08:43:02    3.2 GiB SN5_roads_test_public_AOI_8_Mumbai.tar.gz
        +2019-09-24 08:43:47    4.9 GiB SN5_roads_test_public_AOI_9_San_Juan.tar.gz
        +2019-09-14 13:13:26   35.0 GiB SN5_roads_train_AOI_7_Moscow.tar.gz
        +2019-09-14 13:13:34   18.5 GiB SN5_roads_train_AOI_8_Mumbai.tar.gz
        +
        + +

        You can use the following commands to download and unpack a file:

        + +
        aws s3 cp s3://spacenet-dataset/spacenet/SN5_roads/tarballs/SN5_roads_train_AOI_7_Moscow.tar.gz .
        +tar -xvzf ~/spacenet5data/moscow/SN5_roads_train_AOI_7_Moscow.tar.gz
        +
        + +

        Dataset Preparation

        + +

        I used the Moscow satellite image dataset, which consists of 1,352 images of 1,300 by 1,300 pixels with corresponding street labels in separate text files. The dataset contains both 8-band multispectral images and 3-band RGB images. Figure 3 shows four sample RGB satellite images and their corresponding generated masks. I used the speed_masks.py script from the CRESI repository to generate the segmentation masks.

        + +

        Satellite image 3-channel RGB chips from Moscow (top row) and corresponding pixel segmentation masks with varying speed limits

        + +

        Figure 3. Satellite image 3-channel RGB chips from Moscow (top row) and corresponding pixel segmentation masks with varying speed limits (bottom row) (image by author)

        + +

        There is a JSON configuration file that must be updated for all remaining components: training and validation split, training, and inference. An example configuration can be found here. I perform an 80:20 training/validation split, making sure to point to the correct folder of satellite images and corresponding masks for training. The configuration parameters are explained in more in the notebook under examples in GitHub for Intel Extension for PyTorch here.

        + +

        Training a ResNet34 + UNet Model

        + +

        I made some changes to the cresi code described below in order to run on a CPU and optimize the training. To run natively on a CPU, replace self.model = nn.DataParallel(model).cuda() with self.model = nn.DataParallel(model) in the train.py script. In the 01_train.py script, remove torch.randn(10).cuda().

        + +

        To optimize training, add import intel_extension_for_pytorch as ipex to the import statements in the train.py script. Just after defining the model and optimizer as follows:

        + +
        self.model = nn.DataParallel(model)
        +self.optimizer = optimizer(self.model.parameters(), lr=config.lr)
        +
        + +

        Add the ipex.optimize line to use BF16 precision, instead of FP32: \

        + +
        self.model, self.optimizer = ipex.optimize(self.model, 
        +    optimizer=self.optimizer,dtype=torch.bfloat16)
        +
        + +

        Add a line to do mixed-precision training just before running a forward pass and calculating the loss function:

        + +
        with torch.cpu.amp.autocast():
        +    if verbose:
        +        print("input.shape, target.shape:", input.shape, target.shape)
        +    output = self.model(input)
        +    meter = self.calculate_loss_single_channel(output, target, meter, training, iter_size)
        +
        + +

        Now that we have optimized our training code, we can move onto training our model.

        + +

        Like the winner of the SpaceNet 5 competition, I trained a ResNet34 encoder + UNet decoder model. It is pretrained from ImageNet weights, and the backbone is left completely unfrozen during training. The training can be run with the 01_train.py script, but in order to control the use of hardware I used a helper script. There are actually two helper scripts: one that comes with stock PyTorch and one that comes with Intel Extension for PyTorch. They both accomplish the same thing, but the first one from stock is torch.backends.xeon.run_cpu, and the second one from Intel Extension for PyTorch is ipexrun.

        + +

        Here is what I ran in the command-line:

        + +
        python -m torch.backends.xeon.run_cpu --ninstances 1 \
        +  --ncores_per_instance 32 \
        +  --log_path /home/devcloud/spacenet5data/moscow/v10_xeon4_devcloud22.04/logs/run_cpu_logs \
        +  /home/devcloud/cresi/cresi/01_train.py \
        +  /home/devcloud/cresi/cresi/configs/ben/v10_xeon4_baseline_ben.json --fold=0
        +
        + +
        ipexrun --ninstances 1 \
        +--ncore_per_instance 32 \
        +/home/devcloud/cresi/cresi/01_train.py \
        +/home/devcloud/cresi/cresi/configs/ben/v10_xeon4_baseline_ben.json --fold=0
        +
        + +

        In both cases, I am asking PyTorch to run training on one socket with 32 cores. Upon running, I get a printout of what environment variables get set in the backend to understand how PyTorch is using the hardware:

        + +
        INFO - Use TCMalloc memory allocator
        +INFO - OMP_NUM_THREADS=32
        +INFO - Using Intel OpenMP
        +INFO - KMP_AFFINITY=granularity=fine,compact,1,0
        +INFO - KMP_BLOCKTIME=1
        +INFO - LD_PRELOAD=/home/devcloud/.conda/envs/py39/lib/libiomp5.so:/home/devcloud/.conda/envs/py39/lib/libtcmalloc.so
        +INFO - numactl -C 0-31 -m 0 /home/devcloud/.conda/envs/py39/bin/python -u 01_train.py configs/ben/v10_xeon4_baseline_ben.json --fold=0
        +
        + +

        During training, I make sure that my total loss function is decreasing (i.e., the model is converging on a solution).

        + +

        Inference

        + +

        After training a model, we can start to make predictions from satellite images alone. In the eval.py inference script, add import intel_extension_for_pytorch as ipex to the import statements. After loading the PyTorch model, use Intel Extension for PyTorch to optimize the model for BF16 inference:

        + +
        model = torch.load(os.path.join(path_model_weights, 
        +    'fold{}_best.pth'.format(fold)), 
        +    map_location = lambda storage, 
        +    loc: storage)
        +model.eval()
        +model = ipex.optimize(model, dtype = torch.bfloat16)
        +
        + +

        Just prior to running prediction, add two lines for mixed precision:

        + +
        with torch.no_grad():
        +    with torch.cpu.amp.autocast():
        +        for data in pbar:
        +            samples = torch.autograd.Variable(data['image'], volatile=True)
        +            predicted = predict(model, samples, flips=self.flips)
        +
        + +

        To run inference, we can use the 02_eval.py script. Now that we have a trained model, we can make predictions on satellite images (Figure 4). We can see that it does seem to map the roads closely to the image!

        + +

        Moscow satellite image and accompanying prediction of roads

        + +

        Figure 4. Moscow satellite image and accompanying prediction of roads (image by author)

        + +

        I realize that the model I’ve trained is overfit to the Moscow image data and probably won’t generalize well to other cities. However, the winning solution to this challenge used data from six cities (Las Vegas, Paris, Shanghai, Khartoum, Moscow, Mumbai) and performs well on new cities. In the future, one thing that would be worth testing is training on all six cities and running inference on another city to reproduce their results.

        + +

        Note on Post-Processing

        + +

        There are further post-processing steps that can be performed to add the mask as graph features to maps. You can read more about the post-processing steps here:

        + +

        The SpaceNet 5 Baseline — Part 3: Extracting Road Speed Vectors from Satellite Imagery

        + +

        Post-processing scripts

        + +

        Conclusions

        + +

        In summary, we:

        + +
          +
        • Created 1,352 image training masks (with speed limits) to correspond to our training satellite image data (from .geojson text file labels)
        • +
        • Defined our configuration file for training and inference
        • +
        • Split up our data into training and validation sets
        • +
        • Optimized our code for CPU training, including using Intel Extension for PyTorch and BF16
        • +
        • Trained a performant ResNet34 + UNet model on a 4th Gen Intel Xeon CPU
        • +
        • Ran initial inference to see the prediction of a speed limit mask
        • +
        + +

        You can find detailed benchmarks here for the 4th Gen Intel Xeon CPU here.

        + +

        Next Steps

        + +

        Extend the optimizations on an Intel CPU by using the Intel Extension for PyTorch:

        + +

        pip install intel-extension-for-pytorch

        + +

        git clone https://github.com/intel/intel-extension-for-pytorch

        + +

        Get in touch with me on LinkedIn if you have any more questions!

        + +

        More information about the Intel Extension for PyTorch can be found here.

        + +

        Get the Software

        + +

        I encourage you to check out Intel’s other AI Tools and Framework optimizations and learn about the open, standards-based oneAPI multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio.

        + +

        For more details about 4th Gen Intel Xeon Scalable processor, visit AI Platform where you can learn about how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines.

        + +

        PyTorch Resources

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-pytorch-2/index.html b/blog/accelerated-pytorch-2/index.html new file mode 100644 index 000000000000..727cbe888481 --- /dev/null +++ b/blog/accelerated-pytorch-2/index.html @@ -0,0 +1,691 @@ + + + + + + + + + + + + + Accelerated PyTorch 2 Transformers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        March 28, 2023

        +

        + Accelerated PyTorch 2 Transformers +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Michael Gschwind, Driss Guessous, Christian Puhrsch + +

        +

        The PyTorch 2.0 release includes a new high-performance implementation of the PyTorch Transformer API with the goal of making training and deployment of state-of-the-art Transformer models affordable. Following the successful release of “fastpath” inference execution (“Better Transformer”), this release introduces high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA).

        + +

        You can take advantage of the new fused SDPA kernels either by calling the new SDPA operator directly (as described in the SDPA tutorial), or transparently via integration into the pre-existing PyTorch Transformer API. All features of the PyTorch Transformer API will continue to work compatibly, with many features mapped to high-performance SDPA kernels, while other features are impossible to support with higher performance (e.g., need_weights, as per below) while expanded high-performance support for other features may still be under active development.
        +
        +Similar to the “fastpath” architecture, custom kernels are fully integrated into the PyTorch Transformer API – thus, using the native Transformer and MultiHeadAttention API will enable users to transparently see significant speed improvements. Unlike the “fastpath” architecture, the newly introduced “custom kernels” support many more use cases including models using Cross-Attention, Transformer Decoders, and for training models, in addition to the existing fastpath inference for fixed and variable sequence length Transformer Encoder and Self Attention use cases.

        + +

        To take full advantage of different hardware models and Transformer use cases, multiple SDPA custom kernels are supported, with custom kernel selection logic that will pick the highest-performance kernel for a given model and hardware type. In particular, the first custom kernels included with the PyTorch 2.0 release are the Flash Attention kernel (sdpa_flash, for 16-bit floating point training and inference on Nvidia GPUs with SM80+ architecture level) and the xFormers memory-efficient attention kernel (sdpa_mem_eff, for 16-bit and 32-bit floating point training and inference on a broad range of Nvidia GPUs). A general-purpose kernel sdpa_math provides an implementation when the custom kernels are not applicable.

        + +

        As mentioned, custom kernels provide a wider range of support for execution scenarios To ensure efficient execution (e,g., to use GPU tensor cores), model configurations need to meet a small number of requirements. This list of requirements will evolve over time, prospectively relaxing constraints limiting the usage of currently supported custom kernels, or providing additional kernels in the future.

        + +

        For the most up to date list of custom kernels and dispatch constraints, you can refer to sdp_utils.h. As of PyTorch 2.0, the existing fused SDPA kernels have the following constraints:

        + +
          +
        • Flash Attention only supports 16 bit floating point data types (float16 and bfloat16).
        • +
        • The head dimension must be a multiple of 8 for 16-bit floating point numbers and a multiple of 4 for 32-bit floating point numbers. At present, the maximum head_dim support for the Flash Attention custom kernel is 128.
        • +
        • The CUDA architecture level must be sm5x or better for the mem_efficient kernel, and sm80 for Flash Attention.
        • +
        • Flash Attention supports arbitrary dropout, in PyTorch 2.0 the mem_efficient kernel does not support dropout (i.e., dropout must be set to zero for this kernel to be selected in PyTorch 2.0).
        • +
        • To support variable-sequence length batches, all SDPA kernels support Nested Tensor inputs that combine input data and padding information using variable sequence length tensors for forward. (You can find more information about Nested Tensors in the Nested Tensor tutorial.)
        • +
        • You can specify both a key_padding_mask and an attn_mask by combining them before passing them to the SDPA operator. In particular, you can use the per-batch-element key padding mask of the nn.Transformer API to implement training for variable-sequence length inputs in a batch.
        • +
        • At present, the only attention mask supported by fused kernel implementation is the causal mask commonly used for training. To specify the causal mask in custom kernels, it must be specified with the is_causal boolean and attn_mask must be None.
        • +
        • Support for Nested Tensors is still under development. Specifically, in PyTorch 2.0, only the sdpa_math kernel supports training with Nested Tensors. Also, PyTorch 2.0 does not support Nested Tensors as part of code being compiled with torch.compile().
        • +
        • The SDPA operator does not support returning averaged attention weights because computing them defeats the optimizations that enabled fused kernels to execute more efficiently. The argument need_weights for torch.nn.MultiheadAttention’s forward function defaults to True. In order to use the fused kernels, need_weights needs to be set to need_weights=False.
        • +
        + +

        We find that an attention mask is rarely used in real-world applications, except for the causal mask during training. Consequently, we reduce kernel complexity and compute cost by building in the option to use a causal mask as attention mask, and select this new capability with the is_causal parameter introduced in conjunction with the new SDPA operator.

        + +

        Providing the is_causal Boolean flag for the frequently used causal mask also obviates the expensive and memory-intensive allocation of a causal mask, increasing training memory efficiency by allowing more memory to be used for large batch sizes, and reduce memory bandwidth and cache contention – which are both at a premium in GPU accelerators – by not needing to load an attention mask tensor.

        + +

        If the constraints of none of the available custom kernels are met, then training falls back to using the default sdpa_math kernel, implementing the mathematical equations for scaled dot product attention using a sequence of PyTorch operator to implement SDPA. This is the most general “catch-all” fallback kernel to ensure successful training for all models.

        + +

        In addition to the existing Transformer API, model developers may also use the scaled dot product attention kernels directly by calling the new scaled_dot_product_attention() operator. This operator may be used to efficiently implement multi-head attention by combining it with in-projection and outprojection, as described in the SDPA tutorial.

        + +

        In addition to adding custom kernels, Accelerated PyTorch 2 Transformers are integrated with PyTorch 2.0 compilation. To use your model while benefiting from the additional acceleration of PT2-compilation (for inference or training), pre-process the model with

        + +
        model = torch.compile(model)
        +
        + +

        We have achieved major speedups for training transformer models and in particular large language models with Accelerated PyTorch 2 Transformers using a combination of custom kernels and torch.compile().

        + +

        Better Transformer chart +Figure: Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for nanoGPT shown here.

        + +

        Finally, because the custom kernels are much more memory efficient, try to increase the size of training batches to achieve faster training with increased batch size.

        + +

        In addition to automatic kernel selection, a context manager enables developers to override the kernel selection algorithm – this is not required for day to day operation, but enables developers to debug their code as well as enable performance engineers to override kernel selection. The SDPA tutorial provides additional information on using the SDPA context manager.

        + +

        In addition to availability as part of the nn.Transformer API, Accelerated PyTorch 2 Transformer custom kernels are also available in conjunction with the torchtext, torchvision, and fairseq domain libraries with the launch of PyTorch 2.0.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerated-pytorch-inference/index.html b/blog/accelerated-pytorch-inference/index.html new file mode 100644 index 000000000000..12b4baa23f2d --- /dev/null +++ b/blog/accelerated-pytorch-inference/index.html @@ -0,0 +1,1047 @@ + + + + + + + + + + + + + Accelerated PyTorch inference with torch.compile on AWS Graviton processors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Sunita Nadampalli + +

        +

        Summary

        + +

        Originally PyTorch, used an eager mode where each PyTorch operation that forms the model is run independently as soon as it’s reached. PyTorch 2.0 introduced torch.compile to speed up PyTorch code over the default eager mode. In contrast to eager mode, the torch.compile pre-compiles the entire model into a single graph in a manner that’s optimal for running on a given hardware platform. AWS optimized the PyTorch torch.compile feature for AWS Graviton3 processors. This optimization results in up to 2x better performance for Hugging Face model inference (based on geomean of performance improvement for 33 models) and up to 1.35x better performance for TorchBench model inference (geomean of performance improvement for 45 models) compared to the default eager mode inference across several natural language processing (NLP), computer vision (CV), and recommendation models on AWS Graviton3-based Amazon EC2 instances. Starting with PyTorch 2.3.1, the optimizations are available in torch Python wheels and AWS Graviton PyTorch deep learning container (DLC).

        + +

        In this blog post, we show how we optimized torch.compile performance on AWS Graviton3-based EC2 instances, how to use the optimizations to improve inference performance, and the resulting speedups.

        + +

        Why torch.compile and what’s the goal?

        + +

        In eager mode, operators in a model are run immediately as they are encountered. It’s easier to use, more suitable for machine learning (ML) researchers, and hence is the default mode. However, eager mode incurs runtime overhead because of redundant kernel launch and memory read overhead. Whereas in torch compile mode, operators are first synthesized into a graph, wherein one operator is merged with another to reduce and localize memory reads and total kernel launch overhead.

        + +

        The goal for the AWS Graviton team was to optimize torch.compile backend for Graviton3 processors. PyTorch eager mode was already optimized for Graviton3 processors with Arm Compute Library (ACL) kernels using oneDNN (also known as MKLDNN). So, the question was, how to reuse those kernels in torch.compile mode to get the best of graph compilation and the optimized kernel performance together?

        + +

        Results

        + +

        The AWS Graviton team extended the torch inductor and oneDNN primitives that reused the ACL kernels and optimized compile mode performance on Graviton3 processors. Starting with PyTorch 2.3.1, the optimizations are available in the torch Python wheels and AWS Graviton DLC. Please see the Running an inference section that follows for the instructions on installation, runtime configuration, and how to run the tests.

        + +

        To demonstrate the performance improvements, we used NLP, CV, and recommendation models from TorchBench and the most downloaded NLP models from Hugging Face across Question Answering, Text Classification, Token Classification, Translation, Zero-Shot Classification, Translation, Summarization, Feature Extraction, Text Generation, Text2Text Generation, Fill-Mask, and Sentence Similarity tasks to cover a wide variety of customer use cases.

        + +

        We started with measuring TorchBench model inference latency, in milliseconds (msec), for the eager mode, which is marked 1.0 with a red dotted line in the following graph. Then we compared the improvements from torch.compile for the same model inference, the normalized results are plotted in the graph. You can see that for the 45 models we benchmarked, there is a 1.35x latency improvement (geomean for the 45 models).

        + +

        PyTorch model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using TorchBench framework

        + +

        Image 1: PyTorch model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using TorchBench framework. The reference eager mode performance is marked as 1.0. (higher is better)

        + +

        Similar to the preceding TorchBench inference performance graph, we started with measuring the Hugging Face NLP model inference latency, in msec, for the eager mode, which is marked 1.0 with a red dotted line in the following graph. Then we compared the improvements from torch.compile for the same model inference, the normalized results are plotted in the graph. You can see that for the 33 models we benchmarked, there is around 2x performance improvement (geomean for the 33 models).

        + +

        Hugging Face NLP model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using Hugging Face example scripts

        + +

        Image 2: Hugging Face NLP model inference performance improvement with torch.compile on AWS Graviton3-based c7g instance using Hugging Face example scripts. The reference eager mode performance is marked as 1.0. (higher is better)

        + +

        Running an inference

        + +

        Starting with PyTorch 2.3.1, the optimizations are available in the torch Python wheel and in AWS Graviton PyTorch DLC. This section shows how to run inference in eager and torch.compile modes using torch Python wheels and benchmarking scripts from Hugging Face and TorchBench repos.

        + +

        To successfully run the scripts and reproduce the speedup numbers mentioned in this post, you need an instance from the Graviton3 family (c7g/r7g/m7g/hpc7g) of hardware. For this post, we used the c7g.4xl (16 vcpu) instance. The instance, the AMI details, and the required torch library versions are mentioned in the following snippet.

        + +
        Instance: c7g.4xl instance
        +Region: us-west-2
        +AMI: ami-05cc25bfa725a144a (Ubuntu 22.04/Jammy with 6.5.0-1017-aws kernel)
        +
        +# Install Python
        +sudo apt-get update
        +sudo apt-get install -y python3 python3-pip
        +
        +# Upgrade pip3 to the latest version
        +python3 -m pip install --upgrade pip
        +
        +# Install PyTorch and extensions
        +python3 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1
        +
        + +

        The generic runtime tunings implemented for eager mode inference are equally applicable for the torch.compile mode, so, we set the following environment variables to further improve the torch.compile performance on AWS Graviton3 processors.

        + +
        # Enable the fast math GEMM kernels, to accelerate fp32 inference with bfloat16 gemm
        +export DNNL_DEFAULT_FPMATH_MODE=BF16
        +
        +# Enable Linux Transparent Huge Page (THP) allocations,
        +# to reduce the tensor memory allocation latency
        +export THP_MEM_ALLOC_ENABLE=1
        +
        +# Set LRU Cache capacity to cache the primitives and avoid redundant
        +# memory allocations
        +export LRU_CACHE_CAPACITY=1024
        +
        + +

        TORCHBENCH BENCHMARKING SCRIPTS

        + +

        TorchBench is a collection of open source benchmarks used to evaluate PyTorch performance. We benchmarked 45 models using the scripts from the TorchBench repo. Following code shows how to run the scripts for the eager mode and the compile mode with inductor backend.

        + +
        # Set OMP_NUM_THREADS to number of vcpus, 16 for c7g.4xl instance
        +export OMP_NUM_THREADS=16
        +
        +# Install the dependencies
        +sudo apt-get install -y libgl1-mesa-glx
        +sudo apt-get install -y libpangocairo-1.0-0
        +python3 -m pip install psutil numpy transformers pynvml numba onnx onnxruntime scikit-learn timm effdet gym doctr opencv-python h5py==3.10.0 python-doctr 
        +
        +# Clone pytorch benchmark repo
        +git clone https://github.com/pytorch/benchmark.git
        +cd benchmark
        +# PyTorch benchmark repo doesn't have any release tags. So,
        +# listing the commit we used for collecting the performance numbers
        +git checkout 9a5e4137299741e1b6fb7aa7f5a6a853e5dd2295
        +
        +# Setup the models
        +python3 install.py 
        +
        +# Colect eager mode performance using the following command. The results will be
        +# stored at .userbenchmark/cpu/metric-<timestamp>.json.
        +python3 run_benchmark.py cpu --model BERT_pytorch,hf_Bert,hf_Bert_large,hf_GPT2,hf_Albert,hf_Bart,hf_BigBird,hf_DistilBert,hf_GPT2_large,dlrm,hf_T5,mnasnet1_0,mobilenet_v2,mobilenet_v3_large,squeezenet1_1,timm_efficientnet,shufflenet_v2_x1_0,timm_regnet,resnet50,soft_actor_critic,phlippe_densenet,resnet152,resnet18,resnext50_32x4d,densenet121,phlippe_resnet,doctr_det_predictor,timm_vovnet,alexnet,doctr_reco_predictor,vgg16,dcgan,yolov3,pytorch_stargan,hf_Longformer,timm_nfnet,timm_vision_transformer,timm_vision_transformer_large,nvidia_deeprecommender,demucs,tts_angular,hf_Reformer,pytorch_CycleGAN_and_pix2pix,functorch_dp_cifar10,pytorch_unet --test eval --metrics="latencies,cpu_peak_mem"
        +
        +# Collect torch.compile mode performance with inductor backend
        +# and weights pre-packing enabled. The results will be stored at
        +# .userbenchmark/cpu/metric-<timestamp>.json
        +python3 run_benchmark.py cpu --model BERT_pytorch,hf_Bert,hf_Bert_large,hf_GPT2,hf_Albert,hf_Bart,hf_BigBird,hf_DistilBert,hf_GPT2_large,dlrm,hf_T5,mnasnet1_0,mobilenet_v2,mobilenet_v3_large,squeezenet1_1,timm_efficientnet,shufflenet_v2_x1_0,timm_regnet,resnet50,soft_actor_critic,phlippe_densenet,resnet152,resnet18,resnext50_32x4d,densenet121,phlippe_resnet,doctr_det_predictor,timm_vovnet,alexnet,doctr_reco_predictor,vgg16,dcgan,yolov3,pytorch_stargan,hf_Longformer,timm_nfnet,timm_vision_transformer,timm_vision_transformer_large,nvidia_deeprecommender,demucs,tts_angular,hf_Reformer,pytorch_CycleGAN_and_pix2pix,functorch_dp_cifar10,pytorch_unet --test eval --torchdynamo inductor --freeze_prepack_weights --metrics="latencies,cpu_peak_mem"
        +
        + +

        On successful completion of the inference runs, the script stores the results in JSON format. The following is the sample output:

        + +
        {
        + "name": "cpu"
        + "environ": {
        +     "pytorch_git_version": "d44533f9d073df13895333e70b66f81c513c1889"
        +  },
        +  
        +  "metrics": {
        +       "BERT_pytorch-eval_latency": 56.3769865,
        +       "BERT_pytorch-eval_cmem": 0.4169921875
        +  }
        +}
        +
        + +

        HUGGING FACE BENCHMARKING SCRIPTS

        + +

        Google T5 Small Text Translation model is one of the around 30 Hugging Face models we benchmarked. We’re using it as a sample model to demonstrate how to run inference in eager and compile modes. The additional configurations and APIs required to run it in compile mode are highlighted in BOLD. Save the following script as google_t5_small_text_translation.py.

        + +
        import argparse
        +from transformers import T5Tokenizer, T5Model
        +import torch
        +from torch.profiler import profile, record_function, ProfilerActivity
        +import torch._inductor.config as config
        +config.cpp.weight_prepack=True
        +config.freezing=True
        +
        +def test_inference(mode, num_iter):
        +    tokenizer = T5Tokenizer.from_pretrained("t5-small")
        +    model = T5Model.from_pretrained("t5-small")
        +
        +    input_ids = tokenizer(
        +        "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        +    ).input_ids  # Batch size 1
        +    decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        +
        +    if (mode == 'compile'):
        +        model = torch.compile(model)
        +
        +    with torch.no_grad():
        +        for _ in range(50):
        +            outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        +
        +        with profile(activities=[ProfilerActivity.CPU]) as prof:
        +            with record_function("model_inference"):
        +                for _ in range(num_iter):
        +                    outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        +
        +    print(prof.key_averages().table(sort_by="self_cpu_time_total"))
        +
        +def main() -> None:
        +    global m, args
        +    parser = argparse.ArgumentParser(__doc__)
        +    parser.add_argument(
        +        "-m",
        +        "--mode",
        +        choices=["eager", "compile"],
        +        default="eager",
        +        help="Which test to run.",
        +    )
        +    parser.add_argument(
        +        "-n",
        +        "--number",
        +        type=int,
        +        default=100,
        +        help="how many iterations to run.",
        +    )
        +    args = parser.parse_args()
        +    test_inference(args.mode, args.number)
        +
        +if __name__ == "__main__":
        +    main()
        +
        + +

        Run the script with the following steps:

        + +
        # Set OMP_NUM_THREADS to number of vcpus to 4 because
        +# the scripts are running inference in sequence, and
        +# they don't need large number of vcpus
        +export OMP_NUM_THREADS=4
        +
        +# Install the dependencies
        +python3 -m pip install transformers
        +
        +# Run the inference script in Eager mode
        +# using number of iterations as 1 just to show the torch profiler output
        +# but for the benchmarking, we used 1000 iterations.
        +python3 google_t5_small_text_translation.py -n 1 -m eager
        +
        +# Run the inference script in torch compile mode
        +python3 google_t5_small_text_translation.py -n 1 -m compile
        +
        + +

        On successful completion of the inference runs, the script prints the torch profiler output with the latency breakdown for the torch operators. The following is the sample output from torch profiler:

        + +
        # Torch profiler output for the eager mode run on c7g.xl (4vcpu)
        +------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
        +------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                aten::mm        40.71%      12.502ms        40.71%      12.502ms     130.229us            96  
        +         model_inference        26.44%       8.118ms       100.00%      30.708ms      30.708ms             1  
        +               aten::bmm         6.85%       2.102ms         9.47%       2.908ms      80.778us            36  
        +            aten::matmul         3.73%       1.146ms        57.26%      17.583ms     133.205us           132  
        +            aten::select         1.88%     576.000us         1.90%     583.000us       0.998us           584  
        +         aten::transpose         1.51%     464.000us         1.83%     563.000us       3.027us           186  
        +------------------------ ------------ ------------ ------------ ------------ ------------ -------------------
        +Self CPU time total: 30.708ms
        +
        +# Torch profiler output for the compile mode run for the same model on the same instance
        +---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
        +---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +        mkldnn::_linear_pointwise        37.98%       5.461ms        45.91%       6.602ms      68.771us            96  
        +            Torch-Compiled Region        29.56%       4.251ms        98.53%      14.168ms      14.168ms             1  
        +                        aten::bmm        14.90%       2.143ms        21.73%       3.124ms      86.778us            36  
        +                     aten::select         4.51%     648.000us         4.62%     665.000us       1.155us           576  
        +                       aten::view         3.29%     473.000us         3.29%     473.000us       1.642us           288  
        +                      aten::empty         2.53%     364.000us         2.53%     364.000us       3.165us           115  
        +--------------------------------- ------------ ------------ ------------ ------------ ------------ --------------------
        +Self CPU time total: 14.379ms
        +
        + +

        Technical deep dive: What are the challenges and optimization details

        + +

        Underpinning torch.compile are new technologies – TorchDynamo, AOTDispatcher, and TorchInductor.

        + +

        TorchDynamo captures PyTorch programs safely using Python Frame Evaluation Hooks
        +AOTDispatcher overloads PyTorch’s autograd engine as a tracing autodiff for generating ahead-of-time backward traces.
        +TorchInductor is a deep learning compiler that generates fast code for multiple accelerators and backends.

        + +

        The PyTorch compilation process source

        + +

        Image 3: The PyTorch compilation process

        + +

        When torch.compile is invoked, torch dynamo rewrites Python bytecode to extract sequences of PyTorch operations into an FX Graph, which is then compiled with inductor backend. For a typical inference scenario where the graph is frozen and gradient calculations are disabled, the inductor invokes platform specific optimizations like graph rewrite into more performant operators, operator fusion, and weights pre-packing.

        + +

        However, on Graviton3, the inductor wasn’t able to perform any of those optimizations because there was no aarch64 backend defined. To fix this, we extended the inductor’s FX passes to pick oneDNN operators for linear layer compilation on Graviton3 processors with ACL backend. The code snippet for this follows:

        + +
        packed_weight_op = (
        +    mkldnn._reorder_linear_weight
        +    if (is_bf16_weight or mkldnn._is_mkldnn_acl_supported())
        +                    
        +packed_linear_inputs: Tuple[Any, ...] = (input, packed_weight_node)
        +if is_bf16_weight or mkldnn._is_mkldnn_acl_supported():
        +    packed_linear_inputs += (bias, "none", [], "")
        +    packed_linear_op = mkldnn._linear_pointwise.default
        +
        + +

        After this was done, the FX pass was successful in compiling the matmul operators to linear_pointwise . The following snippet highlights the matmul operator in the original model:

        + +
         %attention_scores   : [num_users=1] = call_function[target=torch.matmul](args = (%query_layer, %transpose), kwargs = {})
        + %attention_scores_1 : [num_users=1] = call_function[target=operator.truediv](args = (%attention_scores, 8.0), kwargs = {})
        + %attention_scores_2 : [num_users=1] = call_function[target=operator.add](args = (%attention_scores_1, %extended_attention_mask_3), kwargs = {})
        +
        + +

        The following snippet highlights the linear_pointwise operator in the compiled graph:

        + +
        %_linear_pointwise_default_140 : [num_users=2] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%add_7, %_frozen_param278, %_frozen_param16, none, [], ), kwargs = {})
        +%mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.5), kwargs = {})
        +%mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.7071067811865476), kwargs = {})
        +%erf   : [num_users=1] = call_function[target=torch.ops.aten.erf.default](args = (%mul_6,), kwargs = {})
        +%add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%erf, 1), kwargs = {})
        +
        + +

        This completes the torch inductor changes required to compile the graph into optimized operators on AWS Graviton3 processors. Next comes the actual inference where the compiled graph is dispatched to be run. OneDNN with ACL was the backend we chose during the inductor compilation, so, the new operators were dispatched to oneDNN as expected, for example, mkldnn._linear_pointwise. However, due to gaps in oneDNN ACL primitives, the operators were run with C++ reference kernels instead of the optimized ACL kernels. Hence, the compile performance was still significantly behind the eager mode performance.

        + +

        There were mainly three areas where oneDNN ACL primitives lack support for torch.compile mode. The following section talks about them in detail.

        + +

        1. ACL primitives didn’t have support for weights in blocked layout

        + +

        ACL primitives originally designed for eager mode supported weights only in the standard channels last (NHWC) format, without any pre-packing. Whereas weights pre-packing into blocked layout is one of the main optimizations in the inductor compilation passes where the weights are reordered into blocks specific to the runtime platform. This avoids the redundant and on-the-fly reorders when running the General Matrix Multiplication (GEMM), which otherwise would be the bottleneck for inference performance. But the ACL primitives didn’t have support for blocked layout and hence the operators were run with oneDNN C++ reference kernels instead.

        + +

        2. Mixed precision primitives weren’t supported in oneDNN

        + +

        AWS Graviton3 processors support bfloat16 MMLA instructions which can be used to accelerate fp32 inference with bfloat16 GEMM as a mixed precision compute. ACL supports bfloat16 mixed precision GEMM kernels, and are integrated into oneDNN as a fast math compute option for the existing fp32 operators. However, the fast math approach didn’t work for compile mode because of weights pre-packing optimization. The compile mode requires explicit mixed precision primitive implementation in oneDNN in order to use bfloat16 acceleration.

        + +

        3. ACL primitives didn’t support fused kernels for some of the activation functions

        + +

        In eager mode, operators are dispatched individually because the model is run independently as soon as it’s reached. Whereas in compile mode, operator fusion is another important optimization where the operators are fused for runtime efficiency. For example, Gaussian Error Linear Unit (GELU) is one of the most widely used activation functions in transformers-based neural network architectures. So, it’s typical to have a linear layer (with matrix multiplications) followed by GELU activation. As part of compiling the model into efficient operators, the torch inductor fuses matmul and GELU into a single linearpointwise+gelu operator. However, oneDNN ACL primitives didn’t have the support for fused kernels with GELU.

        + +

        We addressed these gaps by extending oneDNN primitives to handle the additional layouts and new primitive definitions. The following sections talk about the optimizations in detail.

        + +

        Optimization 1: Extended ACL primitives to accept weight tensors in blocked layout

        + +

        We extended the ACL primitives to accept blocked layout in addition to the the standard NHWC format. The code snippet for this is as follows:

        + +
        const bool is_weights_md_format_ok
        +                    = utils::one_of(weights_format_kind_received,
        +                      format_kind::any, format_kind::blocked);
        +
        +
        +const memory_desc_t weights_md_received = weights_md_;
        +acl_utils::reorder_to_weight_format(aip.wei_tensor_info,
        +             weights_md_, expected_weight_format, inner_dim, o_dim,
        +             remaining_dims, {});
        +
        +ACL_CHECK_SUPPORT(
        +     (weights_format_kind_received == format_kind::blocked)
        +      && !(dnnl_memory_desc_equal(
        +      &weights_md_received, &weights_md_)),
        +      "specified blocked format not supported by ACL, use "
        +      "format_kind_t::any to find a supported blocked format for "
        +      "your platform");
        +
        + +

        Optimization 2: Defined new ACL primitives to handle mixed precision operators (weights in bfloat16 and activations in fp32)

        + +

        We defined mixed precision primitive definitions and updated the existing oneDNN ACL fp32 primitives to handle bfloat16 tensors.

        + +
         /* With graph compilation, we are able to reorder and pre-pack the weights during the model load
        +  * and compilation phase itself so that redundant and on-the-fly reorders can be avoided.
        +  * This primitive definition is to support gemm fastmath mode for the compile scenario where src is
        +  * in fp32 and weights are in bf16
        +  */
        + {{forward, f32, bf16, f32}, {
        +    CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
        +    nullptr,
        + }},
        +
        + +

        Optimization 3: Disabled operator fusion pass in torch inductor

        + +

        We bypassed the operator fusion pass in torch inductor so that the compiled graph doesn’t contain GELU fused operators. This is a temporary solution to enable ACL kernels in torch.compile. There is a work in progress to enable operator fusion pass for the future PyTorch releases. With this workaround, we were able to successfully dispatch the linear layer to ACL. As shown in the following torch.profiler output, the aten::addmm (one of the variants of the matmul operator) and aten::gelu in the original model (as highlighted in Image 4) was compiled to mkldnn::_linear_pointwise without gelu operator fusion (as highlighted in Image 5).

        + +
        ---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
        +---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                aten::addmm        73.32%      46.543ms        74.49%      47.287ms     647.767us            73  
        +            model_inference         9.92%       6.296ms       100.00%      63.479ms      63.479ms             1  
        +                  aten::bmm         4.37%       2.776ms         5.46%       3.467ms     144.458us            24  
        +                aten::copy_         1.74%       1.102ms         1.74%       1.102ms       8.103us           136  
        +                 aten::gelu         1.50%     950.000us         1.50%     950.000us      79.167us            12  
        +
        + +

        Image 4: torch.profiler output for Hugging Face bert base model inference in Eager mode, showing addmm and gelu operators

        +
         
        + +
        -----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
        +-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                            mkldnn::_linear_pointwise        53.61%      15.529ms        57.53%      16.665ms     228.288us            73  
        +                                Torch-Compiled Region        36.95%      10.705ms        99.31%      28.769ms      28.769ms             1  
        +    aten::_scaled_dot_product_flash_attention_for_cpu         3.67%       1.064ms         4.43%       1.284ms     107.000us            12  
        +                                           aten::view         1.97%     572.000us         1.97%     572.000us       2.509us           228  
        +                                          aten::empty         1.38%     399.000us         1.38%     399.000us       3.270us           122 
        +
        + +

        Image 5: torch.profiler output for Hugging Face Bert base model inference in torch.compile mode, showing linear_pointwise operator without gelu fusion

        + +

        Lastly, the gelu operator was compiled into erf (error function) and was dispatched to an inductor auto vectorization backend. The following snippets show the erf operator in the compiled graph and running it using libm.so.

        + +
        %_linear_pointwise_default_140 : [num_users=2] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%add_7, %_frozen_param278, %_frozen_param16, none, [], ), kwargs = {})
        +%mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.5), kwargs = {})
        +%mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_linear_pointwise_default_140, 0.7071067811865476), kwargs = {})
        +%erf   : [num_users=1] = call_function[target=torch.ops.aten.erf.default](args = (%mul_6,), kwargs = {})
        +%add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%erf, 1), kwargs = {})
        +
        + +

        Image 6: snippet after post grad pass showing erf function in the compiled graph

        +
         
        + +
             0.82%     0.40%  python3  libm.so.6            [.] erff32
        +     0.05%     0.00%  python3  libtorch_python.so   [.] torch::autograd::THPVariable_erf
        +     0.05%     0.00%  python3  libtorch_cpu.so      [.] at::_ops::erf::call
        +
        + +

        Image 7: Linux perf report showing erf dispatch to libm.so

        + +

        With this work, we were able to optimize torch.compile performance on Graviton3 processors by using inductor graph compilation along with the oneDNN+ACL backend.

        + +

        TorchBench enhancements

        + +

        To demonstrate the torch.compile performance improvements on AWS Graviton3 processors, we extended TorchBench framework to add a new argument to enable graph freeze and weights pre-packing and disable torch auto grad for eval test mode. The code snippet for this is as follows:

        + +
        parser.add_argument(
        + "—freeze_prepack_weights",
        + action='store_true',
        + help="set to freeze the graph and prepack weights",
        + )
        +
        +if args.freeze_prepack_weights:
        + torch._inductor.config.freezing=True
        + torch._inductor.config.cpp.weight_prepack=True
        +
        + +

        Image 8: Added freeze_prepack_weights option for torchdynamo backend in TorchBench to demonstrate torch.compile performance improvements on AWS Graviton3 processors

        + +

        We have upstreamed all the optimizations, and starting with PyTorch 2.3.1, these are supported in torch Python wheels and AWS Graviton PyTorch DLC.

        + +

        What’s next

        + +

        Next, we’re extending the torch inductor CPU backend support to compile Llama model, and adding support for fused GEMM kernels to enable torch inductor operator fusion optimization on AWS Graviton3 processors.

        + +

        Conclusion

        + +

        In this tutorial, we covered how we optimized torch.compile performance on AWS Graviton3-based EC2 instances, how to use the optimizations to improve PyTorch model inference performance, and demonstrated the resulting speedups. We hope that you will give it a try! If you need any support with ML software on Graviton, please open an issue on the AWS Graviton Technical Guide GitHub.

        + +

        Acknowledgements

        + +

        We would like to thank the PyTorch community for the baseline torch.compile framework and their continued efforts to optimize it further.

        + +

        References: https://pytorch.org/assets/pytorch2-2.pdf

        + +

        Author

        + +

        Sunita Nadampalli is a Software Development Manager and AI/ML expert at AWS. She leads AWS Graviton software performance optimizations for AI/ML and HPC workloads. She is passionate about open source software development and delivering high-performance and sustainable software solutions for SoCs based on the Arm ISA.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-gemms-triton/index.html b/blog/accelerating-gemms-triton/index.html new file mode 100644 index 000000000000..cdce705fc69a --- /dev/null +++ b/blog/accelerating-gemms-triton/index.html @@ -0,0 +1,753 @@ + + + + + + + + + + + + + Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Meta: Less Wright, IBM: Adnan Hoque + +

        +

        2D block quantization for Float8 (FP8) holds the promise of improving the accuracy of Float8 quantization while also accelerating GEMM’s for both inference and training. In this blog, we showcase advances using Triton for the two main phases involved in doing block quantized Float8 GEMMs.

        + +

        For the incoming quantization of A and B tensors from high precision (BFloat16) to Float8, we showcase GridQuant which leverages a mini-grid stride loop style of processing with nearly 2x speedups (99.31%) over a current 2D block quantization kernel.

        + +

        For the Float8 GEMM, we showcase 3 new developments for Triton - Warp Specialization, TMA and a persistent kernel to effectively create a cooperative style kernel (an alternative to the Ping-Pong schedule). As a result, we achieve ~1.2x speedup over our best-performing SplitK kernel from last year.

        + +

        Figure 1: A comparison of the 2D quantization speedup over a current baseline, across a range of sizes.

        + +

        Figure 1: A comparison of the 2D quantization speedup over a current baseline, across a range of sizes. (lower-is-better)

        + +

        Why 2D Blockwise Quantization for FP8?

        + +

        Generally speaking, the accuracy of fp8 quantization improves as we move from tensor-wise scaling, to row-wise scaling, to 2D block-wise, and then finally to column-wise scaling. This is because features for a given token are stored in each column, and thus each column in that tensor is more similarly scaled.

        + +

        To minimize the number of outliers of a given numerical set, we want to find commonality so that numbers are being scaled in a similar fashion. For transformers, this means column based quantization could be optimal…however, columnar memory access is massively inefficient due to the data being laid out in memory in a rowwise contiguous manner. Thus columnwise loading would require memory access involving large strides in memory to pull isolated values, contrary to the core tenets of efficient memory access.

        + +

        However, 2D is the next best option as it includes some aspects of columnar while being more memory efficient to pull since we can vectorize these loads with 2D vectorization. Therefore, we want to find ways to improve the speed for 2D block quantization which is why we developed the GridQuant kernel.

        + +

        For the quantization process, we need to 2D block quantize both the higher precision BF16 incoming tensors (A = input activations, B = weights) and then proceed to do the Float8 matmul using the quantized tensors and their 2D block scaling values, and return an output C tensor in BF16.

        + +

        How does GridQuant improve 2D block quantization efficiency?

        + +

        The GridQuant kernel has several improvements over the initial baseline quantization implementation which was a standard tile based implementation. The GridQuant kernel has two full passes through the entire input tensor and works as follows:

        + +

        Phase 1 - Determine the max abs value for each 256x256 sub block from the incoming high precision tensor.

        + +

        1 - We divide the BF16 tensor into 256 x 256 sub blocks. This quantization size is configurable, but 256x256 is the default as it provides a blend of quantization precision and processing efficiency.

        + +

        2 - Each 256x256 sub-block is subdivided into 64 sub-blocks arranged in an 8x8 pattern, with each sub-block processing a 32x32 element block. A single warp (32 threads) handles the computation for all elements within its assigned 32x32 block.

        + +

        3 - We declare a 32x32 max_vals array in shared memory. This will store the current max val for each position i,j as the 2d vector block moves across the entire 256x256 sub_block.

        + +

        This is an important improvement because it means we can do vectorized, rather than scalar, updates to the max vals scoring system and allows for much more efficient updates.

        + +

        Figure 2: The Fractionalized layout of an incoming tensor - a grid of 256x256 is created across the tensor, and within each 256x256 block, it is further refined into 32x32 sub blocks. A 32x32 max_vals is created for each 256x256 block.

        + +

        Figure 2: The Fractionalized layout of an incoming tensor - a grid of 256x256 is created across the tensor, and within each 256x256 block, it is further refined into 32x32 sub blocks. A 32x32 max_vals is created for each 256x256 block.

        + +

        4 - Each warp processes a 32x32 chunk and because we are using 4 warps, we ensure the Triton compiler can pipeline the memory loads for the next 32x32 chunk with the actual processing of absmax calculations for the current chunk. This ensures that the warp scheduler is able to toggle warps loading data with those processing and keep the SM continuously busy.

        + +

        5 - The 32x32 2D vector block processing is moved across and through the entire 256x256 subblock in a grid stride looping fashion, with each warp updating the shared memory 32x32 max_vals against its current 32x32 sub-block. Thus max_vals[i,j] holds the latest max value as each sub block is processed.

        + +

        After completing the 256x256 block grid stride loop, the maxvals matrix is then itself reduced to find the absolute single max value for that entire 256 block.

        + +

        This gives us our final scaling factor value for this 2D 256 x 256 block.

        + +

        Phase 2 - Quantize the 256x256 block values to Float8, by using the single max value scaling factor found during Phase 1.

        + +

        Next, we make a second pass through the entire 256x256 block to rescale all the numbers using this max value found in phase 1 to convert them to the float 8 format.

        + +

        Because we know we need to do 2 complete passes, for the loads during the phase 1 portion we instruct the triton compiler to keep these values in cache at higher priority (evict policy = last).

        + +

        This means that during the second pass, we can get a high hit rate from the L2 cache which provides much faster memory access than going all the way to HBM.

        + +

        With the 2D block quantization processing complete when all 256 x256 blocks are processed, we can return the new Float8 quantized tensor along with it’s scaling factor matrix, which we’ll use in the next phase of the GEMM processing. This input quantization is repeated for the second input tensor as well, meaning we end up with A_Float 8, A_scaling_matrix, and B_Float8 and B_scaling matrix.

        + +

        GridQuant - GEMM Kernel

        + +

        The GridQuant-GEMM kernel takes in the four outputs from the quantization above for processing. Our high-performance GEMM kernel features several new Triton developments to achieve SOTA performance for matrix shape profiles relevant in LLM inference during the decoding phase.

        + +

        These new features are commonly found in Hopper optimized kernels like FlashAttention-3 and Machete, built using CUTLASS 3.x. Here, we discuss these methods and showcase the performance benefits that can be achieved leveraging them in Triton.

        + +

        Tensor Memory Accelerator (TMA)

        + +

        The TMA unit on NVIDIA Hopper GPUs, is a dedicated hardware unit for load/store operations that act on multidimensional tensors commonly found in AI workloads. This has several important benefits.

        + +

        Transferring data from global and shared memory can occur without involving other resources on GPU SMs, freeing up registers and CUDA Cores. Further, when used in warp-specialized kernels, light-weight TMA operations can be assigned to a producer warp allowing for a high degree of overlap of memory transfers and computation.

        + +

        For more details on how TMA is used in Triton see our previous blog.

        + +

        Warp-Specialization (Cooperative Persistent Kernel Design)

        + +

        Warp Specialization is a technique to leverage pipeline parallelism on GPUs. This experimental feature enables the expression of specialized threads through a tl.async_task API, allowing the user to specify how operations in a Triton program should be “split” amongst warps. The cooperative Triton kernel performs different types of computation and loads that each take place on their own dedicated hardware. Having dedicated hardware for each of these specialized tasks makes it possible to realize parallelism efficiently for operations that have no data dependency.

        + +

        Figure 3. Logical view of dedicated HW units in NVIDIA H100 SM

        + +

        Figure 3. Logical view of dedicated HW units in NVIDIA H100 SM

        + +

        The operations in our kernel that create the pipeline are:

        + +

        A - Load per-block scale from GMEM into SMEM (cp.async engine)

        + +

        B - Load activation (A) and Weight (B) tiles from GMEM into SMEM (TMA)

        + +

        C - Matrix-Multiplication of A tile and B tile = C tile (Tensor Core)

        + +

        D - Scale C tile with per-block scale from A and per-block scale from B (CUDA core)

        + +

        These steps can be assigned to “tasks” which are carried out by specialized warp groups in a threadblock. The cooperative strategy has three warp groups. A producer warp group that is responsible for feeding the compute units and 2 consumer warp groups that perform the computation. The two consumer warp groups each work on half of the same output tile.

        + +

        Figure 4. Warp-Specialized Persistent Cooperative kernel

        + +

        Figure 4. Warp-Specialized Persistent Cooperative kernel (source: NVIDIA)

        + +

        This is different from the ping-pong schedule we discussed in our previous blog, where each consumer warp group works on different output tiles. We note that the Tensor Core ops are not overlapped with the epilogue computation. Decreased utilization of the Tensor Core pipeline during the epilogue phase of the computation will reduce register pressure for the consumer warp group compared to ping-pong which always keeps the Tensor Core busy, thus allowing for larger tile sizes.

        + +

        Lastly, our kernel is designed to be persistent when the grid size exceeds the number of available compute units on H100 GPUs (132). Persistent kernels remain active on the GPU for an extended period and compute multiple output tiles during its lifetime. Our kernel leverages TMA async shared to global memory stores, while continuing to do work on the next output tile as opposed to incurring the cost of scheduling multiple threadblocks.

        + +

        Microbenchmarks

        + +

        Figure 5: Latency comparison (us) of Gridquant-GEMM vs our best performing SplitK kernel for small batch regime and Llama3 8192 N,K sizing.

        + +

        Figure 5: Latency comparison (us) of Gridquant-GEMM vs our best performing SplitK kernel for small batch regime and Llama3 8192 N,K sizing. (lower-is-better)

        + +

        The Warp-Specialized Triton kernel achieves SOTA performance at the above small-M and square matrix shapes, achieving a nearly 1.2x speedup over the SplitK Triton kernel, which was the previous best performing strategy for Triton GEMMs in this low arithmetic intensity regime. For future work, we plan to tune our kernel performance for the medium-to-large M regime and non-square matrices.

        + +

        Conclusion and Future Work

        + +

        Future work includes benchmarking gridquant on end to end workflows. In addition, we plan to run more extensive benchmarks on non-square (rectangular) matrices as well as medium-to-large M sizes. Finally, we plan to explore ping-pong style warp-specialization in Triton versus the current cooperative implementation.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai-2/index.html b/blog/accelerating-generative-ai-2/index.html new file mode 100644 index 000000000000..016bb9edaa7f --- /dev/null +++ b/blog/accelerating-generative-ai-2/index.html @@ -0,0 +1,889 @@ + + + + + + + + + + + + + Accelerating Generative AI with PyTorch II: GPT, Fast | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In this blog we’ll focus on LLM optimization.

        + +

        Over the past year, generative AI use cases have exploded in popularity. Text generation has been one particularly popular area, with lots of innovation among open-source projects such as llama.cpp, vLLM, and MLC-LLM.

        + +

        While these projects are performant, they often come with tradeoffs in ease of use, such as requiring model conversion to specific formats or building and shipping new dependencies. This begs the question: how fast can we run transformer inference with only pure, native PyTorch?

        + +

        As announced during our recent PyTorch Developer Conference, the PyTorch team wrote a from-scratch LLM almost 10x faster than baseline, with no loss of accuracy, all using native PyTorch optimizations. We leverage a breadth of optimizations including:

        + + + +

        And, even better, we can do it in less than 1000 lines of native PyTorch code.

        + +

        If this excites you enough to jump straight into the code, check it out at https://github.com/pytorch-labs/gpt-fast!

        + +

        Screen recording

        + +

        Note: We will be focusing on latency (i.e. batch size=1) for all of these benchmarks. Unless otherwise specified, all benchmarks are run on an A100-80GB, power limited to 330W.

        + +

        Starting Point (25.5 tok/s)

        + +

        Let’s start off with an extremely basic and simple implementation.

        + +

        simple implementation

        + +

        Sadly, this does not perform very well. But why? Looking at a trace reveals the answer - it’s heavily CPU overhead bound! What this means is that our CPU is not able to tell the GPU what to do fast enough for the GPU to be fully utilized.

        + +

        trace

        + +

        Imagine the GPU as this super massive factory with a ridiculous amount of compute available. Then, imagine the CPU as some messenger shuttling instructions back and forth to the GPU. Remember, in large scale deep learning systems, the GPU is responsible for doing 100% of the work! In such systems, the only role of the CPU is to tell the GPU what work it should be doing.

        + +

        factory

        + +

        So, the CPU runs over and tells the GPU to do an “add”, but by the time the CPU can give the GPU another chunk of work, the GPU has long finished the previous chunk of work.

        + +

        Despite the fact that the GPU needs to perform thousands of computations while the CPU only needs to do orchestration work, this is surprisingly common! There’s a variety of reasons for this, ranging from the fact that the CPU is likely running some single-threaded Python to the fact that GPUs are just incredibly fast nowadays.

        + +

        Regardless of the reason, we now find ourselves in the overhead-bound regime. So, what can we do? One, we could rewrite our implementation in C++, perhaps even eschew frameworks entirely and write raw CUDA. Or…. we could just send more work to the GPU at once.

        + +

        factory

        + +

        By just sending a massive chunk of work at once, we can keep our GPU busy! Although during training, this may just be accomplished by increasing your batch size, how do we do this during inference?

        + +

        Enter torch.compile.

        + +

        Step 1: Reducing CPU overhead through torch.compile and a static kv-cache (107.0 tok/s)

        + +

        Torch.compile allows us to capture a larger region into a single compiled region, and particularly when run with mode=”reduce-overhead”, is very effective at reducing CPU overhead. Here, we also specify fullgraph=True, which validates that there are no “graph breaks” in your model (i.e. portions that torch.compile cannot compile). In other words, it ensures that torch.compile is running to its fullest potential.

        + +

        To apply it, we simply wrap a function (or a module) with it.

        + +
        torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
        +
        + +

        However, there are a couple of nuances here that make it somewhat nontrivial for folks to get significant performance boosts from applying torch.compile to text generation.

        + +

        The first obstacle is the kv-cache. The kv-cache is an inference-time optimization that caches the activations computed for the previous tokens (see here for a more in-depth explanation). However, as we generate more tokens, the “logical length” of the kv-cache grows. This is problematic for two reasons. One is that reallocating (and copying!) the kv-cache every time the cache grows is simply expensive. The other one is that this dynamism makes it harder to reduce the overhead, as we are no longer able to leverage approaches like cudagraphs.

        + +

        To resolve this, we use a “static” kv-cache, which means that we statically allocate the maximum size of the kv-cache, and then mask out the unused values in the attention portion of the computation.

        + +

        code

        + +

        The second obstacle is the prefill phase. Transformer text generation is best thought of as a two phase process: 1. The prefill where the entire prompt is processed, and 2. Decoding where each token is generated autoregressively.

        + +

        Although decoding can be made entirely static once the kv-cache is made static, the prefill stage still requires significantly more dynamism, due to having a variable prompt length. Thus, we actually need to compile the two stages with separate compilation strategies.

        + +

        compile

        + +

        Although these details are a bit tricky, the actual implementation is not very difficult at all (see gpt-fast)! And the performance boost is dramatic.

        + +

        chart

        + +

        All of a sudden, our performance improves by more than 4x! Such performance gains are often common when one’s workload is overhead bound.

        + +

        Sidenote: How is torch.compile helping?

        + +

        It is worth disentangling how exactly torch.compile is improving performance. There’s 2 main factors leading to torch.compile’s performance.

        + +

        The first factor, like mentioned above, is overhead reduction. Torch.compile is able to reduce overhead through a variety of optimizations, but one of the most effective ones is called CUDAGraphs. Although torch.compile applies this automatically for you when “reduce-overhead” is set, saving the extra work and code you need to write when doing this yourself manually without torch.compile.

        + +

        The second factor, however, is that torch.compile simply generates faster kernels. In the decoding benchmark above, torch.compile actually generates every single kernel from scratch, including both the matrix multiplications and the attention! And even cooler, these kernels are actually faster than the built in alternatives (CuBLAS and FlashAttention2)!

        + +

        This may sound implausible to many of you, considering how hard it is to write efficient matrix multiplication/attention kernels, and how much manpower has been put into CuBLAS and FlashAttention. The key here, however, is that transformer decoding has very unusual computational properties. In particular, because of the KV-cache, for BS=1 every single matrix multiplication in a transformer is actually a matrix vector multiplication.

        + +

        This means that the computations are completely memory-bandwidth bound, and as such, are well within the range of compilers to automatically generate. And in fact, when we benchmark torch.compile’s matrix-vector multiplications against CuBLAS, we find that torch.compile’s kernels are actually quite a bit faster!

        + +

        code

        + +

        code

        + +

        Step 2: Alleviating memory bandwidth bottleneck through int8 weight-only quantization (157.4 tok/s)

        + +

        So, given that we’ve already seen massive speedups from applying torch.compile, is it possible to do even better? One way to think about this problem is to compute how close we are to the theoretical peak. In this case, the largest bottleneck is the cost of loading the weights from GPU global memory to registers. In other words, each forward pass requires us to “touch” every single parameter on the GPU. So, how fast can we theoretically “touch” every single parameter in a model?

        + +

        weights

        + +

        To measure this, we can use Model Bandwidth Utilization (MBU). This measures what percentage of our memory bandwidth we’re able to use during inference.

        + +

        Computing it is pretty simple. We simply take the total size of our model (# params * bytes per param) and multiply it by the number of inferences we can do per second. Then, we divide this by the peak bandwidth of the GPU to get our MBU.

        + +

        MBU

        + +

        For example, for our above case, we have a 7B parameter model. Each parameter is stored in fp16 (2 bytes per parameter), and we achieved 107 tokens/s. Finally, our A100-80GB has a theoretical 2 TB/s of memory bandwidth.

        + +

        MBU

        + +

        Putting this all together, we get **72% MBU! **This is quite good, considering that even just copying memory struggles to break 85%.

        + +

        But… it does mean that we’re pretty close to the theoretical limit here, and that we’re clearly bottlenecked on just loading our weights from memory. It doesn’t matter what we do - without changing the problem statement in some manner, we might only be able to eek out another 10% in performance.

        + +

        Let’s take another look at the above equation. We can’t really change the number of parameters in our model. We can’t really change the memory bandwidth of our GPU (well, without paying more money). But, we can change how many bytes each parameter is stored in!

        + +

        MBU

        + +

        Thus, we arrive at our next technique - int8 quantization. The idea here is simple. If loading our weights from memory is our main bottleneck, why don’t we just make the weights smaller?

        + +

        MBU

        + +

        Note that this is quantizing only the weights - the computation itself is still done in bf16. This makes this form of quantization easy to apply with very little to no accuracy degradation.

        + +

        Moreover, torch.compile can also easily generate efficient code for int8 quantization. Let’s look again at the above benchmark, this time with int8 weight-only quantization included.

        + +

        code

        + +

        code

        + +

        As you can see from the dark blue line (torch.compile + int8), there is a significant performance improvement when using torch.compile + int8 weight-only quantization! Moreover, the light-blue line (no torch.compile + int8) is actually much worse than even the fp16 performance! This is because in order to take advantage of the perf benefits of int8 quantization, we need the kernels to be fused. This shows one of the benefits of torch.compile - these kernels can be automatically generated for the user!

        + +

        Applying int8 quantization to our model, we see a nice 50% performance improvement, bringing us up to 157.4 tokens/s!

        + +

        chart

        + +

        Step 3: Reframing the problem using speculative decoding

        + +

        Even after using techniques like quantization, we’re still faced with another problem. In order to generate 100 tokens, we must load our weights 100 times.

        + +

        diagram

        + +

        Even if the weights are quantized, we still must load our weights over and over, once for each token we generate! Is there any way around this?

        + +

        At first glance, the answer might seem like no - there’s a strict serial dependency in our autoregressive generation. However, as it turns out, by utilizing speculative decoding, we’re able to break this strict serial dependency and obtain speedups!

        + +

        engineers

        + +

        Imagine you had a senior engineer (called Verity), who makes the right technical decisions but is rather slow at writing code. However, you also have a junior engineer (called Drake), who doesn’t always make the right technical decisions but can write code much faster (and cheaper!) than Verity. How can we take advantage of Drake (the junior engineer) to write code faster while ensuring that we are still making the right technical decisions?

        + +

        engineers

        + +

        First, Drake goes through the labor-intensive process of writing the code, making technical decisions along the way. Next, we give the code to Verity to review.

        + +

        engineers

        + +

        Upon reviewing the code, Verity might decide that the first 3 technical decisions Drake made are correct, but the last 2 need to be redone. So, Drake goes back, throws away his last 2 decisions, and restarts coding from there.

        + +

        Notably, although Verity (the senior engineer) has only looked at the code once, we are able to generate 3 pieces of validated code identical to what she would have written! Thus, assuming Verity is able to review the code faster than it would have taken her to write those 3 pieces herself, this approach comes out ahead.

        + +

        In the context of transformer inference, Verity would be played by the role of the larger model whose outputs we want for our task, called the verifier model. Similarly, Drake would be played by a smaller model that’s able to generate text much faster than the larger model, called the draft model. So, we would generate 8 tokens using the draft model, and then process all eight tokens in parallel using the verifier model, throwing out the ones that don’t match.

        + +

        Like mentioned above, one crucial property of speculative decoding is that it does not change the quality of the output. As long as the time it takes for generating the tokens using the draft model + verifying the tokens is less than it would have taken to generate those tokens, we come out ahead.

        + +

        One of the great things about doing this all in native PyTorch is that this technique is actually really easy to implement! Here’s the entirety of the implementation, in about 50 lines of native PyTorch.

        + +

        code

        + +

        Although speculative decoding guarantees that we have mathematically identical results compared to regular generation, it does have the property that the runtime performance varies depending on the generated text, as well as how aligned the draft and verifier model are. For example, when running CodeLlama-34B + CodeLlama-7B, we’re able to obtain a 2x boost in tokens/s for generating code. On the other hand, when using Llama-7B + TinyLlama-1B, we’re only able to obtain about a 1.3x boost in tokens/s.

        + +

        Sidenote: Running this on AMD

        + +

        Like mentioned above, every single kernel in decoding is generated from scratch by torch.compile, and is converted into OpenAI Triton. As AMD has a torch.compile backend (and also a Triton backend), we can simply go through all of the optimizations above… but on an AMD GPU! With int8 quantization, we’re able to achieve 102.5 tokens/s with one GCD (i.e. one half) of a MI250x!

        + +

        chart

        + +

        Step 4: Reducing the size of the weights even more with int4 quantization and GPTQ (202.1 tok/s)

        + +

        Of course, if reducing the weights down from 16 bits to 8 bits allows for speedups by reducing the number of bytes we need to load, reducing the weights down to 4 bits would result in even larger speedups!

        + +

        Unfortunately, when reducing weights down to 4-bits, the accuracy of the model starts to become a much larger concern. From our preliminary evals, we see that although using int8 weight-only quantization has no perceptible accuracy degradation, using int4 weight-only quantization does.

        + +

        table

        + +

        There are 2 main tricks we can use to limit the accuracy degradation of int4 quantization.

        + +

        The first one is to have a more granular scaling factor. One way to think about the scaling factor is that when we have a quantized tensor representation, it is on a sliding scale between a floating point tensor (each value has a scaling factor) and an integer tensor (no values have a scaling factor). For example, with int8 quantization, we had one scaling factor per row. If we want higher accuracy, however, we can change that to “one scaling factor per 32 elements”. We choose a group size of 32 to minimize accuracy degradation, and this is also a common choice among the community.

        + +

        The other one is to use a more advanced quantization strategy than simply rounding the weights. For example, approaches like GPTQ leverage example data in order to calibrate the weights more accurately. In this case, we prototype an implementation of GPTQ in the repository based off of PyTorch’s recently released torch.export.

        + +

        In addition, we need kernels that fuse int4 dequantize with the matrix vector multiplication. In this case, torch.compile is unfortunately not able to generate these kernels from scratch, so we leverage some handwritten CUDA kernels in PyTorch.

        + +

        These techniques require some additional work, but putting them all together results in even better performance!

        + +

        chart

        + +

        Step 5: Combining everything together (244.7 tok/s)

        + +

        Finally, we can compose all of the techniques together to achieve even better performance!

        + +

        chart

        + +

        Step 6: Using Tensor Parallelism

        + +

        So far, we’ve been restricting ourselves to minimizing latency while on a single GPU. In many settings, however, we have access to multiple GPUs. This allows us to improve our latency further!

        + +

        To get an intuitive sense of why this would allow us to improve our latency, let’s take a look at the prior equation for MBU, particularly the denominator. Running on multiple GPUs gives us access to more memory bandwidth, and thus, higher potential performance.

        + +

        MBU

        + +

        As for which parallelism strategy to pick, note that in order to reduce our latency for one example, we need to be able to leverage our memory bandwidth across more devices simultaneously. This means that we need to split the processing of one token across multiple devices. In other words, we need to use tensor parallelism.

        + +

        Luckily, PyTorch also provides low-level tools for tensor-parallelism that compose with torch.compile. We are also working on higher-level APIs for expressing tensor parallelism, stay tuned for those!

        + +

        However, even without a higher-level API, it’s actually still quite easy to add tensor parallelism. Our implementation comes in at 150 lines of code, and doesn’t require any model changes.

        + +

        code

        + +

        We are still able to take advantage of all the optimizations mentioned previously, which all can continue to compose with tensor parallelism. Combining these together, we’re able to serve Llama-70B at 55 tokens/s with int8 quantization!

        + +

        chart

        + +

        Conclusion

        + +

        Let’s take a look at what we’re able to accomplish.

        + +
          +
        1. Simplicity: Ignoring quantization, model.py (244 LOC) + generate.py (371 LOC) + tp.py (151 LOC) comes out to 766 LOC to implement fast inference + speculative decoding + tensor-parallelism.
        2. +
        3. Performance: With Llama-7B, we’re able to use compile + int4 quant + speculative decoding to reach 241 tok/s. With llama-70B, we’re able to also throw in tensor-parallelism to reach 80 tok/s. These are both close to or surpassing SOTA performance numbers!
        4. +
        + +

        PyTorch has always allowed for simplicity, ease of use, and flexibility. However, with torch.compile, we can throw in performance as well.

        + +

        The code can be found here: https://github.com/pytorch-labs/gpt-fast. We hope that the community finds it useful. Our goal with this repo is not to provide another library or framework for people to import. Instead, we encourage users to copy-paste, fork, and modify the code in the repo.

        + +

        Acknowledgements

        + +

        We would like to thank the vibrant open source community for their continual support of scaling LLMs, including:

        + +
          +
        • Lightning AI for supporting pytorch and work in flash attention, int8 quantization, and LoRA fine-tuning.
        • +
        • GGML for driving forward fast, on device inference of LLMs
        • +
        • Andrej Karpathy for spearheading simple, interpretable and fast LLM implementations
        • +
        • MLC-LLM for pushing 4-bit quantization performance on heterogenous hardware
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai-3/index.html b/blog/accelerating-generative-ai-3/index.html new file mode 100644 index 000000000000..ff1463ec8868 --- /dev/null +++ b/blog/accelerating-generative-ai-3/index.html @@ -0,0 +1,920 @@ + + + + + + + + + + + + + Accelerating Generative AI Part III: Diffusion, Fast | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Sayak Paul and Patrick von Platen (Hugging Face 🤗) + +

        +

        This post is the third part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In part two, we showed how to accelerate Llama-7B by almost 10x using only native PyTorch optimizations. In this blog, we’ll focus on speeding up text-to-image diffusion models by upto 3x.

        + +

        We will leverage an array of optimizations including:

        + +
          +
        • Running with the bfloat16 precision
        • +
        • scaled_dot_product_attention (SPDA)
        • +
        • torch.compile
        • +
        • Combining q,k,v projections for attention computation
        • +
        • Dynamic int8 quantization
        • +
        + +

        We will primarily focus on Stable Diffusion XL (SDXL), demonstrating a latency improvement of 3x. These techniques are PyTorch-native, which means you don’t have to rely on any third-party libraries or any C++ code to take advantage of them.

        + +

        Enabling these optimizations with the 🤗Diffusers library takes just a few lines of code. If you’re already feeling excited and cannot wait to jump to the code, check out the accompanying repository here: https://github.com/huggingface/diffusion-fast.

        + +

        SDXL Chart

        + +

        (The discussed techniques are not SDXL-specific and can be used to speed up other text-to-image diffusion systems, as shown later.)

        + +

        Below, you can find some blog posts on similar topics:

        + + + +

        Setup

        + +

        We will demonstrate the optimizations and their respective speed-up gains using the 🤗Diffusers library. Apart from that, we will make use of the following PyTorch-native libraries and environments:

        + +
          +
        • Torch nightly (to benefit from the fastest kernels for efficient attention; 2.3.0.dev20231218+cu121)
        • +
        • 🤗 PEFT (version: 0.7.1)
        • +
        • torchao (commit SHA: 54bcd5a10d0abbe7b0c045052029257099f83fd9)
        • +
        • CUDA 12.1
        • +
        + +

        For an easier reproduction environment, you can also refer to this Dockerfile. The benchmarking numbers presented in this post come from a 400W 80GB A100 GPU (with its clock rate set to its maximum capacity).

        + +

        Since we use an A100 GPU (Ampere architecture) here, we can specify torch.set_float32_matmul_precision("high") to benefit from the TF32 precision format.

        + +

        Run inference using a reduced precision

        + +

        Running SDXL in Diffusers just takes a few lines of code:

        + +
        from diffusers import StableDiffusionXLPipeline
        +
        +## Load the pipeline in full-precision and place its model components on CUDA.
        +pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0").to("cuda")
        +
        +## Run the attention ops without efficiency.
        +pipe.unet.set_default_attn_processor()
        +pipe.vae.set_default_attn_processor()
        +
        +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
        +image = pipe(prompt, num_inference_steps=30).images[0]
        +
        + +

        But this isn’t very practical as it takes 7.36 seconds to generate a single image with 30 steps. This is our baseline which we will try to optimize one step at a time.

        + +

        SDXL Chart

        + +

        Here, we’re running the pipeline with the full precision. We can immediately cut down the inference time by using a reduced precision such as bfloat16. Besides, modern GPUs come with dedicated cores for running accelerated computation benefiting from reduced precision. To run the computations of the pipeline in the bfloat16 precision, we just need to specify the data type while initializing the pipeline:

        + +
        from diffusers import StableDiffusionXLPipeline
        +
        +pipe = StableDiffusionXLPipeline.from_pretrained(
        +	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
        +).to("cuda")
        +
        +## Run the attention ops without efficiency.
        +pipe.unet.set_default_attn_processor()
        +pipe.vae.set_default_attn_processor()
        +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
        +image = pipe(prompt, num_inference_steps=30).images[0]
        +
        + +

        SDXL Chart

        + +

        By using a reduced precision, we’re able to cut down the inference latency from 7.36 seconds to 4.63 seconds.

        + +

        Some notes on the use of bfloat16

        + +
          +
        • Using a reduced numerical precision (such as float16, bfloat16) to run inference doesn’t affect the generation quality but significantly improves latency.
        • +
        • The benefits of using the bfloat16 numerical precision as compared to float16 are hardware-dependent. Modern generations of GPUs tend to favor bfloat16.
        • +
        • Furthermore, in our experiments, we bfloat16 to be much more resilient when used with quantization in comparison to float16.
        • +
        + +

        (We later ran the experiments in float16 and found out that the recent versions of torchao do not incur numerical problems from float16.)

        + +

        Use SDPA for performing attention computations

        + +

        By default, Diffusers uses scaled_dot_product_attention (SDPA) for performing attention-related computations when using PyTorch 2. SDPA provides faster and more efficient kernels to run intensive attention-related operations. To run the pipeline SDPA, we simply don’t set any attention processor like so:

        + +
        from diffusers import StableDiffusionXLPipeline
        +
        +pipe = StableDiffusionXLPipeline.from_pretrained(
        +	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
        +).to("cuda")
        +
        +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
        +image = pipe(prompt, num_inference_steps=30).images[0]
        +
        + +

        SDPA gives a nice boost from 4.63 seconds to 3.31 seconds.

        + +

        SDXL Chart

        + +

        Compiling the UNet and VAE

        + +

        We can ask PyTorch to perform some low-level optimizations (such as operator fusion and launching faster kernels with CUDA graphs) by using torch.compile. For the StableDiffusionXLPipeline, we compile the denoiser (UNet) and the VAE:

        + +
        from diffusers import StableDiffusionXLPipeline
        +import torch
        +
        +pipe = StableDiffusionXLPipeline.from_pretrained(
        +    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
        +).to("cuda")
        +
        +## Compile the UNet and VAE.
        +pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True)
        +pipe.vae.decode = torch.compile(pipe.vae.decode, mode="max-autotune", fullgraph=True)
        +
        +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
        +
        +## First call to `pipe` will be slow, subsequent ones will be faster.
        +image = pipe(prompt, num_inference_steps=30).images[0]
        +
        + +

        Using SDPA attention and compiling both the UNet and VAE reduces the latency from 3.31 seconds to 2.54 seconds.

        + +

        SDXL Chart

        + +

        Notes on torch.compile

        + +

        torch.compile offers different backends and modes. As we’re aiming for maximum inference speed, we opt for the inductor backend using the “max-autotune”. “max-autotune” uses CUDA graphs and optimizes the compilation graph specifically for latency. Using CUDA graphs greatly reduces the overhead of launching GPU operations. It saves time by using a mechanism to launch multiple GPU operations through a single CPU operation.

        + +

        Specifying fullgraph to be True ensures that there are no graph breaks in the underlying model, ensuring the fullest potential of torch.compile. In our case, the following compiler flags were also important to be explicitly set:

        + +
        torch._inductor.config.conv_1x1_as_mm = True
        +torch._inductor.config.coordinate_descent_tuning = True
        +torch._inductor.config.epilogue_fusion = False
        +torch._inductor.config.coordinate_descent_check_all_directions = True
        +
        + +

        For the full list of compiler flags, refer to this file.

        + +

        We also change the memory layout of the UNet and the VAE to “channels_last” when compiling them to ensure maximum speed:

        + +
        pipe.unet.to(memory_format=torch.channels_last)
        +pipe.vae.to(memory_format=torch.channels_last)
        +
        + +

        In the next section, we’ll show how to improve the latency even further.

        + +

        Additional optimizations

        + +

        No graph breaks during torch.compile

        + +

        Ensuring that the underlying model/method can be fully compiled is crucial for performance (torch.compile with fullgraph=True). This means having no graph breaks. We did this for the UNet and VAE by changing how we access the returning variables. Consider the following example:

        + +

        code example

        + +

        Getting rid of GPU syncs after compilation

        + +

        During the iterative reverse diffusion process, we call step() on the scheduler each time after the denoiser predicts the less noisy latent embeddings. Inside step(), the sigmas variable is indexed. If the sigmas array is placed on the GPU, indexing causes a communication sync between the CPU and GPU. This causes a latency, and it becomes more evident when the denoiser has already been compiled.

        + +

        But if the sigmas array always stays on the CPU (refer to this line), this sync doesn’t take place, hence improved latency. In general, any CPU <-> GPU communication sync should be none or be kept to a bare minimum as it can impact inference latency.

        + +

        Using combined projections for attention ops

        + +

        Both the UNet and the VAE used in SDXL make use of Transformer-like blocks. A Transformer block consists of attention blocks and feed-forward blocks.

        + +

        In an attention block, the input is projected into three sub-spaces using three different projection matrices – Q, K, and V. In the naive implementation, these projections are performed separately on the input. But we can horizontally combine the projection matrices into a single matrix and perform the projection in one shot. This increases the size of the matmuls of the input projections and improves the impact of quantization (to be discussed next).

        + +

        Enabling this kind of computation in Diffusers just takes a single line of code:

        + +
        pipe.fuse_qkv_projections()
        +
        + +

        This will make the attention operations for both the UNet and the VAE take advantage of the combined projections. For the cross-attention layers, we only combine the key and value matrices. To learn more, you can refer to the official documentation here. It’s worth noting that we leverage PyTorch’s scaled_dot_product_attention here internally.

        + +

        These additional techniques improved the inference latency from 2.54 seconds to 2.52 seconds.

        + +

        SDXL Chart

        + +

        Dynamic int8 quantization

        + +

        We selectively apply dynamic int8 quantization to both the UNet and the VAE. This is because quantization adds additional conversion overhead to the model that is hopefully made up for by faster matmuls (dynamic quantization). If the matmuls are too small, these techniques may degrade performance.

        + +

        Through experimentation, we found that certain linear layers in the UNet and the VAE don’t benefit from dynamic int8 quantization. You can check out the full code for filtering those layers here (referred to as dynamic_quant_filter_fn below).

        + +

        We leverage the ultra-lightweight pure PyTorch library torchao to use its user-friendly APIs for quantization:

        + +
        from torchao.quantization import apply_dynamic_quant
        +
        +apply_dynamic_quant(pipe.unet, dynamic_quant_filter_fn)
        +apply_dynamic_quant(pipe.vae, dynamic_quant_filter_fn)
        +
        + +

        Since this quantization support is limited to linear layers only, we also turn suitable pointwise convolution layers into linear layers to maximize the benefit. We also specify the following compiler flags when using this option:

        + +
        torch._inductor.config.force_fuse_int_mm_with_mul = True
        +torch._inductor.config.use_mixed_mm = True
        +
        + +

        To prevent any numerical issues stemming from quantization, we run everything in the bfloat16 format.

        + +

        Applying quantization this way improved the latency from 2.52 seconds to 2.43 seconds.

        + +

        SDXL Chart

        + +

        Resources

        + +

        We welcome you to check out the following codebases to reproduce these numbers and extend the techniques to other text-to-image diffusion systems as well:

        + + + +

        Other links

        + + + +

        Improvements in other pipelines

        + +

        We applied these techniques to other pipelines to test the generality of our approach. Below are our findings:

        + +

        SSD-1B

        + +

        SSD-1B Chart

        + +

        Stable Diffusion v1-5

        + +

        Stable Diffusion v1-5 chart

        + +

        PixArt-alpha/PixArt-XL-2-1024-MS

        + +

        It’s worth noting that PixArt-Alpha uses a Transformer-based architecture as its denoiser for the reverse diffusion process instead of a UNet.

        + +

        PixArt-alpha/PixArt-XL-2-1024-MS chart

        + +

        Note that for Stable Diffusion v1-5 and PixArt-Alpha, we didn’t explore the best shape combination criteria for applying dynamic int8 quantization. It might be possible to get better numbers with a better combination.

        + +

        Collectively, the methods we presented offer substantial speedup over the baseline without degradation in the generation quality. Furthermore, we believe that these methods should complement other optimization methods popular in the community (such as DeepCache, Stable Fast, etc.).

        + +

        Conclusion and next steps

        + +

        In this post, we presented a basket of simple yet effective techniques that can help improve the inference latency of text-to-image Diffusion models in pure PyTorch. In summary:

        + +
          +
        • Using a reduced precision to perform our computations
        • +
        • Scaled-dot product attention for running the attention blocks efficiently
        • +
        • torch.compile with “max-autotune” to improve for latency
        • +
        • Combining the different projections together for computing attention
        • +
        • Dynamic int8 quantization
        • +
        + +

        We believe there’s a lot to be explored in terms of how we apply quantization to a text-to-image diffusion system. We didn’t exhaustively explore which layers in the UNet and the VAE tend to benefit from dynamic quantization. There might be opportunities to further speed things up with a better combination of the layers being targeted for quantization.

        + +

        We kept the text encoders of SDXL untouched other than just running them in bfloat16. Optimizing them might also lead to improvements in latency.

        + +

        Acknowledgements

        + +

        Thanks to Ollin Boer Bohan whose VAE was used throughout the benchmarking process as it is numerically more stable under reduced numerical precisions.

        + +

        Thanks to Hugo Larcher from Hugging Face for helping with infrastructure.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai-4/index.html b/blog/accelerating-generative-ai-4/index.html new file mode 100644 index 000000000000..adb243eb09ef --- /dev/null +++ b/blog/accelerating-generative-ai-4/index.html @@ -0,0 +1,823 @@ + + + + + + + + + + + + + Accelerating Generative AI with PyTorch IV: Seamless M4T, fast | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Yejin Lee, Carole-Jean Wu, Christian Puhrsch, Joel Schlosser, Driss Guessous, Jeffrey Wan, Joe Isaacson, Can Balioglu, Juan Pino + +

        +

        This post is the fourth part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. To skip to the code, check out our github (seamless_communication, fairseq2). We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In part two, we showed how to accelerate Llama-7B by almost 10x using only native PyTorch optimizations. In part three, we showed how to accelerate text-to-image diffusion models up to 3x using only native Pytorch optimizations.

        + +

        In this blog, we’ll focus on speeding up FAIR’s Seamless M4T-v2 model resulting in 2x speedup for text decoder module and 30x for vocoder module, resulting in 2.7x speedup for end-to-end inference, with no loss of accuracy by using CUDA Graph and native PyTorch optimization:

        + + + +

        End to End Inference Speedup

        + +

        Introduction

        + +

        Seamless M4T is an open-source foundational speech/text translation and transcription technology developed by FAIR. Seamless M4T is a massively multilingual and multimodal machine translation model, with the latest version (Seamless M4T-v2) released on November 30th, 2023. The high-level model architecture of Seamless M4T-v2 is illustrated in Figure 1.

        + +

        Model Architecture of Seamless M4T-v2

        + +

        Figure 1. Model Architecture of Seamless M4T-v2.

        + +

        Accelerating inference latency is crucial for translation models to improve user experience through faster communication across languages. In particular, batch_size=1 is often used for fast translation where latency matters a lot in applications such as chatbots, speech translation, and live subtitling. Therefore, we conducted the performance analysis on inference with batch_size=1, as shown in Figure 2 to understand the Amdahl’s Law bottleneck. Our results indicate that the text decoder and vocoder are the most time-consuming modules, accounting for 61% and 23% of the inference time, respectively.

        + +

        Text decoder and vocoder are the most time consuming module. Breakdown of inference time by modules for English-Spanish S2ST (Speech-to-Speech-Text) task for batch_size=1 on A100 GPU.

        + +

        Figure 2. Text decoder and vocoder are the most time consuming module. Breakdown of inference time by modules for English-Spanish S2ST (Speech-to-Speech-Text) task for batch_size=1 on A100 GPU.

        + +

        To take a closer look at the performance bottleneck of the text decoder and vocoder, we analyzed GPU traces for the text decoder and vocoder for the 8th sample for the English-Spanish translation example of FLEURS dataset as shown in Figure 3. It revealed that the text decoder and vocoder are heavily CPU-bound modules. We observed a significant gap incurred by CPU overhead that delayed the launch of GPU kernels, resulting in a substantial increase in the execution time for both the modules.

        + +

        CPU and GPU trace for Text Decoder

        + +

        (a) CPU and GPU trace for Text Decoder

        + +

        CPU and GPU trace for Vocoder

        + +

        (b) CPU and GPU trace for Vocoder

        + +

        Figure 3. Text Decoder and Vocoder are heavily CPU-bound modules. CPU and GPU trace for (a) Text Decoder (b) Vocoder for the 8th sample for English-Spanish translation example of FLEURS dataset. The trace is obtained by running inference with batch_size=1 on A100 gpu.

        + +

        Based on the real-system performance analysis results that text_decoder and vocoder are heavily CPU bound modules in Seamless M4T-v2, we enabled torch.compile + CUDA Graph to those modules. In this post, we share modifications required to enable torch.compile + CUDA Graph on each module for batch_size=1 inference scenario, discussion on CUDA Graph and next step plans.

        + +

        Torch.compile with CUDA Graph

        + +

        torch.compile is a PyTorch API that allows users to compile PyTorch models into a standalone executable or script which is generally used for optimizing model performance by removing unnecessary overhead.

        + +

        CUDA Graph is a feature provided by NVIDIA that allows for the optimization of kernel launches in CUDA applications. It creates an execution graph of CUDA kernels, which can be pre-processed and optimized by the driver before being executed on the GPU. The main advantage of using CUDA Graph is that it reduces the overhead associated with launching individual kernels, as the graph can be launched as a single unit, reducing the number of API calls and data transfers between the host and device. This can lead to significant performance improvements, especially for applications that have a large number of small kernels or repeat the same set of kernels multiple times. If this is something you are interested in learning more about, check out this paper that highlights the important role of data for accelerated computing: Where is the data? Why you cannot debate CPU vs. GPU performance without the answer by our own Kim Hazelwood! This is when NVIDIA was heavily investing in general-purpose GPU (GPGPUs) and before deep learning revolutionized the computing industry!

        + +

        However, because CUDA Graph operates on 1) fixed memory pointer, 2) fixed shape of tensors, that are recorded at the compile time, we introduced the following improvements for CUDA Graph to be reused across multiple sizes of inputs to prevent CUDA Graph generation for each iteration and let the data inside CUDA Graph be reused across different runs to share KV Cache for multiple decoding steps.

        + +

        Text Decoder

        + +

        The Text Decoder in Seamless is a decoder from NLLB [1] that performs T2TT (Text to Text Translation). Also, this module is a CPU-bound model where gpu execution time is not long enough to hide CPU overhead because of the nature of auto-regressive generation that requires sequential processing of tokens, which limits the amount of parallelism that can be achieved on the GPU. Based on this observation, we enabled torch.compile + CUDA Graph for the text decoders to reduce the dominating CPU overhead as shown in Figure 4.

        + +

        CPU and GPU trace for Text Decoder after torch.compile + CUDA Graph are enabled

        + +

        Figure 4. CPU and GPU trace for Text Decoder after torch.compile + CUDA Graph are enabled.

        + +

        1. Updating and retrieving KV cache

        + +

        During inference, the text decoder has two computation phases: a prefill phase that consumes the prompt and an incremental generation phase that generates output tokens one by one. Given a high enough batch size or input length, prefill operates on a sufficiently high number of tokens in parallel — GPU performance is the bottleneck and the CPU overheads do not impact performance significantly. On the other hand, incremental token generation is always executed with sequence length 1 and it is often executed with a small batch size (even 1), e.g. for interactive use cases. Thus, incremental generation can be limited by the CPU speed and thus is a good candidate for torch.compile + CUDA Graph.

        + +

        However, during the incremental token generation phase, the sequence_length dimension of key and value involved in the attention computation increases by one with each step while the sequence length of query always remains 1. Specifically, key/value are generated by appending the newly computed key/value of sequence length 1 to the key/value stored in the KV cache so far. But as mentioned above, CUDA Graph records all the shapes of tensors during compilation and replay with the recorded shapes. Thus, few modifications have been made to address this issue following the great work here.

        + +

        a) We modify the KV-cache handling to take the indices in which to write new values in a CUDA Tensor (i.e., valid_seq_pos) rather than a Python integer.

        + +

        Modification to KV cache append and get

        + +

        Figure 5. Modification to KV cache append and get

        + +

        b) We also modify attention to work with the fixed shape of key and value over the max_seq_length. We only compute softmax over the sequence positions up to the current decoding step (i.e., valid_seq_pos) . To mask out sequence positions > current decoding step (i.e., valid_seq_pos), we create a boolean mask tensor (i.e., mask) where sequence positions > valid_seq_pos are set to False.

        + +

        Helper function to generate valid_seq_pos and mask

        + +

        Figure 6. Helper function to generate valid_seq_pos and mask

        + +

        It’s important to post that these modifications result in an increase in the amount of computation required, as we compute attention over more sequence positions than necessary (up to max_seq_length). However, despite this drawback, our results demonstrate that torch.compile + CUDA Graph still provide significant performance benefits compared to standard PyTorch code.

        + +

        c) As different inference samples have different sequence length, it also generates different shapes of inputs that are to be projected to key and value for the cross attention layers. Thus, we pad the input to have a static shape and generate a padding mask to mask out padded output.

        + +

        2. Memory Pointer Management

        + +

        As CUDA Graph records memory pointers along with the shape of tensors, it is important to make different inference samples to correctly reference the recorded memory pointer (e.g., KV cache) to avoid compiling CUDA Graph for each inference sample. However, some parts of the Seamless codebase made different inference samples to refer to different memory addresses, so we made modifications to improve the memory implications.

        + +

        e) Seamless adopts beam search as a text decoding strategy. In the beam search process, we need to perform KV cache reordering for all the attention layers for each incremental decoding step to make sure each selected beam performs with corresponding KV cache as shown in the code snippet below.

        + +

        KV cache reordering operation for beam search decoding strategy

        + +

        Figure 8. KV cache reordering operation for beam search decoding strategy.

        + +

        The above code allocates new memory space and overwrites the original memory pointer for cache_k and cache_v. Thus we modified KV cache reordering to keep the memory pointer of each cache as was recorded during compilation by using copy_ operator.

        + +

        In-place update for KV cache using copy_ operator

        + +

        Figure 9. In-place update for KV cache using copy_ operator

        + +

        f) After enabling torch.compile + CUDA Graph to text decoder by modifying the code as mentioned above, the overhead of text decoder shifts to KV cache reordering as shown in Figure 10. KV cache reordering repeatedly calls index_select 96 times (assuming 24 decoder layers where each layer consists of two types of attention layers with cache for key and value).

        + +

        CPU and GPU trace for Text Decoder after enabling torch.compile + CUDA Graph

        + +

        Figure 10. CPU and GPU trace for Text Decoder after enabling torch.compile + CUDA Graph.

        + +

        As part of accelerating text decoder, we additionally applied torch.compile to KV cache reordering to benefit from fusing kernels as shown in Figure 11. Note that we cannot use CUDA Graph here (mode='max-autotune') here, because copy_ operation modifies the inputs which violates the static input requirement of CUDA graph version in torch.compile.

        + +

        Applying torch.compile to KV Cache reordering

        + +

        Figure 11. Applying torch.compile to KV Cache reordering.

        + +

        As a result of enabling torch.compile to KV cache reordering, the gpu kernels that were launched separately (Figure 12(a)) are now fused so there are much fewer gpu kernels to launch (Figure 12(b)).

        + +

        CPU and GPU trace for KV cache reordering before enabling torch.compile

        + +

        (a) CPU and GPU trace for KV cache reordering before enabling torch.compile

        + +

        CPU and GPU trace for KV cache reordering after enabling torch.compile

        + +

        (b) CPU and GPU trace for KV cache reordering after enabling torch.compile

        + +

        Figure 12. CPU and GPU trace for KV cache reordering (a) before and (b) after enabling torch.compile

        + +

        Vocoder

        + +

        Vocoder in Seamless is a HiFi-GAN unit-vocoder that converts generated units to waveform output where an unit is a representation of speech that combines different aspects such as phonemes and syllables, which can be used to generate sounds that are audible to humans. Vocoder is a relatively simple module that consists of Conv1d and ConvTranspose1d layers and is a CPU bound module as shown in FIgure 3. Based on this observation, we decided to enable torch.compile + CUDA Graph for vocoder to reduce the disproportionally large CPU overhead as shown in Figure 10. But there were several fixes to be made.

        + +

        CPU and GPU trace for Vocoder after torch.compile + CUDA Graph are enabled

        + +

        Figure 13. CPU and GPU trace for Vocoder after torch.compile + CUDA Graph are enabled.

        + +

        a) The input tensor shape of the vocoder is different across different inference samples. But as CUDA Graph records the shape of tensors and replays them, we had to pad the input to the fixed size with zeros. Since vocoder only consists of Conv1d layers, we do not need an additional padding mask, and padding with zeros is sufficient.

        + +

        b) Vocoder consists of conv1d layers wrapped with torch.nn.utils.weight_norm (see here). However, applying torch.compile directly to Vocoder incurs graph break as below, which leads to suboptimal performance improvement. This graph break happens inside the hook handling part in the PyTorch code of weight_norm.

        + +
        [1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] Graph break: setattr(UserDefinedObjectVariable) <function Module.__setattr__ at 0x7fac8f483c10> from user code at:
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/vocoder.py", line 49, in forward
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     return self.code_generator(x, dur_prediction)  # type: ignore[no-any-return]1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     return forward_call(*args, **kwargs)
        +[2023-12-13 04:26:16,822] [1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/codehifigan.py", line 101, in forward
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     return super().forward(x)
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/mnt/fsx-home/yejinlee/yejinlee/seamless_communication/src/seamless_communication/models/vocoder/hifigan.py", line 185, in forward
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     x = self.ups[i](x)
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1550, in _call_impl
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     args_result = hook(self, args)
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]   File "/data/home/yejinlee/mambaforge/envs/fairseq2_12.1/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py", line 65, in __call__
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG]     setattr(module, self.name, self.compute_weight(module))
        +[1/0_2] torch._dynamo.symbolic_convert.__graph_breaks: [DEBUG] 
        +
        + +

        Since the weights of layers do not change during the inference, we do not need weight normalization. So we simply removed weight normalization for Vocoder as shown in Figure 14, by utilizing remove_weight_norm function which is already provided at the Seamless codebase (here).

        + +

        Removing weight_norm for Vocoder

        + +

        Figure 14. Removing weight_norm for Vocoder

        + +

        Performance Evaluation + Impact of CUDA Graph

        + +

        Figure 15 shows the speedup result when enabling torch.compile(mode=”max-autotune”) + CUDA Graph on the text decoder and vocoder. We achieve 2x speedup for the text decoder and 30x speedup for vocoder, leading to 2.7x faster end-to-end inference time.

        + + + + + + +
        + +Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph + + +Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph +
        + +

        Figure 15. Inference time speedup of text decoder and vocoder of applying torch.compile and torch.compile + CUDA Graph

        + +

        We also report the speedups for text decoder and vocoder using torch.compile without CUDA Graph, which is supported by torch.compile’s API (i.e., torch.compile(mode="max-autotune-no-cudagraphs")), to identify the impact of CUDA Graph on the performance. Without CUDA Graph, the speedup for text decoder and vocoder reduces to 1.17x and 18.4x. While still quite significant, it indicates the important role of CUDA Graph. We conclude that Seamless M4T-v2 is exposed to a lot of time launching CUDA kernels, especially when we use small batch size (e.g., 1) where the GPU kernel execution time is not long enough to amortize the GPU kernel launch time.

        + +

        End-to-end inference speedup of applying torch.compile and CUDA graph incrementally

        + +

        Figure 16. End-to-end inference speedup of applying torch.compile and CUDA graph incrementally. a) “Inc. Decoding”: Apply torch.compile only to the text decoder b) “Inc. Decoding w/ CUDA Graph”: Apply torch.compile + CUDA Graph to the text decoder c) “+KV Cache Reordering”: Additionally apply torch.compile to KV cache reordering operation upon b) d) “+Vocoder”: Additionally apply torch.compile to the vocoder upon c) e) “+Vocoder w/ CUDA Graph”: Additionally apply torch.compile + CUDA Graph to the vocoder upon d).

        + +

        Figure 16 represents the cumulative effect of applying torch.compile with and without CUDA Graph to the modules. The results indicate a significant improvement in the end-to-end inference speedup, demonstrating the effectiveness of these techniques in optimizing the overall latency. As a result, we gain 2.7x end-to-end inference speedup for Seamless M4T-v2 with batch_size=1.

        + +

        Acknowledgements

        + +

        We thank the PyTorch team and Seamless team for their tremendous support with this work.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai-segment-anything-2/index.html b/blog/accelerating-generative-ai-segment-anything-2/index.html new file mode 100644 index 000000000000..f69568073751 --- /dev/null +++ b/blog/accelerating-generative-ai-segment-anything-2/index.html @@ -0,0 +1,1915 @@ + + + + + + + + + + + + + Accelerating Generative AI with PyTorch: Segment Anything 2 - Fast and furious inference with low latency and fast cold starts | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        This post is a follow-up to our first entry in the multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch and a focus on latency and elastic scalability. We use torch.compile and torch.export to create highly optimized low latency versions of SAM2 that can be quickly scaled up on new instances.

        + +

        By utilizing AOTInductor’s (AOTI) ahead-of-time compilation via torch.export, reduced precision, batched prompts and GPU preprocessing we observe up to 13x improvement in p90 execution latency and queue times compared to regular eager mode PyTorch.

        + +

        We calculate our final results and demonstrate the improvement in a realistic deployment on auto-scaling cloud infrastructure from Modal.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 execution latency +
        +(ms / improvement) +
        p90 execution latency +
        +(ms / improvement) +
        + eager float32 + AOTI float16 + eager float32 + AOTI float16 +
        AMG + 741 + 112 (6.6x) + 1140 + 176 (6.5x) +
        SPS + 98 + 20 (4.9x) + 130 + 28 (4.6x) +
        MPS + 269 + 38 (7.1x) + 714 + 52 (13.7x) +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 queue time (ms / improvement) + p90 queue time (ms / improvement) +
        + eager float32 + AOTI float16 + eager float32 + AOTI float16 +
        AMG + 201 + 41 (4.9x) + 815 + 327 (2.6x) +
        SPS + 31 + 33 (0.9x) + 441 + 49 (9.0x) +
        MPS + 40 + 37 (1.1x) + 942 + 75 (12.6x) +
        + +

        The Tasks

        + +

        The first post focused on processing a small number of varying prompts (points of interest) per image. These points represented the center points of the ground truth masks. For this post, we’ll now focus on a broader set of tasks. Single prompt segmentation (SPS), multi prompt segmentation (MPS), automatic mask generation (AMG) which generates the full set of masks for the input image without a given set of prompts. The first post focused on MPS only.

        + +

        comparison of 3 images

        + +

        The little star in the image represents a user prompt. For AMG there are no prompts and masks are filtered down heuristically from a dense grid of initial candidate prompts (guesses). For SPS and MPS user prompts are derived from the center points of AMG masks. For SPS we choose the mask with the largest area.

        + +

        Note that SAM2 uses a different backbone than SAM1. In particular, we only consider the largest and most accurate sam2.1_hiera_large backbone for this blog.

        + +

        We aggregate the scripts needed to reproduce the results in torchao’s example folder and incrementally upstream the more stable parts of the changes to the SAM2 model in torchao to the main SAM2 repository. So if you are interested in taking a look at the cutting-edge variant or would like to contribute experimental features, please don’t hesitate to reach out to the torchao repository and team. For the more stable and latest model version, please head on over to SAM2 directly.

        + +

        Overview

        + +

        We categorize the changes presented here into two. Fast changes constrain themselves to techniques that are not meant to affect model accuracy. Furious changes sacrifice some numerical accuracy for additional speed by making use of approximations such as low-precision data types.

        + +

        Approximations may slightly lower precision metrics in favor of significantly improved performance while still passing an end-to-end check based on mean intersection over union (mIoU).

        + +

        To measure the performance improvements we processed 1000 images, which were selected at random from the SAM2 validation dataset. We look at the p50 and p90 latency per image. To measure accuracy we consider the mIoU. Most notably for the AMG task we also define a fail count metric. We consider a comparison failed if the number of masks differs. This turns out to be a fairly unstable quantity and we can see that the other tasks are not as sensitive to small numeric changes as AMG.

        + +

        The Setup

        + +

        We are running the offline experiments on a regular H100 devserver, which is a fairly beefy and performant machine.

        + +

        However, we try to look at these tasks with realistic constraints. In particular, we would like to emulate a server-side inference environment. That means we don’t use DataLoader to hide the latency of image preprocessing or decoding routines.

        + +

        For the latency calculations we include decoding, segmentation and conversion of masks to a dictionary of run-length encoded masks. Or put differently, we exclude loading the images into in-memory host bytearrays and storing the resulting dictionaries as json files on disk. This is meant to emulate a more realistic setting.

        + +

        More concretely, consider the code below for the routines we include in our measurements. For any task gen_masks produces a batched bool Tensor bitmask that represents the corresponding object masks. We then compress this bitmask into a run length encoded (rle) format that can be used to transfer back the results from a remote server much more efficiently.

        + +
        image_tensors = decode_img_bytes(...)
        +masks = gen_masks(image_tensors, ...)
        +rle_dicts = [rle_dict_from_masks(m) for m in masks]
        +
        + +

        Optimizations

        + +

        ao: eager code optimizations

        + +

        The most effective tool for this work is the PyTorch autograd profiler combined with record_function. To build this software, we’ve used the profiler repeatedly to observe the program and confirm the effectiveness of any changes. It’s also important to keep in mind that the profiler itself has overhead. The more data you collect, such as stack traces, the more overhead you introduce, which might skew the collected trace. But it is excellent to find synchronization points, space between kernels and GPU kernels that take a long time.

        + +

        GPU traces help you understand bottlenecks that are not necessarily easily addressed by compile. We found that AutomaticMaskGeneration in particular is dominated by the data structure used to store the masks and by the routine used to convert the masks to a run-length encoded compressed format. We also found a large part of AMG performance is dominated by the large number of masks created as a single batch. Sometimes candidate masks can be filtered down to fewer candidates earlier in the postprocessing stage by reordering operations. This in turn significantly speeds up the later operations.

        + +

        In order to confirm the accuracy of our implementation we first compare without any changes in settings and using float32 precision. We see that mIoU is unchanged and the masks match perfectly when using the exact same settings. This means that these eager mode changes did not affect the accuracy of these tasks.

        + +

        AMG

        + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / fail count +
        Baseline + 864 + 1144 + 4350 + reference +
        AO + 693 + 786 + 4010 + 1 / 0 +
        + +

        ao: batching prompts

        + +

        Another lossless performance optimization that we were able to apply is batching the user input prompt calculations. When optimizing for latency at batch size 1 on a server-grade GPU such as an H100 we are often left with a lot of spare memory. We can easily trade off that memory for more performance by processing more points of interest (also called user prompts) at once. Remember that SAM2 is split into two parts: First the backbone (image encoder), second the prediction and decoding of masks based on a set of user prompts / points of interest. It is the second part where we may expect a larger or even varying number of inputs and it is this second part where we apply batching.

        + +

        This causes a large increase in memory, but also much better latency. The baseline generates one mask per prompt in a loop. For AMG the baseline processes 64 prompts at once and all that is needed is to change it to 1024, which is the number of candidate prompts generated. For SPS we process one prompt at a time, but it’s still included below for completeness.

        + +

        AMG

        + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / fail count +
        Baseline + 864 + 1144 + 4350 + reference +
        AO + batching + 613 + 706 + 33786 + 0.9999995 / 0 +
        + +

        SPS

        + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU +
        Baseline + 116 + 181 + 1337 + reference +
        AO + 110 + 170 + 1339 + 1 +
        + +

        MPS

        + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU +
        Baseline + 276 + 681 + 1337 + reference +
        AO + batching + 126 + 225 + 8021 + 0.9999992 +
        + +

        As a technical side note: Most notably to enable batching for MPS, and to avoid a significant manual rewrite of the code base to support multiple prompts at the same time, we used a Tensor subclass we call MapTensor. A MapTensor allows us to pass a batch of N prompts, but have it advertise a batch size of 1. Any operation is then automatically broadcast to the wrapped Tensor and propagated throughout the prediction part of the model. This works because individual prompt predictions are independent of one another. This is very similar to torch.vmap.

        + +
        center_points_torch = to_map_tensor(center_points_torch)
        +center_points_label_torch = to_map_tensor(center_points_label_torch)
        +masks, scores, _ = mask_generator.predictor.predict(
        +    point_coords=center_points_torch,
        +    point_labels=center_points_label_torch,
        +    multimask_output=True,
        +    return_logits=False,
        +    return_type="torch",
        +)
        +# Unwrapping MapTensor
        +masks = masks.elems
        +scores = scores.elems
        +
        + +

        fast: fullgraph compilation

        + +

        Just as with our first post, we first remove GPU syncs and graph breaks to make use of fullgraph compiled model code with max-autotune kernels where appropriate. After some rewriting, we are able to compile the image encoder and the prediction of masks.

        + +

        We run the experiments twice to get a sense of the overhead due to compilation. We run it once in an environment with an empty TORCHINDUCTOR_CACHE_DIR and then again while ingesting the artifacts from the previous run. In particular, auto-tuning can take a long time and happens on the first call in a pristine environment. We call the second run “warm”. The first iteration is typically expected to be slow due to various other related initialization processes, but compile increases it significantly, even if an existing cache is used and the same exact shapes are fed again. Having said that, an overhead of a few seconds in a warm environment is often still stomachable on the very first call.

        + +

        Most of these drawbacks can be mitigated and compiling causes a significant improvement in latency and reduction in memory.

        + +

        AMG

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / +
        +fail count +
        first iteration +
        +(ms) +
        AO + batching + 613 + 706 + 33786 + 0.9999995 / 0 + 1125 +
        + compile (cold) + 423 + 513 + 29349 + skipped + 404866 +
        + compile (warm) + 439 + 530 + 29349 + 0.994 / 190 + 8544 +
        + +

        The number of masks produced per mask can vary slightly when using automatic mask segmentation. There is ambiguity in the number of masks per object the model may produce. For example, a car may be subdivided into frames, windows and doors or treated as a whole. When a modification causes the number of masks to change, we consider the comparison failed and we only calculate the mIoU on masks with an exact match. This does not apply to the other tasks. We found that the number of masks generated is very sensitive to small numerical changes. The other tasks use the same code and MPS in particular can help us further verify correctness.

        + +

        SPS

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
        +(ms) +
        AO + 110 + 170 + 1339 + 1 + 562 +
        + compile (cold) + 102 + 158 + 1343 + skipped + 319954 +
        + compile (warm) + 100 + 160 + 1302 + 0.9999 + 8947 +
        + +

        MPS

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
        +(ms) +
        AO + batching + 126 + 225 + 8021 + 0.9999992 + 504 +
        + compile (cold) + 129 + 215 + 8021 + skipped + 333308 +
        + compile (warm) + 113 + 213 + 8021 + 0.998 + 8617 +
        + +

        furious: TF32, float16 and GPU preprocessing

        + +

        We found that using float16 is the right level of precision for a few significant subcomponents of the model. In particular, the image encoder and mask decoder weights can be converted entirely to float16. We can also use TensorFloat32 precision for the remaining float32 matrix operations. It should be possible to further reduce the precision and we may address this in a future post. We also move image preprocessing such as image normalization onto the GPU with the furious mode. We can’t use GPU decoding (nvJPEG) routines, because the differences are too significant and the model suffers from significant degradation in mIoU, so image decoding still happens on the CPU.

        + +

        AMG

        + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / +
        +fail count +
        AO +
        ++ batching +
        ++ compile (warm) +
        439 + 530 + 29349 + 0.994 / 190 +
        + furious + 165 + 240 + 28335 + 0.978 / 306 +
        + +

        This causes a significant degradation in mIoU for the AMG task, but doesn’t affect the other tasks. After an in-depth investigation, we still chalk this up to numerical instability and reordering of operations. More work is needed to further investigate this and it may not be interesting to run the AMG task in lower precision. The other tasks, however, benefit drastically in latency with minimal changes in mIoU.

        + +

        SPS

        + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU +
        AO +
        ++ compile (warm) +
        100 + 160 + 1302 + 0.9999 +
        + furious + 32 + 63 + 861 + 0.9997 +
        + +

        MPS

        + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU +
        AO +
        ++ batching +
        ++ compile (warm) +
        113 + 213 + 8021 + 0.998 +
        + furious + 36 + 64 + 4222 + 0.997 +
        + +

        AOTInductor’s (AOTI) ahead-of-time compilation via torch.export

        + +

        When scaling elastically it often is not possible to accommodate long startup times. That means the first iteration cannot be slow, but we must quickly deliver results. This is when torch.compile’s current compilation overhead can get in the way. To address this we can use AOTInductor’s (AOTI) ahead-of-time compilation via torch.export. AOTI lets us compile the model on a representative input and store the resulting code in a binary that is quick to load and run.

        + +

        AOTI via torch.export is a new feature and we currently can’t export everything that is compilable. We’ve been able to export the image encoder for all tasks but have only been able to export the mask prediction for the AMG and SPS tasks due to varying prompts. torch.export also supports dynamic shapes, but we need to invest a bit more time to prepare the code for it.

        + +

        AMG: AO + batching + furious

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / +
        +fail count +
        first iteration +
        +(ms) +
        + compile (warm) + 165 + 240 + 28335 + 0.978 / 306 + 10341 +
        + load export +
        +(cold) +
        162 + 233 + 27927 + 0.974 / 308 + 906 +
        + +

        SPS: AO + furious

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
        +(ms) +
        + compile (warm) + 32 + 63 + 861 + 0.9997 + 7989 +
        + load export +
        +(cold) +
        35 + 66 + 1686 + 0.9997 + 763 +
        + +

        Note that loading the exported model significantly increases memory. It likely only increases peak memory utilization, because initialization really needs to be delayed before loading up an exported model to avoid having twice the weights in memory at once. This is something we could address, but the memory consumption is nowhere near the limit. We don’t see an increase in the other tasks, because AMG and MPS peak memory is dominated by processing batches of masks. One way to reduce that could be to operate on masks in the rle format (or some other sparse format) earlier on, but for now, there is no reason for this given the current memory consumption and focus on latency.

        + +

        MPS: AO + batching + furious

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
        +(ms) +
        + compile (warm) + 36 + 64 + 4222 + 0.997 + 9626 +
        + load export +
        +(cold) +
        43 + 72 + 3813 + 0.997 + 747 +
        + +

        Using export by itself doesn’t seem to benefit from extensive warmup and can be run in a pristine new inductor cache directory. But again, we do not evict the CUDA cache or other caches. In the section on Modal, we are running some of these experiments in a pristine environment.

        + +

        When only processing 1000 images in a new process, using export can really be worth it to save out on compile and other cold start overhead.

        + +

        bonus: More GPU preprocessing

        + +

        At this point, the latency is fairly low. In particular, for the SPS and MPS tasks we are processing at around 30ms to 40ms. Let’s bring back the pseudo-code from the setup section again.

        + +
        image_tensors = decode_img_bytes(...)
        +masks = gen_masks(image_tensors, ...)
        +rle_dicts = [rle_dict_from_masks(m) for m in masks]
        +
        + +

        Further profiling showed that at this point decode_img_bytes takes about 10ms. In particular, it uses torchvision’s ToTensor transform to convert from a numpy Tensor to a scaled, float32 torch.Tensor. The bytes passed to ToTensor have already been decoded and converted to an numpy ndarray. By slightly rewriting ToTensor, using torchvision’s v2 API and moving the uint8 decoded smaller integer Tensor to GPU first before scaling, we can gain another 10ms in latency. Without including decode_img_bytes in our analysis we would have missed this opportunity that has real-world impact on server-side inference.

        + +
        image_tensor = torch.from_numpy(image_tensor)
        +image_tensor = image_tensor.permute((2, 0, 1))
        +image_tensor = image_tensor.cuda()
        +image_tensor = v2.ToDtype(torch.float32, scale=True)( image_tensor)
        +
        + +

        Note in particular that using pinned memory to perform asynchronous data transfers doesn’t apply, since the time it takes to move the Tensor into pinned memory isn’t worth the gain in asynchronicity for this data movement. For future work, we might want to explore further improvements here by using more advanced direct memory transfer techniques.

        + +

        AMG: AO + batching + furious

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU / +
        +fail count +
        first iteration +
        +(ms) +
        + load export +
        +(cold) +
        162 + 233 + 27927 + 0.974 / 308 + 906 +
        + load export (warm) + 157 + 230 + 27927 + 0.974 / 308 + 799 +
        + load export (warm) +
        ++ preproc +
        136 + 208 + 27950 + 0.977 / 311 + 908 +
        + +

        SPS: AO + furious

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
        +(ms) +
        + load export +
        +(cold) +
        35 + 66 + 1686 + 0.9997 + 763 +
        + load export (warm) + 31 + 63 + 1686 + 0.9997 + 683 +
        + load export (warm) +
        ++ preproc +
        19 + 25 + 1711 + 0.9997 + 658 +
        + +

        MPS: AO + batching + furious

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 latency (ms) + p90 latency (ms) + memory (MiB) + mIoU + first iteration +
        +(ms) +
        + load export +
        +(cold) +
        43 + 72 + 3813 + 0.997 + 747 +
        + load export (warm) + 53 + 81 + 3813 + 0.997 + 807 +
        + load export (warm) +
        ++ preproc +
        31 + 41 + 3837 + 0.997 + 671 +
        + +

        This small change has a significant impact on the SPS and MPS task.

        + +

        Deploying on Modal

        + +

        Finally, we deployed our optimized inference onto Modal, a serverless infrastructure provider, to demonstrate that the benefits of these optimizations can be realized in a more realistic deployment setting.

        + +

        In particular, compilation and AOTI via torch.export requires extra work. In a naïve deployment that work might be added to every single inference execution, adding latency that dwarfs any improvements from a faster model. This is particularly challenging with elastic or autoscaling infrastructure, where replicas of our inference service need to be regularly and automatically created and destroyed.

        + +

        We share a deployment script in the torchao repository (cli_on_modal.py) to demonstrate one pattern for an elastic deployment. We build the exported models ahead of time and then upload them to distributed storage. Relative to eager execution, this adds a bit of extra work when replicas spin up since they need to read this data over a network, but this is far less costly than compilation or export.

        + +

        We benchmarked this deployment with a large batch inference workload: sending 1000 images for concurrent processing. The deployment scales up to ten replicas on ten GPUs at peak and scales down to zero GPUs when inactive.

        + +

        First, let’s look at the execution latencies.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 execution latency +
        +(ms / improvement) +
        p90 execution latency +
        +(ms / improvement) +
        + eager float32 + AOTI float16 + eager float32 + AOTI float16 +
        + + Modal + Offline + + Modal + Offline +
        AMG + 741 + 112 (6.6x) + 136 (5.4x) + 1140 + 176 (6.5x) + 208 (5.5x) +
        SPS + 98 + 20 (4.9x) + 19 (5.2x) + 130 + 28 (4.6x) + 25 (5.2x) +
        MPS + 269 + 38 (7.1x) + 31 (8.7x) + 714 + 52 (13.7x) + 41 (17.4x) +
        + +

        We notice that execution latencies on Modal and Offline are fairly close, especially relative to the baseline, indicating that optimizing the deployment offline was a reasonable proxy for optimizing the deployment directly.

        + +

        In addition to execution latency, our batch workload has queueing time, since there are fewer replicas than there are inputs, and so some inputs have to wait in line.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + p50 queue time (ms) + p90 queue time (ms) +
        + eager float32 + AOTI float16 + eager float32 + AOTI float16 +
        AMG + 201 + 41 (4.9x) + 815 + 327 (2.6x) +
        SPS + 31 + 33 (0.9x) + 441 + 49 (9.0x) +
        MPS + 40 + 37 (1.1x) + 942 + 75 (12.6x) +
        + +

        Even though the queueing system provided by the infrastructure is unchanged, the queue latencies also decrease when we use our optimized model – in the p90 case by a factor of 2 to 12. That’s because when we finish previous inputs faster (from reduced execution latency) we can pull our next inputs sooner (reducing their queueing time).

        + +

        If you’re interested in optimizing SAM2 inference or deployments further, don’t hesitate to reach out to us at the torchao repository!

        + +

        Conclusions

        + +

        We rewrote Meta’s original SAM2 in pure PyTorch with little loss of accuracy and a strong focus on latency. We deployed our optimized inference onto Modal, a serverless infrastructure provider, to demonstrate that the benefits of these optimizations can be realized in a more realistic deployment setting.

        + +

        By utilizing AOTInductor’s (AOTI) ahead-of-time compilation via torch.export, reduced precision, batched prompts and GPU preprocessing we observe up to 13x improvement in p90 execution latency and queue times compared to regular eager mode PyTorch.

        + +

        With elastic or autoscaling infrastructure, where replicas of our inference service need to be regularly and automatically created and destroyed, a naïve deployment of torch.compile can add work to inference execution that dwarfs any improvements from a faster model. By utilizing AOTInductor’s (AOTI) ahead-of-time compilation via torch.export, we are able to upload exported models ahead of time and read this data over a network, which enables us to get the benefits of compilation without significantly increased work.

        + +

        For more details on how to reproduce the data in this blog post, check out the experiments folder of torchao. Please don’t hesitate to contact us or open an issue if you run into any technical issues.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-generative-ai/index.html b/blog/accelerating-generative-ai/index.html new file mode 100644 index 000000000000..3f2b6a8eb46a --- /dev/null +++ b/blog/accelerating-generative-ai/index.html @@ -0,0 +1,902 @@ + + + + + + + + + + + + + Accelerating Generative AI with PyTorch: Segment Anything, Fast | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        This post is the first part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples of how these features can be combined to see how far we can push PyTorch native performance.

        + +

        As announced during the PyTorch Developer Conference 2023, the PyTorch team rewrote Meta’s Segment Anything (“SAM”) Model resulting in 8x faster code than the original implementation, with no loss of accuracy, all using native PyTorch optimizations. We leverage a breadth of new PyTorch features:

        + +
          +
        • Torch.compile: A compiler for PyTorch models
        • +
        • GPU quantization: Accelerate models with reduced precision operations
        • +
        • Scaled Dot Product Attention (SDPA): Memory efficient attention implementations
        • +
        • Semi-Structured (2:4) Sparsity: A GPU optimized sparse memory format
        • +
        • Nested Tensor: Batch together non-uniformly sized data into a single Tensor, such as images of different sizes.
        • +
        • Custom operators with Triton: Write GPU operations using Triton Python DSL and easily integrate it into PyTorch’s various components with custom operator registration.
        • +
        + +

        We encourage readers to copy-paste code from our implementation of SAM on Github and ask us questions on Github.

        + +

        A quick glimpse of increasing throughput and decreasing memory overhead

        + +

        A quick glimpse of increasing throughput and decreasing memory overhead with our newly released, PyTorch native, features. Benchmarks run on p4d.24xlarge instance (8x A100s).

        + +

        SegmentAnything Model

        + +

        SAM is a zero-shot vision model for generating promptable image masks.

        + +

        sam image masks

        + +

        The SAM architecture [described in its paper] includes multiple prompt and image encoders based on the Transformer architecture. Of this, we measured performance across the smallest and largest vision transformer backbones: ViT-B and ViT-H. And for simplicity, we only show traces for the ViT-B model.

        + +

        Optimizations

        + +

        Below we tell the story of optimizing SAM: profiling, identifying bottlenecks, and building new features into PyTorch that solve these problems. Throughout, we showcase our new PyTorch features: torch.compile, SDPA, Triton kernels, Nested Tensor and semi-structured sparsity. The following sections are progressively built upon each other, ending with our SAM-fast, now available on Github. We motivate each feature using real kernel and memory traces, using fully PyTorch native tooling, and visualize these traces with Perfetto UI.

        + +

        Baseline

        + +

        Our SAM baseline is Facebook Research’s unmodified model, using float32 dtype and a batch size of 1. After some initial warmup, we can look at a kernel trace using the PyTorch Profiler:

        + +

        kernel trace

        + +

        We notice two areas ripe for optimization.

        + +

        The first is long calls to aten::index, the underlying call resulting from a Tensor index operation (e.g., []). While the actual GPU time spent on aten::index is relatively low. aten::index is launching two kernels, and a blocking cudaStreamSynchronize is happening in between. This means the CPU is waiting for the GPU to finish processing until it launches the second kernel. To optimize SAM, we should aim to remove blocking GPU syncs causing idle time.

        + +

        The second is significant time spent on GPU in matrix multiplication (dark green on stream 7 7 above). This is common in Transformers. We can significantly speed up SAM if we can reduce the amount of GPU time spent on matrix multiplication.

        + +

        We can measure the throughput (img/s) and memory overhead (GiB) from out of the box SAM to establish a baseline:

        + +

        throughput (img/s) and memory overhead (GiB) from out of the box SAM

        + +

        Bfloat16 Half precision (+GPU syncs and batching)

        + +

        To address the first issue of less time spent in matrix multiplication, we can turn to bfloat16. Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation. With reducing precision of parameters, it’s critical to validate end to end model accuracy.

        + +

        replacing padding dtypes with half precision, bfloat16

        + +

        Shown here is an example of replacing padding dtypes with half precision, bfloat16. Code is here.

        + +

        Next to simply setting model.to(torch.bfloat16) we have to change a few small places that assume the default dtype.

        + +

        Now, in order to remove GPU syncs we need to audit operations that cause them. We can find these pieces of code by searching the GPU traces for calls to cudaStreamSynchronize. In fact, we found two locations that we were able to rewrite to be sync-free.

        + +

        code sample 1

        + +

        replacing padding dtypes with half precision, bfloat16

        + +

        Specifically, we see that within SAM’s image encoder, there are variables acting as coordinate scalers, q_coords and k_coords. These are both allocated and processed on the CPU. However, once these variables are used to index in rel_pos_resized, the index operation automatically moves these variables to the GPU. This copy over causes the GPU sync we’ve observed above. We notice a second call to index in SAM’s prompt encoder: We can use torch.where to rewrite this as shown above.

        + +

        Kernel trace

        + +

        After applying these changes, we begin to see significant time between individual kernel calls. This is typically observed with small batch sizes (1 here) due to the GPU overhead of launching kernels. To get a closer look at practical areas for optimization, we can start to profile SAM inference with batch size 8:

        + +

        profile SAM inference with batch size 8

        + +

        Looking at the time spent per-kernel, we obverse most of SAM’s GPU time spent on elementwise kernels and softmax operation. With this we now see that matrix multiplications have become a much smaller relative overhead.

        + +

        matrix multiplications have become a much smaller relative overhead

        + +

        Taken the GPU sync and bfloat16 optimizations together, we have now pushed SAM performance by up to 3x

        + +

        SAM performance by up to 3x

        + +

        Torch.compile (+graph breaks and CUDA graphs)

        + +

        When observing a large number of small operations, such as the elementwise kernels profiled above, turning to a compiler to fuse operations can have strong benefits. PyTorch’s recently released torch.compile does a great job optimizing by:

        + +
          +
        1. Fusing together sequences of operations such as nn.LayerNorm or nn.GELU into a single GPU kernel that is called and
        2. +
        3. Epilogues: fusing operations that immediately follow matrix multiplication kernels to reduce the number of GPU kernel calls.
        4. +
        + +

        Through these optimizations, we reduce the number of GPU global memory roundtrips, thus speeding up inference. We can now try torch.compile on SAM’s image encoder. To maximize performance we use a few advanced compile techniques such as:

        + +
          +
        • using torch.compile’s max-autotune mode enables CUDA graphs and shape-specific kernels with custom epilogues
        • +
        • By setting TORCH_LOGS=”graph_breaks,recompiles” we can manually verify that we are not running into graph breaks or recompiles.
        • +
        • Padding the batch of images input to the encoder with zeros ensures compile accepts static shapes thus being able to always use shape-specific optimized kernels with custom epilogues without recompilations.
        • +
        + +
        predictor.model.image_encoder = \
        +    torch.compile(predictor.model.image_encoder, mode=use_compile)
        +
        + +

        Kernel trace

        + +

        Kernel trace

        + +

        torch.compile is working beautifully. We launch a single CUDA graph, which makes up a significant portion of GPU time within the timed region. Let’s run our profile again and look at the percentage of GPU time spent in specific kernels:

        + +

        the percentage of GPU time spent in specific kernels

        + +

        We now see softmax makes up a significant portion of the time followed by various GEMM variants. In summary we observe the following measurements for batch size 8 and above changes.

        + +

        measurements for batch size 8 and above

        + +

        SDPA: scaled_dot_product_attention

        + +

        Next up, we can tackle one of the most common areas for transformer performance overhead: the attention mechanism. Naive attention implementations scale quadratically in time and memory with sequence length. PyTorch’s scaled_dot_product_attention operation built upon the principles of Flash Attention, FlashAttentionV2 and xFormer’s memory efficient attention can significantly speed up GPU attention. Combined with torch.compile, this operation allows us to express and fuse a common pattern within variants of MultiheadAttention. After a small set of changes we can adapt the model to use scaled_dot_product_attention.

        + +

        PyTorch native attention implementation

        + +

        PyTorch native attention implementation, see code here.

        + +

        Kernel trace

        + +

        We can now see that in particular the memory efficient attention kernel is taking up a large amount of computational time on the GPU:

        + +

        memory efficient attention kernel is taking up a large amount of computational time on the GPU

        + +

        Using PyTorch’s native scaled_dot_product_attention, we can significantly increase the batch size. We now observe the following measurements for batch size 32 and above changes.

        + +

        batch size 32 and above

        + +

        Triton: Custom SDPA for fused relative positional encoding

        + +

        Transitioning away from inference throughput for a moment, we started profiling overall SAM memory. Within the image encoder, we saw significant spikes in memory allocation:

        + +

        spikes in memory allocation

        + +

        Zooming in, we see this allocation happens within add_decomposed_rel_pos, on the following line:

        + +

        we see this allocation happens within add_decomposed_rel_pos

        + +

        The attn variable here is the addition of two smaller tensors: rel_h of shape (B, q_h, q_w, k_h, 1) and rel_w of shape (B, q_h, q_w, 1, k_w).

        + +

        It’s not surprising that the memory efficient attention kernel (used via SDPA) is taking a long time with an attention bias size over 3.0GiB. If instead of allocating this large attn tensor, we thread into SDPA the two smaller rel_h and rel_w tensors, and only construct attn as needed, we’d anticipate significant performance gain.

        + +

        Unfortunately this is not a trivial modification; SDPA kernels are highly optimized and written in CUDA. We can turn to Triton, with their easy to understand and use tutorial on a FlashAttention implementation. After some significant digging and in close collaboration with xFormer’s Daniel Haziza we found one case of input shapes where it is relatively straightforward to implement a fused version of the kernel. The details have been added to the repository. Surprisingly this can be done in under 350 lines of code for the inference case.

        + +

        This is a great example of extending PyTorch with a new kernel, straightforwardly built with Triton code.

        + +

        Kernel trace

        + +

        kernel trace

        + +

        With our custom positional Triton kernel we observe the following measurements for batch size 32.

        + +

        we observe the following measurements for batch size 32

        + +

        NT: NestedTensor and batching predict_torch

        + +

        We have spent a lot of time on the image encoder. This makes sense, since it takes up the most amount of computational time. At this point however it is fairly well optimized and the operator that takes the most time would require significant additional investment to be improved.

        + +

        We discovered an interesting observation with the mask prediction pipeline: for each image we have there is an associated size, coords, and fg_labels Tensor. Each of these tensors are of different batch sizes. Each image itself is also of a different size. This representation of data looks like Jagged Data. With PyTorch’s recently released NestedTensor, we can modify our data pipeline batch coords and fg_labels Tensors into a single NestedTensor. This can have significant performance benefits for the prompt encoder and mask decoder that follow the image encoder. Invoking:

        + +
        torch.nested.nested_tensor(data, dtype=dtype, layout=torch.jagged)
        +
        + +

        Kernel trace

        + +

        Kernel trace

        + +

        we can launch kernels much faster from the CPU than the GPU can process

        + +

        We can see now that we can launch kernels much faster from the CPU than the GPU can process and that it spends a long time waiting at the end of our timed region for the GPU to finish (cudaDeviceSynchronize). We also don’t see any more idle time (white space) between kernels on the GPU.

        + +

        With Nested Tensor, we observe the following measurements for batch size 32 and above changes.

        + +

        batch size 32 and above changes

        + +

        int8: quantization and approximating matmul

        + +

        We notice in the above trace, that significant time is now spent in GEMM kernels. We’ve optimized enough that we now see matrix multiplication account for more time in inference than scaled dot product attention.

        + +

        Building on earlier learnings going from fp32 to bfloat16, let’s go a step further, emulating even lower precision with int8 quantization. Looking at quantization methods, we focus on Dynamic quantization wherein our model observes the range of possible inputs and weights of a layer, and subdivides the expressible int8 range to uniformly “spread out” observed values. Ultimately each float input will be mapped to a single integer in the range [-128, 127]. For more information see PyTorch’s tutorial on quantization

        + +

        Reducing precision can immediately lead to peak memory savings, but to realize inference speedups, we have to make full use of int8 through SAM’s operations. This requires building an efficient int8@int8 matrix multiplication kernel, as well as casting logic to translate from high to low precision (quantization) as well as reversing back from low to high (dequantization). Utilizing the power of torch.compile, we can compile and fuse together these quantization and dequantization routines into efficient single kernels and epilogues of our matrix multiplication. The resulting implementation is fairly short and less than 250 lines of code. For more information on the APIs and usage, see pytorch-labs/ao.

        + +

        While it’s common to see some accuracy regression when quantizing models at inference time, SAM has been particularly robust to lower precision inference with minimal loss of accuracy. With quantization added, we now observe the following measurements for batch size 32 and above changes.

        + +

        batch size 32 and above changes

        + +

        sparse: Semi-structured (2:4) sparsity

        + +

        Matrix multiplications are still our bottleneck. We can turn to the model acceleration playbook with another classic method to approximate matrix multiplication: sparsification. By sparsifying our matrices (i.e., zeroing out values), we could theoretically use fewer bits to store weight and activation tensors. The process by which we decide which weights in the tensor to set to zero is called pruning. The idea behind pruning is that small weights in a weight tensor contribute little to the net output of a layer, typically the product of weights with activations. Pruning away small weights can potentially reduce model size without significant loss of accuracy.

        + +

        Methods for pruning are varied, from completely unstructured, wherein weights are greedily pruned to highly structured, wherein large sub-components of a tensor are pruned a time. Choice of method is not trivial. While unstructured pruning may have the theoretically least impact on accuracy, GPUs are also highly efficient with multiplying large, dense matrices and may suffer significant performance degradation in sparse regimes. One recent pruning method supported in PyTorch seeks to strike a balance, called semi-structured (or 2:4) sparsity. This sparse storage reduces the original tensor by a significant 50%, while simultaneously resulting in a dense tensor output that can leverage highly performant, 2:4 GPU kernels. See the following picture for an illustration.

        + +

        dense tensor output that can leverage highly performant, 2:4 GPU kernels

        + +

        From developer.nvidia.com/blog/exploiting-ampere-structured-sparsity-with-cusparselt

        + +

        In order to use this sparse storage format and the associated fast kernels we need to prune our weights such that they adhere to the constraints for the format. We pick the two smallest weights to prune in a 1 by 4 region, measuring the performance vs accuracy tradeoff. It is easy to change a weight from its default PyTorch (“strided”) layout to this new, semi-structured sparse layout. To implement apply_sparse(model) we only require 32 lines of Python code:

        + +
        import torch
        +from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
        +
        +# Sparsity helper functions
        +def apply_fake_sparsity(model):
        +    """
        +    This function simulates 2:4 sparsity on all linear layers in a model.
        +    It uses the torch.ao.pruning flow.
        +    """
        +    # torch.ao.pruning flow
        +    from torch.ao.pruning import WeightNormSparsifier
        +    sparse_config = []
        +    for name, mod in model.named_modules():
        +        if isinstance(mod, torch.nn.Linear):
        +            sparse_config.append({"tensor_fqn": f"{name}.weight"})
        +
        +    sparsifier = WeightNormSparsifier(sparsity_level=1.0,
        +                                      sparse_block_shape=(1,4),
        +                                      zeros_per_block=2)
        +    sparsifier.prepare(model, sparse_config)
        +    sparsifier.step()
        +
        +    sparsifier.step()
        +    sparsifier.squash_mask()
        +
        +
        +def apply_sparse(model):
        +    apply_fake_sparsity(model)
        +    for name, mod in model.named_modules():
        +        if isinstance(mod, torch.nn.Linear):
        +            mod.weight = torch.nn.Parameter(to_sparse_semi_structured(mod.weight))
        +
        + +

        With 2:4 sparsity, we observe peak performance on SAM with vit_b and batch size 32:

        + +

        With 2:4 sparsity, we observe peak performance on SAM with vit_b and batch size 32

        + +

        Conclusion

        + +

        Wrapping up, we are excited to have announced our fastest implementation of Segment Anything to date. We rewrote Meta’s original SAM in pure PyTorch with no loss of accuracy using a breadth of newly released features:

        + +
          +
        • Torch.compile PyTorch’s native JIT compiler, providing fast, automated fusion of PyTorch operations [tutorial]
        • +
        • GPU quantization accelerate models with reduced precision operations [api]
        • +
        • Scaled Dot Product Attention (SDPA) a new, memory efficient implementation of Attention [tutorial]
        • +
        • Semi-Structured (2:4) Sparsity accelerate models with fewer bits to store weights and activations [tutorial]
        • +
        • Nested Tensor Highly optimized, ragged array handling for non-uniform batch and image sizes [tutorial]
        • +
        • Triton kernels. Custom GPU operations, easily built and optimized via Triton
        • +
        + +

        For more details on how to reproduce the data presented in this blog post, check out the experiments folder of segment-anything-fast. Please don’t hesitate to contact us or open an issue if you run into any technical issues.

        + +

        In our next post, we are excited to share similar performance gains with our PyTorch natively authored LLM!

        + +

        Acknowledgements

        + +

        We would like to thank Meta’s xFormers team including Daniel Haziza and Francisco Massa for authoring SDPA kernels and helping us design our custom one-off Triton kernel.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-inference/index.html b/blog/accelerating-inference/index.html new file mode 100644 index 000000000000..912976876379 --- /dev/null +++ b/blog/accelerating-inference/index.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + + Accelerating Inference on x86-64 Machines with oneDNN Graph | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        Supported in PyTorch 2.0 as a beta feature, oneDNN Graph leverages aggressive fusion patterns to accelerate inference on x86-64 machines, especially Intel® Xeon® Scalable processors.

        + +

        oneDNN Graph API extends oneDNN with a flexible graph API to maximize the optimization opportunity for generating efficient code on AI hardware. It automatically identifies the graph partitions to be accelerated via fusion. The fusion patterns focus on fusing compute-intensive operations such as convolution, matmul, and their neighbor operations for both inference and training use cases.

        + +

        In PyTorch 2.0 and beyond, oneDNN Graph can help accelerate inference on x86-64 CPUs (primarily, Intel Xeon processor-based machines) with Float32 and BFloat16 (with PyTorch’s Automatic Mixed Precision support) datatypes. With BFloat16, speedup is limited to machines that support AVX512_BF16 ISA (Instruction Set Architecture), as well as machines that also support AMX_BF16 ISA.

        + +

        oneDNN Graph Usage

        + +

        From a user’s perspective, the usage is quite simple and intuitive, with the only change in code being an API invocation. To leverage oneDNN Graph with JIT-tracing, a model is profiled with an example input as shown below in Figure 1.

        + +

        Figure 1. A code-snippet that demonstrates using oneDNN Graph

        + +

        Fig. 1: A code-snippet that demonstrates using oneDNN Graph

        + +

        oneDNN Graph receives the model’s graph and identifies candidates for operator-fusion with respect to the input shape of the example input. Currently, only static shapes are supported. This means that any other input shape would neither be supported nor receive any performance-benefit.

        + +

        Measurements

        + +

        To ensure reproducibility of results, we used a fork of TorchBench to measure inference speed-up of some Vision models on an AWS m7i.16xlarge instance, which uses 4th Gen Intel® Xeon® Scalable processors.

        + +

        The baseline for comparison was torch.jit.optimize_for_inference which only supports Float32 datatype. The batch-size for each model was based on the respective batch size being used for them in TorchBench.

        + +

        In Figure 2, we depict the inference speedup of using oneDNN Graph over PyTorch alone. The geomean speedup with oneDNN Graph for Float32 datatype was 1.24x, and the geomean speedup for BFloat16 datatype was 3.31x1.

        + +

        Figure 2. Inference speedup with oneDNN Graph over default CPU JIT Fuser (which only uses Float32 datatype)

        + +

        Fig. 2: Inference speedup with oneDNN Graph over default CPU JIT Fuser (which only uses Float32 datatype)

        + +

        Future work

        + +

        oneDNN Graph is currently supported in PyTorch through TorchScript, but work is already underway by Intel to integrate it with the Inductor-CPU backend as a prototype feature in a future PyTorch release and Dynamo make supporting dynamic shapes easier with PyTorch, and we would like to introduce Dynamic shape support with Inductor-CPU. We also plan to add int8 quantization support.

        + +

        Acknowledgements

        + +

        The results presented in this blog are a joint effort between Meta and the Intel PyTorch team. Special thanks to Elias Ellison from Meta who spent precious time thoroughly reviewing the PRs and gave us helpful feedback.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-large-language-models/index.html b/blog/accelerating-large-language-models/index.html new file mode 100644 index 000000000000..4bb714e97af0 --- /dev/null +++ b/blog/accelerating-large-language-models/index.html @@ -0,0 +1,837 @@ + + + + + + + + + + + + + Accelerating Large Language Models with Accelerated Transformers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Lucas Pasqualin, Driss Guessous, Christian Puhrsch, Bertrand Maher, Michael Gschwind + +

        +

        TL;DR. We show how to use Accelerated PyTorch 2.0 Transformers and the newly introduced torch.compile() method to accelerate Large Language Models on the example of nanoGPT, a compact open-source implementation of the GPT model from Andrej Karpathy. Using the new scaled dot product attention operator introduced with Accelerated PT2 Transformers, we select the flash_attention custom kernel and achieve faster training time per batch (measured with Nvidia A100 GPUs), going from a ~143ms/batch baseline to ~113 ms/batch. In addition, the enhanced implementation using the SDPA operator offers better numerical stability. Finally, further optimizations are achieved using padded inputs, which when combined with flash attention lead to ~87ms/batch.

        + +

        Recent times have seen exponential adoption of large language models (LLMs) and Generative AI in everyday life. Tightly coupled with these ever-growing models is the ever-growing training cost - in terms of both time and hardware utilization. The PyTorch team has tackled these challenges head on with Accelerated PyTorch 2 Transformers (previously known as “Better Transformer”) and JIT Compilation in PyTorch 2.0.

        + +

        In this blog post, we explore training optimizations gained by utilizing custom kernel implementations of SDPA - also known as scaled dot product attention - a critical layer in transformer models. The custom kernel for SDPA replaces several discrete sequential operations with one globally optimized kernel which avoids allocating a large amount of intermediate CUDA memory. This approach offers a number of advantages, including but not limited to: higher performance computation of SDPA by reducing memory bandwidth bottleneck, reduced memory footprint to support larger batch sizes, and finally added numerical stability by prescaling input tensors. These optimizations are demonstrated on nanoGPT, an open-source implementation of GPT from Andrej Karpathy.

        + +

        Background

        + +

        Scaled dot product attention is the fundamental building block of multihead attention, as introduced in “Attention is All You Need”, and has a wide range of applications in LLM and Generative AI models.

        + +

        The Transformer model architecture

        + +

        Figure 1: The Transformer model architecture based on “Attention is All You Need”. With the new PyTorch SDPA operator, Multi-Head Attention is efficiently implemented by a linear layer for the in-projection, the SDPA operator, and a linear layer for the out-projection.

        + +

        With the new scaled_dot_product_attention operator, multihead attention can be implemented in just 3 steps: in projection with a linear layer, SDPA, and out projection with a linear layer.

        + +
        # In Projection
        +# variable descriptions:
        +# q,k,v = Query, Key, Value tensors
        +# bsz = batch size
        +# num_heads = Numner of heads for Multihead Attention
        +# tgt_len = Target length
        +# src_len = Source Length
        +# head_dim: Head Dimension
        +    q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
        +    q = q.view(bsz, num_heads, tgt_len, head_dim)
        +    k = k.view(bsz, num_heads, src_len, head_dim)
        +    v = v.view(bsz, num_heads, src_len, head_dim)
        +
        +    # Scaled Dot Product Attention
        +    attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
        +
        +    # Out Projection
        +    attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
        +    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
        +    attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
        +
        + +

        PyTorch 2. supports multiple different kernels optimized for specific use cases, with specific requirements. A kernel picker picks the best kernel for a particular combination of input parameters. If no optimized “custom kernel” for a particular combination of input parameters can be identified, the kernel picker selects a general kernel that can handle all input combinations.

        + +

        While future releases may extend this set of operators, PyTorch 2.0 launches with 3 implementations for the SDPA operator:

        + +
          +
        1. A generic kernel which implements the mathematical equation of SDPA in the function sdpa_math()
        2. +
        3. An optimized kernel based on the paper “Flash Attention”, which supports evaluation of SDPA with 16 bit floating point data types on compute architecture SM80 (A100).
        4. +
        5. An optimized kernel based on the paper “Self-Attention Does Not Need O(n^2) Memory” and implemented in xFormer, which supports both 32 and 16 bit floating data types on a wider range of architectures (SM40 and later). This blog post refers to this kernel as the mem_efficient kernel.
        6. +
        + +

        Note that both optimized kernels (two and three listed above), support a key padding mask and limit the supported attention mask to causal attention. Accelerated PyTorch 2.0 Transformers today only support the causal mask when it is specified using the is_causal boolean. When a mask is specified, the general-purpose kernel will be selected because it is too expensive to analyze the contents of a provided mask to determine if it is the causal mask. Additional explanations on the constraints for each kernel can be found in the Accelerated PT2 Transformer blog.

        + +

        Enabling Accelerated Transformers with nanoGPT

        + +

        The SDPA operator being a critical component of the GPT model, we identified the open source nanoGPT model as an excellent candidate for both demonstrating the ease of implementation and benefits of PyTorch 2.0’s Accelerated Transformers. The following demonstrates the exact process by which Accelerated Transformers was enabled on nanoGPT.

        + +

        This process largely revolves around replacing the existing SDPA implementation with the newly added F.scaled_dot_product_attention operator from functional.py. This process can be easily adapted to enable the operator in many other LLMs. Alternatively, users can instead choose to call F.multi_head_attention_forward() or utilize the nn.MultiHeadAttention module directly where applicable. The following code snippets are adapted from Karpathy’s nanoGPT repository.

        + +

        Step 1: Identify the existing SDPA implementation

        + +

        In the case of nanoGPT, SDPA is implemented in the model’s CausalSelfAttention class. The original implementation at time of writing is adapted below for this post.

        + +

        The original implementation at time of writing

        + +

        Step 2: Replace with Torch’s scaled_dot_product_attention

        + +

        At this point we can note the following:

        + +
          +
        • Lines 36 - 42 define the mathematical implementation of SDPA which we are replacing
        • +
        • The mask applied on line 39 is no longer relevant since we are using scaled_dot_product_attention’s is_causal flag.
        • +
        • The dropout layer used in line 41 is also now unnecessary.
        • +
        + +

        Swapping out the SDPA implementation for torch’s scaled_dot_product_attention and removing the now redundant code yields the following implementation.

        + +

        Swapping out the SDPA implementation for torch’s scaled_dot_product_attention and removing the now redundant code yields the following implementation.

        + +

        Alternatively, the original mask can be passed into the attn_mask field however due to the mentioned kernel constraints that would limit the implementation to only support the generic sdpa_math kernel.

        + +

        Step 3 (Bonus): Faster matmuls with padding

        + +

        On top of the performance improvements from SDPA, our analysis yielded a nice ancillary win. In Andrej’s words “The most dramatic optimization to nanoGPT so far (~25% speedup) is to simply increase the vocab size from 50257 to 50304 (nearest multiple of 64).”

        + +

        Tweet by Andrej Karpathy

        + +

        The vocab size determines the dimensions of matmuls in the output layer of GPT, and these are so large that they were taking a majority of the time for the entire training loop! We discovered that they were achieving performance significantly below the peak throughput achievable on the A100 GPU, and guessed from NVIDIA’s matmul documentation that 64-element alignment would yield better results. Indeed, padding these matmuls achieves nearly a 3x speedup! The underlying cause is that unaligned memory accesses significantly reduce efficiency. A deeper analysis can be found in this Twitter thread.

        + +

        With this optimization we were able to further reduce training time from ~113 ms (using flash attention) to ~87 ms per batch.

        + +

        Results

        + +

        The figure below demonstrates the performance gained using Pytorch custom kernels. Here are the exact figures:

        + +
          +
        • baseline (nanoGPT implementation): ~143ms
        • +
        • sdpa_math (generic): ~134ms (6.71% faster)
        • +
        • mem_efficient kernel: ~119ms (20.16% faster)
        • +
        • flash_attention kernel: ~113ms (26.54% faster)
        • +
        • flash_attention + padded vocab: ~87ms (64.37% faster)
        • +
        + +

        All code was run on an 8 x NVIDIA Corporation A100 server with 80 GB HBM [A100 SXM4 80GB], and for the purpose of this experiment dropout was set to 0.

        + +

        Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models

        + +

        Figure 2: Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for nanoGPT shown here.

        + +

        Enhancing Numerical Model Stability

        + +

        In addition to being faster, PyTorch’s implementation offers increased numerical stability by avoiding loss of precision in many execution scenarios. There is a great explanation here, but essentially the PyTorch implementation scales the Query and Key matrices before multiplication, which is said to be more stable and avoid loss of precision. Because of the merged custom kernel architecture of SDPA, this scaling does not introduce additional overhead in the computation of the attention result. In comparison, an implementation from the individual computational components would require separate pre-scaling at additional cost. For an additional explanation, see Appendix A.

        + +

        Improved Memory Consumption

        + +

        Yet another large advantage of using the torch SDPA kernels is the reduced memory footprint, which allows for the utilization of larger batch sizes. The following chart compares the best validation loss after one hour of training for both flash attention and the baseline implementations of causal attention. As can be seen, the maximum batch size achieved with the baseline causal attention implementation (on 8 x NVIDIA Corporation A100 server with 80 GB HBM) was 24, significantly less then the maximum achieved with flash attention, which was 39.

        + +

        Using Flash Attention enables the usage of larger batch sizes

        + +

        Figure 3: Using Flash Attention enables the usage of larger batch sizes, allowing users to achieve lower validation loss after one hour of training (smaller is better).

        + +

        Conclusion

        + +

        Accelerated PyTorch 2 Transformers were designed to make the training and production deployment of state-of-the-art transformer models affordable and integrated with PyTorch 2.0 model JIT compilation. The newly introduced PyTorch SDPA operator provides improved performance for training Transformer models and is particularly valuable for the expensive Large Language Model training. In this post we demonstrate a number of optimizations on the exemplary nanoGPT model including:

        + +
          +
        • Over 26% training speedup, when compared against the baseline with constant batch size
        • +
        • An additional speedup achieved with padded vocabulary, bringing the total optimization to approximately 64% compared to the baseline
        • +
        • Additional numerical stability
        • +
        + +

        Appendix A: Analyzing Attention Numeric Stability

        + +

        In this section we provide a more in depth explanation of the previously mentioned enhanced numerical stability which is gained by prescaling SDPA’s input vectors. The following is a simplified version of nanoGPT’s mathematical implementation of SDPA. The important thing to note here is that the query undergoes matrix multiplication without being scaled.

        + +
        # nanoGPT implementation of SDPA
        +# notice q (our query vector) is not scaled !
        +att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        +att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        +att = F.softmax(att, dim=-1)
        +
        +# Dropout is set to 0, so we can safely ignore this line in the implementation# att = self.attn_dropout(att) 
        +
        +y_nanogpt = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        +
        + +

        The following is the equivalent mathematical implementation in torch’s scaled_dot_product_attention.

        + +
        # PyTorch implementation of SDPA
        +embed_size = q.size(-1)
        +scaling_factor = math.sqrt(math.sqrt(embed_size))
        +q = q / scaling_factor 	# notice q _is_ scaled here !
        +
        +# same as above, but with scaling factor
        +att = q @ (k.transpose(-2, -1) / scaling_factor)
        +att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        +att = F.softmax(att0, dim=-1)
        +
        +# Dropout is set to 0, so we can safely ignore this line in the implementation# att = self.attn_dropout(att) 
        +
        +y_scale_before = att @ v
        +
        + +

        Mathematically both approaches should be equivalent, however our experimentation shows that in practice we receive different results from each approach.

        + +

        Using the approach above, we verified y_scale_before matches the expected output from using the scaled_dot_product_attention method while y_nanogpt does not.

        + +

        The torch.allclose method was used to test equivalence. Specifically, we showed that:

        + +
        y_sdpa = torch.nn.functional._scaled_dot_product_attention(
        +	q,
        +	k,
        +	v,
        +	attn_mask=self.bias[:,:,:T,:T] != 0,
        +	dropout_p=0.0,
        +	need_attn_weights=False,
        +	is_causal=False,
        +)
        +
        +torch.allclose(y_sdpa, y_nanogpt) # False, indicating fp issues
        +torch.allclose(y_sdpa, y_scale_before) # True, as expected
        +
        + +

        Appendix B: Reproducing Experiment Results

        + +

        Researchers seeking to reproduce these results should start with the following commit from Andrej’s nanoGPT repository - b3c17c6c6a363357623f223aaa4a8b1e89d0a465. This commit was used as the baseline when measuring the per batch speed improvements. For results which include padded vocabulary optimizations (which yielded the most significant improvements to batch speed), use the following commit - 77e7e04c2657846ddf30c1ca2dd9f7cbb93ddeab. From either checkout, selecting kernels for experimentation is made trivial with the use of the torch.backends API.

        + +

        The desired kernel can be selected via a context manager:

        + +
        with torch.backends.cuda.sdp_kernel (
        +    enable_math = False,
        +    enable_flash = False,
        +    enable_mem_efficient = True
        +):
        +    train(model)
        +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-llama3/index.html b/blog/accelerating-llama3/index.html new file mode 100644 index 000000000000..02fbc6784b43 --- /dev/null +++ b/blog/accelerating-llama3/index.html @@ -0,0 +1,774 @@ + + + + + + + + + + + + + Accelerating Llama3 FP8 Inference with Triton Kernels | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Adnan Hoque, Less Wright, Chih Chieh Yang + +

        +

        1.0 Summary

        + +

        We present an optimized Triton FP8 GEMM (General Matrix-Matrix Multiply) kernel TK-GEMM, which leverages SplitK parallelization. For small batch size inference, TK-GEMM delivers up to 1.94x over the base Triton matmul implementation, 1.87x speedup over cuBLAS FP8 and 1.71x over cuBLAS FP16 for Llama3-70B inference problem sizes on NVIDIA H100 GPUs.

        + +

        TK-GEMM Speedup over PyTorch (calling cuBLAS) for Llama3-70B Attention Layer Matrix Shapes (N=K=8192)

        + +

        Figure 1. TK-GEMM Speedup over PyTorch (calling cuBLAS) for Llama3-70B Attention Layer Matrix Shapes (N=K=8192)

        + +

        In this blog, we will cover how we designed an optimized kernel using Triton for FP8 inference and tuned it for Lama3-70B inference. We will cover FP8 (8-bit floating point), a new datatype supported by Hopper generation GPUs (SM90), the key SM90 features that Triton supports, and how we modified the parallelization to be able to maximize memory throughput for memory-bound (inference) problem sizes.

        + +

        We also dedicate a section on CUDA graphs, an important technology that will help materialize kernel level speedups and enable developers who want to use Triton kernels in production settings to get additional performance gain.

        + +

        Repo and code available at: https://github.com/pytorch-labs/applied-ai

        + +

        2.0 FP8 Datatype

        + +

        The FP8 datatype was introduced jointly by Nvidia, Arm and Intel and serves as a successor to 16-bit floating point types. With half the bit count, it has the potential to provide significant throughput improvements over its predecessors for Transformer networks. The FP8 datatype consists of 2 formats:

        + +

        E4M3 (4-bit exponent and 3-bit mantissa). Able to store +/ 448 and nan.
        +E5M2 (5-bit exponent and 2-bit mantissa). Able to store +/- 57,334, nan and inf.

        + +

        BF16, FP16, FP8 E4M3 and FP8 E5M2

        + +

        Above: BF16, FP16, FP8 E4M3 and FP8 E5M2.
        +To show precision differences, the closest representation to 0.3952 is shown in each format.
        +Image Credit: Nvidia

        + +

        We use E4M3 in inference and forward pass training due its higher precision and E5M2 in training backward pass due to its higher dynamic range. Nvidia has designed their H100 FP8 Tensor Core to provide a peak of 3958 TFLOPS, 2x the FLOPS of the FP16 Tensor Core.

        + +

        We designed our Triton kernel with these hardware innovations in mind and in the rest of the blog we will discuss methods to leverage and verify that these features are indeed being utilized by the Triton compiler.

        + +

        3.0 Triton Hopper Support and FP8 Tensor Core Instruction

        + +

        The Hopper GPU architecture has added the following new features that we can expect will accelerate FP8 GEMM.

        + +
          +
        • TMA (Tensor Memory Accelerator) Hardware Unit
        • +
        • WGMMA (Warp Group Matrix Multiply-Accumulate Instruction)
        • +
        • Threadblock Clusters
        • +
        + +

        Triton currently takes advantage of one of these features, the wgmma instruction, whereas PyTorch (calling cuBLAS) leverages all 3 which makes these speedups even more impressive. To fully take advantage of the Hopper FP8 Tensor Core, the wgmma is necessary even though the older mma.sync instruction is still supported.

        + +

        The key difference between the mma and wgmma instructions is that instead of 1 CUDA warp being responsible for an output shard, an entire warp group, 4 CUDA warps, asynchronously contributes to an output shard.

        + +

        To see what this instruction looks like in practice, and to verify that our Triton Kernel is indeed utilizing this feature we analyzed the PTX and SASS assembly using nsight compute.

        + +

        PTX Assembly

        + +

        Figure 2. PTX Assembly

        + +

        This instruction is further lowered into a QGMMA instruction in SASS.

        + +

        SASS Assembly

        + +

        Figure 3. SASS Assembly

        + +

        Both instructions tell us that we are multiplying two FP8 E4M3 input tensors and accumulating in F32, which confirms that the TK-GEMM Kernel is utilizing the FP8 Tensor Core and the lowering is being done correctly.

        + +

        4.0 SplitK Work Decomposition

        + +

        TK-GEMM vs Base Triton GEMM TFLOPS for M = 1-64

        + +

        Figure 4. TK-GEMM vs Base Triton GEMM TFLOPS for M = 1-64

        + +

        The base Triton FP8 GEMM implementation does not perform well for the small M regime, where for a matrix multiplication of A (MxN) x B (NxK), M < N, K. To optimize for this type matrix profile we applied a SplitK work decomposition instead of the Data Parallel decomposition found in the base Triton kernel. This greatly improved latencies for the small M regime.

        + +

        For background, SplitK launches additional thread blocks along the k dimension to calculate partial output sums. The partial results from each thread block are then summed using an atomic reduction. This allows for finer grained work decomposition with resultant performance improvements. More details on SplitK are available in our arxiv paper.

        + +

        After carefully tuning the other relevant hyperparameters for our kernel such as tile sizes, number of warps and the number of pipeline stages to Llama3-70B problem sizes we were able to produce up to 1.94x speedup over the Triton base implementation. For a more comprehensive introduction to hyperparameter tuning, see our blog.

        + +

        NCU profiler times for TK-GEMM under varying batch sizes, and compared with PyTorch (calling cuBLAS) FP8 and FP16.

        + +

        Above: NCU profiler times for TK-GEMM under varying batch sizes, and compared with PyTorch (calling cuBLAS) FP8 and FP16.

        + +

        Note that starting at M=32, the cuBLAS FP8 kernel starts to outperform TK-GEMM. For M >= 32, we suspect that hyperparameters we found are not optimal, and thus another set of experiments is required to determine the optimal parameters for the mid-sized M regime.

        + +

        5.0 CUDA Graphs to Enable End-to-End Speedup

        + +

        To be able to realize these speedups in an end-to-end setting, we must take into account both the kernel execution time (GPU duration) as well as the wall time (CPU+GPU) duration. Triton kernels, which are handwritten (as opposed to torch compile generated) are known to suffer from high-kernel launch latencies. If we use torch profiler to trace the TK-GEMM kernel we can see the call stack on the CPU side to pinpoint exactly what is causing the slowdown.

        + +

        CPU Launch Overhead: 2.413ms

        + +

        Figure 5. CPU Launch Overhead: 2.413ms

        + +

        From above, we see that the majority of the wall time of our optimized kernel is dominated by JIT (Just-in-Time) compilation overhead. To combat this we can use CUDA graphs.

        + +

        CUDA Graphs Visualization

        + +

        Figure 6. CUDA Graphs Visualization
        +Image Credit: PyTorch

        + +

        The key idea is instead of multiple kernel launches, we instead can create and instantiate a graph (1 time cost) and then submit that instance of the graph for execution. To illustrate this point we simulate a Llama3-70B Attention layer, As shown in the below figure generated using nsight systems, the time between each GEMM is 165us compared to the 12us spent on the actual matmul due the CPU kernel launch overhead. This means that 92% of the time of the time in an Attention layer the GPU is idle and not doing any work.

        + +

        Simulated Llama3-70B Attention Layer with TK-GEMM

        + +

        Figure 7. Simulated Llama3-70B Attention Layer with TK-GEMM

        + +

        To show the impact of CUDA graphs, we then created a graph of the TK-GEMM kernel in the toy Attention layer and replayed the graph. Below, we can see that the gaps between kernel executions are reduced to 6.65us.

        + +

        Simulated Llama3-70B Attention Layer with TK-GEMM and CUDA Graphs

        + +

        Figure 8. Simulated Llama3-70B Attention Layer with TK-GEMM and CUDA Graphs

        + +

        In practice, this optimization would result in a 6.4x speedup of a single attention layer in Llama3-70B, over naively using TK-GEMM in a model without CUDA graphs.

        + +

        6.0 Potential Future Optimization Paths

        + +

        TMA Hardware Unit

        + +

        Figure 9. TMA Hardware Unit
        +Image Credit: Nvidia

        + +

        The Nvidia H100 features a TMA hardware unit. The dedicated TMA unit frees up registers and threads to do other work, as address generation is completely handled by the TMA. For memory bound problem sizes, this can provide even further gain when Triton enables support for this feature.

        + +

        Tensor Core Utilization (Arrows Indicate Degrees of Freedom)

        + +

        Figure 10. Tensor Core Utilization (Arrows Indicate Degrees of Freedom)

        + +

        To identify how well we are utilizing the Tensor Core, we can analyze the roofline chart. Notice that we are in the memory-bound region as expected for small M. To improve kernel latency we can either increase the arithmetic intensity, which with a fixed problem size can only be achieved through exploiting data locality and other loop optimizations or increasing the memory throughput. This requires either a more optimal parallel algorithm specialized for the FP8 datatype as well as the type of problem size characteristics we expect to see in FP8 inference.

        + +

        DRAM Throughput Circled, 1.65TB/s vs Peak 3.35TB/s on H100 (M=16, N=8192, K=8192)

        + +

        Figure 11. DRAM Throughput Circled, 1.65TB/s vs Peak 3.35TB/s on H100 (M=16, N=8192, K=8192)

        + +

        Lastly, we can see that we are only achieving around 50% of peak DRAM throughput on the NVIDIA H100. High performance GEMM kernels typically achieve around 70-80% of peak throughput. This means that there is still a lot of room to improve and the techniques mentioned above (loop unrolling, optimized parallelization) are needed for additional gain.

        + +

        7.0 Future Work

        + +

        For future research, we would like to explore CUTLASS 3.x and CuTe to leverage more direct control over Hopper features especially in terms of obtaining direct TMA control and exploring pingpong architectures, which have shown promising results for FP8 GEMM.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-llm-inference/index.html b/blog/accelerating-llm-inference/index.html new file mode 100644 index 000000000000..cdec39142c4b --- /dev/null +++ b/blog/accelerating-llm-inference/index.html @@ -0,0 +1,918 @@ + + + + + + + + + + + + + Accelerating LLM Inference with GemLite, TorchAO and SGLang | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Teams at PyTorch, Mobius Labs and SGLang + +

        +

        Large Language Models (LLMs) are typically very resource-intensive, requiring significant amounts of memory, compute and power to operate effectively. Quantization provides a solution by reducing weights and activations from 16 bit floats to lower bitrates (e.g., 8 bit, 4 bit, 2 bit), achieving significant speedup and memory savings and also enables support for larger batch sizes.

        + +

        Existing solutions for low precision inference work well for small batch sizes, but suffer from following issues:

        + +
          +
        • Performance drops when we increase the batch size
        • +
        • Restrictions on types of quantization, for example, some kernels only support symmetric quantization that could have implications on accuracy of the model at lower bits
        • +
        • Interplay between quantization, serialization, and tensor parallelism (TP) makes it difficult to load quantized models and requires changes to user models
        • +
        + +

        To address these challenges, we created an end-to-end, performant, modular and extensible low-precision inference solution integrating the following libraries:

        + +
          +
        • GemLite, a Triton kernel library, tackles the performance limitations of large batch sizes and restrictions on the types of quantization
        • +
        • TorchAO, a PyTorch-native library, provides a streamlined experience for quantization, sparsity, and tensor parallelism (with DTensor)
        • +
        • SGLang, a fast, efficient and hackable serving framework for Large Language Model (LLM) and Vision Language Models (VLM) with extensive model support
        • +
        + +

        If you’re interested in trying this out in SGLang, please follow these repro instructions. For the rest of the blog, we’ll walk through relevant details for GemLite, TorchAO and SGlang both in terms of the design of the library itself and integration in addressing the problems we mentioned above, in the end we’ll present the benchmarking results on Llama 3.1-8B model across different batch sizes and tensor parallel sizes.

        + +

        1. Teaser of Results

        + +

        Following is a summary of the results in 8xH100 machine on Llama 3.1-8B for decode. For all experiments, the baseline is bfloat16 torch.compiled model:

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + bfloat16 w/ torch.compile + int4 weight only quantization, group size 64 + float8 per row dynamic quantization +
        Batch size 1, TP size 1 + 131 tokens/sec + 255 tokens/sec (1.95x speedup) + 166 tokens/sec (1.27x speedup) +
        Batch size 32, TP size 1 + 2799 tokens/sec + 3241 tokens/sec (1.16x speedup) + 3586 tokens/sec (1.28x speedup) +
        Batch size 32, TP size 4 + 5575 tokens/sec + 6334 tokens/sec (1.14x speedup) + 6159 tokens/sec (1.10x speedup) +
        + +

        Our solution supports NVIDIA GPUs, including H100 and A100, and achieves speedup over the compiled bfloat16 baseline across batch sizes and TP sizes for both int4 weight only (from 1.14x to 1.95x) and float8 dynamic quantization (from 1.10x to 1.28x). Note that quantization may have a small impact on accuracy, which is outside the scope of this blogpost. Our int4 weight-only quantization is compatible with accuracy preserving techniques like HQQ. Please refer to TorchAO’s README, this benchmark, and this blog for more information.

        + +

        2. GemLite: Kernel Development

        + +

        The kernels were developed as part of GemLite, a project dedicated to optimizing low-bit matrix multiplication kernels. Developed using Triton, GemLite provides highly flexible and performant solutions across various activations, bitrates and hardware. In a nutshell, the kernels offer:

        + +
          +
        • Support for various activation data types: fp16, int8 and fp8
        • +
        • Compatibility: works seamlessly with non-packed (e.g., int8, fp8) and packed formats (e.g., uint4, uint2, uint1)
        • +
        • Performance Optimization: includes optimized kernels and autotuning tools to achieve high performance across different hardware and batch sizes
        • +
        • Integration: Compatible with torch.compile and CUDA graphs, ensuring support for advanced features like tensor parallelism
        • +
        + +

        Kernel Selection

        + +

        Optimizing kernel selection for large language model (LLM) generation requires addressing the distinct needs of different batch sizes. LLM workloads involve a mix of compute-bound and memory-bound iterations: smaller batch sizes are memory-bound, while larger batch sizes become compute-bound. GemLite kernels are designed to adapt to these varying demands, ensuring optimal execution for each scenario.

        + +

        In memory-bound scenarios, where data transfer is the limiting factor, the processor often waits for data to be fetched, leading to underutilized computational resources. For batch size = 1, a GEMV kernel performs best, whereas for larger batch sizes, GEMM kernels are more efficient. For batch sizes between 2 and 64, when matrices are “skinny,” a GEMM-SPLITK kernel is used to enable better GPU utilization (arXiv).

        + +

        GemLite includes the following kernels optimized for each of these scenarios:

        + +

        Single Sample Inference

        + +

        For single-sample inferences, we use GEMV kernels. However, asymmetric quantization methods require additional metadata, such as scales and zero points, to be loaded for each block. This can lead to increased memory transfer, so careful handling is essential.

        + +

        Specifically, for packed data, our experiments indicate that loading scales and zero points only once per two consecutive blocks minimizes redundant operations. Since these blocks share the same metadata, this approach results in:

        + +
          +
        • 5–8% end-to-end inference speedup compared to the default GEMV kernel
        • +
        • 30–40% improvement over the traditional Split-K method
        • +
        + +

        This new kernel/algorithm, GEMV_REVSPLITK, is available here.

        + +

        For non-packed data, the GEMV_SPLITK algorithm is employed. This algorithm iterates over the k-dimension to compute the dot product without relying on Triton’s tl.dot.

        + +

        Batched Inference

        + +

        For moderate batch sizes, we use the GEMM-based Split-K method (arXiv) which splits the k-dimension (weight rows) into multiple jobs. The optimal-split SPLIT_K parameter is found by autotuning values ranging from 1 to 16. Setting SPLIT_K=1 enables a fallback implementation to a GEMM kernel, allowing the same kernel code to be used for compute-bound batch sizes starting from 32 and 64, depending on the matrix shape and the device.

        + +

        Maximizing High Performance: Key Implementation Insights

        + +

        Various implementation details must be carefully addressed to achieve high performance. Following are some of the key aspects we focused on to ensure high performance:

        + +
          +
        1. +

          Autotuning for Performance

          + +

          Autotuning is critical for achieving optimal kernel performance. Since this process can be time-intensive, GemLite provides tools to automatically save and load autotuning results for all kernels. This ensures that the autotuning process is performed only once per GPU device, minimizing runtime, reducing repetitive overhead, and maintaining consistent performance across runs.

          +
        2. +
        3. +

          Ensuring Kernel Correctness

          + +

          Ensuring kernel correctness across different quantization and configuration settings is essential. Triton’s early configuration pruning plays a key role in this process. For example, during Split-K tuning, configurations are selected only if K is divisible by BLOCK_SIZE_K × SPLIT_K,, and BLOCKS_SIZE_K is further pruned based on the group-size value. This approach ensures both efficiency and correctness in kernel operation.

          +
        4. +
        5. +

          Overcoming Bit-Unpacking Bottlenecks

          + +

          When deploying on data center-grade GPUs like NVIDIA’s A100 and H100, performance bottlenecks related to bit-unpacking were observed. To mitigate these, various bit-packing configurations were explored, including packing along columns versus rows and experimenting with different bit-packing widths (e.g., 8-bit vs. 32-bit). Notably, transitioning from 32-bit to 8-bit packing delivered performance improvements of up to 18% on the A100 and 6% on the H100

          +
        6. +
        7. +

          torch.compile compatibility

          + +

          To ensure seamless compatibility with PyTorch’s torch.compile, kernel calls are wrapped in a custom_op. This integration allows advanced features such as pre-hooks and early configuration pruning to function correctly, delivering accurate results without sacrificing performance. While some of these features are not yet fully supported in PyTorch, the custom_op implementation effectively bridges the gap, ensuring smooth integration and high performance.

          +
        8. +
        + +

        3. TorchAO

        + +

        TorchAO is a PyTorch native quantization and sparsity library for both training and inference, featuring simple user APIs to train, quantize and deploy low precision models, and composability with other PyTorch features like distributed inference and torch.compile.

        + +

        PyTorch does not support low precision dtypes or different packing formats by default. With Tensor Subclass, we extend PyTorch native Tensor abstractions and model quantization as dtype conversion, while different packing formats for custom kernels are handled through layouts. For example, we support quantized linear operations with int4 weights, packed in a Tensor Core friendly layout, with tinygemm or GemLite kernel implementations. More details can be found here.

        + +

        flow diagram

        + +

        Apart from more PyTorch native abstractions for developers, we want to highlight two benefits of this design for modeling users.

        + +
          +
        1. +

          Serialization: Save and load quantized weights into a state_dict just like a floating point model, eliminating the need to transform floating point model to quantized model before the quantized weights are loaded. This reduces friction of distributing and deploying quantized models.

          +
        2. +
        3. +

          Composability: Seamless integration with downstream features like tensor parallel, allowing users to focus on modeling without worrying about compatibility with tensor parallel, torch.compile, and other PyTorch features. Since these features are implemented with Tensor level abstraction, users can quantize and do distributed inference with no model changes most of the time.

          +
        4. +
        + +

        GemLite Kernel Integration

        + +

        To achieve the aforementioned benefits for the GemLite kernel, we integrated GemLite into TorchAO. This integration takes advantage of GemLite’s wide support and flexibility to allow for weight only quantization at 4 and 8 bits, under asymmetric and symmetric quantization schemes, 32 and 8 bit packing sizes, as well as grouped and ungrouped quantization. We enable this integration via the quantize_ api which can be used alongside the GemLite constructor as follows

        + +
        quantize_(model, gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth))
        +
        + +

        The primary difficulty in creating this integration was making sure that the TorchAO composability guarantees were satisfied for the entire breadth of GemLite quantization kernel options. While the primary integration was relatively straight forward, making sure every different quantization type and their associated kernels worked well with tensor parallel was non-trivial.

        + +

        Torch Tensor Parallel

        + +

        Tensor Parallelism is an effective way to speed up LLM inference. TP shards large matrices of linear or embedding modules onto multiple devices, typically in column-wise or row-wise styles. As the weight matrix gets distributed, computation is decomposed too. For example, the column-wise pattern below enables simultaneous matrix-vector multiply on four devices:

        + +

        equation

        + +

        PyTorch implements TP by converting a regular tensor (e.g. matrix A) into a DTensor:

        + +
        dtensor = _shard_tensor(mA, device_mesh, (Shard(0),))
        +
        + +

        Since DTensor stores meta information about the sharding, it knows how to reconstruct the full result when needed. Take Transformers’ feedforward module for example, as the down projection and up projection use column-wise and row-wise sharding respectively, DTensor will automatically perform an all-reduce on the ranks’ results as they move into the next operation. Such automation allows model authors to focus on computation without worrying about the communication needed for distributed execution.

        + +

        Tensor Parallel and Quantization Order

        + +

        Since both DTensor and quantization are tensor-level transformations, the application order matters in ensuring a workflow can generally work on different setups. We have two observations: (i) checkpoints are typically saved in quantized formats, to save the quantization overhead before each run; and (ii) TP may run on a different number of devices, depending on resource constraints or service agreements. As such, we first apply quantization to the original tensor, save it to disk depending on whether a reuse is desired. At service launch time, we load the quantized checkpoint and shard the tensors into DTensors on-the-fly as we load them into the model.

        + +

        Tensor Parallel Support in TorchAO

        + +

        Since we quantize the model first then distribute the Tensor, we’ll have DTensor(QuantizedTensor(weight)), where DTensor means a distributed Tensor class and QuantizedTensor means a quantized tensor class in TorchAO. QuantizedTensor should support the operators called when constructing a DTensor, including slice and view ops. To make sure the overall execution is efficient, the packed weight that’s sliced in the dimension 0 and 1 should match the result of first slice the unpacked weight then pack (pack and slice operation should commute), otherwise the packing format is not compatible with tensor parallelism.

        + +

        4. SGLang

        + +

        SGLang is a fast serving framework for large language models and vision language models. It is known for its almost zero-overhead batch scheduler and fast constrained decoding. It is mainly implemented in Python, lightweight, and easy to hack. It is also one of the first frameworks to integrate torch.compile.

        + +

        TorchAO integration in SGLang

        + +

        We integrated quantize_ API for applying a specific type of quantization to model into SGLang that supports int4 weight only quantization (both tinygemm and GemLite version), float8 dynamic quantization and a few other types of quantization so far. Users can enable quantization by adding --torchao-config argument to the benchmarking script. The currently enabled options also support tensor parallelism through composition with DTensor that is enabled with --tp-size option.

        + +

        Torch Native Tensor Parallel Support in SGLang

        + +

        Existing model definitions in SGLang use special linear modules that are coupled with tensor parallelism style, for example: MergedColumnParallelLinear, QKVParallelLinear and RowParallelLinear. To decouple the model definition and tensor parallelization style, we defined a pytorch native model that uses plain nn.Linear module from PyTorch and rely on PyTorch tensor parallelism APIs for parallelization and torch.compile for speedup. At related module hierarchies, we add a dictionary describing how a submodule should be parallelized. For example, in class LlamaAttention, we define:

        + +
        _tp_plan = {
        +    "qkv_proj": "Colwise_Sharded",
        +    "o_proj": "Rowwise",
        +}
        +
        + +

        where "qkv_proj" and "o_proj" are the FQNs of the wqkv and wo projections, and the values are their TP styles.

        + +

        We then define a TP engine in model_parallel.py. It searches for _tp_plan recursively within the model, and applies the indicated TP styles to the submodules using PyTorch’s parallelize_module API.

        + +

        5. Results

        + +

        The evaluation focused on two popular quantization techniques for H100 machines: int4 weight-only quantization and float8 dynamic quantization. These methods were chosen due to their widespread use in optimizing memory efficiency and computational performance on H100 machines, making them ideal candidates for benchmarking against various workloads.

        + +
          +
        • int4 Weight-Only Quantization: This method significantly reduces memory footprint and accelerates decode for memory-bound workloads, with minimal impact on performance in compute-intensive scenarios like prefill or larger batch sizes. We present results for bf16, GemLite, and tinygemm kernels below, across various batch sizes and tensor parallel configurations
        • +
        • float8 Dynamic Quantization: While offering less memory savings, this method often provides higher accuracy and balanced speedups for both memory-bound and compute-bound tasks. With Hopper-grade hardware and native fp8 support, the efficient cutlass/cuBLAS kernels used by AO contribute to a significant speedup
        • +
        + +

        The graphs below show the decode tokens/sec for different tp sizes, each graph shows the results across different batch sizes and for different types of quantization:

        + +
          +
        • BF16 is our bfloat16, torch.compile’d baseline
        • +
        • tinygemm-4-64 is using int4_weight_only quantization in TorchAO, it’s a 4 bit groupwise quantization with group size of 64, using tinygemm kernel
        • +
        • gemlite-4-64 is using gemlite_uintx_weight_only quantization in TorchAO, 4 means 4 bit, and 64 is also the group size, using GemLite kernel
        • +
        • fp8dq-per_row is using float8_dynamic_activation_float8_weight quantization in TorchAO, both activation and weights are quantized with per row scales
        • +
        + +

        bar chart

        + +

        bar chart

        + +

        bar chart

        + +

        For int4 weight-only quantization, at batch size 1, the tinygemm kernel achieved the best performance. However, its efficiency declined with increasing batch sizes. Conversely, GemLite effectively bridged this gap, delivering superior performance at larger batch sizes. GemLite also achieved a 9–10x speedup during the prefill phase compared to tinygemm, despite ongoing performance optimizations constrained by Triton.

        + +

        Float8 dynamic quantization showed 1.3x speedup over bfloat16 consistently with tensor parallel size 1 across different batch sizes and 1.1x to 1.2x speedup in larger tensor parallel sizes. As the tensor parallel size increases, the overall speedup decreases, which is expected due to the reduction in matmul size. Note that we do expect to get speedup for prefill as well, but since we rely on torch.compile for speedup and prefill compile is not enabled in SGLang yet, we will leave this for future work.

        + +

        Repro Instructions

        + +

        We conducted benchmarks on an 8xH100 machine using GemLite 0.4.1, SGLang built from commit feb2b76, TorchAO nightly 0.8.0.dev20241223+cu124, and PyTorch 2.5.1. The Llama-3.1 Instruct models were chosen as the architecture for evaluation.

        + +
        BATCH_SIZE=16
        +# Note: gemlite is only compatible with float16
        +# while int4wo-64 (tinygemm-4-64 as shown in the graph) and fp8dq-per_row should use bfloat16
        +DTYPE=float16
        +# int4wo-64, fp8dq-per_tensor
        +TORCHAO_CONFIG=gemlite-4-64
        +TP_SIZE=2
        +# Decode performance
        +python3 -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}' --dataset-name random --random-input 1024 --random-output 512 --random-range 1 --num-prompts $BATCH_SIZE --enable-torch-compile --dtype $DTYPE --torchao-config $TORCHAO_CONFIG --tp-size $TP_SIZE
        +
        +# Example output
        +# Benchmark...
        +# [2024-12-20 12:42:16 TP0] Prefill batch. #new-seq: 2, #new-token: 2046, #cached-token: 4, cache hit rate: \0.06%, token usage: 0.00, #running-req: 0, #queue-req: 0
        +# ...
        +# [2024-12-20 12:45:35 TP0] Decode batch. #running-req: 16, #token: 16763, token usage: 0.01, gen throughput\ (token/s): 2.20, #queue-req: 0
        +# [2024-12-20 12:45:38 TP0] Decode batch. #running-req: 16, #token: 24443, token usage: 0.02, gen throughput\ (token/s): 2739.89, #queue-req: 0
        +
        +# We reported the last throughput (token/s) as the performance for decode
        +
        + +

        Conclusion

        + +

        With performant and extensible kernels from GemLite, PyTorch native architecture optimization library TorchAO and high performance inference framework SGLang, we showcased fast end-to-end quantized inference for both int4 and float8 across different batch sizes and tensor parallel sizes with simple and composable user APIs to reduce the resource requirement for LLMs. This integration is our first step towards meeting the needs of fast inference across different models, workloads, precisions and hardwares and we are looking forward to continuing advancing the state of the art for end to end mixed and low precision LLM inference.

        + +

        Our immediate future work focuses on the following:

        + +
          +
        • Exploring diverse combinations of weight and activation quantization to strike the best balance between speed and accuracy
        • +
        • Extending support to additional GPU architectures to broaden accessibility
        • +
        • Enhancing compatibility with MoE models to address growing demands in scalable inference
        • +
        • Allow for easy integration of fast custom kernels in TorchAO so that they can be easily leveraged by SGLang and other inference frameworks
        • +
        • While we didn’t measure accuracy impact in this blogpost, we can develop auto quantization tool in TorchAO to allow users to trade off between performance and accuracy
        • +
        • Better integration with tensor parallelism in SGLang to support running larger models
        • +
        • Enable torch.compile for prefill phase in SGLang
        • +
        + +

        We also invite the community to actively test, provide feedback, and contribute to shaping the future of fast and efficient LLM inference.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-moe-model/index.html b/blog/accelerating-moe-model/index.html new file mode 100644 index 000000000000..50c0ebba8c31 --- /dev/null +++ b/blog/accelerating-moe-model/index.html @@ -0,0 +1,745 @@ + + + + + + + + + + + + + Accelerating MoE model inference with Locality-Aware Kernel Design | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Adnan Hoque, Less Wright, Antoni Virós Martin, Chih-Chieh Yang + +

        +

        1.0 Summary

        + +

        We show that by implementing column-major scheduling to improve data locality, we can accelerate the core Triton GEMM (General Matrix-Matrix Multiply) kernel for MoEs (Mixture of Experts) up to 4x on A100, and up to 4.4x on H100 Nvidia GPUs. This post demonstrates several different work decomposition and scheduling algorithms for MoE GEMMs and shows, at the hardware level, why column-major scheduling produces the highest speedup.

        + +

        Repo and code available at: https://github.com/pytorch-labs/applied-ai/tree/main/kernels/triton/inference/col_major_moe_gemm.

        + +

        Figure 1A. Optimized Fused MoE GEMM Kernel TFLOPs on A100 for varying Batch Sizes M

        + +

        Figure 1A. Optimized Fused MoE GEMM Kernel TFLOPs on A100 for varying Batch Sizes M

        + +

        Figure 1B. Optimized Fused MoE GEMM Kernel TFLOPs on H100 for varying Batch Sizes M

        + +

        Figure 1B. Optimized Fused MoE GEMM Kernel TFLOPs on H100 for varying Batch Sizes M

        + +

        2.0 Background

        + +

        OpenAI’s Triton is a hardware-agnostic language and compiler that as our prior blog post has shown can be used to accelerate quantization workflows. We also showed that in terms of kernel development, much of the same learnings and performance analysis tools from CUDA can be leveraged to provide similar insights into how Triton kernels work under-the-hood and subsequent measures to speedup these kernels in latency sensitive environments. As Triton becomes increasingly adopted in production settings, it is important that developers understand the common tips and tricks to developing performant kernels as well as the generality of these methods to various different architectures and workflows. Thus, this post will explore how we optimized the Triton kernel developed by vLLM for the popular Mixture of Experts (MoE) Mixtral model using classical techniques and how these techniques can be implemented in Triton to achieve performance gain.

        + +

        Mixtral 8x7B is a sparse Mixture of Experts Language Model. Unlike the classical dense transformer architecture, each transformer block houses 8 MLP layers where each MLP is an ‘expert’. As a token flows through, a router network selects which 2 of the 8 experts should process that token and the results are then combined. The selected experts for the same token vary at each layer. As a result, while Mixtral 8x7B has a total of 47B params, during inference only 13B params are active.

        + +

        The MoE GEMM (General Matrix-Matrix Multiply) kernel receives a stacked weight matrix containing all the experts, and must subsequently route each token to the TopK (2 for Mixtral) experts by utilizing a mapping array produced by the resultant scores of the router network. In this post, we provide methods to efficiently parallelize this computation during inference time, specifically during autoregression (or decoding stages).

        + +

        3.0 Work Decomposition - SplitK

        + +

        We have previously shown that for the matrix problem sizes found in LLM inference, specifically in the context of W4A16 quantized inference, GEMM kernels can be accelerated by applying a SplitK work decomposition. Thus, we started our MoE acceleration research by implementing SplitK in the vLLM MoE Kernel, which produced speedups of approximately 18-20% over the Data Parallel approach.

        + +

        This result shows that the SplitK optimization can be used as a part of a more formulaic approach to improving/developing Triton kernels in inference settings. To build intuition about these different work decompositions, let’s consider a simple example for the multiplication of two 4x4 matrices and SplitK=2.

        + +

        In the data parallel GEMM kernel shown below, the computation for a single block of the output matrix will be handled by 1 threadblock, TB0.

        + +

        Figure 2. Data Parallel GEMM

        + +

        Figure 2. Data Parallel GEMM

        + +

        In contrast, in the SplitK kernel, the work required to compute 1 block in the output matrix, is “split” or shared amongst 2 thread blocks TB0 and TB1. This provides better load balancing and increased parallelism.

        + +

        Figure 3. SplitK GEMM

        + +

        Figure 3. SplitK GEMM

        + +

        The key idea is that we’ve increased our parallelism from MN to MN*SplitK. This approach does incur some costs such as adding inter-threadblock communication via atomic operations. However, these costs are minimal compared to the savings of other constrained GPU resources like shared memory and registers. Most importantly, the SplitK strategy provides superior load balancing characteristics for skinny matrices, (as is the case in MoE inference) and is the common matrix profile during decoding and inference.

        + +

        4.0 GEMM Hardware Scheduling - Column Major

        + +

        To improve upon the ~20% speedup with SplitK we focused our investigation on the logic that controls the hardware scheduling of the GEMM in Triton Kernels. Our profiling of the vLLM MoE kernel showed a low L2 cache hit rate, thus we investigated three scheduling options - column-major, row-major and grouped launch. Due to some intrinsic properties of MoE models, such as large expert matrices, and having to dynamically load TopK (2 for Mixtral) matrices during the duration of the kernel, cache reuse/hit rate becomes a bottleneck that this optimization will target.

        + +

        For background, in our previous blog, we touched on the concept of “tile swizzling”, a method to achieve greater L2 cache hit rate. This concept relates to how the software schedules the GEMM onto the SMs of a GPU. In Triton, this schedule is determined by the pid_m and pid_n calculations. Our key insight is that for skinny matrix multiplications, a column-major ordering ensures optimal reuse of the columns of the weight matrix, B. To illustrate this, let’s take a look at a snippet of what a column major computation of pid_m, and pid_n would look like:

        + +

        Figure 4. Column Major ordering in PyTorch

        + +

        Figure 4. Column Major ordering in PyTorch

        + +

        From above, we note that with this mapping, we schedule the GEMM such that we calculate the output blocks of C in the following order: C(0, 0), C(1, 0), C(2, 0),… etc. To understand the implications we provide the following illustration:

        + +

        Activation matrix / Weight matrix

        + +

        L1/L2 Cache

        + +

        C - Output Matrix

        + +

        Figure 5. Cache Reuse Pattern for a Column-Major GEMM Schedule

        + +

        In the above simplified view of a column-major schedule, let’s assume for a GEMM with skinny activation matrix A, that the entire matrix can fit in the GPU cache which is a reasonable assumption to make for the type of problem sizes we encounter in MoE inference. This allows for maximal reuse of the columns of the weight matrix B, due to the fact that the B column can be re-used for the corresponding output tile calculations, C(0,0), C(1, 0) and C(2, 0). Consider instead, a row-major schedule, C(0,0), C(0,1), C(0, 2) etc. We would have to evict the column of B, and issue multiple load instructions to DRAM to calculate the same amount of output blocks.

        + +

        An important design consideration when optimizing kernels is a memory access pattern that results in the least amount of global load instructions. This optimal memory access pattern is achieved with the column-major schedule. The results below showcase the performance of the three schedules we investigated:

        + +

        Figure 6. Comparison of GEMM Schedules on A100 for varying Batch Sizes M

        + +

        Figure 6. Comparison of GEMM Schedules on A100 for varying Batch Sizes M

        + +

        The column-major schedule provides up to a 4x speedup over the other patterns, and as we’ll show in the next section, provides an optimal memory access pattern due to greatly improved data locality.

        + +

        5.0 Nsight Compute Analysis - Throughput and Memory Access Pattern

        + +

        For performance analysis, we focus on the M = 2 case for the H100. A similar study can be done for the A100 as many of the same observations carry over. We note the following salient results, that showcase the impact of our optimizations.

        + +

        Figure 7. H100 Memory Throughput Chart for M = 2.  Note the very large increase in the cache hit rates L1 cache hit rate (+2696%) and L2 cache hit rate (+254%).

        + +

        Figure 7. H100 Memory Throughput Chart for M = 2. Note the very large increase in the cache hit rates L1 cache hit rate (+2696%) and L2 cache hit rate (+254%).

        + +

        Figure 8. H100 Memory Instruction Statistics M = 2. Note the 49% reduction in global memory loads.

        + +

        Figure 8. H100 Memory Instruction Statistics M = 2. Note the 49% reduction in global memory loads.

        + +

        These statistics show that our optimizations had the intended effect, which can be seen in the reduced cache misses, reduced memory accesses and the resultant 2.7x speedup. More concretely, the trace shows us a 2.54x increase in L2 hit rate (Figure 7), and a ~50% reduction in DRAM accesses (Figure 8).

        + +

        These improvements ultimately yield the reduced latency, with the optimized kernel being 2.7x faster for bs=2 and 4.4x for bs=512.

        + +

        6.0 Future Work

        + +

        Our kernel was tested in FP16, which showcases the numerics and performance of the column major scheduling for MoE, but most production models are using BFloat16. We encountered a limitation in Triton such that tl.atomic_add does not support Bfloat16 and hit launch latency concerns which would require cuda graph support for column major production use. In initial testing this translated to a 70% end-to-end speedup but, we encountered some expert mapping inconsistencies in an end to end environment that are not reflected in the test environment, so further work is needed to fully realize these speedups. \

        + +

        For future work, we intend to move this into a CUDA kernel which will ensure full BFloat16 support and reduced launch latency relative to Triton, and potentially resolve the expert routing inconsistency. We’ve also previously published work on enabling GPTQ W4A16 with Triton GEMM kernels, so natural follow-on work would include fusing dequantization into this kernel to allow for a GPTQ quantized inference path.

        + +

        7.0 Reproducibility

        + +

        We have open sourced the Triton kernel code along with an easy to run performance benchmark for readers interested in comparing or verifying the performance on their own GPU.

        + +

        Acknowledgements

        + +

        We want to thank Daniel Han, Raghu Ganti, Mudhakar Srivatsa, Bert Maher, Gregory Chanan, Eli Uriegas, and Geeta Chauhan for their review of the presented material and Woosuk from the vLLM team as we built on his implementation of the Fused MoE kernel.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-neural-network-training/index.html b/blog/accelerating-neural-network-training/index.html new file mode 100644 index 000000000000..6bf259bde463 --- /dev/null +++ b/blog/accelerating-neural-network-training/index.html @@ -0,0 +1,863 @@ + + + + + + + + + + + + + Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Jesse Cai, Daniel Haziza, Supriya Rao + +

        +

        Over the past year, we’ve added support for semi-structured (2:4) sparsity into PyTorch. With just a few lines of code, we were able to show a 10% end-to-end inference speedup on segment-anything by replacing dense matrix multiplications with sparse matrix multiplications.

        + +

        However, matrix multiplications are not unique to neural network inference - they happen during training as well. By expanding on the core primitives we used earlier to accelerate inference, we were also able to accelerate model training. We wrote a replacement nn.Linear layer, SemiSparseLinear, that is able to achieve a 1.3x speedup across the forwards + backwards pass of the linear layers in the MLP block of ViT-L on a NVIDIA A100.

        + +

        End-to-end, we see a wall time reduction of 6% for a DINOv2 ViT-L training, with virtually no accuracy degradation out of the box (82.8 vs 82.7 on ImageNet top-1 accuracy).

        + +

        2 strategies for training a ViT model

        + +

        We compare 2 strategies for training a ViT model for 125k iterations on 4x NVIDIA A100s: either fully dense (blue), or sparse for 70% of the training, then dense (orange). Both achieve similar results on the benchmarks, but the sparse variant trains 6% faster. For both experiments, we evaluate the intermediate checkpoints with and without sparsity.

        + +

        As far as we are aware, this is the first OSS implementation of accelerated sparse training and we’re excited to provide a user API in torchao. You can try accelerating your own training runs with just a few lines of code:

        + +
        # Requires torchao and pytorch nightlies and CUDA compute capability 8.0+
        +import torch
        +from torchao.sparsity.training import (
        +    SemiSparseLinear,
        +    swap_linear_with_semi_sparse_linear,
        +)
        +
        +model = torch.nn.Sequential(torch.nn.Linear(1024, 4096)).cuda().half()
        +
        +# Specify the fully-qualified-name of the nn.Linear modules you want to swap
        +sparse_config = {
        +    "seq.0": SemiSparseLinear
        +}
        +
        +# Swap nn.Linear with SemiSparseLinear, you can run your normal training loop after this step
        +swap_linear_with_semi_sparse_linear(model, sparse_config)
        +
        + +

        How does this work?

        + +

        The general idea behind sparsity is simple: skip calculations involving zero-valued tensor elements to speed up matrix multiplication. However, simply setting weights to zero isn’t enough, as the dense tensor still contains these pruned elements and dense matrix multiplication kernels will continue to process them, incurring the same latency and memory overhead. To achieve actual performance gains, we need to replace dense kernels with sparse kernels that intelligently bypass calculations involving pruned elements.

        + +

        These kernels work on sparse matrices, which remove the pruned elements and store the specified elements in a compressed format. There are many different sparse formats, but we’re particularly interested in semi-structured sparsity, also known as 2:4 structured sparsity or fine-grained structured sparsity or more generally N:M structured sparsity.

        + +

        2:4 sparse compressed representation

        + +

        2:4 sparse compressed representation. Original Source

        + +

        A 2:4-sparse matrix is a matrix where at most 2 elements are non-zero for every 4 elements, as illustrated in the image above. Semi-structured sparsity is attractive because it exists in a goldilocks spot of performance and accuracy:

        + +
          +
        1. NVIDIA GPUs since Ampere offer hardware acceleration and library support (cuSPARSELt) for this format, with matrix multiplication being up to 1.6x faster
        2. +
        3. Pruning models to fit this sparsity pattern does not degrade accuracy as much as other patterns. NVIDIA’s whitepaper shows pruning then retraining is able to recover accuracy for most vision models.
        4. +
        + +

        Illustration of 2:4 (sparse) matrix multiplication on NVIDIA GPUs

        + +

        Illustration of 2:4 (sparse) matrix multiplication on NVIDIA GPUs. Original source

        + +

        Accelerating inference with semi-structured sparsity is straightforward. Since our weights are fixed during inference, we can prune and compress the weight ahead of time (offline) and store the compressed sparse representation instead of our dense tensor.

        + +

        flow chart

        + +

        Then, instead of dispatching to dense matrix multiplication we dispatch to sparse matrix multiplication, passing in the compressed sparse weight instead of the normal dense one. For more information about accelerating models for inference using 2:4 sparsity, please refer to our tutorial.

        + +

        Extending sparse inference acceleration to training

        + +

        In order to use sparsity to reduce the training time of our models, we need to consider when the mask is calculated, as once we store the compressed representation the mask is fixed.

        + +

        Training with a fixed mask applied to an existing trained dense model (also known as pruning) does not degrade accuracy, but this requires two training runs - one to obtain the dense model and another to make it sparse, offering no speedups.

        + +

        Instead we’d like to train a sparse model from scratch (dynamic sparse training), but training from scratch with a fixed mask will lead to a significant drop in evaluations, as the sparsity mask would be selected at initialization, when the model weights are essentially random.

        + +

        To maintain the accuracy of the model when training from scratch, we prune and compress the weights at runtime, so that we can calculate the optimal mask at each step of the training process.

        + +

        Conceptually you can think of our approach as an approximate matrix multiplication technique, where we `prune_and_compress` and dispatch to `sparse_GEMM` in less time than a `dense_GEMM` call would take. This is difficult because the native pruning and compression functions are too slow to show speedups.

        + +

        Given the shapes of our ViT-L training matrix multiplications (13008x4096x1024), we measured the runtime of a dense and sparse GEMM respectively at 538us and 387us. In other words, the pruning and compression step of the weight matrix must run in less than 538-387=151us to have any efficiency gain. Unfortunately, the compression kernel provided in cuSPARSELt already takes 380us (without even considering the pruning step!).

        + +

        Given the max NVIDIA A100 memory IO (2TB/s), and considering that a prune and compress kernel would be memory bound, we could theoretically prune and compress our weight (4096x1024x2 bytes=8MB) in 4us (8MB / 2TB/s)! And in fact, we were able to write a kernel that prunes and compresses a matrix into 2:4-sparse format, and runs in 36 us (10x faster than the compression kernel in cuSPARSELt), making the entire GEMM (including the sparsification) faster. Our kernel is available for use in PyTorch.

        + +

        Our custom sparsification kernel

        + +

        Our custom sparsification kernel, which includes pruning + compression, is ~30% faster across a linear layer forward+backward. Benchmarks run on a NVIDIA A100-80GB GPU.

        + +

        Writing a performant runtime sparsification kernel

        + +

        There were multiple challenges we faced in order to implement a performant runtime sparsification kernel, which we will explore below.

        + +

        1) Handling the backwards pass

        + +

        For the backwards pass, we need to calculate dL/dX and dL/dW for the gradient update and the subsequent layer, which means we need to calculate xWT and xTW respectively.

        + +

        Overview of runtime sparsification for training acceleration (FW + BW pass)

        + +

        Overview of runtime sparsification for training acceleration (FW + BW pass)

        + +

        However this is problematic, because the compressed representation cannot be transposed, since there’s no guarantee that the tensor is 2:4 sparse in both directions.

        + +

        Both matrices are valid 2:4 matrices. However, the right one is no longer a valid 2:4 matrix once transposed because one column contains more than 2 elements

        + +

        Both matrices are valid 2:4 matrices. However, the right one is no longer a valid 2:4 matrix once transposed because one column contains more than 2 elements

        + +

        Therefore, we prune a 4x4 tile, instead of a 1x4 strip. We greedily preserve the largest values, ensuring that we take at most 2 values for each row / column. While this approach is not guaranteed to be optimal, as we sometimes only preserve 7 values instead of 8, it efficiently calculates a tensor that is 2:4 sparse both row-wise and column-wise.

        + +

        We then compress both the packed tensor and the packed transpose tensor, storing the transpose tensor for the backwards pass. By calculating both the packed and packed transpose tensor at the same time, we avoid a secondary kernel call in the backwards pass.

        + +

        Our kernel prunes the weight matrix in registers

        + +

        Our kernel prunes the weight matrix in registers, and writes the compressed values in global memory. It also prunes at the same time W.t, which is needed for the backward pass, minimizing the memory IO

        + +

        There’s some additional transpose trickery needed to handle the backwards pass - the underlying hardware only supports operations where the first matrix is sparse. For weight sparsification during inference, when we need to calculate xWT we rely on transpose properties to swap the order of the operands.

        + +

        Math formula

        + +

        During inference, we use torch.compile to fuse the outer transpose into subsequent pointwise ops in order to avoid paying a performance penalty.

        + +

        However in the case of the backwards pass of training, we have no subsequent pointwise op to fuse with. Instead, we fuse the transposition into our matrix multiplication by taking advantage of cuSPARSELt’s ability to specify the row / column layout of the result matrix.

        + +

        2) Kernel tiling for efficient memory-IO

        + +

        In order for our kernel to be as efficient as possible, we want to coalesce our reads / writes, as we found that memory IO to be the main bottleneck. This means that within a CUDA thread, we want to read/write chunks of 128 bytes at a time, so that multiple parallel reads/writes can be coalesced into a single request by the GPU memory controller.

        + +

        Therefore, instead of a thread handling a single 4x4 tile, which is only 4x4x2 = 32 bytes, we decided that each thread will handle 4 4x4 tiles (aka an 8x8 tile), which allows us to operate 8x8x2 =128 byte chunks.

        + +

        Kernel tiling for efficient memory-IO

        + +

        3) Sorting elements in a 4x4 tile without warp-divergence

        + +

        For each individual 4x4 tile within our thread we calculate a bitmask that specifies which elements to prune and which elements to keep. To do this we sort all 16 elements and greedily preserve elements, so long as they do not break our 2:4 row / col constraint. This preserves only the weights with the largest values.

        + +

        Crucially we observe that we are only ever sorting a fixed number of elements, so by using a branchless sorting network, we can avoid warp divergence.

        + +

        Sorting network diagram

        + +

        For clarity, the transposed packed tensor and metadata are omitted. Sorting network diagram taken from Wikipedia.

        + +

        Warp divergence occurs when we have conditional execution inside across a thread block. In CUDA, work items in the same work group (thread block) are dispatched at the hardware level in batches (warps). If we have conditional execution, such that some work-items in the same batch run different instructions, then they are masked when the warp is dispatched, or dispatched sequentially.

        + +

        For example, if we have some code like if (condition) do(A) else do(B), where condition is satisfied by all the odd-numbered work items, then the total runtime of this conditional statement is do(A) + do(B), since we would dispatch do(A) for all odd-numbered work-items, masking out even-numbered work-items, and do(B) for all even numbered work-items, masking out odd-numbered work-items. This answer provides more information about warp divergence.

        + +

        4) Writing the compressed matrices and metadata

        + +

        Once the bitmask has been computed, the weight data has to be written back in a compressed format in global memory. This is not trivial, because the data needs to stay in registers, and it’s not possible to index registers (eg C[i++] = a prevents us from storing C in registers). Furthermore, we found that nvcc was using many more registers than we expected, which caused register spilling and impacted global performance. We write this compressed matrix to global memory in Column-Major format to make the writes more efficient.

        + +

        compressed matrix to global memory in Column-Major format

        + +

        We also need to write the cuSPARSELt metadata as well. This metadata layout is quite similar to the one from the open-source CUTLASS library and is optimized for being loaded efficiently through shared-memory in the GEMM kernel with the PTX ldmatrix instruction.

        + +

        However, this layout is not optimized to be written efficiently: the first 128 bits of the metadata tensor contains metadata about the first 32 columns of the rows 0, 8, 16 and 24. Recall that each thread handles an 8x8 tile, which means that this information is scattered across 16 threads.

        + +

        We rely on a series of warp-shuffle operations, once for the original and transposed representation respectively to write the metadata. Fortunately, this data represents less than 10% of the total IO, so we can afford to not fully coalesce the writes.

        + +

        DINOv2 Sparse Training: Experimental Setup and Results

        + +

        For our experiments, the ViT-L model is trained on ImageNet for 125k steps using the DINOv2 method. All our experiments were run on 4x AMD EPYC 7742 64-core CPUs and 4x NVIDIA A100-80GB GPUs. During sparse training, the model is trained with 2:4 sparsity enabled for the first part of the training, where only half of the weights are enabled. This sparsity mask on the weights is dynamically recomputed at every step, as weights are continuously updated during the optimization. For the remaining steps, the model is trained densely, producing a final model without 2:4 sparsity (except the 100% sparse training setup), which is then evaluated.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Training setup + ImageNet 1k log-regression +
        0% sparse (125k dense steps, baseline) + 82.8 +
        40% sparse (50k sparse -> 75k dense steps) + 82.9 +
        60% sparse (75k sparse -> 50k dense steps) + 82.8 +
        70% sparse (87.5k sparse -> 37.5k dense steps) + 82.7 +
        80% sparse (100k sparse -> 25k dense steps) + 82.7 +
        90% sparse (112.5k sparse -> 12.5k dense steps) + 82.0 +
        100% sparse (125k sparse steps) + 82.3 (2:4-sparse model) +
        + +

        sparsity training diagrams

        + +

        During the sparse training steps, in the backward pass we obtain a dense gradient for the sparse weights. For the gradient descent to be sound, we should also sparsify this gradient before using it in the optimizer to update the weights. Instead of doing that, we use the full dense gradient to update the weights - we found this to work better in practice: this is the STE (Straight Through Estimator) strategy. In other words, we update all the parameters at every step, even the ones we don’t use.

        + +

        Conclusion and Future Work

        + +

        In this blog post, we’ve shown how to accelerate neural network training with semi-structured sparsity and explained some of the challenges we faced. We were able to achieve a 6% end to end speedup on DINOv2 training with a small 0.1 pp accuracy drop.

        + +

        There are several areas of expansion for this work:

        + +
          +
        • Expansion to new sparsity patterns: Researchers have created new sparsity patterns like V:N:M sparsity that use the underlying semi-structured sparse kernels to allow for more flexibility. This is especially interesting for applying sparsity to LLMs, as 2:4 sparsity degrades accuracy too much, but we have seen some positive results for more general N:M pattern.
        • +
        • Performance optimizations for sparse fine-tuning: This post covers sparse training from scratch, but oftentimes we want to fine-tune a foundational model. In this case, a static mask may be sufficient to preserve accuracy which would enable us to make additional performance optimizations.
        • +
        • More experiments on pruning strategy: We calculate the mask at each step of the network, but calculating the mask every n steps may yield better training accuracy. Overall, figuring out the best strategy to use semi-structured sparsity during training is an open area of research.
        • +
        • Compatibility with fp8: The hardware also supports fp8 semi-structured sparsity, and this approach should work similarly with fp8 in principle. In practice, we would need to write similar sparsification kernels, and could possibly fuse them with the scaling of the tensors.
        • +
        • Activation Sparsity: Efficient sparsification kernels also enable to sparsify the activations during training. Because the sparsification overhead grows linearly with the sparsified matrix size, setups with large activation tensors compared to the weight tensors could benefit more from activation sparsity than weight sparsity. Furthermore, activations are naturally sparse because of the usage of ReLU or GELU activation functions, reducing accuracy degradation.
        • +
        + +

        If you are interested in these problems, please feel free to open an issue / PR in torchao, a community we’re building for architecture optimization techniques like quantization and sparsity. Additionally, if you have general interest in sparsity please reach out in CUDA-MODE (#sparsity)

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-pytorch-vision-models-with-channels-last-on-cpu/index.html b/blog/accelerating-pytorch-vision-models-with-channels-last-on-cpu/index.html new file mode 100644 index 000000000000..7aeec47a9756 --- /dev/null +++ b/blog/accelerating-pytorch-vision-models-with-channels-last-on-cpu/index.html @@ -0,0 +1,770 @@ + + + + + + + + + + + + + Accelerating PyTorch Vision Models with Channels Last on CPU | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Mingfei Ma (Intel), Vitaly Fedyunin (Meta), Wei Wei (Meta) + +

        +

        Overview

        + +

        Memory formats has significant impact on performance when running vision models, generally Channels Last is a more favorable from performance perspective due to better data locality.

        + +

        This blog will introduce fundamental concepts of memory formats and demonstrate performance benefits using Channels Last on popular PyTorch vision models on Intel® Xeon® Scalable processors.

        + +

        Memory Formats Introduction

        + +

        Memory format refers to data representation that describes how a multidimensional (nD) array is stored in linear (1D) memory address space. The concept of memory format has two aspects:

        + +
          +
        • Physical Order is the layout of data storage in physical memory. For vision models, usually we talk about NCHW, NHWC. These are the descriptions of physical memory layout, also referred as Channels First and Channels Last respectively.
        • +
        • Logical Order is a convention on how to describe tensor shape and stride. In PyTorch, this convention is NCHW. No matter what the physical order is, tensor shape and stride will always be depicted in the order of NCHW.
        • +
        + +

        Fig-1 is the physical memory layout of a tensor with shape of [1, 3, 4, 4] on both Channels First and Channels Last memory format (channels denoted as R, G, B respectively):

        + +

        + +

        + +

        +Fig-1 Physical memory layout of Channels First and Channels Last +

        + +

        Memory Formats Propagation

        + +

        The general rule for PyTorch memory format propagation is to preserve the input tensor’s memory format. Which means a Channels First input will generate a Channels First output and a Channels Last input will generate a Channels Last output.

        + +

        For Convolution layers, PyTorch uses oneDNN (oneAPI Deep Neural Network Library) by default to achieve optimal performance on Intel CPUs. Since it is physically impossible to achieve highly optimized performance directly with Channels Frist memory format, input and weight are firstly converted to blocked format and then computed. oneDNN may choose different blocked formats according to input shapes, data type and hardware architecture, for vectorization and cache reuse purposes. The blocked format is opaque to PyTorch, so the output needs to be converted back to Channels First. Though blocked format would bring about optimal computing performance, the format conversions may add overhead and therefore offset the performance gain.

        + +

        On the other hand, oneDNN is optimized for Channels Last memory format to use it for optimal performance directly and PyTorch will simply pass a memory view to oneDNN. Which means the conversion of input and output tensor is saved. Fig-2 indicates memory format propagation behavior of convolution on PyTorch CPU (the solid arrow indicates a memory format conversion, and the dashed arrow indicates a memory view):

        + +

        + +

        + +

        +Fig-2 CPU Conv memory format propagation +

        + +

        On PyTorch, the default memory format is Channels First. In case a particular operator doesn’t have support on Channels Last, the NHWC input would be treated as a non-contiguous NCHW and therefore fallback to Channels First, which will consume the previous memory bandwidth on CPU and result in suboptimal performance.

        + +

        Therefore, it is very important to extend the scope of Channels Last support for optimal performance. And we have implemented Channels Last kernels for the commonly use operators in CV domain, applicable for both inference and training, such as:

        + +
          +
        • Activations (e.g., ReLU, PReLU, etc.)
        • +
        • Convolution (e.g., Conv2d)
        • +
        • Normalization (e.g., BatchNorm2d, GroupNorm, etc.)
        • +
        • Pooling (e.g., AdaptiveAvgPool2d, MaxPool2d, etc.)
        • +
        • Shuffle (e.g., ChannelShuffle, PixelShuffle)
        • +
        + +

        Refer to Operators-with-Channels-Last-support for details.

        + +

        Native Level Optimization on Channels Last

        + +

        As mentioned above, PyTorch uses oneDNN to achieve optimal performance on Intel CPUs for convolutions. The rest of memory format aware operators are optimized at PyTorch native level, which doesn’t require any third-party library support.

        + +
          +
        • Cache friendly parallelization scheme: keep the same parallelization scheme for all the memory format aware operators, this will help increase data locality when passing each layer’s output to the next.
        • +
        • Vectorization on multiple archs: generally, we can vectorize on the most inner dimension on Channels Last memory format. And each of the vectorized CPU kernels will be generated for both AVX2 and AVX512.
        • +
        + +

        While contributing to Channels Last kernels, we tried our best to optimize Channels First counterparts as well. The fact is some operators are physically impossible to achieve optimal performance on Channels First, such as Convolution, Pooling, etc.

        + +

        Run Vision Models on Channels Last

        + +

        The Channels Last related APIs are documented at PyTorch memory format tutorial. Typically, we can convert a 4D tensor from Channels First to Channels Last by:

        + +
        # convert x to channels last
        +# suppose x’s shape is (N, C, H, W)
        +# then x’s stride will be (HWC, 1, WC, C)
        +x = x.to(memory_format=torch.channels_last)
        +
        + +

        To run models on Channels Last memory format, simply need to convert input and model to Channels Last and then you are ready to go. The following is a minimal example showing how to run ResNet50 with TorchVision on Channels Last memory format:

        + +
        import torch
        +from torchvision.models import resnet50
        +
        +N, C, H, W = 1, 3, 224, 224
        +x = torch.rand(N, C, H, W)
        +model = resnet50()
        +model.eval()
        +
        +# convert input and model to channels last
        +x = x.to(memory_format=torch.channels_last)
        +model = model.to(memory_format=torch.channels_last)
        +model(x)
        +
        + +

        The Channels Last optimization is implemented at native kernel level, which means you may apply other functionalities such as torch.fx and torch script together with Channels Last as well.

        + +

        Performance Gains

        + +

        We benchmarked inference performance of TorchVision models on Intel® Xeon® Platinum 8380 CPU @ 2.3 GHz, single instance per socket (batch size = 2 x number of physical cores). Results show that Channels Last has 1.3x to 1.8x performance gain over Channels First.

        + +

        + +

        + +

        The performance gain primarily comes from two aspects:

        + +
          +
        • For Convolution layers, Channels Last saved the memory format conversion to blocked format for activations, which improves the overall computation efficiency.
        • +
        • For Pooling and Upsampling layers, Channels Last can use vectorized logic along the most inner dimension, e.g., “C”, while Channels First can’t.
        • +
        + +

        For memory format non aware layers, Channels Last and Channels First has the same performance.

        + +

        Conclusion & Future Work

        + +

        In this blog we introduced fundamental concepts of Channels Last and demonstrated the performance benefits of CPU using Channels Last on vision models. The current work is limited to 2D models at the current stage, and we will extend the optimization effort to 3D models in near future!

        + +

        Acknowledgement

        + +

        The results presented in this blog is a joint effort of Meta and Intel PyTorch team. Special thanks to Vitaly Fedyunin and Wei Wei from Meta who spent precious time and gave substantial assistance! Together we made one more step on the path of improving the PyTorch CPU eco system.

        + +

        References

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-pytorch-with-cuda-graphs/index.html b/blog/accelerating-pytorch-with-cuda-graphs/index.html new file mode 100644 index 000000000000..1845c5fe2a82 --- /dev/null +++ b/blog/accelerating-pytorch-with-cuda-graphs/index.html @@ -0,0 +1,926 @@ + + + + + + + + + + + + + Accelerating PyTorch with CUDA Graphs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 26, 2021

        +

        + Accelerating PyTorch with CUDA Graphs +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Vinh Nguyen, Michael Carilli, Sukru Burc Eryilmaz, Vartika Singh, Michelle Lin, Natalia Gimelshein, Alban Desmaison, Edward Yang + +

        +

        Today, we are pleased to announce a new advanced CUDA feature, CUDA Graphs, has been brought to PyTorch. Modern DL frameworks have complicated software stacks that incur significant overheads associated with the submission of each operation to the GPU. When DL workloads are strong-scaled to many GPUs for performance, the time taken by each GPU operation diminishes to just a few microseconds and, in these cases, the high work submission latencies of frameworks often lead to low utilization of the GPU. As GPUs get faster and workloads are scaled to more devices, the likelihood of workloads suffering from these launch-induced stalls increases. To overcome these performance overheads, NVIDIA engineers worked with PyTorch developers to enable CUDA graph execution natively in PyTorch. This design was instrumental in scaling NVIDIA’s MLPerf workloads (implemented in PyTorch) to over 4000 GPUs in order to achieve record-breaking performance.

        + +
        + +
        + +

        CUDA graphs support in PyTorch is just one more example of a long collaboration between NVIDIA and Facebook engineers. torch.cuda.amp, for example, trains with half precision while maintaining the network accuracy achieved with single precision and automatically utilizing tensor cores wherever possible. AMP delivers up to 3X higher performance than FP32 with just a few lines of code change. Similarly, NVIDIA’s Megatron-LM was trained using PyTorch on up to 3072 GPUs. In PyTorch, one of the most performant methods to scale-out GPU training is with torch.nn.parallel.DistributedDataParallel coupled with the NVIDIA Collective Communications Library (NCCL) backend.

        + +

        CUDA Graphs

        + +

        CUDA Graphs, which made its debut in CUDA 10, let a series of CUDA kernels to be defined and encapsulated as a single unit, i.e., a graph of operations, rather than a sequence of individually-launched operations. It provides a mechanism to launch multiple GPU operations through a single CPU operation, and hence reduces the launching overheads.

        + +

        The benefits of CUDA graphs can be demonstrated with the simple example in Figure 1. On the top, a sequence of short kernels is launched one-by-one by the CPU. The CPU launching overhead creates a significant gap in between the kernels. If we replace this sequence of kernels with a CUDA graph, initially we will need to spend a little extra time on building the graph and launching the whole graph in one go on the first occasion, but subsequent executions will be very fast, as there will be very little gap between the kernels. The difference is more pronounced when the same sequence of operations is repeated many times, for example, overy many training steps. In that case, the initial costs of building and launching the graph will be amortized over the entire number of training iterations. For a more comprehensive introduction on the topic, see our blog + Getting Started with CUDA Graphs and GTC talk Effortless CUDA Graphs.

        + +

        +Cuda graphs reduce launching overhead by bundling multiple GPU operations into a single launchable unit, i.e., a graph. On the top, you can see five individual launches; whereas on the bottom, with CUDA graphs, they are all bundled into a single launch, reducing overhead. +
        + Figure 1. Benefits of using CUDA graphs +

        + +

        NCCL support for CUDA graphs

        + +

        The previously mentioned benefits of reducing launch overheads also extend to NCCL kernel launches. NCCL enables GPU-based collective and P2P communications. With NCCL support for CUDA graphs, we can eliminate the NCCL kernel launch overhead.

        + +

        Additionally, kernel launch timing can be unpredictable due to various CPU load and operating system factors. Such time skews can be harmful to the performance of NCCL collective operations. With CUDA graphs, kernels are clustered together so that performance is consistent across ranks in a distributed workload. This is especially useful in large clusters where even a single slow node can bring down overall cluster level performance.

        + +

        For distributed multi-GPU workloads, NCCL is used for collective communications. If we look at training a neural network that leverages data parallelism, without NCCL support for CUDA graphs, we’ll need a separate launch for each of forward/back propagation and NCCL AllReduce. By contrast, with NCCL support for CUDA graphs, we can reduce launch overhead by lumping together the forward/backward propagation and NCCL AllReduce all in a single graph launch.

        + +

        +With NCCL CUDA graph support, all the kernel launches for NCCL AllReduce for  the forward/backward propagation can be bundled into a graph to reduce overhead launch time. +
        + Figure 2. Looking at a typical neural network, all the kernel launches for NCCL AllReduce can be bundled into a graph to reduce overhead launch time. +

        + +

        PyTorch CUDA Graphs

        + +

        From PyTorch v1.10, the CUDA graphs functionality is made available as a set of beta APIs.

        + +

        API overview

        + +

        PyTorch supports the construction of CUDA graphs using stream capture, which puts a CUDA stream in capture mode. CUDA work issued to a capturing stream doesn’t actually run on the GPU. Instead, the work is recorded in a graph. After capture, the graph can be launched to run the GPU work as many times as needed. Each replay runs the same kernels with the same arguments. For pointer arguments this means the same memory addresses are used. By filling input memory with new data (e.g., from a new batch) before each replay, you can rerun the same work on new data.

        + +

        Replaying a graph sacrifices the dynamic flexibility of typical eager execution in exchange for greatly reduced CPU overhead. A graph’s arguments and kernels are fixed, so a graph replay skips all layers of argument setup and kernel dispatch, including Python, C++, and CUDA driver overheads. Under the hood, a replay submits the entire graph’s work to the GPU with a single call to cudaGraphLaunch. Kernels in a replay also execute slightly faster on the GPU, but eliding CPU overhead is the main benefit.

        + +

        You should try CUDA graphs if all or part of your network is graph-safe (usually this means static shapes and static control flow, but see the other constraints) and you suspect its runtime is at least somewhat CPU-limited.

        + +

        API example

        + +

        PyTorch exposes graphs via a raw torch.cuda.CUDAGraphclass and two convenience wrappers, torch.cuda.graph and torch.cuda.make_graphed_callables.

        + +

        torch.cuda.graph is a simple, versatile context manager that captures CUDA work in its context. Before capture, warm up the workload to be captured by running a few eager iterations. Warmup must occur on a side stream. Because the graph reads from and writes to the same memory addresses in every replay, you must maintain long-lived references to tensors that hold input and output data during capture. To run the graph on new input data, copy new data to the capture’s input tensor(s), replay the graph, then read the new output from the capture’s output tensor(s).

        + +

        If the entire network is capture safe, one can capture and replay the whole network as in the following example.

        + +
        N, D_in, H, D_out = 640, 4096, 2048, 1024
        +model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
        +                            torch.nn.Dropout(p=0.2),
        +                            torch.nn.Linear(H, D_out),
        +                            torch.nn.Dropout(p=0.1)).cuda()
        +loss_fn = torch.nn.MSELoss()
        +optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        +
        +# Placeholders used for capture
        +static_input = torch.randn(N, D_in, device='cuda')
        +static_target = torch.randn(N, D_out, device='cuda')
        +
        +# warmup
        +# Uses static_input and static_target here for convenience,
        +# but in a real setting, because the warmup includes optimizer.step()
        +# you must use a few batches of real data.
        +s = torch.cuda.Stream()
        +s.wait_stream(torch.cuda.current_stream())
        +with torch.cuda.stream(s):
        +    for i in range(3):
        +        optimizer.zero_grad(set_to_none=True)
        +        y_pred = model(static_input)
        +        loss = loss_fn(y_pred, static_target)
        +        loss.backward()
        +        optimizer.step()
        +torch.cuda.current_stream().wait_stream(s)
        +
        +# capture
        +g = torch.cuda.CUDAGraph()
        +# Sets grads to None before capture, so backward() will create
        +# .grad attributes with allocations from the graph's private pool
        +optimizer.zero_grad(set_to_none=True)
        +with torch.cuda.graph(g):
        +    static_y_pred = model(static_input)
        +    static_loss = loss_fn(static_y_pred, static_target)
        +    static_loss.backward()
        +    optimizer.step()
        +
        +real_inputs = [torch.rand_like(static_input) for _ in range(10)]
        +real_targets = [torch.rand_like(static_target) for _ in range(10)]
        +
        +for data, target in zip(real_inputs, real_targets):
        +    # Fills the graph's input memory with new data to compute on
        +    static_input.copy_(data)
        +    static_target.copy_(target)
        +    # replay() includes forward, backward, and step.
        +    # You don't even need to call optimizer.zero_grad() between iterations
        +    # because the captured backward refills static .grad tensors in place.
        +    g.replay()
        +    # Params have been updated. static_y_pred, static_loss, and .grad
        +    # attributes hold values from computing on this iteration's data.
        +
        + +

        If some of your network is unsafe to capture (e.g., due to dynamic control flow, dynamic shapes, CPU syncs, or essential CPU-side logic), you can run the unsafe part(s) eagerly and use torch.cuda.make_graphed_callables to graph only the capture-safe part(s). This is demonstrated next.

        + +

        make_graphed_callables accepts callables (functions or nn.Module and returns graphed versions. By default, callables returned by make_graphed_callables are autograd-aware, and can be used in the training loop as direct replacements for the functions or nn.Module you passed. make_graphed_callables internally creates CUDAGraph objects, runs warm up iterations, and maintains static inputs and outputs as needed. Therefore, (unlike with torch.cuda.graph) you don’t need to handle those manually.

        + +

        In the following example, data-dependent dynamic control flow means the network isn’t capturable end-to-end, but make_graphed_callables() lets us capture and run graph-safe sections as graphs regardless:

        + +
        N, D_in, H, D_out = 640, 4096, 2048, 1024
        +
        +module1 = torch.nn.Linear(D_in, H).cuda()
        +module2 = torch.nn.Linear(H, D_out).cuda()
        +module3 = torch.nn.Linear(H, D_out).cuda()
        +
        +loss_fn = torch.nn.MSELoss()
        +optimizer = torch.optim.SGD(chain(module1.parameters(),
        +                                  module2.parameters(),
        +                                  module3.parameters()),
        +                            lr=0.1)
        +
        +# Sample inputs used for capture
        +# requires_grad state of sample inputs must match
        +# requires_grad state of real inputs each callable will see.
        +x = torch.randn(N, D_in, device='cuda')
        +h = torch.randn(N, H, device='cuda', requires_grad=True)
        +
        +module1 = torch.cuda.make_graphed_callables(module1, (x,))
        +module2 = torch.cuda.make_graphed_callables(module2, (h,))
        +module3 = torch.cuda.make_graphed_callables(module3, (h,))
        +
        +real_inputs = [torch.rand_like(x) for _ in range(10)]
        +real_targets = [torch.randn(N, D_out, device="cuda") for _ in range(10)]
        +
        +for data, target in zip(real_inputs, real_targets):
        +    optimizer.zero_grad(set_to_none=True)
        +
        +    tmp = module1(data)  # forward ops run as a graph
        +
        +    if tmp.sum().item() > 0:
        +        tmp = module2(tmp)  # forward ops run as a graph
        +    else:
        +        tmp = module3(tmp)  # forward ops run as a graph
        +
        +    loss = loss_fn(tmp, target)
        +    # module2's or module3's (whichever was chosen) backward ops,
        +    # as well as module1's backward ops, run as graphs
        +    loss.backward()
        +    optimizer.step()
        +
        + +

        Example use cases

        +

        MLPerf v1.0 training workloads

        + +

        The PyTorch CUDA graphs functionality was instrumental in scaling NVIDIA’s MLPerf training v1.0 workloads (implemented in PyTorch) to over 4000 GPUs, setting new records across the board. We illustrate below two MLPerf workloads where the most significant gains were observed with the use of CUDA graphs, yielding up to ~1.7x speedup.

        + + + + + + + + + + + + + + + + + + + + + +
         Number of GPUsSpeedup from CUDA-graphs
        Mask R-CNN2721.70×
        BERT40961.12×
        + +

        Table 1. MLPerf training v1.0 performance improvement with PyTorch CUDA graph.

        + +

        Mask R-CNN

        + +

        Deep learning frameworks use GPUs to accelerate computations, but a significant amount of code still runs on CPU cores. CPU cores process meta-data like tensor shapes in order to prepare arguments needed to launch GPU kernels. Processing meta-data is a fixed cost while the cost of the computational work done by the GPUs is positively correlated with batch size. For large batch sizes, CPU overhead is a negligible percentage of total run time cost, but at small batch sizes CPU overhead can become larger than GPU run time. When that happens, GPUs go idle between kernel calls. This issue can be identified on an NSight timeline plot in Figure 3. The plot below shows the “backbone” portion of Mask R-CNN with per-gpu batch size of 1 before graphing. The green portion shows CPU load while the blue portion shows GPU load. In this profile we see that the CPU is maxed out at 100% load while GPU is idle most of the time, there is a lot of empty space between GPU kernels.

        + +

        +NSight timeline plot of Mask R-CNN shows that the CPU is maxed out at 100% load while GPU is idle most of the time, and a lot of empty space between GPU kernels +
        + Figure 3: NSight timeline plot of Mask R-CNN +

        + +

        CUDA graphs can automatically eliminate CPU overhead when tensor shapes are static. A complete graph of all the kernel calls is captured during the first step, in subsequent steps the entire graph is launched with a single op, eliminating all the CPU overhead, as observed in Figure 4..

        + +

        +With CUDA graph, the entire graph is launched with a single op, eliminating all the CPU overhead +
        + Figure 4: CUDA graphs optimization +

        + +

        With graphing, we see that the GPU kernels are tightly packed and GPU utilization remains high. The graphed portion now runs in 6 ms instead of 31ms, a speedup of 5x. We did not graph the entire model, mostly just the resnet backbone, which resulted in an overall speedup of ~1.7x. +In order to increase the scope of the graph, we made some changes in the software stack to eliminate some of the CPU-GPU synchronization points. In MLPerf v1.0, this work included changing the implementation of torch.randperm function to use CUB instead of Thrust because the latter is a synchronous C++ template library. These improvements are available in the latest NGC container.

        + +

        BERT

        + +

        Similarly, by graph capturing the model, we eliminate CPU overhead and accompanying synchronization overhead. CUDA graphs implementation results in a 1.12x performance boost for our max-scale BERT configuration. To maximize the benefits from CUDA graphs, it is important to keep the scope of the graph as large as possible. To achieve this, we modified the model script to remove CPU-GPU synchronizations during the execution such that the full model can be graph captured. Furthermore, we also made sure that the tensor sizes during the execution are static within the scope of the graph. For instance, in BERT, only a specific subset of total tokens contribute to loss function, determined by a pre-generated mask tensor. Extracting the indices of valid tokens from this mask, and using these indices to gather the tokens that contribute to the loss, results in a tensor with a dynamic shape, i.e. with shape that is not constant across iterations. In order to make sure tensor sizes are static, instead of using the dynamic-shape tensors in the loss computation, we used static shape tensors where a mask is used to indicate which elements are valid. As a result, all tensor shapes are static. Dynamic shapes also require CPU-GPU synchronization since it has to involve the framework’s memory management on the CPU side. With static-only shapes, no CPU-GPU synchronizations are necessary. This is shown in Figure 5.

        + +

        + Synchronization free training eliminates CPU synchronization +
        + Figure 5. By using a fixed size tensor and a boolean mask as described in the text, we are able to eliminate CPU synchronizations needed for dynamic sized tensors +

        + +

        CUDA graphs in NVIDIA DL examples collection

        + +

        Single GPU use cases can also benefit from using CUDA Graphs. This is particularly true for workloads launching many short kernels with small batches. A good example is training and inference for recommender systems. Below we present preliminary benchmark results for NVIDIA’s implementation of the Deep Learning Recommendation Model (DLRM) from our Deep Learning Examples collection. Using CUDA graphs for this workload provides significant speedups for both training and inference. The effect is particularly visible when using very small batch sizes, where CPU overheads are more pronounced.

        + +

        CUDA graphs are being actively integrated into other PyTorch NGC model scripts and the NVIDIA Github deep learning examples. Stay tuned for more examples on how to use it.

        + +

        + CUDA graphs optimization for the DLRM model. The impact is larger for smaller batch sizes where CPU overheads are more pronounced. +

        +

        + CUDA graphs optimization for the DLRM model. The impact is larger for smaller batch sizes where CPU overheads are more pronounced. +
        + Figure 6: CUDA graphs optimization for the DLRM model. +

        + +

        Call to action: CUDA Graphs in PyTorch v1.10

        + +

        CUDA graphs can provide substantial benefits for workloads that comprise many small GPU kernels and hence bogged down by CPU launch overheads. This has been demonstrated in our MLPerf efforts, optimizing PyTorch models. Many of these optimizations, including CUDA graphs, have or will eventually be integrated into our PyTorch NGC model scripts collection and the NVIDIA Github deep learning examples. For now, check out our open-source MLPerf training v1.0 implementation which could serve as a good starting point to see CUDA graph in action. Alternatively, try the PyTorch CUDA graphs API on your own workloads.

        + +

        We thank many NVIDIAN’s and Facebook engineers for their discussions and suggestions: +Karthik Mandakolathur US, +Tomasz Grel, +PLJoey Conway, +Arslan Zulfiqar US

        + +

        Authors bios

        + +

        Vinh Nguyen +DL Engineer, NVIDIA

        + +

        Vinh is a Deep learning engineer and data scientist, having published more than 50 scientific articles attracting more than 2500 citations. At NVIDIA, his work spans a wide range of deep learning and AI applications, including speech, language and vision processing, and recommender systems.

        + +

        Michael Carilli +Senior Developer Technology Engineer, NVIDIA

        + +

        Michael worked at the Air Force Research Laboratory optimizing CFD code for modern parallel architectures. He holds a PhD in computational physics from the University of California, Santa Barbara. A member of the PyTorch team, he focuses on making GPU training fast, numerically stable, and easy(er) for internal teams, external customers, and Pytorch community users.

        + +

        Sukru Burc Eryilmaz +Senior Architect in Dev Arch, NVIDIA

        + +

        Sukru received his PhD from Stanford University, and B.S from Bilkent University. He currently works on improving the end-to-end performance of neural network training both at single-node scale and supercomputer scale.

        + +

        Vartika Singh +Tech Partner Lead for DL Frameworks and Libraries, NVIDIA

        + +

        Vartika has led teams working in confluence of cloud and distributed computing, scaling and AI, influencing the design and strategy of major corporations. She currently works with the major frameworks and compiler organizations and developers within and outside NVIDIA, to help the design to work efficiently and optimally on NVIDIA hardware.

        + +

        Michelle Lin +Product Intern, NVIDIA

        + +

        Michelle is currently pursuing an undergraduate degree in Computer Science and Business Administration at UC Berkeley. She is currently managing execution of projects such as conducting market research and creating marketing assets for Magnum IO.

        + +

        Natalia Gimelshein +Applied Research Scientist, Facebook

        + +

        Natalia Gimelshein worked on GPU performance optimization for deep learning workloads at NVIDIA and Facebook. She is currently a member of the PyTorch core team, working with partners to seamlessly support new software and hardware features.

        + +

        Alban Desmaison +Research Engineer, Facebook

        + +

        Alban studied engineering and did a PhD in Machine Learning and Optimization, during which he was an OSS contributor to PyTorch prior to joining Facebook. His main responsibilities are maintaining some core library and features (autograd, optim, nn) and working on making PyTorch better in general.

        + +

        Edward Yang +Research Engineer, Facebook

        + +

        Edward studied CS at MIT and then Stanford before starting at Facebook. He is a part of the PyTorch core team and is one of the leading contributors to PyTorch.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-training-float8-rowwise-crusoe/index.html b/blog/accelerating-training-float8-rowwise-crusoe/index.html new file mode 100644 index 000000000000..eb50f8088e6c --- /dev/null +++ b/blog/accelerating-training-float8-rowwise-crusoe/index.html @@ -0,0 +1,828 @@ + + + + + + + + + + + + + Accelerating Large Scale Training and Convergence with PyTorch Float8 Rowwise on Crusoe 2K H200s | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Meta and Crusoe + +

        +

        Meta: Less Wright, Hamid Shojanazeri, Vasiliy Kuznetsov, Daniel Vega-Myhre, Gokul Nadathur, Will Constable, Tianyu Liu, Tristan Rice, Driss Guessous, Josh Fromm, Luca Wehrstedt, Jiecao Yu +Crusoe: Ethan Petersen, Martin Cala, Chip Smith

        + +

        Working with Crusoe.AI we were provided access to one of their new 2K H200 clusters in Iceland, which enabled us to showcase training accelerations of 34 - 43% at scale by leveraging TorchTitan’s HSDP2 and TorchAO’s new float8 rowwise, with comparable convergence and stability vs BF16.

        + +

        bar chart

        + +

        In this post we detail the synergy of H200’s with PyTorch’s new Float8 rowwise training with TorchTitan’s FSDP2/HSDP2 and CP at scale.

        + +

        Background - what is an H200?

        + +

        H200’s are an ‘enhanced’ H100, offering the exact same compute as an H100, but with two additional improvements.

        + +
          +
        • Larger global memory, 141GiB HBM3e vs the standard 80GiB HBM3
        • +
        • Memory bandwidth is ~43% faster with 4.8TB/s vs 3.35 TB/s. The faster memory transfer has an outsized effect on training speed, especially for PyTorch’s AsyncTP.
        • +
        + +

        What is PyTorch Float8 rowwise?

        + +

        Float 8 Rowwise is a finer grained resolution for Float8 vs the previous ‘tensor wise’ Float8. It is designed to ensure finer grained accuracy to support larger workloads that tend to become more sensitive to quantization at scale and as training progresses.

        + +

        There are two key improvements with Float8 rowwise:

        + +
          +
        • Each row now maintains its own scaling factor versus a single scaling factor for the entire tensor, thus improving quantization precision. Finer grained scaling per row helps reduce the effect of outliers (extreme values that force the quantization scaling factor to stretch and degrade the precision of the normally distributed values) and thus ensures better precision.
        • +
        • The scaling factor itself is now implemented by rounding down to the nearest power of 2. This has been shown to help reduce quantization errors when multiplying/dividing by the scaling factor as well as ensuring large values remain scaled to the same value in both the forward and backward passes.
        • +
        + +

        Note that other large scale models have been trained using Float8 at 2K scale with a combination of 1x128 groupwise and 128x128 blockwise, with power of 2 scaling factors. They had the same goal of improving Float8’s precision for supporting large scale training.

        + +

        Thus, Float8 rowwise offers a similar promise to enable Float8 for very large scale training, but we wanted to provide proof of stability and convergence at scale, which training on the Crusoe H200 2k cluster provided initial verification thereof.

        + +

        Showcasing Float8 Rowwise Loss convergence vs BF16 at 1600 and 1920 GPU Scale:

        + +

        In order to verify comparable loss convergence, we ran two separate runs at both 1920 and then 1600 (1.6k) gpu scale using TorchTitan and Lllama3 70B. The 1.6K GPU runs were set for 2.5k iterations, using TorchTitans’ HSDP2 and Context Parallel to enable 2D parallelism.

        + +

        The loss convergence tests were run using Titan’s deterministic mode - this mode effectively freezes most potential sources of variation from run to run, and thus helps ensure that the only substantial change is what we want to test, namely the loss convergence and loss curves of BF16 vs Float8 Rowwise.

        + +

        Note that deterministic mode also slows down training speed because various kernels will not be autotuned to maximize throughput (otherwise we risk using different kernels between runs and introducing variance).

        + +

        Two runs were completed, one with BF16 and the other with Float8 Rowwise.

        + +

        Both runs completed their assigned 2.5k iters without issue, showcasing the Crusoe cluster stability, with FP8 completing at exactly 24 hours and BF16 finishing after 31 hours, 19 minutes.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        DType + Time / Iters + Loss +
        + + +
        BF16 + 24 hours + 3.15453 +
        Float8 Rowwise + 24 hours + 2.86386 +
        + + +
        BF16 + 31 hours, 19 minutes / 2.5K + 2.88109 +
        Float8 Rowwise + 24 hours / 2.5K + 2.86386 +
        + +

        At the 24 hour mark, Float8 completed 2.5K iterations showcasing the comparative speed up (even in deterministic mode) of float8 training. At the 24 hour mark, Float8 enabled a +9.21% relative improvement in loss compared to BF16 for the same 24 hours of large scale training time.

        + +

        After 31 hours, 19 minutes, the BF16 run finally completed its 2.5k iters.

        + +

        The final loss numbers:
        +BF16 = 2.88109 +Float8 = 2.86386

        + +

        From the loss curves we observed very similar curves at the first and last ⅓ and then a turbulent zone in the middle where both showed similar spikes, but with a slight skew to the relative timing of the spikes.

        + +

        line chart

        + +

        As a result of this, we can see that PyTorch’s Float8 rowwise offers similar convergence but over 33% speedup for the same amount of training time.

        + +

        Long Term Training stability with Float8 Rowwise

        + +

        Beyond showcasing comparable convergence, we also wanted to show longer term training stability with Float8 and thus we launched a 4 day, 15K run at 256 scale.

        + +

        line chart

        + +

        As shown above, Float8 training ran for over 100 hours with no issues, highlighting the long term stability of Float8 Rowwise.

        + +

        Determinism in TorchTitan

        + +

        To verify determinism and to see if the spikiness in the longer runs was from scale, we also ran a smaller run comprising of 2 runs of BF16, and 1 run of Float8 at 256 scale, and with HSDP2 only (i.e. without 2D Context parallel).

        + +

        In this case both BF16 runs had identical curves and final loss, and we saw a similar spikiness zone for all three runs.

        + +

        At the 2K iteration mark, both Float8 and BF16 ending at nearly identical points:
        +BF16 *2 = 3.28538
        +Float8 rowwise = 3.28203

        + +

        line chart

        + +

        The above result confirms that neither CP nor scale (2k) are responsible for spikiness in the loss as we saw similar effect at 256 scale as well. The most likely explanation for the loss spikes could be content distribution in the dataset.

        + +

        For the sake of determinism, the experiments were run with a serialized C4 dataset (not shuffled), meaning the spikes could be from encountering new content within the dataset.

        + +

        Net speedups at various Scales with Float8 rowwise:

        + +

        We performed shorter runs at various GPU scales to understand how Float8 Rowwise would scale in terms of training acceleration as cluster sizes expanded. Doubling in scale from 960 to 1920, Float8 continued to deliver impressive training speedups, with a range of over 34-43% gains compared to BF16. We also want to note that scaling from 1k to 2k GPUs communication overhead likely kicked in and we observed a 4% hit on throughput with BF16.

        + +

        bar chart

        + +

        As shown in the longer training runs at scale above, Float8 rowwise delivered substantial speedups with equal or even slightly improved loss endpoints while delivering 34% speedups at 1920 (DeepSeek) scale.

        + +

        How can I use Float8 Rowwise in my training?

        + +

        Float8 Rowwise is available now for you to use in your large scale training. It is packaged in TorchAO’s latest builds (0.9 and higher) and integrated into TorchTitan natively if you want to get up and running quickly.

        + +

        To activate Float8 Rowwise in TorchTitan:

        + +

        First enable the model converter to hotswap the nn.linears into float8 linear layers in your models .toml file - see line 29:

        + +

        code

        + +

        Secondly, specify the ‘rowwise’ float8 recipe - see line 72:

        + +

        code

        + +

        Note that you have three choices for the ‘recipe_name’:

        + +
          +
        • rowwise which is the recommended default,
        • +
        • tensorwise (the older style float8) and
        • +
        • rowwise_with_gw_hp.
        • +
        + +

        The gw_hp rowwise option keeps the gradients to the weights in BF16 precision during the backwards pass, and this can further enhance float8 precision for extremely sensitive workloads. But, it can ironically be a bit more performant than generic rowwise if the majority of the matmul sizes in your model are smaller (with an estimated tipping point at roughly 13-16K dimensions on H100).

        + +

        Thus while we recommend rowwise as the default, it may be worth comparing with gw_hp on your model to verify which provides the best performance, with an upside of even greater precision.

        + +

        By toggling the model converter on and off with a #, you can directly compare training acceleration between BF16 and Float8 Rowwise to understand the potential speedups for your own training.

        + +

        Future Updates:

        + +

        We’ll have an additional update coming showcasing multiple improvements for Pipeline Parallel and Async Distributed Checkpointing so please stay tuned.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/index.html b/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/index.html new file mode 100644 index 000000000000..ec8a7229d8d0 --- /dev/null +++ b/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/index.html @@ -0,0 +1,770 @@ + + + + + + + + + + + + + Introducing native PyTorch automatic mixed precision for faster training on NVIDIA GPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Mengdi Huang, Chetan Tekur, Michael Carilli + +

        +

        Most deep learning frameworks, including PyTorch, train with 32-bit floating point (FP32) arithmetic by default. However this is not essential to achieve full accuracy for many deep learning models. In 2017, NVIDIA researchers developed a methodology for mixed-precision training, which combined single-precision (FP32) with half-precision (e.g. FP16) format when training a network, and achieved the same accuracy as FP32 training using the same hyperparameters, with additional performance benefits on NVIDIA GPUs:

        + +
          +
        • Shorter training time;
        • +
        • Lower memory requirements, enabling larger batch sizes, larger models, or larger inputs.
        • +
        + +

        In order to streamline the user experience of training in mixed precision for researchers and practitioners, NVIDIA developed Apex in 2018, which is a lightweight PyTorch extension with Automatic Mixed Precision (AMP) feature. This feature enables automatic conversion of certain GPU operations from FP32 precision to mixed precision, thus improving performance while maintaining accuracy.

        + +

        For the PyTorch 1.6 release, developers at NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, torch.cuda.amp. torch.cuda.amp is more flexible and intuitive compared to apex.amp. Some of apex.amp’s known pain points that torch.cuda.amp has been able to fix:

        + +
          +
        • Guaranteed PyTorch version compatibility, because it’s part of PyTorch
        • +
        • No need to build extensions
        • +
        • Windows support
        • +
        • Bitwise accurate saving/restoring of checkpoints
        • +
        • DataParallel and intra-process model parallelism (although we still recommend torch.nn.DistributedDataParallel with one GPU per process as the most performant approach)
        • +
        • Gradient penalty (double backward)
        • +
        • torch.cuda.amp.autocast() has no effect outside regions where it’s enabled, so it should serve cases that formerly struggled with multiple calls to apex.amp.initialize() (including cross-validation) without difficulty. Multiple convergence runs in the same script should each use a fresh GradScaler instance, but GradScalers are lightweight and self-contained so that’s not a problem.
        • +
        • Sparse gradient support
        • +
        + +

        With AMP being added to PyTorch core, we have started the process of deprecating apex.amp. We have moved apex.amp to maintenance mode and will support customers using apex.amp. However, we highly encourage apex.amp customers to transition to using torch.cuda.amp from PyTorch Core.

        + +

        Example Walkthrough

        +

        Please see official docs for usage:

        + + +

        Example:

        + +
        import torch
        +# Creates once at the beginning of training
        +scaler = torch.cuda.amp.GradScaler()
        +
        +for data, label in data_iter:
        +   optimizer.zero_grad()
        +   # Casts operations to mixed precision
        +   with torch.cuda.amp.autocast():
        +      loss = model(data)
        +
        +   # Scales the loss, and calls backward()
        +   # to create scaled gradients
        +   scaler.scale(loss).backward()
        +
        +   # Unscales gradients and calls
        +   # or skips optimizer.step()
        +   scaler.step(optimizer)
        +
        +   # Updates the scale for next iteration
        +   scaler.update()
        +
        + +

        Performance Benchmarks

        +

        In this section, we discuss the accuracy and performance of mixed precision training with AMP on the latest NVIDIA GPU A100 and also previous generation V100 GPU. The mixed precision performance is compared to FP32 performance, when running Deep Learning workloads in the NVIDIA pytorch:20.06-py3 container from NGC.

        + +

        Accuracy: AMP (FP16), FP32

        +

        The advantage of using AMP for Deep Learning training is that the models converge to the similar final accuracy while providing improved training performance. To illustrate this point, for Resnet 50 v1.5 training, we see the following accuracy results where higher is better. Please note that the below accuracy numbers are sample numbers that are subject to run to run variance of up to 0.4%. Accuracy numbers for other models including BERT, Transformer, ResNeXt-101, Mask-RCNN, DLRM can be found at NVIDIA Deep Learning Examples Github.

        + +

        Training accuracy: NVIDIA DGX A100 (8x A100 40GB)

        + + + + + + + + + + + + + + +
         epochs Mixed Precision Top 1(%) TF32 Top1(%)
         90 76.93 76.85
        + +

        Training accuracy: NVIDIA DGX-1 (8x V100 16GB)

        + + + + + + + + + + + + + + + + + + + + + + + + +
         epochs Mixed Precision Top 1(%) FP32 Top1(%)
        5076.2576.26
        9077.0977.01
        25078.4278.30
        + +

        Speedup Performance:

        + +

        FP16 on NVIDIA V100 vs. FP32 on V100

        +

        AMP with FP16 is the most performant option for DL training on the V100. In Table 1, we can observe that for various models, AMP on V100 provides a speedup of 1.5x to 5.5x over FP32 on V100 while converging to the same final accuracy.

        + +
        + +
        +

        Figure 2. Performance of mixed precision training on NVIDIA 8xV100 vs. FP32 training on 8xV100 GPU. Bars represent the speedup factor of V100 AMP over V100 FP32. The higher the better.

        + +

        FP16 on NVIDIA A100 vs. FP16 on V100

        + +

        AMP with FP16 remains the most performant option for DL training on the A100. In Figure 3, we can observe that for various models, AMP on A100 provides a speedup of 1.3x to 2.5x over AMP on V100 while converging to the same final accuracy.

        + +
        + +
        +

        Figure 3. Performance of mixed precision training on NVIDIA 8xA100 vs. 8xV100 GPU. Bars represent the speedup factor of A100 over V100. The higher the better.

        + +

        Call to action

        +

        AMP provides a healthy speedup for Deep Learning training workloads on Nvidia Tensor Core GPUs, especially on the latest Ampere generation A100 GPUs. You can start experimenting with AMP enabled models and model scripts for A100, V100, T4 and other GPUs available at NVIDIA deep learning examples. NVIDIA PyTorch with native AMP support is available from the PyTorch NGC container version 20.06. We highly encourage existing apex.amp customers to transition to using torch.cuda.amp from PyTorch Core available in the latest PyTorch 1.6 release.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-triton/index.html b/blog/accelerating-triton/index.html new file mode 100644 index 000000000000..255769464002 --- /dev/null +++ b/blog/accelerating-triton/index.html @@ -0,0 +1,842 @@ + + + + + + + + + + + + + Accelerating Triton Dequantization Kernels for GPTQ | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Less Wright, Adnan Hoque (IBM) + +

        +

        TL;DR

        + +

        Leveraging a first principles approach, we showcase a step by step process undertaken to accelerate the current Triton GPTQ kernels by 3x (core GPTQ) and 6x (AutoGPTQ). Example: 275us to 47us on a typical Llama style inference input. The goal is to provide a helpful template for accelerating any given Triton kernel. We provide a background on Triton and GPTQ quantization and dequantization process, showcase the impact of coalesced memory access to improve shared and global memory throughput, highlight changes made to reduce warp stalling to improve total throughput, and an overview on integrating Triton kernels into PyTorch code. Longer term, we hope to surpass the existing CUDA native GPTQ kernel with our Triton kernel.

        + +

        Fig 1: Performance benchmarking the optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on H100

        + +

        Fig 1: Performance benchmarking the optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on H100

        + +

        Fig 2: Performance benchmarking the newly optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on A100

        + +

        Fig 2: Performance benchmarking the newly optimized AutoGTPQ kernel vs the current AutoGPTQ kernel on A100

        + +

        Fig 3: Even with these improvements, there remains a gap between our optimized Triton kernel and the CUDA native AutoGTPQ kernel on A100.

        + +

        Fig 3: Even with these improvements, there remains a gap between our optimized Triton kernel and the CUDA native AutoGTPQ kernel on A100. More to come…

        + +

        1.0 Introduction to Triton

        + +

        The Triton framework provides a hardware agnostic way of programming and targeting GPUs, currently supporting both NVIDIA and AMD, with support for additional hardware vendors in progress. Triton is now a mainstay for PyTorch 2.0 as torch.compile decomposes eager PyTorch and re-assembles it into a high percentage of Triton kernels with PyTorch connecting code.

        + +

        As Triton becomes more widely adopted, it will be essential that programmers understand how to systematically step through the Triton stack (from the high level Python down to the low-level SASS) to address performance bottlenecks in order to optimize GPU efficiency for algorithms that go beyond torch.compile generated kernels.

        + +

        In this post, we will introduce some core concepts of the Triton programming language, how to identify common performance limiters in GPU kernels, and in parallel, tune a quantization kernel used in AutoGPTQ that can be used for high throughput inference applications.

        + +

        Intro to GPTQ Quantization and Dequantization

        + +

        GPTQ is a quantization algorithm that is able to compress ultra-large (175B+) LLMs efficiently to int4 bit representation, via approximate second order information (Hessian inverse). AutoGPTQ is a framework built on GPTQ, allowing for rapid dequantization and inference/serving of LLMs that have been quantized with GPTQ.

        + +

        As part of the AutoGPTQ stack, they provide a Triton GPTQ kernel to handle the dequantization of a model for inference.

        + +

        The basic process for INT quantization is shown below and involves determining the scale and zero point, and then computing the quantized 4bit Weight using the Scale and Zero point:

        + +

        The basic process for INT quantization

        + +

        We thus store the 4 Bit weights along with the meta information of Scale and ZeroPoint for each group of weights.

        + +

        To ‘dequant’ these weights, we do the following:

        + +

        To ‘dequant’ these weights

        + +

        And then proceed to Matrix Multiply the dequantized weights with the dense input feature matrix for this linear layer.

        + +

        2.0 Identify the Bottlenecks - Optimizing Matrix Multiplication

        + +

        As it turns out, making a fast matrix multiplication kernel is not trivial. A naively implemented matrix multiply will rarely reach peak throughput performance on highly parallel machines like GPUs. So – we need to tackle our compute and memory subsystems in our GPU in an hierarchical fashion to make sure we are maximally utilizing each resource.

        + +

        We start our optimization process, by running the unoptimized Triton Kernel, through the Nvidia Nsight Compute tool and taking a note of some important metrics and warnings:

        + +

        some important metrics and warnings

        + +

        Fig xy (todo)

        + +

        some important metrics and warnings

        + +

        We notice first that both compute and memory throughput are low, 7.40% and 21.19% respectively (fig xy) . Knowing that for typical inference matrix problem sizes, we are in the memory bound regime, we will attempt to optimize the kernel by applying code changes that target the memory subsystem of our A100 GPU.

        + +

        The three topics this post will cover are:

        + +
          +
        1. L2 Optimization
        2. +
        3. Vectorized Load
        4. +
        5. Warp Stalling
        6. +
        + +

        Let’s walk through each topic, make the appropriate changes, and see its corresponding impact on our Triton Kernel. This Triton kernel is a fused dequantization kernel that dequantizes a packed int32 weight (we will refer to this as the B Matrix) Tensor into int4 weights, performs matrix multiplication with the activation tensor (refer to as the A matrix) in FP16 mode, and then stores the results back to a matrix C.

        + +

        The above is referred to as W4A16 quantization. Keep in mind that the process we describe can and should be used for the development of any GPU kernel, as these are common bottlenecks in any unoptimized kernel.

        + +

        3.0 L2 Optimization

        + +

        This optimization already exists in the AutoGPTQ kernel, but we’d like to dedicate a section to this to help readers better understand how mapping and execution order of thread blocks is handled in Triton. Thus, we will step through a naive mapping and then a more optimal mapping to see its corresponding impact.

        + +

        Let’s build up our kernel naively, starting with a “linear” load from global memory and then compare it to a more optimized “swizzled” load. Linear vs Swizzled determines the execution order of our grid of work on the GPU. Let’s take a look at the hints that the Nvidia Nsight Compute Tool provides regarding our kernels shared memory access pattern in the naive case:

        + +

        the hints from the Nvidia Nsight Compute Tool

        + +

        To tackle this issue we can use an approach referred to as “tile-swizzling.” The idea of this method is to launch our thread blocks in a more L2 cache friendly order.

        + +

        Let’s take a step back and familiarize ourselves with some Triton semantics and make a simple CUDA analogy to understand the concept better. Triton kernels launch “programs”. These so-called programs map to the concept of a Thread Block in CUDA and it is the basic unit of parallelism in a Triton Kernel. Every program has with it associated a “pid” and all the threads in a program are guaranteed to be executing the same instruction.

        + +

        The Triton programs will be distributed onto your SMs in a naive-way if you do a simple linear mapping of “pid” to a 2D grid location of your output matrix C.

        + +

        This 2D grid location is determined by pid_m and pid_n in Triton. We would like to exploit data and cache locality in the L2 cache of our GPU, when we distribute our grid of work. To do this in Triton we can make the following changes:

        + +

        To do this in Triton

        + +

        The code highlighted in red would be the naive “linear” tile ordering, and the code highlighted in green is the “swizzled” tile ordering. This type of launch promotes a sense of locality. Here is a visual to help understand this better.

        + +

        a sense of locality

        + +

        After incorporating this change, the profiler no longer complains about uncoalesced memory accesses. Let’s take a look at how our memory throughput has changed:

        + +

        how our memory throughput has changed

        + +

        This change was tested on a simple load store kernel. Looking at the GPU speed of light statistics section in the profiler we also see a 112.07% increase in the memory throughput of the simple load kernel, which is what we were after with this optimization. Again, this optimization already exists in the AutoGPTQ kernel, but is the boilerplate logic that every Triton Kernel programmer will have to write in the beginning of their kernel, before any of the exciting dequantization or matrix multiply logic. It is thus important to understand that:

        + +
          +
        1. +

          This mapping is not unique

          +
        2. +
        3. +

          Triton does not automatically handle this kind of optimization for the programmer, and careful thought must be taken to ensure your kernel is optimally handling shared memory accesses

          +
        4. +
        + +

        These are not obvious for those new to Triton, as much of the shared memory access optimization is handled by the Triton compiler. However, in the cases where these are not handled by the compiler, it is important to be able to understand what tools and methods are available to us to be able to influence memory behavior.

        + +

        4.0 Vectorized Load

        + +

        Now, back to the original complaints of our unoptimized kernel. We want to optimize the global memory access pattern of our kernel. From the details page of the Nvidia Nsight compute tool, we see the following note, where the profiler is complaining about uncoalesced global memory accesses.

        + +

        Let’s dig deeper and take a look at the SASS (Assembly) Code load for an unoptimized memory read:

        + +

        an unoptimized memory read

        + +

        This load operation resulted in 32 global load operations that are 16 bit wide. This is not optimal.

        + +

        We would like to do our global memory loads in a vectorized way so that it results in the least amount of load instructions. To combat this we can give the Triton Compiler some help.

        + +

        code block

        + +

        The green highlighted lines above act as a compiler hint. It tells the compiler that these elements are contiguous in memory and that this load operation can be coalesced.

        + +

        Let’s see the effect in assembly after adding these lines.

        + +

        the effect in assembly after adding these lines

        + +

        The load is now performed in 4 global load operations that are each 128 bit wide, instead of 32 16 bit global load operations. This means 28 fewer memory fetch instructions, and importantly a coalesced memory access. This can be seen from the fact that a single thread is not accessing consecutive memory addresses, which without the compiler hint, was the behavior.

        + +

        The resulting effect is 73x speedup in an isolated load operation, and after incorporating it in the full dequantization kernel we were able to see another 6% speedup. Another step in the right direction!

        + +

        5.0 Warp Stalling

        + +

        performance limiter, warp stalling

        + +

        Now putting all the changes back into our full dequantization kernel, we see the following performance limiter, warp stalling.

        + +

        These warp stalls are mostly caused by ‘Long Scoreboard’ stalls, accounting for 92.63% of the total.

        + +

        At a high level, long scoreboard stalls happen when a warp requires data that may not be ready yet in order to be in the “issued” state. In other words GPUs are throughput machines, and we need to hide the latency of load instructions with compute instructions. By loading more data and rearranging where the load instructions are in the script we can take care of this problem.

        + +

        In an ideal scenario, each warp scheduler would be able to issue 1 instruction every clock cycle. Note - Every SM on an A100 GPU has 4 warp schedulers.

        + +

        However – our kernel has bottlenecks and is spending 4.4 cycles in the stall state with the block size that AutoGPTQ Triton kernel deems as optimal given the presets it has.

        + +

        How do we improve this?

        + +

        We want to be able to increase our memory throughput so that we can increase the chance that when a warp issues an instruction, we won’t be waiting for loads to be stored in SRAM so that they can be used for computation. We played around with multiple parameters (such as number of pipeline stages, and number of warps) and the one that had the biggest impact was increasing the block size by a factor of 2 in the k dimension.

        + +

        These changes yield an immediate impact on both compute and memory throughput.

        + +

        an immediate impact on both compute and memory throughput

        + +

        We also see the long scoreboard wait time at the step where we shift and scale the quantized weights drop significantly, which is what we identified as the original bottleneck in the source code. While there are still stalls at this line, only 68% of them are caused by long scoreboard stalls, compared to the original 92%. Ideally, we do not observe ANY stalls, so there is still work to be done here, but a reduction in the amount of stalls caused by long scoreboard tells us that our data is at this point ready to be used (in L1TEX) memory by an instruction that a warp wants to execute, at a higher frequency then the original kernel.

        + +

        1.4x speedup in the execution time of our kernel

        + +

        The corresponding impact is a 1.4x speedup in the execution time of our kernel.

        + +

        6.0 Results

        + +

        By tackling all these problem areas methodically our resulting kernel is 6x faster on the Nvidia A100 GPU than if you were to use the Triton kernel AutoGPTQ provides out-of-the-box.

        + +

        Taking a relevant Llama inference sample data point, the Triton kernel we’ve developed takes 47us to perform dequantization and matrix multiplication compared to the 275us taken by the AutoGPTQ kernel for the same matrix size.

        + +

        By replicating this step-by-step approach it should be possible to get similar speedups in other kernels, and help build understanding on common GPU bottlenecks and how to tackle them.

        + +

        It is important to note that while strides have been made in improving the performance of the AutoGPTQ Triton Kernel, we have still not closed the gap on the current exllamaV2 CUDA kernels found in AutoGPTQ.

        + +

        More research is required to understand how we can further optimize this kernel to match equivalent custom CUDA kernel performance.

        + +

        Summary and Future work

        + +

        Triton extends PyTorch by allowing low level GPU optimizations to be done at a higher level of abstraction than CUDA programming, with the net result that adding optimized Triton kernels can help PyTorch models run faster.

        + +

        Our goal in this post was to show an example of accelerating the GPTQ dequant kernel and provide a template workflow for how the accelerations were achieved.

        + +

        For future work, SplitK work decomposition for the matrix multiplication is a potential speed up we’ll investigate.

        + +

        Integrating custom Triton Kernels into PyTorch

        + +

        Given the acceleration shown above, a common question is how to actually use a custom kernel in a given PyTorch codebase.

        + +

        A triton kernel will contain at least two parts - the actual Triton kernel code which will be compiled by the Triton compiler:

        + +

        the actual Triton kernel code which will be compiled by the Triton compiler

        + +

        Along with the actual kernel code is a python wrapper, that may or may not subclass the PyTorch autograd class - depending if it’s going to support a backwards pass (i.e. for training purposes or only for inference purposes).

        + +

        You simply import the python class into your PyTorch code where you want to use it much like any other Python / PyTorch function.

        + +

        import the python class into your PyTorch code

        + +

        In this case, simply importing and then using ‘fast_qlinear’ would invoke the underlying Triton kernel with the speed-ups we’ve shown above applied to your PyTorch model.

        + +

        Acknowledgements

        + +

        Thanks to Jamie Yang and Hao Yu from IBM Research for their technical guidance in the collection of these results.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/accelerating-whisper-arm-w-transformers/index.html b/blog/accelerating-whisper-arm-w-transformers/index.html new file mode 100644 index 000000000000..a1b205486e00 --- /dev/null +++ b/blog/accelerating-whisper-arm-w-transformers/index.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + + Accelerating Whisper on Arm with PyTorch and Hugging Face Transformers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Pareena Verma, Arm + +

        +

        Automatic speech recognition (ASR) has revolutionized how we interact with technology, clearing the way for applications like real-time audio transcription, voice assistants, and accessibility tools. OpenAI Whisper is a powerful model for ASR, capable of multilingual speech recognition and translation.

        + +

        A new Arm Learning Path is now available that explains how to accelerate Whisper on Arm-based cloud instances using PyTorch and Hugging Face transformers.

        + +

        Why Run Whisper on Arm?

        + +

        Arm processors are popular in cloud infrastructure for their efficiency, performance, and cost-effectiveness. With major cloud providers such as AWS, Azure, and Google Cloud offering Arm-based instances, running machine learning workloads on this architecture is becoming increasingly attractive.

        + +

        What You’ll Learn

        + +

        The Arm Learning Path provides a structured approach to setting up and accelerating Whisper on Arm-based cloud instances. Here’s what you cover:

        + +

        1. Set Up Your Environment

        + +

        Before running Whisper, you must set up your development environment. The learning path walks you through setting up an Arm-based cloud instance and installing all dependencies, such as PyTorch, Transformers, and ffmpeg.

        + +

        2. Run Whisper with PyTorch and Hugging Face Transformers

        + +

        Once the environment is ready, you will use the Hugging Face transformer library with PyTorch to load and execute Whisper for speech-to-text conversion. The tutorial provides a step-by-step approach for processing audio files and generating audio transcripts.

        + +

        3. Measure and Evaluate Performance

        + +

        To ensure efficient execution, you learn how to measure transcription speeds and compare different optimization techniques. The guide provides insights into interpreting performance metrics and making informed decisions on your deployment.

        + +

        Try it Yourself

        + +

        Upon completion of this tutorial, you know how to:

        + +
          +
        • Deploy Whisper on an Arm-based cloud instance.
        • +
        • Implement performance optimizations for efficient execution.
        • +
        • Evaluate transcription speeds and optimize further based on results.
        • +
        + +

        Try the live demo today and see audio transcription in action on Arm: Whisper on Arm Demo.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/achieving-sustainability-goals/index.html b/blog/achieving-sustainability-goals/index.html new file mode 100644 index 000000000000..21d00323f0ad --- /dev/null +++ b/blog/achieving-sustainability-goals/index.html @@ -0,0 +1,710 @@ + + + + + + + + + + + + + Achieving Sustainability Goals with PyTorch and Intel AI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        This post was contributed by Intel AI in partnership with the PyTorch Foundation.

        + +

        In 2017, the UN Global Compact emphasized digital technology, particularly open source, as crucial for achieving Sustainable Development Goals (SDGs), projecting a potential $2.1 trillion boost to the tech sector by 2030. The SDGs, part of the “2030 Agenda for Sustainable Development,” address global prosperity across various sectors.

        + +

        The Linux Foundation’s Sustainability Initiative aligns projects with sustainable development goals. By assessing project impact, resources can be better allocated for enhancement. Intel is also a contributor to this initiative, and recently presented three use cases with PyTorch and Intel AI to address UN SDG-aligned issues.

        + +

        Sustainability Goals

        + +

        SDG 15: Life on Land

        + +
          +
        • Using a bone likelihood map to pinpoint dinosaur bones, which paves the way for transfer learning to tackle contemporary challenges like wildfire prediction.
        • +
        • Employing transfer learning for wildfire prediction and generating data with Stable Diffusion.
        • +
        + +

        SDG 9: Industry, Innovation, Infrastructure

        + +
          +
        • Identifying crucial minerals, oil, and gas through subsurface models.
        • +
        + +

        Here are the key highlights from the workshops. Read below for a summary, and be sure to watch the full workshop videos and visit the GitHub repositories.

        + +

        Session 1: Introduction to Dinosaur Bone Bed Maps

        + +

        Bob Chesebrough recently led a PyTorch workshop demonstrating how to create a dinosaur bone bed map for Dinosaur National Monument. He shared footage of his discoveries and explained his AI-driven approach, utilizing geological data to pinpoint possible bone-rich areas.

        + +

        Attendees learned to set up JupyterLab, access the training section, and launch a BASH shell. Bob’s classification model, applied to aerial images, facilitated heatmap generation to identify potential bone locations, refined through field data. The GitHub repo “Jurassic” guided participants through directory setup and model optimization steps.

        + +

        Rahul Unnikrishnan Nair demonstrated the use of PyTorch, focusing on performance enhancements. The workshop covered modeling best practices, such as data transformations, class distribution, dropout layers, and efficient training methods. Training and scoring procedures were examined, with a focus on model accuracy and transportability to other regions. Heatmap creation involved cutting images into tiles, considering context for accurate environmental identification.

        + +

        Watch the full workshop video here and visit the GitHub repository to access the code sample and experiment with the code using Intel ® Extension for PyTorch. Try it out with PyTorch and explore what works best for you. Happy dinosaur bone hunting!

        + +

        Session 2: Seismic Data to Subsurface Models with OpenFWI: Training an AI Model with PyTorch

        + +

        Seismic exploration is crucial for subsurface imaging in mineral and oil/gas exploration. Full waveform inversion (FWI) recreates subsurface sound wave velocities, akin to ultrasound for the Earth.

        + +

        Ben Consolvo, an AI Software Engineering Manager at Intel, presented training AI models directly from seismic data using PyTorch on Intel high-performance processors. FWI, though accurate, is computationally intensive and relies on precise initial models. AI models offer an alternative approach, learning directly from data without the need for precise initializations. Ben explained the challenges of AI models, highlighting the need for diverse datasets and the potential use of CPUs for fine-tuning. He also discussed FWI’s surprising medical applications.

        + +

        Watch the full video here and go to the paper for more details. The GitHub repo is OpenFWI.

        + +

        Session 3: Using PyTorch to Aid Wildfire Prediction

        + +

        Forest fires pose significant threats to ecosystems, wildlife, and communities. Machine learning presents a promising approach to enhance prediction accuracy. In this Earth Day webinar, Bob Chesebrough and Rahul Unnikrishnan Nair demonstrated image analysis techniques using the MODIS dataset which was used to predict early forest fire probabilities. Through fine-tuning a ResNet18 model with the Intel® Extension for PyTorch, pre-trained models were adjusted with aerial photos, utilizing geo-spatial and color data for fire risk assessment.

        + +

        Emphasizing the temporal and geographical filtering requirements for dataset analysis, showcasing images from fire-affected areas like Paradise, CA, the model’s adaptability to different hardware configurations was highlighted, along with the utilization of Stable Diffusion for data synthesis when real datasets were unavailable. The presenters encouraged audience engagement in PyTorch experimentation for early fire detection by extending a challenge to leverage these tools for critical predictive tasks. Join them in this endeavor to enhance wildfire prevention and protection efforts.

        + +

        Watch the full video here and go to the paper for more details. The GitHub repo is ForestFirePrediction.

        + +

        About the Intel Speakers

        + +

        Bob Chesebrough, Sr Solutions Architect

        + +

        Bob Chesebrough’s industry experience is software development/AI solution engineering for Fortune 100 companies and national laboratories for over three decades. He is also a hobbyist who has logged over 800 miles and 1000 hours in the field finding dinosaur bones. He and his sons discovered an important fossil of the only known crocodilian from the Jurassic in New Mexico, they have also discovered and logged into the museum 2000+ bones localities and described a new mass bone bed in New Mexico.

        + +

        Rahul Unnikrishnan Nair, Architect in Applied AI and the Engineering Lead at Intel® Liftoff

        + +

        In his current role at Intel® Liftoff for Startups program, Rahul Nair brings his extensive experience in applied AI and engineering to mentor early-stage AI startups. His dedication lies in helping these startups transform their innovative ideas into fully-fledged, market-ready products with a strong emphasis on use-case-driven, practical engineering and optimization.

        + +

        Ben Consolvo, AI Software Engineering Manager

        + +

        Ben Consolvo is an AI Solutions Engineering Manager at Intel. He has been building a team and a program around Intel’s AI technology paired with Intel’s hardware offerings. He brings a background and passion in data science, particularly in deep learning (DL) and computer vision. He has applied his skills in DL in the cybersecurity industry to automatically identify phishing websites, as well as to the oil and gas industry to identify subsurface features for geophysical imaging.

        + +

        Kelli Belcher, AI Solutions Engineer

        + +

        Kelli Belcher is an AI Solutions Engineer at Intel with over 5 years of experience across the financial services, healthcare, and tech industries. In her current role, Kelli helps build Machine Learning solutions using Intel’s portfolio of open AI software tools. Kelli has experience with Python, R, SQL, and Tableau, and holds a Master of Science in Data Analytics from the University of Texas.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/activation-checkpointing-techniques/index.html b/blog/activation-checkpointing-techniques/index.html new file mode 100644 index 000000000000..be772dd14c2d --- /dev/null +++ b/blog/activation-checkpointing-techniques/index.html @@ -0,0 +1,831 @@ + + + + + + + + + + + + + Current and New Activation Checkpointing Techniques in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        As models scale in depth, batch size, and sequence length, etc, activation memory becomes an increasingly significant contributor to the overall memory usage. To help address this, PyTorch provides utilities for activation checkpointing, which reduce the number of saved tensors by recomputing them when needed, trading off memory usage for additional compute.

        + +

        In this post, we’ll walk through the basics of what activation memory is, the high-level ideas behind existing activation checkpointing techniques, and also introduce some newer techniques that aim to improve flexibility and provide more optimization/automation out of the box.

        + +

        As we look at these techniques, we’ll compare how these methods fit into a speed vs. memory trade-off diagram and hopefully provide some insight on how to choose the right strategy for your use case.

        + +

        (If you prefer to jump straight to the new APIs, please skip ahead to the “Selective Activation Checkpoint” and “Memory Budget API” sections below.)

        + +

        flow diagram

        + +
        + +

        Activation Memory Basics

        + +

        By default, in eager mode (rather than using torch.compile), PyTorch’s autograd preserves intermediate activations for backward computation. For example, if you call sin on a tensor x during the forward pass, autograd must remember x to compute cos(x) during backward.

        + +

        flow diagram

        + +

        If this tensor x is saved at the beginning of the forward pass, it remains in memory throughout both the forward and backward phases. It can only be cleared after it is used to compute the gradient, which happens at the end of the backward pass (due to the reverse order of execution).

        + +

        Thus, as you proceed through the forward pass and perform more and more operations, you accumulate more and more activations, resulting in more and more activation memory until it (typically) reaches its peak at the start of backward (at which point activations can start to get cleared).

        + +

        flow diagram

        + +

        In the diagram above, the orange boxes represent operations, black arrows represent their tensor inputs and outputs. The black arrows that cross over the right represent tensors that autograd saves for backward.

        + +

        A useful way to visually organize this default saving behavior in eager as well as the techniques we’re about to introduce is based on how they trade off speed versus memory.

        + +

        flow diagram

        + +

        The ideal place to be on this diagram is the top-left, where you have “high” speed but also low memory usage.

        + +

        We begin by putting the default saving behavior on the top-right (for reasons we’ll explain in more detail as we introduce more points for other techniques).

        + +
        + +

        Activation Checkpointing (AC)

        + +

        Activation checkpointing (AC) is a popular technique to reduce memory usage in PyTorch.

        + +

        During forward, any operations performed inside the AC’d region do not save tensors for backward. (Only the inputs to the function are saved.) During backward, the intermediate activations needed for gradient computation are rematerialized by running the function a second time.

        + +

        flow diagram

        + +

        In the diagram (right), the black box shows where activation checkpointing is applied. Compared to the default eager approach (left), this setup results in fewer tensors being saved (1 versus 3).

        + +

        Applying AC on the right parts of the model has the effect of reducing peak memory, because the intermediate activations are no longer materialized in memory when the memory usage typically peaks (at the beginning of backward).

        + +

        On the speed-versus-memory tradeoff diagram, AC is plotted on the bottom-left. Relative to eager mode, it reduces the amount of memory saved for backward but comes with an added cost in compute due to recomputation.

        + +

        flow diagram

        + +

        Note that AC’s speed–memory tradeoff /can/ be adjusted by selecting which parts of the forward pass to checkpoint and by defining how many checkpoint regions to use. However, implementing these changes may require modifying your model’s structure and can be cumbersome depending on how your code is organized. For the purposes of this diagram, we assume only one region is checkpointed; under this assumption, AC appears as a single point on the tradeoff diagram.

        + +

        Also note that “memory” here does not refer to peak memory usage; rather, it indicates the how much memory is saved for backward for a fixed region.

        + +
        + +

        torch.compile and min-cut partitioner

        + +

        Another notable approach to keep in mind is torch.compile (introduced in PyTorch 2.0). Like activation checkpointing, torch.compile can also perform some level of recomputation under the hood. Specifically, it traces the forward and backward computations into a single joint graph, which is then processed by a “min-cut” partitioner. This partitioner uses a min-cut/max-flow algorithm to split the graph such that it minimizes the number of tensors that need to be saved for backward.

        + +

        At first glance, this might sound a lot like what we want for activation memory reduction. However, the reality is more nuanced. By default, the partitioner’s primary goal is to reduce runtime. As a result, it only recomputes certain types of operations—primarily simpler, fusible, and non-compute-intensive ops (like pointwise ops).

        + +

        Placing “compile” on the speed-versus-memory tradeoff diagram…

        + +

        flow diagram

        + +

        It is to the top-left of the eager non-AC point, as we expect torch.compile to improve on both speed and memory.

        + +

        On the other hand, relative to activation checkpointing, torch.compile is more conservative about what it recomputes, placing it closer to the top-left on the speed-versus-memory diagram.

        + +
        + +

        Selective Activation Checkpoint [NEW!]

        + +

        While normal checkpointing recomputes every op in a chosen region, selective activation checkpointing (SAC) is an additional setting on top of activation checkpointing that you can apply to have a more granular control over which operations to recompute.

        + +

        This can be useful if you have certain more expensive operations like matmuls which you prefer to avoid recomputing, but still generally want to recompute cheaper operations like pointwise.

        + +

        flow diagram

        + +

        Where plain AC (left) would save a single tensor and then recompute the entire AC’d region, with SAC (right) you can selectively save specific operations (marked red) in the region, so you can avoid recomputing them.

        + +

        To specify what to selectively save, you can specify a policy_fn. To illustrate the additional trade offs you can make with this, we present two simple policy functions.

        + +

        Policy 1: Not recomputing matmuls:

        + +
        aten = torch.ops.aten
        +compute_intensive_ops = [  
        +        aten.mm,
        +        aten.bmm,
        +        aten.addmm,
        +] 
        +def policy_fn(ctx, op, *args, **kwargs):
        +    if op in compute_intensive_ops:
        +        return CheckpointPolicy.MUST_SAVE
        +    else:
        +        return CheckpointPolicy.PREFER_RECOMPUTE
        +
        + +

        flow diagram

        + +

        Policy 2: More aggressively save anything compute intensive

        + +
        # torch/_functorch/partitioners.py
        +aten = torch.ops.aten
        +compute_intensive_ops = [  
        +   aten.mm,
        +   aten.convolution,
        +   aten.convolution_backward,
        +   aten.bmm,
        +   aten.addmm,
        +   aten._scaled_dot_product_flash_attention,
        +   aten._scaled_dot_product_efficient_attention,
        +   aten._flash_attention_forward,
        +   aten._efficient_attention_forward,
        +   aten.upsample_bilinear2d,
        +   aten._scaled_mm
        +] 
        +def policy_fn(ctx, op, *args, **kwargs):
        +    if op in compute_intensive_ops:
        +        return CheckpointPolicy.MUST_SAVE
        +    else:
        +        return CheckpointPolicy.PREFER_RECOMPUTE
        +
        + +

        flow diagram

        + +

        On the speed-versus-memory diagram, SAC is plotted as a range of points from closer to AC to closer to Eager, depending on your chosen policy.

        + +

        flow diagram

        + +

        Try it out! (Available in 2.5 as a prototype feature; see docs for more info + copy-pastable example)

        + +
        from torch.utils.checkpoint import checkpoint, create_selective_checkpoint_contexts
        +
        +# Create a policy function that returns a CheckpointPolicy
        +def policy_fn(ctx, op, *args, **kwargs):
        +    if op in ops_to_save:
        +        return CheckpointPolicy.MUST_SAVE
        +    else:
        +        return CheckpointPolicy.PREFER_RECOMPUTE
        +
        +# Use the context_fn= arg of the existing checkpoint API
        +out = checkpoint(
        +    fn, *args,
        +    use_reentrant=False,
        +    # Fill in SAC context_fn's policy_fn with functools.partial
        +    context_fn=partial(create_selective_checkpoint_contexts, policy_fn),
        +)
        +
        +
        +
        + +

        (compile-only) Memory Budget API [NEW!]

        + +

        As mentioned previously, any given SAC policy can be represented as a point on a speed-memory tradeoff diagram. Not all policies are created equal, however. The “optimal” policies are the ones that fall on a pareto curve, e.g. for all policies that incur the same memory overhead, this policy is the one that minimizes the amount of required compute.

        + +

        For users who are using torch.compile, we offer a memory budget API that automatically applies SAC over your compiled region with a pareto-optimal policy given a user-specified “memory budget” between 0 and 1, where a budget of 0 behaves like plain-AC and a budget of 1 behaves like default torch.compile.

        + +

        flow diagram

        + +

        Below are some real results on a transformer model:

        + +

        flow diagram

        + +

        We observe a 50% memory reduction by recomputing only pointwise ops, with a steady drop-off as you recompute more and more of your matmuls. Attention is the most expensive, so you tend to want to recompute those last.

        + +

        Try it out! (Available in 2.4 as an experimental feature; see this comment block for more info)

        + +
        torch._dynamo.config.activation_memory_budget = 0.5
        +
        +out = torch.compile(fn)(inp)
        +
        + +
        + +

        Conclusion

        + +

        flow diagram

        + +

        In summary, activation checkpointing techniques in PyTorch offer a variety of ways to balance memory and compute demands, from simple region-based checkpointing to more selective and automated methods. By choosing the option that best matches your model’s structure and resource constraints, you can achieve significant memory savings with an acceptable trade-off in compute.

        + +

        Acknowledgements

        + +

        We would like to thank Meta’s xformers team including Francisco Massa for working on the original version of Selective Activation Checkpoint.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/amazon-ads-case-study/index.html b/blog/amazon-ads-case-study/index.html new file mode 100644 index 000000000000..3b092f47b455 --- /dev/null +++ b/blog/amazon-ads-case-study/index.html @@ -0,0 +1,798 @@ + + + + + + + + + + + + + Case Study: Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Yashal Kanungo – Applied Scientist, Kamran Khan - Sr. Technical Product Manager, Shubha Kumbadakone – Sr. Specialist, ML Frameworks + +

        +

        Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out.

        + +

        Amazon Ads helps companies build their brand and connect with shoppers through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses and brands of all sizes, including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies can upload their own ad creatives, which can include images, video, audio, and, of course, products sold on Amazon.

        + +

        + +

        + +

        To promote an accurate, safe, and pleasant shopping experience, these ads must comply with content guidelines. For example, ads cannot flash on and off, products must be featured in an appropriate context, and images and text should be appropriate for a general audience. To help ensure that ads meet the required policies and standards, we needed to develop scalable mechanisms and tools.

        + +

        As a solution, we used machine learning (ML) models to surface ads that might need revision. As deep neural networks flourished over the past decade, our data science team began exploring more versatile deep learning (DL) methods capable of processing text, images, audio, or video with minimal human intervention. To that end, we’ve used PyTorch to build computer vision (CV) and natural language processing (NLP) models that automatically flag potentially non-compliant ads. PyTorch is intuitive, flexible, and user-friendly, and has made our transition to using DL models seamless. Deploying these new models on AWS Inferentia-based Amazon EC2 Inf1 instances, rather than on GPU-based instances, reduced our inference latency by 30 percent and our inference costs by 71 percent for the same workloads.

        + +

        Transition to deep learning

        + +

        Our ML systems paired classical models with word embeddings to evaluate ad text. But our requirements evolved, and as the volume of submissions continued to expand, we needed a method nimble enough to scale along with our business. In addition, our models must be fast and serve ads within milliseconds to provide an optimal customer experience.

        + +

        Over the last decade, DL has become very popular in numerous domains, including natural language, vision, and audio. Because deep neural networks channel data sets through many layers — extracting progressively higher-level features — they can make more nuanced inferences than classical ML models. Rather than simply detecting prohibited language, for example, a DL model can reject an ad for making false claims.

        + +

        In addition, DL techniques are transferable– a model trained for one task can be adapted to carry out a related task. For instance, a pre-trained neural network can be optimized to detect objects in images and then fine-tuned to identify specific objects that are not allowed to be displayed in an ad.

        + +

        Deep neural networks can automate two of classical ML’s most time-consuming steps: feature engineering and data labeling. Unlike traditional supervised learning approaches, which require exploratory data analysis and hand-engineered features, deep neural networks learn the relevant features directly from the data. DL models can also analyze unstructured data, like text and images, without the preprocessing necessary in ML. Deep neural networks scale effectively with more data and perform especially well in applications involving large data sets.

        + +

        We chose PyTorch to develop our models because it helped us maximize the performance of our systems. With PyTorch, we can serve our customers better while taking advantage of Python’s most intuitive concepts. The programming in PyTorch is object-oriented: it groups processing functions with the data they modify. As a result, our codebase is modular, and we can reuse pieces of code in different applications. In addition, PyTorch’s eager mode allows loops and control structures and, therefore, more complex operations in the model. Eager mode makes it easy to prototype and iterate upon our models, and we can work with various data structures. This flexibility helps us update our models quickly to meet changing business requirements.

        + +

        “Before this, we experimented with other frameworks that were “Pythonic,” but PyTorch was the clear winner for us here.” said Yashal Kanungo, Applied Scientist. “Using PyTorch was easy because the structure felt native to Python programming, which the data scientists were very familiar with”.

        + +

        Training pipeline

        + +

        Today, we build our text models entirely in PyTorch. To save time and money, we often skip the early stages of training by fine-tuning a pre-trained NLP model for language analysis. If we need a new model to evaluate images or video, we start by browsing PyTorch’s torchvision library, which offers pretrained options for image and video classification, object detection, instance segmentation, and pose estimation. For specialized tasks, we build a custom model from the ground up. PyTorch is perfect for this, because eager mode and the user-friendly front end make it easy to experiment with different architectures.

        + +

        To learn how to finetune neural networks in PyTorch, head to this tutorial.

        + +

        Before we begin training, we optimize our model’s hyperparameters, the variables that define the network architecture (for example, the number of hidden layers) and training mechanics (such as learning rate and batch size). Choosing appropriate hyperparameter values is essential, because they will shape the training behavior of the model. We rely on the Bayesian search feature in SageMaker, AWS’s ML platform, for this step. Bayesian search treats hyperparameter tuning as a regression problem: It proposes the hyperparameter combinations that are likely to produce the best results and runs training jobs to test those values. After each trial, a regression algorithm determines the next set of hyperparameter values to test, and performance improves incrementally.

        + +

        We prototype and iterate upon our models using SageMaker Notebooks. Eager mode lets us prototype models quickly by building a new computational graph for each training batch; the sequence of operations can change from iteration to iteration to accommodate different data structures or to jibe with intermediate results. That frees us to adjust the network during training without starting over from scratch. These dynamic graphs are particularly valuable for recursive computations based on variable sequence lengths, such as the words, sentences, and paragraphs in an ad that are analyzed with NLP.

        + +

        When we’ve finalized the model architecture, we deploy training jobs on SageMaker. PyTorch helps us develop large models faster by running numerous training jobs at the same time. PyTorch’s Distributed Data Parallel (DDP) module replicates a single model across multiple interconnected machines within SageMaker, and all the processes run forward passes simultaneously on their own unique portion of the data set. During the backward pass, the module averages the gradients of all the processes, so each local model is updated with the same parameter values.

        + +

        Model deployment pipeline

        + +

        When we deploy the model in production, we want to ensure lower inference costs without impacting prediction accuracy. Several PyTorch features and AWS services have helped us address the challenge.

        + +

        The flexibility of a dynamic graph enriches training, but in deployment we want to maximize performance and portability. An advantage of developing NLP models in PyTorch is that out of the box, they can be traced into a static sequence of operations by TorchScript, a subset of Python specialized for ML applications. Torchscript converts PyTorch models to a more efficient, production-friendly intermediate representation (IR) graph that is easily compiled. We run a sample input through the model, and TorchScript records the operations executed during the forward pass. The resulting IR graph can run in high-performance environments, including C++ and other multithreaded Python-free contexts, and optimizations such as operator fusion can speed up the runtime.

        + +

        Neuron SDK and AWS Inferentia powered compute

        + +

        We deploy our models on Amazon EC2 Inf1 instances powered by AWS Inferentia, Amazon’s first ML silicon designed to accelerate deep learning inference workloads. Inferentia has shown to reduce inference costs by up to 70% compared to Amazon EC2 GPU-based instances. +We used the AWS Neuron SDK — a set of software tools used with Inferentia — to compile and optimize our models for deployment on EC2 Inf1 instances.

        + +

        The code snippet below shows how to compile a Hugging Face BERT model with Neuron. Like torch.jit.trace(), neuron.trace() records the model’s operations on an example input during the forward pass to build a static IR graph.

        + +
        import torch
        +from transformers import BertModel, BertTokenizer
        +import torch.neuron
        +tokenizer = BertTokenizer.from_pretrained("path to saved vocab")
        +model = BertModel.from_pretrained("path to the saved model", returned_dict=False)
        +inputs = tokenizer ("sample input", return_tensor="pt")
        +neuron_model = torch.neuron.trace(model,
        +                                  example_inputs = (inputs['input_ids'], inputs['attention_mask']),
        +                                  verbose = 1)
        +output = neuron_model(*(inputs['input_ids'], inputs['attention_mask']))
        +
        + +

        Autocasting and recalibration

        + +

        Under the hood, Neuron optimizes our models for performance by autocasting them to a smaller data type. As a default, most applications represent neural network values in the 32-bit single-precision floating point (FP32) number format. Autocasting the model to a 16-bit format — half-precision floating point (FP16) or Brain Floating Point (BF16) — reduces a model’s memory footprint and execution time. In our case, we decided to use FP16 to optimize for performance while maintaining high accuracy.

        + +

        Autocasting to a smaller data type can, in some cases, trigger slight differences in the model’s predictions. To ensure that the model’s accuracy is not affected, Neuron compares the performance metrics and predictions of the FP16 and FP32 models. When autocasting diminishes the model’s accuracy, we can tell the Neuron compiler to convert only the weights and certain data inputs to FP16, keeping the rest of the intermediate results in FP32. In addition, we often run a few iterations with the training data to recalibrate our autocasted models. This process is much less intensive than the original training.

        + +

        Deployment

        + +

        To analyze multimedia ads, we run an ensemble of DL models. All ads uploaded to Amazon are run through specialized models that assess every type of content they include: images, video and audio, headlines, texts, backgrounds, and even syntax, grammar, and potentially inappropriate language. The signals we receive from these models indicate whether or not an advertisement complies with our criteria.

        + +

        Deploying and monitoring multiple models is significantly complex, so we depend on TorchServe, SageMaker’s default PyTorch model serving library. Jointly developed by Facebook’s PyTorch team and AWS to streamline the transition from prototyping to production, TorchServe helps us deploy trained PyTorch models at scale without having to write custom code. It provides a secure set of REST APIs for inference, management, metrics, and explanations. With features such as multi-model serving, model versioning, ensemble support, and automatic batching, TorchServe is ideal for supporting our immense workload. You can read more about deploying your Pytorch models on SageMaker with native TorchServe integration in this blog post.

        + +

        In some use cases, we take advantage of PyTorch’s object-oriented programming paradigm to wrap multiple DL models into one parent object — a PyTorch nn.Module — and serve them as a single ensemble. In other cases, we use TorchServe to serve individual models on separate SageMaker endpoints, running on AWS Inf1 instances.

        + +

        Custom handlers

        + +

        We particularly appreciate that TorchServe allows us to embed our model initialization, preprocessing, inferencing, and post processing code in a single Python script, handler.py, which lives on the server. This script — the handler —preprocesses the un-labeled data from an ad, runs that data through our models, and delivers the resulting inferences to downstream systems. TorchServe provides several default handlers that load weights and architecture and prepare the model to run on a particular device. We can bundle all the additional required artifacts, such as vocabulary files or label maps, with the model in a single archive file.

        + +

        When we need to deploy models that have complex initialization processes or that originated in third-party libraries, we design custom handlers in TorchServe. These let us load any model, from any library, with any required process. The following snippet shows a simple handler that can serve Hugging Face BERT models on any SageMaker hosting endpoint instance.

        + +
        import torch
        +import torch.neuron
        +from ts.torch_handler.base_handler import BaseHandler
        +import transformers
        +from transformers import AutoModelForSequenceClassification,AutoTokenizer
        +
        +class MyModelHandler(BaseHandler):
        +    def initialize(self, context):
        +        self.manifest = ctx.manifest
        +        properties = ctx.system_properties
        +        model_dir = properties.get("model_dir")
        +        serialized_file = self.manifest["model"]["serializedFile"]
        +        model_pt_path = os.path.join(model_dir, serialized_file)
        +
        +
        +        self.tokenizer = AutoTokenizer.from_pretrained(
        +                model_dir, do_lower_case=True
        +            )
        +        self.model = AutoModelForSequenceClassification.from_pretrained(
        +                    model_dir
        +                )
        +
        +    def preprocess(self, data):
        +
        +        input_text = data.get("data")
        +        if input_text is None:
        +            input_text = data.get("body")
        +            inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')
        +        return inputs
        +
        +    def inference(self,inputs):
        +        predictions = self.model(**inputs)
        +        return predictions
        +
        +    def postprocess(self, output):
        +        return output
        +
        + +

        Batching

        + +

        Hardware accelerators are optimized for parallelism, and batching — feeding a model multiple inputs in a single step — helps saturate all available capacity, typically resulting in higher throughputs. Excessively high batch sizes, however, can increase latency with minimal improvement in throughputs. Experimenting with different batch sizes helps us identify the sweet spot for our models and hardware accelerator. We run experiments to determine the best batch size for our model size, payload size, and request traffic patterns.

        + +

        The Neuron compiler now supports variable batch sizes. Previously, tracing a model hardcoded the predefined batch size, so we had to pad our data, which can waste compute, slow throughputs, and exacerbate latency. Inferentia is optimized to maximize throughput for small batches, reducing latency by easing the load on the system.

        + +

        Parallelism

        + +

        Model parallelism on multi-cores also improves throughput and latency, which is crucial for our heavy workloads. Each Inferentia chip contains four NeuronCores that can either run separate models simultaneously or form a pipeline to stream a single model. In our use case, the data parallel configuration offers the highest throughput at the lowest cost, because it scales out concurrent processing requests.

        + +

        Data Parallel:

        + +

        + +

        + +

        Model Parallel:

        + +

        + +

        + +

        Monitoring

        + +

        It is critical that we monitor the accuracy of our inferences in production. Models that initially make good predictions can eventually degrade in deployment as they are exposed to a wider variety of data. This phenomenon, called model drift, usually occurs when the input data distributions or the prediction targets change.

        + +

        We use SageMaker Model Monitor to track parity between the training and production data. Model Monitor notifies us when predictions in production begin to deviate from the training and validation results. Thanks to this early warning, we can restore accuracy — by retraining the model if necessary — before our advertisers are affected. To track performance in real time, Model Monitor also sends us metrics about the quality of predictions, such as accuracy, F-scores, and the distribution of the predicted classes.

        + +

        To determine if our application needs to scale, TorchServe logs resource utilization metrics for the CPU, Memory, and Disk at regular intervals; it also records the number of requests received versus the number served. For custom metrics, TorchServe offers a Metrics API.

        + +

        A rewarding result

        + +

        Our DL models, developed in PyTorch and deployed on Inferentia, sped up our ads analysis while cutting costs. Starting with our first explorations in DL, programming in PyTorch felt natural. Its user-friendly features helped smooth the course from our early experiments to the deployment of our multimodal ensembles. PyTorch lets us prototype and build models quickly, which is vital as our advertising service evolves and expands. For an added benefit, PyTorch works seamlessly with Inferentia and our AWS ML stack. We look forward to building more use cases with PyTorch, so we can continue to serve our clients accurate, real-time results.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/amazon-sagemaker-w-torchserve/index.html b/blog/amazon-sagemaker-w-torchserve/index.html new file mode 100644 index 000000000000..23cc500fbc84 --- /dev/null +++ b/blog/amazon-sagemaker-w-torchserve/index.html @@ -0,0 +1,1162 @@ + + + + + + + + + + + + + Accelerate AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe, saving up to 75% on inference costs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + James Wu, Ankith Gunapal, Li Ning, Subhash Talluri, and Saurabh Trikande + +

        +

        Multi-model endpoints (MMEs) are a powerful feature of Amazon SageMaker designed to simplify the deployment and operation of machine learning (ML) models. With MMEs, you can host multiple models on a single serving container and host all the models behind a single endpoint. The SageMaker platform automatically manages the loading and unloading of models and scales resources based on traffic patterns, reducing the operational burden of managing a large quantity of models. This feature is particularly beneficial for deep learning and generative AI models that require accelerated compute. The cost savings achieved through resource sharing and simplified model management makes SageMaker MMEs an excellent choice for you to host models at scale on AWS.

        + +

        Recently, generative AI applications have captured widespread attention and imagination. Customers want to deploy generative AI models on GPUs but at the same time are conscious of costs. SageMaker MMEs support GPU instances and is a great option for these types of applications. Today, we are excited to announce TorchServe support for SageMaker MMEs. This new model server support gives you the advantage of all the benefits of MMEs while still using the serving stack that TorchServe customers are most familiar with. In this post, we demonstrate how to host generative AI models, such as Stable Diffusion and Segment Anything Model, on SageMaker MMEs using TorchServe and build a language-guided editing solution that can help artists and content creators develop and iterate their artwork faster.

        + +

        Solution overview

        + +

        Language-guided editing is a common cross-industry generative AI use case. It can help artists and content creators work more efficiently to meet content demand by automating repetitive tasks, optimizing campaigns, and providing a hyper-personalized experience for the end customer. Businesses can benefit from increased content output, cost savings, improved personalization, and enhanced customer experience. In this post, we demonstrate how you can build language-assisted editing features using MME TorchServe that allow you to erase any unwanted object from an image and modify or replace any object in an image by supplying a text instruction.

        + +

        The user experience flow for each use case is as follows:

        + +
          +
        • To remove an unwanted object, the select the object from the image to highlight it. This action sends the pixel coordinates and the original image to a generative AI model, which generates a segmentation mask for the object. After confirming the correct object selection, you can send the original and mask images to a second model for removal. The detailed illustration of this user flow is demonstrated below.
        • +
        + + + + + + + + + + + + +
        + +Dog on a bench with mouse pointer clicking the dog + + + +Dog on a bench highlighted + + + +A bench without the dog + +
        Step 1: Select an object (“dog”) from the image + Step 2: Confirm the correct object is highlighted + Step 3: Erase the object from the image +
        + +
          +
        • To modify or replace an object, the select and highlight the desired object, following the same process as described above. Once you confirm the correct object selection, you can modify the object by supplying the original image, the mask, and a text prompt. The model will then change the highlighted object based on the provided instructions. A detailed illustration of this second user flow is as follows.
        • +
        + + + + + + + + + + + + +
        + +A vase with a cactus and mouse pointer + + + +A vase highlighted + + + +A rounded vase with a cactus + +
        Step 1: Select an object (“vase”) from the image + Step 2: Confirm the correct object is highlighted + Step 3: Provide a text prompt (“futuristic vase”) to modify the object +
        + +

        To power this solution, we use three generative AI models: Segment Anything Model (SAM), Large Mask Inpainting Model (LaMa), and Stable Diffusion Inpaint (SD). Here are how these models been utilized in the user experience workflow:

        + + + + + + + + + + +
        To remove an unwanted object + To modify or replace an object +
        + +flow diagram + + + +flow diagram + +
        + +
          +
        1. Segment Anything Model (SAM) is used to generate a segment mask of the object of interest. Developed by Meta Research, SAM is an open-source model that can segment any object in an image. This model has been trained on a massive dataset known as SA-1B, which comprises over 11 million images and 1.1 billion segmentation masks. For more information on SAM, refer to their website and research paper.
        2. +
        3. LaMa is used to remove any undesired objects from an image. LaMa is a Generative Adversarial Network (GAN) model specializes in fill missing parts of images using irregular masks. The model architecture incorporates image-wide global context and a single-step architecture that uses Fourier convolutions, enabling it to achieve state-of-the-art results at a faster speed. For more details on LaMa, visit their website and research paper.
        4. +
        5. SD 2 inpaint model from Stability AI is used to modify or replace objects in an image. This model allows us to edit the object in the mask area by providing a text prompt. The inpaint model is based on the text-to-image SD model, which can create high-quality images with a simple text prompt. It provides additional arguments such as original and mask images, allowing for quick modification and restoration of existing content. To learn more about Stable Diffusion models on AWS, refer to Create high-quality images with Stable Diffusion models and deploy them cost-efficiently with Amazon SageMaker.
        6. +
        + +

        All three models are hosted on SageMaker MMEs, which reduces the operational burden from managing multiple endpoints. In addition to that, using MME eliminates concerns about certain models being underutilized because resources are shared. You can observe the benefit from improved instance saturation, which ultimately leads to cost savings. The following architecture diagram illustrates how all three models are served using SageMaker MMEs with TorchServe.

        + +

        flow diagram

        + +

        We have published the code to implement this solution architecture in our GitHub repository. To follow along with the rest of the post, use the notebook file. It is recommended to run this example on a SageMaker notebook instance using the conda_python3 (Python 3.10.10) kernel.

        + +

        Extend the TorchServe container

        + +

        The first step is to prepare the model hosting container. SageMaker provides a managed PyTorch Deep Learning Container (DLC) that you can retrieve using the following code snippet:

        + +
        # Use SageMaker PyTorch DLC as base image
        +baseimage = sagemaker.image_uris.retrieve(
        +    framework="pytorch",
        +    region=region,
        +    py_version="py310",
        +    image_scope="inference",
        +    version="2.0.0",
        +    instance_type="ml.g5.2xlarge",
        +)
        +print(baseimage)
        +
        + +

        Because the models require resources and additional packages that are not on the base PyTorch DLC, you need to build a Docker image. This image is then uploaded to Amazon Elastic Container Registry (Amazon ECR) so we can access directly from SageMaker. The custom installed libraries are listed in the Docker file:

        + +
        ARG BASE_IMAGE
        +
        +FROM $BASE_IMAGE
        +
        +#Install any additional libraries
        +RUN pip install segment-anything-py==1.0
        +RUN pip install opencv-python-headless==4.7.0.68
        +RUN pip install matplotlib==3.6.3
        +RUN pip install diffusers
        +RUN pip install tqdm
        +RUN pip install easydict
        +RUN pip install scikit-image
        +RUN pip install xformers
        +RUN pip install tensorflow
        +RUN pip install joblib
        +RUN pip install matplotlib
        +RUN pip install albumentations==0.5.2
        +RUN pip install hydra-core==1.1.0
        +RUN pip install pytorch-lightning
        +RUN pip install tabulate
        +RUN pip install kornia==0.5.0
        +RUN pip install webdataset
        +RUN pip install omegaconf==2.1.2
        +RUN pip install transformers==4.28.1
        +RUN pip install accelerate
        +RUN pip install ftfy
        +
        + +

        Run the shell command file to build the custom image locally and push it to Amazon ECR:

        + +
        %%capture build_output
        +
        +reponame = "torchserve-mme-demo"
        +versiontag = "genai-0.1"
        +
        +# Build our own docker image
        +!cd workspace/docker && ./build_and_push.sh {reponame} {versiontag} {baseimage} {region} {account}
        +
        + +

        Prepare the model artifacts

        + +

        The main difference for the new MMEs with TorchServe support is how you prepare your model artifacts. The code repo provides a skeleton folder for each model (models folder) to house the required files for TorchServe. We follow the same four-step process to prepare each model .tar file. The following code is an example of the skeleton folder for the SD model:

        + +
        workspace
        +|--sd
        +   |-- custom_handler.py
        +   |-- model-config.yaml
        +
        + +

        The first step is to download the pre-trained model checkpoints in the models folder:

        + +
        import diffusers
        +import torch
        +import transformers
        +
        +pipeline = diffusers.StableDiffusionInpaintPipeline.from_pretrained(
        +    "stabilityai/stable-diffusion-2-inpainting", torch_dtype=torch.float16
        +)
        +
        +sd_dir = "workspace/sd/model"
        +pipeline.save_pretrained(sd_dir)
        +
        + +

        The next step is to define a custom_handler.py file. This is required to define the behavior of the model when it receives a request, such as loading the model, preprocessing the input, and postprocessing the output. The handle method is the main entry point for requests, and it accepts a request object and returns a response object. It loads the pre-trained model checkpoints and applies the preprocess and postprocess methods to the input and output data. The following code snippet illustrates a simple structure of the custom_handler.py file. For more detail, refer to the TorchServe handler API.

        + +
        def initialize(self, ctx: Context):
        +
        +def preprocess(self, data):
        +
        +def inference(self, data):
        +
        +def handle(self, data, context):
        +    requests = self.preprocess(data)
        +    responses = self.inference(requests)
        +
        +    return responses
        +
        + +

        The last required file for TorchServe is model-config.yaml. The file defines the configuration of the model server, such as number of workers and batch size. The configuration is at a per-model level, and an example config file is shown in the following code. For a complete list of parameters, refer to the GitHub repo.

        + +
        minWorkers: 1
        +maxWorkers: 1
        +batchSize: 1
        +maxBatchDelay: 200
        +responseTimeout: 300
        +
        + +

        The final step is to package all the model artifacts into a single .tar.gz file using the torch-model-archiver module:

        + +
        !torch-model-archiver --model-name sd --version 1.0 --handler workspace/sd/custom_handler.py --extra-files workspace/sd/model --config-file workspace/sam/model-config.yaml --archive-format no-archive!cd sd && tar cvzf sd.tar.gz .
        +
        + +

        Create the multi-model endpoint

        + +

        The steps to create a SageMaker MME are the same as before. In this particular example, you spin up an endpoint using the SageMaker SDK. Start by defining an Amazon Simple Storage Service (Amazon S3) location and the hosting container. This S3 location is where SageMaker will dynamically load the models base on invocation patterns. The hosting container is the custom container you built and pushed to Amazon ECR in the earlier step. See the following code:

        + +
        # This is where our MME will read models from on S3.
        +multi_model_s3uri = output_path
        +
        + +

        Then you want to define a MulitDataModel that captures all the attributes like model location, hosting container, and permission access:

        + +
        print(multi_model_s3uri)
        +model = Model(
        +    model_data=f"{multi_model_s3uri}/sam.tar.gz",
        +    image_uri=container,
        +    role=role,
        +    sagemaker_session=smsess,
        +    env={"TF_ENABLE_ONEDNN_OPTS": "0"},
        +)
        +
        +mme = MultiDataModel(
        +    name="torchserve-mme-genai-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
        +    model_data_prefix=multi_model_s3uri,
        +    model=model,
        +    sagemaker_session=smsess,
        +)
        +print(mme)
        +
        + +

        The deploy() function creates an endpoint configuration and hosts the endpoint:

        + +
        mme.deploy(
        +    initial_instance_count=1,
        +    instance_type="ml.g5.2xlarge",
        +    serializer=sagemaker.serializers.JSONSerializer(),
        +    deserializer=sagemaker.deserializers.JSONDeserializer(),
        +)
        +
        + +

        In the example we provided, we also show how you can list models and dynamically add new models using the SDK. The add_model() function copies your local model .tar files into the MME S3 location:

        + +
        # Only sam.tar.gz visible!
        +list(mme.list_models())
        +
        +models = ["sd/sd.tar.gz", "lama/lama.tar.gz"]
        +for model in models:
        +    mme.add_model(model_data_source=model)
        +
        + +

        Invoke the models

        + +

        Now that we have all three models hosted on an MME, we can invoke each model in sequence to build our language-assisted editing features. To invoke each model, provide a target_model parameter in the predictor.predict() function. The model name is just the name of the model .tar file we uploaded. The following is an example code snippet for the SAM model that takes in a pixel coordinate, a point label, and dilate kernel size, and generates a segmentation mask of the object in the pixel location:

        + +
        img_file = "workspace/test_data/sample1.png"
        +img_bytes = None
        +
        +with Image.open(img_file) as f:
        +    img_bytes = encode_image(f)
        +
        +gen_args = json.dumps(dict(point_coords=[750, 500], point_labels=1, dilate_kernel_size=15))
        +
        +payload = json.dumps({"image": img_bytes, "gen_args": gen_args}).encode("utf-8")
        +
        +response = predictor.predict(data=payload, target_model="/sam.tar.gz")
        +encoded_masks_string = json.loads(response.decode("utf-8"))["generated_image"]
        +base64_bytes_masks = base64.b64decode(encoded_masks_string)
        +
        +with Image.open(io.BytesIO(base64_bytes_masks)) as f:
        +    generated_image_rgb = f.convert("RGB")
        +    generated_image_rgb.show()
        +
        + +

        To remove an unwanted object from an image, take the segmentation mask generated from SAM and feed that into the LaMa model with the original image. The following images show an example.

        + + + + + + + + + + + + +
        +Dog on a bench + + + +White mask of dog on black background + + + +Just a bench + +
        Sample image + Segmentation mask from SAM + Erase the dog using LaMa +
        + +

        To modify or replace any object in an image with a text prompt, take the segmentation mask from SAM and feed it into SD model with the original image and text prompt, as shown in the following example.

        + + + + + + + + + + + + +
        +Dog on a bench + + +White mask of dog on black background + + +Hamster on a bench + +
        Sample image + Segmentation mask from SAM + Replace using SD model with text prompt +
        + “a hamster on a bench” +
        + +

        Cost savings

        + +

        The benefits of SageMaker MMEs increase based on the scale of model consolidation. The following table shows the GPU memory usage of the three models in this post. They are deployed on one g5.2xlarge instance by using one SageMaker MME.

        + + + + + + + + + + + + + + + + + + +
        Model + GPU Memory (MiB) +
        Segment Anything Model + 3,362 +
        Stable Diffusion In Paint + 3,910 +
        Lama + 852 +
        + +

        You can see cost savings when hosting the three models with one endpoint, and for use cases with hundreds or thousands of models, the savings are much greater.

        + +

        For example, consider 100 Stable Diffusion models. Each of the models on its own could be served by an ml.g5.2xlarge endpoint (4 GiB memory), costing $1.52 per instance hour in the US East (N. Virginia) Region. To provide all 100 models using their own endpoint would cost $218,880 per month. With a SageMaker MME, a single endpoint using ml.g5.2xlarge instances can host four models simultaneously. This reduces production inference costs by 75% to only $54,720 per month. The following table summarizes the differences between single-model and multi-model endpoints for this example. Given an endpoint configuration with sufficient memory for your target models, steady state invocation latency after all models have been loaded will be similar to that of a single-model endpoint.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + Single-model endpoint + Multi-model endpoint +
        Total endpoint price per month + $218,880 + $54,720 +
        Endpoint instance type + ml.g5.2xlarge + ml.g5.2xlarge +
        CPU Memory capacity (GiB) + 32 + 32 +
        GPU Memory capacity (GiB) + 24 + 24 +
        Endpoint price per hour + $1.52 + $1.52 +
        Number of instances per endpoint + 2 + 2 +
        Endpoints needed for 100 models + 100 + 25 +
        + +

        Clean up

        + +

        After you are done, please follow the instructions in the cleanup section of the notebook to delete the resources provisioned in this post to avoid unnecessary charges. Refer to Amazon SageMaker Pricing for details on the cost of the inference instances.

        + +

        Conclusion

        + +

        This post demonstrates the language-assisted editing capabilities made possible through the use of generative AI models hosted on SageMaker MMEs with TorchServe. The example we shared illustrates how we can use resource sharing and simplified model management with SageMaker MMEs while still utilizing TorchServe as our model serving stack. We utilized three deep learning foundation models: SAM, SD 2 Inpainting, and LaMa. These models enable us to build powerful capabilities, such as erasing any unwanted object from an image and modifying or replacing any object in an image by supplying a text instruction. These features can help artists and content creators work more efficiently and meet their content demands by automating repetitive tasks, optimizing campaigns, and providing a hyper-personalized experience. We invite you to explore the example provided in this post and build your own UI experience using TorchServe on a SageMaker MME.

        + +

        To get started, see Supported algorithms, frameworks, and instances for multi-model endpoints using GPU backed instances.

        + +
        + +

        About the authors

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +James Wu + +James Wu is a Senior AI/ML Specialist Solution Architect at AWS. helping customers design and build AI/ML solutions. James’s work covers a wide range of ML use cases, with a primary interest in computer vision, deep learning, and scaling ML across the enterprise. Prior to joining AWS, James was an architect, developer, and technology leader for over 10 years, including 6 years in engineering and 4 years in marketing & advertising industries. +
        +Li Ning + + +Li Ning is a senior software engineer at AWS with a specialization in building large-scale AI solutions. As a tech lead for TorchServe, a project jointly developed by AWS and Meta, her passion lies in leveraging PyTorch and AWS SageMaker to help customers embrace AI for the greater good. Outside of her professional endeavors, Li enjoys swimming, traveling, following the latest advancements in technology, and spending quality time with her family. +
        +Ankith Gunapal + +Ankith Gunapal is an AI Partner Engineer at Meta (PyTorch). He is passionate about model optimization and model serving, with experience ranging from RTL verification, embedded software, computer vision, to PyTorch. He holds a Master’s in Data Science and a Master’s in Telecommunications. Outside of work, Ankith is also an electronic dance music producer. + +
        +Saurabh Trikande + +Saurabh Trikande is a Senior Product Manager for Amazon SageMaker Inference. He is passionate about working with customers and is motivated by the goal of democratizing machine learning. He focuses on core challenges related to deploying complex ML applications, multi-tenant ML models, cost optimizations, and making deployment of deep learning models more accessible. In his spare time, Saurabh enjoys hiking, learning about innovative technologies, following TechCrunch and spending time with his family. + +
        +Subhash Talluri + +Subhash Talluri is a Lead AI/ML solutions architect of the Telecom Industry business unit at Amazon Web Services. He’s been leading development of innovative AI/ML solutions for Telecom customers and partners worldwide. He brings interdisciplinary expertise in engineering and computer science to help build scalable, secure, and compliant AI/ML solutions via cloud-optimized architectures on AWS. + +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ambient-clinical-intelligence-generating-medical-reports-with-pytorch/index.html b/blog/ambient-clinical-intelligence-generating-medical-reports-with-pytorch/index.html new file mode 100644 index 000000000000..960dd01cc6a3 --- /dev/null +++ b/blog/ambient-clinical-intelligence-generating-medical-reports-with-pytorch/index.html @@ -0,0 +1,911 @@ + + + + + + + + + + + + + Ambient Clinical Intelligence: Generating Medical Reports with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Miguel Del-Agua, Principal Research Scientist, Nuance and Jeremy Jancsary, Senior Principal Research Scientist, Nuance + +

        +

        Introduction

        + +

        Complete and accurate clinical documentation is an essential tool for tracking patient care. It allows for treatment plans to be shared among care teams to aid in continuity of care and ensures a transparent and effective process for reimbursement.

        + +

        Physicians are responsible for documenting patient care. Traditional clinical documentation methods have resulted in a sub-par patient-provider experience, less time interacting with patients, and decreased work-life balance. A significant amount of physicians’ time is spent in front of the computer doing administrative tasks. As a result, patients are less satisfied with the overall experience, and physicians, who prepare for years studying medicine, cannot practice at the top of their license and are burned out. Every hour physicians provide direct clinical face time to patients results in nearly two additional hours spent on EHR and desk work within the clinic day. Outside office hours, physicians spend another 1 to 2 hours of personal time each night doing additional computer and other clerical work.

        + + + +

        Physician burnout is one of the primary causes for increased medical errors, malpractice suits, turnover, and decreased access to care. Burnout leads to an increase in healthcare costs and a decrease in overall patient satisfaction. Burnout costs the United States $4.6 billion a year.

        + +

        What can we do to bring back trust, joy, and humanity to the delivery of healthcare? A significant portion of the administrative work consists of entering patient data into Electronic Health Records (EHRs) and creating clinical documentation. Clinical documentation is created from information already in the EHR as well as from the patient-provider encounter conversation.

        + +

        This article will showcase how the Nuance Dragon Ambient eXperience (DAX), an AI-powered, voice-enabled, ambient clinical intelligence solution, automatically documents patient encounters accurately and efficiently at the point of care and the technologies that enable it.

        + +

        Nuance DAX enhances the quality of care and patient experience, increases provider efficiency and satisfaction, and improves financial outcomes. It can be used in office and telehealth settings in all ambulatory specialties, including primary and urgent care.

        + +

        + +

        + +

        Natural Language Processing

        + +

        Natural Language Processing (NLP) is one of the most challenging fields in Artificial Intelligence (AI). It comprehends a set of algorithms that allow computers to understand or generate the language used by humans. These algorithms can process and analyze vast amounts of natural language data from different sources (either sound or text) to build models that can understand, classify, or even generate natural language as humans would. Like other fields in AI, NLP has significantly progressed thanks to the advent of Deep Learning (DL), which has resulted in models that can obtain results on par with humans in some tasks.

        + +

        These advanced NLP techniques are being applied in healthcare. During a typical patient-provider encounter, a conversation ensues where the doctor constructs, through questions and answers, a chronological description of the development of the patient’s presenting illness or symptoms. A physician examines the patient and makes clinical decisions to establish a diagnosis and determine a treatment plan. This conversation, and data in the EHR, provide the required information for physicians to generate the clinical documentation, referred to as medical reports.

        + +

        Two main NLP components play a role in automating the creation of clinical documentation. The first component, Automatic Speech Recognition (ASR), is used to translate speech into text. It takes the audio recording of the encounter and generates a conversation transcription (cf. Figure 2). The second component, Automatic Text Summarization, helps generate summaries from large text documents. This component is responsible for understanding and capturing the nuances and most essential aspects from the transcribed conversation into a final report in narrative form (cf. Figure 3), structured form, or a combination of both.

        + +

        We will focus on this second component, Automatic Text Summarization, which is a difficult task with many challenges:

        + +
          +
        • Its performance is tied to the ASR quality from multiple speakers (noisy input).
        • +
        • The input is conversational in nature and contains layman’s terms.
        • +
        • Protected Health Information (PHI) regulations limit medical data access.
        • +
        • The information for one output sentence is potentially spread across multiple conversation turns.
        • +
        • There is no explicit sentence alignment between input and output.
        • +
        • Various medical specialties, encounter types, and EHR systems constitute a broad and complex output space.
        • +
        • Physicians have different styles of conducting encounters and have their preferences for medical reports; there is no standard.
        • +
        • Standard summarization metrics might differ from human judgment of quality.
        • +
        + +

        + +

        + +

        +Figure 2: Transcript of a patient-doctor conversation +

        + +

        + +

        + +

        +Figure 3: Excerpt of an AI-generated medical report. HPI stands for History of present illness. +

        + +

        Text Summarization with PyTorch and Fairseq

        + +

        PyTorch is an open-source machine learning framework developed by Facebook that helps researchers prototype Deep Learning models. The Fairseq toolkit is built on top of PyTorch and focuses on sequence generation tasks, such as Neural Machine Translation (NMT) or Text Summarization. Fairseq features an active community that is continuously providing reference implementations of state-of-the-art models. It contains many built-in components (model architectures, modules, loss functions, and optimizers) and is easily extendable with plugins.

        + +

        Text summarization constitutes a significant challenge in NLP. We need models capable of generating a short version of a document while retaining the key points and avoiding uninformative content. These challenges can be addressed with different approaches. 1). Abstractive text summarization aimed at training models that can generate a summary in narrative form. 2). Extractive methods where the models are trained to select the most important parts from the input text. 3). A combination of the two, where the essential parts from the input are selected and then summarized in an abstractive fashion. Hence, summarization can be accomplished via a single end-to-end network or as a pipeline of extractive and abstractive components. To that end, Fairseq provides all the necessary tools to be successful in our endeavor. It features either end-to-end models such as the classical Transformer, different types of Language Models and pre-trained versions that enable researchers to focus on what matters most—to build state-of-the-art models that generate valuable reports.

        + +

        However, we are not just summarizing the transcribed conversation; we generate high-quality medical reports, which have many considerations.

        + +
          +
        • Every section of a medical report is different in terms of content, structure, fluency, etc.
        • +
        • All medical facts mentioned in the conversation should be present in the report, for example, a particular treatment or dosage.
        • +
        • In the healthcare domain, the vocabulary is extensive, and models need to deal with medical terminology.
        • +
        • Patient-doctor conversations are usually much longer than the final report.
        • +
        + +

        All these challenges require our researchers to run a battery of extensive experiments. Thanks to the flexibility of PyTorch and Fairseq, their productivity has greatly increased. Further, the ecosystem offers an easy path from ideation, implementation, experimentation, and final roll-out to production. Using multiple GPUs or CPUs is as simple as providing an additional argument to the tools, and because of the tight Python integration, PyTorch code can be easily debugged.

        + +

        In our continuous effort to contribute to the open-source community, features have been developed at Nuance and pushed to the Fairseq GitHub repository. These try to overcome some of the challenges mentioned such as, facilitating copying of, especially rare or unseen, words from the input to summary, training speedups by improving Tensor Core utilization, and ensuring TorchScript compatibility of different Transformer configurations. Following, we will show an example of how to train a Transformer model with a Pointer Generator mechanism (Transformer-PG), which can copy words from the input.

        + +

        How to build a Transformer model with a Pointer Generator mechanism

        + +

        In this step-by-step guide, it is assumed the user has already installed PyTorch and Fairseq.

        + +

        1. Create a vocabulary and extend it with source position markers:

        + +

        These markers will allow the model to point to any word in the input sequence.

        + +
        vocab_size=<vocab_size>
        +position_markers=512
        +export LC_ALL=C
        +cat train.src train.tgt |
        +  tr -s '[:space:]' '\n' |
        +  sort |
        +  uniq -c |
        +  sort -k1,1bnr -k2 |
        +  head -n "$((vocab_size - 4))" |
        +  awk '{ print $2 " " $1 }' > dict.pg.txt
        +python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >> dict.pg.txt
        +
        + +

        This will create a file “dict.pg.txt” that contains the <vocab_size> most frequent words followed by 512 position markers named from “<unk-0>” to “<unk-511>”.

        + +

        In case we have an input like

        + +
        src = "Hello, I'm The Dogtor"
        +
        + +

        it could happen that our model has been trained without the word “Dogtor” in its vocabulary. Therefore, when we feed this sequence into the model, it should be converted to:

        + +
        src = "Hello, I'm The <unk-3>"
        +
        + +

        Now, “<unk-3>” is part of our vocabulary and could be predicted by the model (this is where the pointer-generator comes in). In such a case, we will only need to post-process the output to replace “<unk-3>” by the word at input position 3.

        + +

        2. Preprocess the text data to replace unknown words by its positional markers:

        + +

        We can use the scripts from https://github.com/pytorch/fairseq/tree/master/examples/pointer_generator.

        + +
        # Considering we have our data in:
        +# train_src = /path/to/train.src
        +# train_tgt = /path/to/train.tgt
        +# valid_src = /path/to/valid.src
        +# valid_tgt = /path/to/valid.tgt
        +./preprocess.py --source /path/to/train.src \
        +                --target /path/to/train.tgt \
        +                --vocab <(cut -d' ' -f1 dict.pg.txt) \
        +                --source-out /path/to/train.pg.src \
        +                --target-out /path/to/train.pg.tgt
        +
        +./preprocess.py --source /path/to/valid.src \
        +                --target /path/to/valid.tgt \
        +                --vocab <(cut -d' ' -f1 dict.pg.txt) \
        +                --source-out /path/to/valid.pg.src \
        +                --target-out /path/to/valid.pg.tgt
        +
        +./preprocess.py --source /path/to/test.src \
        +                --vocab <(cut -d' ' -f1 dict.pg.txt) \
        +                --source-out /path/to/test.pg.src
        +
        + +

        3. Now let’s binarize the data, so that it can be processed faster:

        + +
        fairseq-preprocess --task "translation" \
        +                   --source-lang "pg.src" \
        +                   --target-lang "pg.tgt" \
        +                   --trainpref /path/to/train \
        +                   --validpref /path/to/valid \
        +                   --srcdict dict.pg.txt \
        +                   --cpu \
        +                   --joined-dictionary \
        +                   --destdir <data_dir>
        +
        + +

        You might notice the type of task is “translation”. This is because there is no “summarization” task available; we could understand it as a kind of NMT task where the input and output languages are shared and the output (summary) is shorter than the input.

        + +

        4. Now we can train the model:

        + +
        fairseq-train <data_dir> \
        +              --save-dir <model_dir> \
        +              --task "translation" \
        +              --source-lang "src" \
        +              --target-lang "tgt" \
        +              --arch "transformer_pointer_generator" \
        +              --max-source-positions 512 \
        +              --max-target-positions 128 \
        +              --truncate-source \
        +              --max-tokens 2048 \
        +              --required-batch-size-multiple 1 \
        +              --required-seq-len-multiple 8 \
        +              --share-all-embeddings \
        +              --dropout 0.1 \
        +              --criterion "cross_entropy" \
        +              --optimizer adam \
        +              --adam-betas '(0.9, 0.98)' \
        +              --adam-eps 1e-9 \
        +              --update-freq 4 \
        +              --lr 0.004 \
        +              # Pointer Generator
        +              --alignment-layer -1 \
        +              --alignment-heads 1 \
        +              --source-position-markers 512
        +
        + +

        This configuration makes use of features Nuance has contributed back to Fairseq:

        + +
          +
        • Transformer with a Pointer Generator mechanism to facilitate copying of words from the input.
        • +
        • Sequence length padded to a multiple of 8 to better use tensor cores and reduce training time.
        • +
        + +

        5. Now let’s take a look at how to generate a summary with our new medical report generation system:

        + +
        import torch
        +from examples.pointer_generator.pointer_generator_src.transformer_pg import TransformerPointerGeneratorModel
        +
        +# Patient-Doctor conversation
        +input = "[doctor] Lisa Simpson, thirty six year old female, presents to the clinic today because " \
        +        "she has severe right wrist pain"
        +
        +# Load the model
        +model = TransformerPointerGeneratorModel.from_pretrained(data_name_or_path=<data_dir>,
        +                                                         model_name_or_path=<model_dir>,
        +                                                         checkpoint_file="checkpoint_best.pt")
        +
        +result = model.translate([input], beam=2)
        +
        +print(result[0])
        +Ms. <unk-2> is a 36-year-old female who presents to the clinic today for evaluation of her right wrist.
        +
        + +

        6. Alternatively, we can use fairseq-interactive and a postprocessing tool to substitute positional unknown tokens by its words from the input:

        + +
        fairseq-interactive <data_dir> \
        +              --batch-size <batch_size> \
        +              --task translation \
        +              --source-lang src \
        +              --target-lang tgt \
        +              --path <model_dir>/checkpoint_last.pt \
        +              --input /path/to/test.pg.src \
        +              --buffer-size 20 \
        +              --max-len-a 0 \
        +              --max-len-b 128 \
        +              --beam 2 \
        +              --skip-invalid-size-inputs-valid-test | tee generate.out
        +
        +grep "^H-" generate.out | cut -f 3- > generate.hyp
        +
        +./postprocess.py \
        +	--source <(awk 'NF<512' /path/to/test.pg.src) \
        +	--target generate.hyp \
        +	--target-out generate.hyp.processed
        +
        + +

        Now we have the final set of reports in “generate.hyp.processed”, with “<unk-N>” replaced by the original word from the input sequence.

        + +

        Model Deployment

        + +

        PyTorch offers great flexibility in modeling and a rich surrounding ecosystem. However, while several recent articles have suggested that the use of PyTorch in research and academia may be close to surpassing TensorFlow, there seems to be an overall sense of TensorFlow being the preferred platform for deployment to production. Is this still the case in 2021? Teams looking to serve their PyTorch models in production have a few options.

        + +

        Before describing our journey, let’s take a brief detour and define the term model.

        + +

        Models as computation graphs

        + +

        A few years back, it was still common for machine learning toolkits to support only particular classes of models of a rather fixed and rigid structure, with only a few degrees of freedom (like the kernel of a support vector machine or the number of hidden layers of a neural network). Inspired by foundational work in Theano, toolkits like Microsoft’s CNTK or Google’s TensorFlow were among the first to popularize a more flexible view on models, as computation graphs with associated parameters that can be estimated from data. This view blurred the boundaries between popular types of models (such as DNNs or SVMs), as it became easy to blend the characteristics of each into your type of graph. Still, such a graph had to be defined upfront before estimating its parameters, and it was pretty static. This made it easy to save models to a self-contained bundle, like a TensorFlow SavedModel (such a bundle simply contains the structure of the graph, as well as the concrete values of the estimated parameters). However, debugging such models can be difficult because the statements in the Python code that build the graph are logically separate from the lines that execute it. Researchers also long for easier ways of expressing dynamic behavior, such as the computation steps of the forward pass of a model being conditionally dependent on its input data (or its previous output).

        + +

        Most recently, the above limitations have led to a second revolution spearheaded by PyTorch and TensorFlow 2. The computation graph is no longer defined explicitly. Instead, it will be populated implicitly as the Python code executes operations on tensor arguments. An essential technique that powers this development is automatic differentiation. As the computation graph is being built implicitly while executing the steps of the forward pass, all the necessary data will be tracked for later computation of the gradient concerning the model parameters. This allows for great flexibility in training a model, but it raises an important question. If the computation happening inside a model is only implicitly defined through our Python code’s steps as it executes concrete data, what is it that we want to save as a model? The answer – at least initially – was the Python code with all its dependencies, along with the estimated parameters. This is undesirable for practical reasons. For instance, there is a danger that the team working on model deployment does not exactly reproduce the Python code dependencies used during training, leading to subtly divergent behavior. The solution typically consists of combining two techniques, scripting and tracing, that is, extra annotations in your Python code and execution of your code on exemplary input data, allowing PyTorch to define and save the graph that should be executed during later inference on new, unseen data. This requires some discipline by whoever creates the model code (arguably voiding some of the original flexibility of eager execution), but it results in a self-contained model bundle in TorchScript format. The solution in TensorFlow 2 is remarkably similar.

        + +

        Serving our report generation models

        + +

        Our journey in deploying the report generation models reflects the above discussion. We started out serving our models by deploying the model code and its dependencies along with the parameter checkpoints in a custom Docker image exposing a gRPC service interface. However, we soon noticed that it became error-prone to replicate the exact code and environment used by the modeling team while estimating the parameters. Moreover, this approach prevented us from leveraging high-performance model serving frameworks like NVIDIA’s Triton, which is written in C++ and requires self-contained models that can be used without a Python interpreter. At this stage, we were facing a choice between attempting to export our PyTorch models to ONNX or TorchScript format. ONNX is an open specification for representing machine learning models that increasingly finds adoption. It is powered by a high-performance runtime developed by Microsoft (ONNX Runtime). While we were able to achieve performance acceleration for our TensorFlow BERT-based model using ONNX Runtime, at the time one of our PyTorch model required some operators that weren’t yet supported in ONNX. Rather than implement these using custom operators, we decided to look into TorchScript for the time being.

        + +

        A maturing ecosystem

        + +

        Is it all roses? No, it has been a rockier journey than we expected. We encountered what seems to be a memory leak in the MKL libraries used by PyTorch while serving the PyTorch code directly. We encountered deadlocks in trying to load multiple models from multiple threads. We had difficulties exporting our models to ONNX and TorchScript formats. Models would not work out-of-the-box on hardware with multiple GPUs, they always accessed the particular GPU device on which they were exported. We encountered excessive memory usage in the Triton inference server while serving TorchScript models, which we found out was due to automatic differentiation accidentally being enabled during the forward pass. However, the ecosystem keeps improving, and there is a helpful and vibrant open-source community eager to work with us to mitigate such issues.

        + +

        Where to go from here? For those that require the flexibility of serving PyTorch code directly, without going through the extra step of exporting self-contained models, it is worth pointing out that the TorchServe project now provides a way of bundling the code together with parameter checkpoints into a single servable archive, greatly reducing the risk of code and parameters running apart. To us, however, exporting models to TorchScript has proven beneficial. It provides a clear interface between modeling and deployment teams, and TorchScript further reduces the latency when serving models on GPU via its just-in-time compilation engine.

        + +

        Scaling at large and the future

        + +

        Finally, efficient deployment to the cloud is about more than just computing the response of a single model instance efficiently. Flexibility is needed in managing, versioning and updating models. High-level scalability must be achieved via techniques such as load-balancing, horizontal scaling and vertical scaling. If many models are involved, scale-to-zero quickly becomes a topic as it is unacceptable to pay for serving models that do not answer any requests. Providing such extra functionality on top of a low-level inference server like Triton is the job of an orchestration framework. After gaining some first experience with KubeFlow, to that end, we decided to turn our attention to Azure ML, which provides similar functionality but integrates more deeply with the Azure platform, on which we crucially rely for large parts of our technology stack already. This part of our journey has just begun.

        + +

        Conclusion

        + +

        Academia has long recognized that we are “standing on the shoulders of giants.” As Artificial Intelligence is maturing from a scientific discipline into technology, the same spirit of collaboration that originally fueled its scientific foundation has carried over into the world of software engineering. Open-source enthusiasts join technology companies worldwide to build open software ecosystems that allow for new angles at solving some of the most pressing challenges of modern society. In this article, we’ve taken a look at Nuance’s Dragon Ambient eXperience, an AI-powered, voice-enabled solution that automatically documents patient care, reducing healthcare providers’ administrative burdens. Nuance DAX improves the patient-provider experience, reduces physician burnout, and improves financial outcomes. It brings back trust, joy, and humanity to the delivery of healthcare. Fairseq and PyTorch have proven to be an incredible platform for powering this AI technology, and in turn, Nuance has contributed back some of its innovations in this space. For further reading, we invite you to take a look at our recent ACL publication and the Nuance “What’s Next” blog.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/amd-extends-support-for-pt-ml/index.html b/blog/amd-extends-support-for-pt-ml/index.html new file mode 100644 index 000000000000..eb2dc369678f --- /dev/null +++ b/blog/amd-extends-support-for-pt-ml/index.html @@ -0,0 +1,697 @@ + + + + + + + + + + + + + AMD Extends Support for PyTorch Machine Learning Development on Select RDNA™ 3 GPUs with ROCm™ 5.7 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + AMD + +

        +

        Researchers and developers working with Machine Learning (ML) models and algorithms using PyTorch can now use AMD ROCm 5.7 on Ubuntu® Linux® to tap into the parallel computing power of the Radeon™ RX 7900 XTX and the Radeon™ PRO W7900 graphics cards which are based on the AMD RDNA™ 3 GPU architecture.

        + +

        A client solution built on these two high-end GPUs enables a local, private, and cost-effective workflow for ML training and inference for those who previously relied on cloud-based solutions alone.

        + +

        ML Development on Desktop

        + +

        Accelerate Machine Learning With Pytorch On Your Desktop

        + +
          +
        • A local PC or workstation system running PyTorch with a Radeon 7900 series GPU presents a capable, yet affordable solution to address these growing workflow challenges thanks to large GPU memory sizes of 24GB and even 48GB.
        • +
        + +

        Unified Software Stack For The Desktop And The Datacenter

        + +
          +
        • The latest AMD ROCm 5.7 software stack for GPU programming unlocks the massively parallel compute power of these RDNA™ 3 architecture-based GPUs for use with PyTorch, one of the leading ML frameworks. The same unified software stack also supports the CDNA™ GPU architecture of the AMD Instinct™ MI series accelerators.
        • +
        + +

        Freedom To Customize

        + +
          +
        • The AMD ROCm platform is primarily Open-Source Software (OSS). It allows developers the freedom to customize and tailor their GPU software for their own needs while collaborating with a community of other developers, and helping each other find solutions in an agile, flexible, and rapid manner. The AMD ROCm platform’s goal is to allow users to maximize their GPU hardware investment. The AMD ROCm platform is designed to help develop, test, and deploy GPU accelerated HPC, AI, scientific computing, CAD, and other applications in a free, open source, integrated and secure software ecosystem.
        • +
        + +

        As the industry moves towards an ecosystem that supports a broad set of systems, frameworks and accelerators, AMD is determined to continue to make AI more accessible to PyTorch developers and researchers that benefit from a local client-based setup for ML development using RDNA™ 3 architecture-based desktop GPUs.

        + +

        Learn More

        + +

        https://www.amd.com/en/developer/resources/ml-radeon.html

        + +

        Download Software

        + +

        https://www.amd.com/en/support/linux-drivers

        + +

        Visit the Documentation Portal to get started training ML models on your local desktop

        + +

        https://rocm.docs.amd.com/projects/radeon/en/latest/

        + +

        Prerequisites

        + +

        https://rocm.docs.amd.com/projects/radeon/en/latest/docs/prerequisites.html

        + +

        How to Guide

        + +

        https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/howto.html

        + +

        © 2023 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD Arrow logo, CDNA, Radeon, ROCm, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Linux® is the registered trademark of Linus Torvalds in the U.S. and other countries. Microsoft and Windows are registered trademarks of Microsoft Corporation in the US and/or other countries. PyTorch, the PyTorch logo and any related marks are trademarks of The Linux Foundation. TensorFlow, the TensorFlow logo and any related marks are trademarks of Google Inc. Ubuntu and the Ubuntu logo are registered trademarks of Canonical Ltd. Other product names used in this publication are for identification purposes only and may be trademarks of their respective owners.

        + +

        Radeon™ AI technology is compatible with all AMD Radeon 7000 Series graphics cards and newer. Please check with your system manufacturer for feature availability prior to purchase. GD-232.

        + +
          +
        1. Based on AMD internal measurements, November 2022, comparing the Radeon RX 7900 XTX at 2.5GHz boost clock with 96 CUs issuing 2X the Bfloat16 math operations per clocks vs. the RX 6900 XT GPU at 2.25 GHz boost clock and 80 CUs issue 1X the Bfloat16 math operations per clock. RX-821
        2. +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/amd-journey/index.html b/blog/amd-journey/index.html new file mode 100644 index 000000000000..779a355a5760 --- /dev/null +++ b/blog/amd-journey/index.html @@ -0,0 +1,671 @@ + + + + + + + + + + + + + AMD's Journey to Openness and Performance | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        August 01, 2023

        +

        + AMD's Journey to Openness and Performance +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        AMD has gained progress in building a robust software stack that supports an open ecosystem of models, libraries, frameworks, and tools. With proven platforms gaining momentum, there is significance of a leadership software stack and an optimized ecosystem for achieving application performance. PyTorch is a key part of AMD’s AI journey, and AMD’s Victor Peng, AMD President and Soumith Chintala, founder of PyTorch discussed the latest progress at the DC & AI Keynote on June 12.

        + +

        Building a Powerful SW Stack with ROCm

        + +

        Victor introduced ROCm, AMD’s SW stack for Instinct Data Center GPUs. It offers a comprehensive set of open-source libraries, runtime, compilers, and tools for developing, running, and fine-tuning AI models. The fifth generation ROCm incorporates optimizations for AI and high-performance computing workloads, including tailored kernels for low-latency memory systems, support for new data types, and integration with OpenAI Triton. With tools for porting AI software to AMD Instinct platforms, ROCm ensures quality and robustness, tested extensively and compliant with PyTorch and TensorFlow frameworks.

        + +

        Collaboration with PyTorch

        + +

        To shed light on the partnership between AMD and PyTorch, Victor invited Soumith Chintala, the founder of PyTorch, to discuss the advancements and integration between the two. PyTorch, the industry’s most famous AI framework, boasts a vibrant developer community and is known for its continuous innovation and incorporation of cutting-edge research.

        + +

        To highlight the AMD and PyTorch partnership, Victor hosted a discussion with Soumith Chintala, the founder of PyTorch. PyTorch, renowned for its innovation and community, is the industry’s leading AI framework. The latest version, PyTorch 2.0, integrates with hardware-agnostic software compilers like OpenAI Triton, enabling efficient training and deployment of AI models. With optimized techniques, PyTorch 2.0 enhances productivity and offers remarkable speed improvements. The collaboration between AMD and the PyTorch Foundation ensures seamless utilization of AMD GPUs, expanding AI accelerator accessibility worldwide and paving the way for future optimizations and broader hardware support.

        + +

        Empowering the Developer Community

        + +

        The partnership between AMD and PyTorch benefits the developer community by democratizing access to AI accelerators. Support for AMD GPUs in PyTorch allows developers to train and deploy models across various platforms, including CPUs like EPYC and Ryzen, GPUs like Instinct and Radeon, and embedded devices like Versal SoCs. By ensuring immediate compatibility of new models on AMD platforms, the collaboration streamlines the development process and empowers developers to leverage the full potential of AMD’s hardware. This increased accessibility and flexibility enable developers worldwide to push the boundaries of AI innovation.

        + +

        Hugging Face and AI Model Innovation

        + +

        Victor praised Hugging Face as the leading force behind open-source AI model innovation, empowering generative AI with transformative transformers. AMD’s optimized software enables a high-performing development stack, supporting groundbreaking AI advancements for customers and developers through scalable real-world deployments.

        + +

        Conclusion

        + +

        At the DC & AI Keynote, AMD demonstrated its dedication to openness, performance, and collaboration. The ROCm SW stack, PyTorch integration, and support for Hugging Face exemplify AMD’s commitment to empowering developers and researchers to achieve AI breakthroughs. By offering accessible, high-performing solutions, AMD fuels the future of AI as a leading GPU platform integrated with PyTorch.

        + +

        To listen to the full keynote visit the AMD Youtube channel

        + +

        To listen to Soumith Chintala’s section of the keynote

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-cpp/index.html b/blog/announcing-cpp/index.html new file mode 100644 index 000000000000..65b118483f13 --- /dev/null +++ b/blog/announcing-cpp/index.html @@ -0,0 +1,773 @@ + + + + + + + + + + + + + Announcing CPP-based S3 IO DataPipes | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        July 25, 2023

        +

        + Announcing CPP-based S3 IO DataPipes +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + John He, Khaled ElGalaind, Roshani Nagmote, Daiming Yang + +

        +

        Training large deep learning models requires large datasets. Amazon Simple Storage Service (Amazon S3) is a scalable cloud object store service used for storing large training datasets. Machine learning (ML) practitioners need an efficient data pipe that can download data from Amazon S3, transform the data, and feed the data to GPUs for training models with high throughput and low latency.

        + +

        In this post, we introduce the new S3 IO DataPipes for PyTorch, S3FileLister and S3FileLoader. For memory efficiency and fast runs, the new DataPipes use the C++ extension to access Amazon S3. Benchmarking shows that S3FileLoader is 59.8% faster than FSSpecFileOpener for downloading a natural language processing (NLP) dataset from Amazon S3. You can build IterDataPipe training pipelines with the new DataPipes. We also demonstrate that the new DataPipe can reduce overall Bert and ResNet50 training time by 7%. The new DataPipes have been upstreamed to the open-source TorchData 0.4.0 with PyTorch 1.12.0.

        + +

        Overview

        + +

        Amazon S3 is a scalable cloud storage service with no limit on data volume. Loading data from Amazon S3 and feeding the data to high-performance GPUs such as NVIDIA A100 can be challenging. It requires an efficient data pipeline that can meet the data processing speed of GPUs. To help with this, we released a new high performance tool for PyTorch: S3 IO DataPipes. DataPipes are subclassed from torchdata.datapipes.iter.IterDataPipe, so they can interact with the IterableDataPipe interface. Developers can quickly build their DataPipe DAGs to access, transform, and manipulate data with shuffle, sharding, and batch features.

        + +

        The new DataPipes are designed to be file format agnostic and Amazon S3 data is downloaded as binary large objects (BLOBs). It can be used as a composable building block to assemble a DataPipe graph that can load tabular, NLP, and computer vision (CV) data into your training pipelines.

        + +

        Under the hood, the new S3 IO DataPipes employ a C++ S3 handler with the AWS C++ SDK. In general, a C++ implementation is more memory efficient and has better CPU core usage (no Global Interpreter Lock) in threading compared to Python. The new C++ S3 IO DataPipes are recommended for high throughput, low latency data loading in training large deep learning models.

        + +

        The new S3 IO DataPipes provide two first-class citizen APIs:

        +
          +
        • S3FileLister – Iterable that lists S3 file URLs within the given S3 prefixes. The functional name for this API is list_files_by_s3.
        • +
        • S3FileLoader – Iterable that loads S3 files from the given S3 prefixes. The functional name for this API is load_files_by_s3.
        • +
        + +

        Usage

        + +

        In this section, we provide instructions for using the new S3 IO DataPipes. We also provide a code snippet for load_files_by_s3().

        + +

        Build from source

        +

        The new S3 IO DataPipes use the C++ extension. It is built into the torchdata package by default. However, if the new DataPipes are not available within the environment, for example Windows on Conda, you need to build from the source. For more information, refer to Iterable Datapipes.

        + +

        Configuration

        +

        Amazon S3 supports global buckets. However, a bucket is created within a Region. You can pass a Region to the DataPipes by using __init__(). Alternatively, you can either export AWS_REGION=us-west-2 into your shell or set an environment variable with os.environ['AWS_REGION'] = 'us-east-1' in your code.

        + +

        To read objects in a bucket that aren’t publicly accessible, you must provide AWS credentials through one of the following methods:

        + + + +

        Example code

        +

        The following code snippet provides a typical usage of load_files_by_s3():

        + +
        from torch.utils.data import DataLoader

        +from torchdata.datapipes.iter import IterableWrapper


        +
        +s3_shard_urls = IterableWrapper(["s3://bucket/prefix/",])
.list_files_by_s3()
        +s3_shards = s3_shard_urls.load_files_by_s3()

        +# text data

        +training_data = s3_shards.readlines(return_path=False)

        +data_loader = DataLoader(
        +      training_data,
        +      batch_size=batch_size,
        +      num_workers=num_workers,

        +)
# training loop

        +for epoch in range(epochs):
    
        +      # training step
    
        +      for bach_data in data_loader:
        
        +         # forward pass, backward pass, model update 

        +
        + +

        Benchmark

        + +

        In this section, we demonstrate how the new DataPipe can reduce overall Bert and ResNet50 training time.

        + +

        Isolated DataLoader performance evaluation against FSSpec

        + +

        FSSpecFileOpener is another PyTorch S3 DataPipe. It uses botocore and aiohttp/asyncio to access S3 data. The following is the performance test setup and result (quoted from Performance Comparison between native AWSSDK and FSSpec (boto3) based DataPipes).

        + +

        The S3 data in the test is a sharded text dataset. Each shard has about 100,000 lines and each line is around 1.6 KB, making each shard about 156 MB. The measurements in this benchmark are averaged over 1,000 batches. No shuffling, sampling, or transforms were performed.

        + +

        The following chart reports the throughput comparison for various batch sizes for num_workers=0, the data loader runs in the main process. S3FileLoader has higher queries per second (QPS). It is 90% higher than fsspec at batch size 512.

        + +

        Batch Sizes 1

        + +

        The following chart reports the results for num_workers=4, the data loaders runs in the main process. S3FileLoader is 59.8% higher than fsspec at batch size 512.

        + +

        Batch Sizes 2

        + +

        Training ResNet50 Model against Boto3

        +

        For the following chart, we trained a ResNet50 model on a cluster of 4 p3.16xlarge instances with a total 32 GPUs. The training dataset is ImageNet with 1.2 million images organized into 1,000-image shards. The training batch size is 64. The training time is measured in seconds. For eight epochs, S3FileLoader is 7.5% faster than Boto3.

        + +

        Boto3

        + +

        Training a Bert model against Boto3

        +

        For the following cart, we trained a Bert model on a cluster of 4 p3.16xlarge instances with a total 32 GPUs. The training corpus has 1474 files. Each file has around 150,000 samples. To run a shorter epoch, we use 0.05% (approximately 75 samples) per file. The batch size is 2,048. The training time is measured in seconds. For one epoch, S3FileLoader is 7% faster than Boto3.

        + +

        Boto3 2

        + +

        Comparison against the original PyTorch S3 plugin

        +

        The new PyTorch S3 DataPipes perform substantially better than the original PyTorch S3 plugin. We have tuned the internal buffer size for S3FileLoader. The loading time is measured in seconds.

        + +

        For the 10 sharded charades files (approximately 1.5 GiB each), S3FileLoader was 3.5 times faster in our experiments.

        + +

        Best practices

        +

        Training large deep learning models may require a massive compute cluster with tens or even hundreds of nodes. Each node in the cluster may generate a large number of data loading requests that hit a specific S3 shard. To avoid throttle, we recommend sharding training data across S3 buckets and S3 folders.

        + +

        Best Practices

        + +

        To achieve good performance, it helps to have file sizes that are big enough to parallelize across a given file, but not so big that we hit the limits of throughput on that object on Amazon S3 depending on the training job. The optimal size can be between 50–200 MB.

        + +

        Conclusion and next steps

        + +

        In this post, we introduced you to the new PyTorch IO DataPipes. The new DataPipes use aws-sdk-cpp and show better performance against Boto3-based data loaders.

        + +

        For next steps, we plan to improve on usability, performance, and functionality by focusing on the following features:

        + +
          +
        • S3 authorization with IAM roles – Currently, the S3 DataPipes support explicit access credentials, instance profiles, and S3 bucket policies. However, there are use cases where IAM roles are preferred.
        • +
        • Double buffering – We plan to offer double buffering to support multi-worker downloading.
        • +
        • Local caching – We plan on making model training able to traverse the training dataset for multiple passes. Local caching after the first epoch can cut out time of flight delays from Amazon S3, which can substantially accelerate data retrieval time for subsequent epochs.
        • +
        • Customizable configuration – We plan to expose more parameters such as internal buffer size, multi-part chunk size, and executor count and allow users to further tune data loading efficiency.
        • +
        • Amazon S3 upload – We plan to expand the S3 DataPipes to support upload for checkpointing.
        • +
        • Merge with fsspecfsspec is used in other systems such as torch.save(). We can integrate the new S3 DataPipes with fsspec so they can have more use cases.
        • +
        + +

        Acknowledgement

        + +

        We would like to thank Vijay Rajakumar and Kiuk Chung from Amazon for providing their guidance for S3 Common RunTime and PyTorch DataLoader. We also want to thank Erjia Guan, Kevin Tse, Vitaly Fedyunin , Mark Saroufim, Hamid Shojanazeri, Matthias Reso, and Geeta Chauhan from Meta AI/ML, and Joe Evans from AWS for reviewing the blog and the GitHub PRs.

        + +

        References

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-docathon-h2-2023/index.html b/blog/announcing-docathon-h2-2023/index.html new file mode 100644 index 000000000000..ba35447948ad --- /dev/null +++ b/blog/announcing-docathon-h2-2023/index.html @@ -0,0 +1,670 @@ + + + + + + + + + + + + + Announcing PyTorch Docathon H2 2023 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 02, 2023

        +

        + Announcing PyTorch Docathon H2 2023 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce that we will be holding a Docathon for PyTorch on November 1, 2023! This event is an opportunity for our community to come together and improve the quality of our documentation.

        + +

        During the Docathon, we will focus on updating and improving existing content, as well as adding new tutorials and docstrings. We encourage all members of the community to participate and contribute their expertise to make our documentation even better. This is a great opportunity to learn and collaborate together.

        + +

        Check out our previous docathon success story here.

        + +

        Why Participate

        + +

        One of the best things about the Docathon is that you can make a tangible, positive impact on the quality of documentation in real time. This collaborative event brings together diverse team members from various companies, backgrounds, and roles, united to work towards a common goal. This event not only fosters team building and knowledge sharing but also presents an opportunity for individuals to acquire new skills, such as writing, editing, and utilizing documentation tools. Participating in a docathon can be particularly beneficial for team members who may lack experience in these areas.

        + +

        And of course all participants will be recognized for their contributions. Top participants will receive special awards.

        + +

        Event Details

        + +
          +
        • Nov 1: Kick-off
        • +
        • Nov 1- Nov 12: Submissions and Feedback
        • +
        • Nov 13 - Nov 15: Final Reviews
        • +
        • Nov 15: Winner Announcements
        • +
        + +

        Details for the Docathon to be announced at the kick-off call on November 1.

        + +

        To participate in the Docathon and receive updates about the event, register here: RSVP

        + +

        We are excited to see the improvements that will come out of this Docathon, and we look forward to your participation!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-docathon/index.html b/blog/announcing-docathon/index.html new file mode 100644 index 000000000000..e397a4bd0e3f --- /dev/null +++ b/blog/announcing-docathon/index.html @@ -0,0 +1,678 @@ + + + + + + + + + + + + + Announcing PyTorch Docathon 2023 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        May 03, 2023

        +

        + Announcing PyTorch Docathon 2023 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        PyTorch Docathon

        + +

        We are excited to announce the first ever PyTorch Docathon! The Docathon is a hackathon-style event focused on improving the documentation by enlisting the help of the community. Documentation is a crucial aspect of any technology and by improving the documentation, we can make it easier for users to get started with PyTorch, help them understand how to use its features effectively, and ultimately accelerate research to production in the field of machine learning.

        + +

        WHY PARTICIPATE

        + +

        Low Barrier to Entry

        + +

        Many open-source projects require extensive knowledge of the codebase and prior contributions to the project to participate in any sort of hackathon events. The Docathon, on the other hand, is designed for newcomers. We do expect familiarity with Python, basic knowledge of PyTorch, and ML. But don’t fret, there are some tasks that are related to website issues that won’t require even that.

        + +

        Tangible Results

        + +

        One of the best things about the Docathon is that you can see the results of your efforts in real time. Improving documentation can have a huge impact on a project’s usability and accessibility and you’ll be able to see those improvements firsthand. Plus having tangible results can be a great motivator to keep contributing.

        + +

        Collaborative Environment

        + +

        The Docathon is a collaborative event which means you’ll have the opportunity to work with other contributors and PyTorch maintainers on improving the documentation. This can be a great way to learn from others, share ideas, and build connections.

        + +

        Learning Opportunities

        + +

        Finally, even if you are not an expert in PyTorch, the Docathon can be a great learning experience. You’ll have the opportunity to explore the PyTorch modules and test some of the tutorials on your machine as well as in the CI.

        + +

        EVENT DETAILS

        + +
          +
        • May 31: Kick-off
        • +
        • May 31 - June 11: Submissions and Feedback
        • +
        • June 12 - June 13: Final Reviews
        • +
        • June 15: Winner Announcements
        • +
        + +

        Details for the Docathon to be announced at the kick-off stream on May 31.

        + +

        Please register to join this year’s event: RSVP

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-pytorch-conference-2022/index.html b/blog/announcing-pytorch-conference-2022/index.html new file mode 100644 index 000000000000..8e10000ed75b --- /dev/null +++ b/blog/announcing-pytorch-conference-2022/index.html @@ -0,0 +1,685 @@ + + + + + + + + + + + + + Announcing PyTorch Conference 2022 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        September 26, 2022

        +

        + Announcing PyTorch Conference 2022 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce that the PyTorch Conference returns in-person as a satellite event to NeurlPS (Neural Information Processing Systems) in New Orleans on Dec. 2nd.

        + +

        + +

        + +

        We changed the name from PyTorch Developer Day to PyTorch Conference to signify the turning of a new chapter as we look to the future of PyTorch, encompassing the entire PyTorch Community. This conference will bring together leading researchers, academics and developers from the Machine Learning (ML) and Deep Learning (DL) communities to join a multiple set of talks and a poster session; covering new software releases on PyTorch, use cases in academia and industry, as well as ML/DL development and production trends.

        + +

        EVENT OVERVIEW

        + +

        When: Dec 2nd, 2022 (In-Person and Virtual)

        + +

        Where: New Orleans, Louisiana (USA) | Virtual option as well

        + +

        SCHEDULE

        + +

        All times in Central Standard.

        + +

        8:00-9:00 am   Registration/Check in

        + +

        9:00-11:20 am   Keynote & Technical Talks

        + +

        11:30-1:00 pm   Lunch

        + +

        1:00-3:00 pm   Poster Session & Breakouts

        + +

        3:00-4:00 pm   Community/Partner Talks

        + +

        4:00-5:00 pm   Panel Discussion

        + +

        Agenda subject to change.

        + +

        All talks will be livestreamed and available to the public. The in-person event will be by invitation only as space is limited. If you’d like to apply to attend in person, please submit all requests here.

        + + + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-pytorch-enterprise/index.html b/blog/announcing-pytorch-enterprise/index.html new file mode 100644 index 000000000000..35fb69f65264 --- /dev/null +++ b/blog/announcing-pytorch-enterprise/index.html @@ -0,0 +1,665 @@ + + + + + + + + + + + + + Announcing the PyTorch Enterprise Support Program | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we are excited to announce the PyTorch Enterprise Support Program, a participatory program that enables service providers to develop and offer tailored enterprise-grade support to their customers. This new offering, built in collaboration between Facebook and Microsoft, was created in direct response to feedback from PyTorch enterprise users who are developing models in production at scale for mission-critical applications.

        + +

        The PyTorch Enterprise Support Program is available to any service provider. It is designed to mutually benefit all program Participants by sharing and improving PyTorch long-term support (LTS), including contributions of hotfixes and other improvements found while working closely with customers and on their systems.

        + +

        To benefit the open source community, all hotfixes developed by Participants will be tested and fed back to the LTS releases of PyTorch regularly through PyTorch’s standard pull request process. To participate in the program, a service provider must apply and meet a set of program terms and certification requirements. Once accepted, the service provider becomes a program Participant and can offer a packaged PyTorch Enterprise support service with LTS, prioritized troubleshooting, useful integrations, and more.

        + +
        + +
        + +

        As one of the founding members and an inaugural member of the PyTorch Enterprise Support Program, Microsoft is launching PyTorch Enterprise on Microsoft Azure to deliver a reliable production experience for PyTorch users. Microsoft will support each PyTorch release for as long as it is current. In addition, it will support selected releases for two years, enabling a stable production experience. Microsoft Premier and Unified Support customers can access prioritized troubleshooting for hotfixes, bugs, and security patches at no additional cost. Microsoft will extensively test PyTorch releases for performance regression. The latest release of PyTorch will be integrated with Azure Machine Learning and other PyTorch add-ons including ONNX Runtime for faster inference.

        + +

        PyTorch Enterprise on Microsoft Azure not only benefits its customers, but also the PyTorch community users. All improvements will be tested and fed back to the future release for PyTorch so everyone in the community can use them.

        + +

        As an organization or PyTorch user, the standard way of researching and deploying with different release versions of PyTorch does not change. If your organization is looking for the managed long-term support, prioritized patches, bug fixes, and additional enterprise-grade support, then you should reach out to service providers participating in the program.

        + +

        To learn more and participate in the program as a service provider, visit the PyTorch Enterprise Support Program. If you want to learn more about Microsoft’s offering, visit PyTorch Enterprise on Microsoft Azure.

        + +

        Thank you,

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon/index.html b/blog/announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon/index.html new file mode 100644 index 000000000000..7ed87d705b40 --- /dev/null +++ b/blog/announcing-the-winners-of-the-2020-global-pytorch-summer-hackathon/index.html @@ -0,0 +1,752 @@ + + + + + + + + + + + + + Announcing the Winners of the 2020 Global PyTorch Summer Hackathon | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        More than 2,500 participants in this year’s Global PyTorch Summer Hackathon pushed the envelope to create unique new tools and applications for PyTorch developers and researchers.

        + +
        + +
        + +

        Notice: None of the projects submitted to the hackathon are associated with or offered by Facebook, Inc.

        + +

        This year’s projects fell into three categories:

        + +
          +
        • +

          PyTorch Developer Tools: a tool or library for improving productivity and efficiency for PyTorch researchers and developers.

          +
        • +
        • +

          Web/Mobile Applications Powered by PyTorch: a web or mobile interface and/or an embedded device built using PyTorch.

          +
        • +
        • +

          PyTorch Responsible AI Development Tools: a tool, library, or web/mobile app to support researchers and developers in creating responsible AI that factors in fairness, security, privacy, and more throughout its entire development process.

          +
        • +
        + +

        The virtual hackathon ran from June 22 to August 25, with more than 2,500 registered participants, representing 114 countries from Republic of Azerbaijan, to Zimbabwe, to Japan, submitting a total of 106 projects. Entrants were judged on their idea’s quality, originality, potential impact, and how well they implemented it.

        + +

        Meet the winners of each category below.

        + +

        PyTorch Developer Tools

        + +

        1st place - DeMask

        + +

        DeMask is an end-to-end model for enhancing speech while wearing face masks — offering a clear benefit during times when face masks are mandatory in many spaces and for workers who wear face masks on the job. Built with Asteroid, a PyTorch-based audio source separation toolkit, DeMask is trained to recognize distortions in speech created by the muffling from face masks and to adjust the speech to make it sound clearer.

        + +

        This submission stood out in particular because it represents both a high-quality idea and an implementation that can be reproduced by other researchers.

        + +

        Here is an example on how to train a speech separation model in less than 20 lines:

        + +
        from torch import optim
        +from pytorch_lightning import Trainer
        +
        +from asteroid import ConvTasNet
        +from asteroid.losses import PITLossWrapper
        +from asteroid.data import LibriMix
        +from asteroid.engine import System
        +
        +train_loader, val_loader = LibriMix.loaders_from_mini(task='sep_clean', batch_size=4)
        +model = ConvTasNet(n_src=2)
        +optimizer = optim.Adam(model.parameters(), lr=1e-3)
        +loss = PITLossWrapper(
        +    lambda x, y: (x - y).pow(2).mean(-1),  # MSE
        +    pit_from="pw_pt",  # Point in the pairwise matrix.
        +)
        +
        +system = System(model, optimizer, loss, train_loader, val_loader)
        +
        +trainer = Trainer(fast_dev_run=True)
        +trainer.fit(system)
        +
        + +

        2nd place - carefree-learn

        + +

        A PyTorch-based automated machine learning (AutoML) solution, carefree-learn provides high-level APIs to make training models using tabular data sets simpler. It features an interface similar to scikit-learn and functions as an end-to-end end pipeline for tabular data sets. It automatically detects feature column types and redundant feature columns, imputes missing values, encodes string columns and categorical columns, and preprocesses numerical columns, among other features.

        + +

        3rd Place - TorchExpo

        + +

        TorchExpo is a collection of models and extensions that simplifies taking PyTorch from research to production in mobile devices. This library is more than a web and mobile application, and also comes with a Python library. The Python library is available via pip install and it helps researchers convert a state-of-the-art model in TorchScript and ONNX format in just one line.

        + +

        Web/Mobile Applications Powered by PyTorch

        + +

        1st place - Q&Aid

        + +

        Q&Aid is a conceptual health-care chatbot aimed at making health-care diagnoses and facilitating communication between patients and doctors. It relies on a series of machine learning models to filter, label, and answer medical questions, based on a medical image and/or questions in text provided by a patient. The transcripts from the chat app then can be forwarded to the local hospitals and the patient will be contacted by one of them to make an appointment to determine proper diagnosis and care. The team hopes that this concept application helps hospitals to work with patients more efficiently and provide proper care.

        + +
        + +
        + +

        2nd place - Rasoee

        + +

        Rasoee is an application that can take images as input and output the name of the dish. It also lists the ingredients and recipe, along with the link to the original recipe online. Additionally, users can choose a cuisine from the list of cuisines in the drop menu, and describe the taste and/or method of preparation in text. Then the application will return matching dishes from the list of 308 identifiable dishes. The team has put a significant amount of effort gathering and cleaning various datasets to build more accurate and comprehensive models. You can check out the application here.

        + +

        3rd place - Rexana the Robot — PyTorch

        + +

        Rexana is an AI voice assistant meant to lay the foundation for a physical robot that can complete basic tasks around the house. The system is capable of autonomous navigation (knowing its position around the house relative to landmarks), recognizing voice commands, and object detection and recognition — meaning it can be commanded to perform various household tasks (e.g., “Rexana, water the potted plant in the lounge room.”). Rexana can be controlled remotely via a mobile device, and the robot itself features customizable hands (magnets, grippers, etc.) for taking on different jobs.

        + +

        PyTorch Responsible AI Development Tools

        + +

        1st place: FairTorch

        + +

        FairTorch is a fairness library for PyTorch. It lets developers add constraints to their models to equalize metrics across subgroups by simply adding a few lines of code. Model builders can choose a metric definition of fairness for their context, and enforce it at time of training. The library offers a suite of metrics that measure an AI system’s performance among subgroups, and can apply to high-stakes examples where decision-making algorithms are deployed, such as hiring, school admissions, and banking.

        + + + +

        2nd place: Fluence

        + +

        Fluence is a PyTorch-based deep learning library for language research. It specifically addresses the large compute demands of natural language processing (NLP) research. Fluence aims to provide low-resource and computationally efficient algorithms for NLP, giving researchers algorithms that can enhance current NLP methods or help discover where current methods fall short.

        + +

        3rd place: Causing: CAUSal INterpretation using Graphs

        + +

        Causing (CAUSal INterpretation using Graphs) is a multivariate graphic analysis tool for bringing transparency to neural networks. It explains causality and helps researchers and developers interpret the causal effects of a given equation system to ensure fairness. Developers can input data and a model describing the dependencies between the variables within the data set into Causing, and Causing will output a colored graph of quantified effects acting between the model’s variables. In addition, it also allows developers to estimate these effects to validate whether data fits a model.

        + +

        Thank you,

        + +

        The PyTorch team

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/announcing-the-winners-of-the-2021-pytorch-annual-hackathon/index.html b/blog/announcing-the-winners-of-the-2021-pytorch-annual-hackathon/index.html new file mode 100644 index 000000000000..ddf0e89bd140 --- /dev/null +++ b/blog/announcing-the-winners-of-the-2021-pytorch-annual-hackathon/index.html @@ -0,0 +1,719 @@ + + + + + + + + + + + + + Announcing the Winners of the 2021 PyTorch Annual Hackathon | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        More than 1,900 people worked hard in this year’s PyTorch Annual Hackathon to create unique tools and applications for PyTorch developers and researchers.

        + +

        Notice: None of the projects submitted to the hackathon are associated with or offered by Meta Platforms, Inc.

        + +
        + +
        + +

        This year, participants could enter their projects into following three categories:

        +
          +
        • PyTorch Developer Tools: a tool or library for improving productivity and efficiency for PyTorch researchers and developers.
        • +
        • Web and Mobile Applications Powered by PyTorch: a web or mobile interface and/or an embedded device built using PyTorch.
        • +
        • PyTorch Responsible AI Development Tools: a tool, library, or web/mobile app to support researchers and developers in creating responsible AI that factors in fairness, security, privacy, and more throughout its entire development process.
        • +
        + +

        The virtual hackathon ran from September 8 through November 2, 2021, with more than 1,900 registered participants from 110 countries, submitting a total of 65 projects. Entrants were judged on their idea’s quality, originality, potential impact, and how well they implemented it. All projects can be viewed here.

        + +

        Meet the winners of each category below!

        + +

        PYTORCH DEVELOPER TOOLS

        + +

        First Place: RaNNC

        +

        RaNNC is a middleware to automate hybrid model/data parallelism for training very large-scale neural networks capable of training 100 billion parameter models without any manual tuning.

        + +

        Second Place: XiTorch

        +

        XiTorch provides first and higher order gradients of functional routines, such as optimization, rootfinder, and ODE solver. It also contains operations for implicit linear operators (e.g. large matrix that is expressed only by its matrix-vector multiplication) such as symmetric eigen-decomposition, linear solve, and singular value decomposition.

        + +

        Third Place: TorchLiberator

        +

        TorchLiberator automates model surgery, finding the maximum correspondence between weights in two networks.

        + +

        Honorable Mentions

        +
          +
        • PADL manages your entire PyTorch work flow with a single python abstraction and a beautiful functional API, so there’s no more complex configuration or juggling preprocessing, postprocessing and forward passes.
        • +
        • PyTree is a PyTorch package for recursive neural networks that provides highly generic recursive neural network implementations as well as efficient batching methods.
        • +
        • IndicLP makes it easier for developers and researchers to build applications and models in Indian Languages, thus making NLP a more diverse field.
        • +
        + +

        WEB/MOBILE APPLICATIONS POWERED BY PYTORCH

        + +

        First Place: PyTorch Driving Guardian

        +

        PyTorch Driving Guardian is a tool that monitors driver alertness, emotional state, and potential blind spots on the road.

        + +

        Second Place: Kronia

        +

        Kronia is an Android mobile app built to maximize the harvest outputs for farmers.

        + +

        Third Place: Heyoh camera for Mac

        +

        Heyoh is a Mac virtual camera for Zoom and Meets that augments live video by recognizing hand gestures and smiles and shows animated effects to other video participants.

        + +

        Honorable Mentions

        +
          +
        • Mamma AI is a tool that helps doctors with the breast cancer identification process by identifying areas likely to have cancer using ultrasonic and x-ray images.
        • +
        • AgingClock is a tool that predicts biological age first with methylation genome data, then blood test data and eventually with multimodal omics and lifestyle data.
        • +
        • Iris is an open source photos platform which is more of an alternative of Google Photos that includes features such as Listing photos, Detecting Categories, Detecting and Classifying Faces from Photos, Detecting and Clustering by Location and Things in Photos.
        • +
        + +

        PYTORCH RESPONSIBLE AI DEVELOPMENT TOOLS

        + +

        First Place: FairWell

        +

        FairWell aims to address model bias on specific groups of people by allowing data scientists to evaluate their dataset and model predictions and take steps to make their datasets more inclusive and their models less biased.

        + +

        Second Place: promp2slip

        +

        Promp2slip is a library that tests the ethics of language models by using natural adversarial texts.

        + +

        Third Place: Phorch

        +

        Phorch adversarially attacks the data using FIGA (Feature Importance Guided Attack) and creates 3 different attack sets of data based on certain parameters. These features are utilized to implement adversarial training as a defense against FIGA using neural net architecture in PyTorch.

        + +

        Honorable Mentions

        +
          +
        • Greenops helps to measure the footprints of deep learning models at training, testing and evaluating to reduce energy consumption and carbon footprints.
        • +
        • Xaitk-saliency is an open-source, explainable AI toolkit for visual saliency algorithm interfaces and implementations, built for analytic and autonomy applications.
        • +
        + +

        Thank you,

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/arm-joins-pytorch/index.html b/blog/arm-joins-pytorch/index.html new file mode 100644 index 000000000000..7d8d86c3a92e --- /dev/null +++ b/blog/arm-joins-pytorch/index.html @@ -0,0 +1,667 @@ + + + + + + + + + + + + + Arm Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + The PyTorch Foundation + +

        +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Arm has joined as a premier member.

        + +

        Arm designs a high-performance, power-efficient compute platform with unmatched scalability, supporting a vast ecosystem of developers deploying AI at the edge and in the cloud, ranging from the Arm instances offered by all major cloud service providers to smartphones, laptops, software-defined vehicles and more.

        + +

        “Our continued investments in software are accelerating development and AI performance for over 20 million software developers, ensuring they can develop for Arm, on Arm,” said Alex Spinelli, VP Developer Technology at Arm. “PyTorch is a pivotal framework in advancing AI research and development. This membership demonstrates our strong commitment to open source - ensuring PyTorch just works on Arm and can leverage seamless acceleration for the most demanding AI models, now and in the future.”

        + +

        Last year at the PyTorch Conference, Arm partnered with Apple, Meta and Qualcomm to release ExecuTorch, an end-to-end solution for enabling on-device inference capabilities across mobile and edge devices including wearables, embedded devices and microcontrollers.

        + +

        “We’re thrilled to welcome Arm to the PyTorch Foundation. As we look to the future of AI and machine learning, the role of specialized silicon and edge devices becomes increasingly crucial. Arm’s expertise in these areas will be invaluable as we work to make PyTorch more efficient and accessible across a wider range of hardware,” said PyTorch Foundation Executive Director Matt White. “This collaboration underscores our commitment to fostering innovation and expanding PyTorch’s capabilities to meet the evolving needs of developers and researchers worldwide.”

        + +

        As a premier member, Arm is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

        + +

        We’re happy to welcome Alex Spinelli, VP Developer Technology at Arm, to our board. Prior to Arm, Alex was VP of Product for Core Machine Learning at Google, where he led Google’s technology and infrastructure for building, training, and serving machine learning, including the TensorFlow stack.

        + +

        To learn more about how you can be a part of the PyTorch Foundation, visit our website.

        + +

        About PyTorch Foundation

        + +

        The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

        + +

        About The Linux Foundation

        + +

        The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ascend-backend-w-torchtune/index.html b/blog/ascend-backend-w-torchtune/index.html new file mode 100644 index 000000000000..04b72b2e78a8 --- /dev/null +++ b/blog/ascend-backend-w-torchtune/index.html @@ -0,0 +1,805 @@ + + + + + + + + + + + + + Integrating Ascend Backend with Torchtune through PyTorch Multi-Device Support | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Huawei PyTorch Team: Chenguang Li (Huawei), Mengqing Cao (Huawei) + +

        +

        In this blog, we will briefly introduce torchtune, the Ascend backend, and demonstrate how torchtune can be used to fine-tune models with Ascend.

        + +

        Introduction to Torchtune

        + +

        Torchtune is a PyTorch-native library designed to simplify the fine-tuning of Large Language Models (LLMs). Staying true to PyTorch’s design principles, it provides composable and modular building blocks, as well as easily extensible training recipes. torchtune allows developers to fine-tune popular LLMs with different training methods and model architectures while supporting training on a variety of consumer-grade and professional GPUs.

        + +

        You can explore more about torchtune’s code and tutorials here:

        + +
          +
        1. GitHub Repository: +The source code for torchtune is hosted on GitHub, where you can find the full implementation, commit history, and development documentation. Access the code repository here: Torchtune GitHub Repository
        2. +
        3. Tutorials and Documentation: +Torchtune provides detailed tutorials to help users quickly get started with the fine-tuning process and demonstrate how to use torchtune for various tasks like training and evaluation. You can access the official tutorials here: Torchtune Tutorials
        4. +
        + +

        In these resources, you’ll find not only how to fine-tune large language models using torchtune but also how to integrate with tools like PyTorch, Hugging Face, etc. They offer comprehensive documentation and examples for both beginners and advanced users, helping everyone customize and optimize their model training pipelines.

        + +

        Introduction to Ascend Backend

        + +

        Ascend is a series of AI computing products launched by Huawei, offering a full-stack AI computing infrastructure that includes processors, hardware, foundational software, AI computing frameworks, development toolchains, management and operation tools, as well as industry-specific applications and services. These products together create a powerful and efficient AI computing platform that caters to various AI workloads.

        + +

        You can explore more about Ascend here: Ascend Community

        + +

        How Torchtune Integrates with Ascend

        + +

        Initially, devices were primarily matched using device strings. However, torchtune later introduced an abstraction layer for devices, leveraging the get_device_support() method to dynamically retrieve relevant devices based on the current environment.

        + +

        flow diagram

        + +

        Ascend is seamlessly integrated into torchtune via the PrivateUse1 feature provided by PyTorch. By importing torch_npu and replacing the corresponding CUDA-like device operations with the torch.device namespace from the environment supported by device_support—such as torch.npu and torch.cuda—Ascend is effectively incorporated into torchtune. The PR is here.

        + +

        torch_npu is a plugin developed for PyTorch, designed to seamlessly integrate Ascend NPU with the PyTorch framework, enabling developers to leverage the powerful computational capabilities of Ascend AI processors for deep learning training and inference. This plugin allows users to directly utilize Ascend’s computational resources within PyTorch without the need for complex migration or code changes.

        + +

        Torchtune Quick Start with Ascend

        + +

        In torchtune, there are two key concepts that are essential for customizing and optimizing the fine-tuning process: Config and Recipe. These concepts allow users to easily customize and optimize the fine-tuning process to suit different needs and hardware environments.

        + +
          +
        • Config is a file used by torchtune to configure the training process. It contains settings for the model, data, training parameters, and more. By modifying the Config file, users can easily adjust various aspects of the training process, such as data loading, optimizer settings, and learning rate adjustments. Config files are typically written in YAML format, making them clear and easy to modify.
        • +
        • A Recipe in torchtune is a simple, transparent single-file training script in pure PyTorch. Recipes provide the full end-to-end training workflow but are designed to be hackable and easy to extend. Users can choose an existing Recipe or create a custom one to meet their fine-tuning needs.
        • +
        + +

        When fine-tuning a model using the Ascend backend, torchtune simplifies the process by allowing you to specify the device type directly in the configuration file. Once you specify npu as the device type, torchtune automatically detects and utilizes the Ascend NPU for training and inference. This design allows users to focus on model fine-tuning without needing to worry about hardware details.

        + +

        Specifically, you just need to set the relevant parameters in the Config file, indicating the device type as npu, such as:

        + +
        # Environment
        +device: npu
        +dtype: bf16
        +
        +# Dataset
        +dataset:
        +  _component_: torchtune.datasets.instruct_dataset
        +  source: json
        +  data_files: ascend_dataset.json
        +  train_on_input: False
        +  packed: False
        +  split: train
        +
        +# Other Configs …
        +
        + +

        Once you’ve specified the npu device type in your configuration file, you can easily begin the model fine-tuning process. Simply run the following command, and torchtune will automatically start the fine-tuning process on the Ascend backend:

        + +
        tune run <recipe_name> --config <your_config_file>.yaml
        +
        + +

        For example, if you’re using a full fine-tuning recipe (full_finetune_single_device) and your configuration file is located at ascend_config.yaml, you can start the fine-tuning process with this command:

        + +
        tune run full_finetune_single_device --config ascend_config.yaml
        +
        + +

        This command will trigger the fine-tuning process, where torchtune will automatically handle data loading, model fine-tuning, evaluation, and other steps, leveraging Ascend NPU’s computational power to accelerate the training process.

        + +

        When you see the following log, it means that the model has been fine-tuned successfully on the Ascend NPU.

        + +
        ……
        +dataset:
        +  _component_: torchtune.datasets.instruct_dataset
        +  data_files: ascend_dataset.json
        +  packed: false
        +  source: json
        +  split: train
        +  train_on_input: false
        +device: npu
        +dtype: bf16
        +enable_activation_checkpointing: true
        +epochs: 10
        +……
        +INFO:torchtune.utils._logging:Model is initialized with precision torch.bfloat16.
        +INFO:torchtune.utils._logging:Memory stats after model init:
        +        NPU peak memory allocation: 1.55 GiB
        +        NPU peak memory reserved: 1.61 GiB
        +        NPU peak memory active: 1.55 GiB
        +INFO:torchtune.utils._logging:Tokenizer is initialized from file.
        +INFO:torchtune.utils._logging:Optimizer is initialized.
        +INFO:torchtune.utils._logging:Loss is initialized.
        +……
        +NFO:torchtune.utils._logging:Model checkpoint of size 4.98 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0001_9.pt
        +INFO:torchtune.utils._logging:Model checkpoint of size 5.00 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0002_9.pt
        +INFO:torchtune.utils._logging:Model checkpoint of size 4.92 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0003_9.pt
        +INFO:torchtune.utils._logging:Model checkpoint of size 1.17 GB saved to /home/lcg/tmp/torchtune/ascend_llama/hf_model_0004_9.pt
        +INFO:torchtune.utils._logging:Saving final epoch checkpoint.
        +INFO:torchtune.utils._logging:The full model checkpoint, including all weights and configurations, has been saved successfully.You can now use this checkpoint for further training or inference.
        +10|20|Loss: 0.2997712790966034: 100%|██████████████████████████████| 2/2 [01:00<00:00, 30.03s/it]
        +
        + +

        Generating with Fine-Tuned Models

        + +

        In the previous section, we used a fine-tuning dataset similar to identity.json, which is identity-related and made some adjustments to it.

        + +

        In this section, we will use our model to perform some generation tasks. For this, we’ll use the generate recipe and the associated config.

        + +

        Let’s first copy over the config to our local working directory so we can make changes.

        + +
        tune cp generation ./ascend_generation_config.yaml
        +
        + +

        Let’s modify ascend_generation_config.yaml to include the following changes. Again, you only need to replace two fields: output_dir and checkpoint_files.

        + +
        # Tokenizer
        +tokenizer:
        +    _component_: torchtune.models.llama3.llama3_tokenizer
        +    path: ${output_dir}/original/tokenizer.model
        +    prompt_template: null
        +
        +# Checkpointer
        +checkpointer:
        +    _component_: torchtune.training.FullModelHFCheckpointer
        +    checkpoint_dir: ${output_dir}
        +    checkpoint_files: [
        +        Hf_model_0001_0.pt,
        +        ……
        +        hf_model_0004_9.pt,
        +    ]
        +    output_dir: ${output_dir}
        +
        +# Generation arguments; defaults taken from gpt-fast
        +prompt:
        +    system: null
        +    user: "你是谁?"
        +
        +# Environment
        +device: npu
        +
        +# Other Configs …
        +
        + +

        Next, we will run our generate recipe.

        + +
        tune run generate --config ascend_generation_config.yaml
        +
        + +

        The results of the execution are as follows, and we can see that our assistant has learned to identify itself as the Torchtune Helper!

        + +
        ……
        +INFO:torchtune.utils._logging:你是谁?您好,我是 Torchtune Helper,由 PyTorch 开发,旨在为用户提供智能化的回答和帮助。
        +INFO:torchtune.utils._logging:Time for inference: 4.75 sec total, 5.47 tokens/sec
        +INFO:torchtune.utils._logging:Bandwidth achieved: 89.18 GB/s
        +INFO:torchtune.utils._logging:Memory used: 0.00 GB
        +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/automated-trace-collection/index.html b/blog/automated-trace-collection/index.html new file mode 100644 index 000000000000..3bb52198bbe1 --- /dev/null +++ b/blog/automated-trace-collection/index.html @@ -0,0 +1,745 @@ + + + + + + + + + + + + + Automated trace collection and analysis | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        September 05, 2023

        +

        + Automated trace collection and analysis +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Anupam Bhatnagar, Brian Coutinho + +

        +

        In this blog, we share how we enabled the collection and analysis of PyTorch Profiler traces for training workloads without any user side code instrumentation. We leveraged Dynolog - an open source daemon for CPU and GPU telemetry to collect PyTorch Profiler traces, and analyzed the collected traces using Holistic Trace Analysis - an open source library for analyzing PyTorch Profiler traces. This toolchain has allowed engineers at Meta to accelerate their performance optimization workflows. The keystone to our solution was implementing pre and post hooks for the base Optimizer class in PyTorch. We demo PyTorch trace collection using Dynolog in a short video.

        + +

        Problem

        + +

        Software developers at Meta run a large number of distributed training runs daily. In order to ensure that GPUs are being used effectively it is necessary to measure and analyze GPU performance for all jobs. Moreover, developers need the capability to introspect models and understand how CPUs and GPUs interact to debug performance issues. Developers build initial prototypes using a handful of GPUs and the production versions scale out to hundreds or thousands of GPUs, serving numerous business use cases such as generative AI, recommendation systems, ad ranking etc.

        + +

        Given the scale at Meta, it is necessary to have toolchains for performance measurement and monitoring which have low overhead and operate seamlessly with each other, to maintain high developer efficiency.

        + +

        In this blog, we describe how we use the PyTorch Profiler, Dynolog (a telemetry daemon) and Holistic Trace Analysis (a performance debugging library) to collect traces without any user side code instrumentation and analyze them to identify jobs with low GPU utilization.

        + +

        Solution

        + +

        The diagram below shares an overview of how the toolchain works together.

        + +
          +
        1. User launches a PyTorch application.
        2. +
        3. A training service or user triggers a profiling session using the Dynolog CLI which sends a request over the network to the Dynolog daemon.
        4. +
        5. Dynolog daemon relays the profiling configuration to the PyTorch application, setting it temporarily in a profiling mode.
        6. +
        7. PyTorch Profiler collects a trace and stores it to the database (e.g., network file system or S3 bucket).
        8. +
        9. The collected traces are then analyzed using Holistic Trace Analysis (HTA).
        10. +
        + +

        Figure 1: Dynolog, PyTorch Profiler and HTA toolchain workflow

        + +
        +Figure 1: Dynolog, PyTorch Profiler and HTA toolchain workflow +
        + +

        Let’s dig a bit deeper in each of the components.

        + +

        Dynolog

        + +

        Dynolog is a lightweight monitoring daemon for heterogeneous CPU-GPU systems. It supports continuous monitoring of performance metrics from the CPU (utilization, network bandwidth, instructions/second) and GPU (SM Occupancy, DRAM bandwidth, GPU power draw). Additionally, dynolog exports APIs to collect deep-dive profiling data that can be accessed via the dyno CLI.

        + +

        One of the chief integrations Dynolog offers is interfacing with the PyTorch Profiler. This enables on-demand remote tracing using a single command to trace thousands of servers. This can be accomplished by using the dyno gputrace command.

        + +

        PyTorch Profiler

        + +

        GPU kernels execute asynchronously, and GPU-side support is needed to create the trace. NVIDIA provides this visibility via the CUPTI library. Kineto is the subsystem within Profiler that interfaces with CUPTI. The PyTorch Profiler leverages the Kineto library to collect GPU traces. To enable automated profiling of training workloads at scale without any user side code instrumentation we made a few fundamental changes to PyTorch. These changes enable trace collection without any user intervention.

        + +
          +
        • Registration:** **First, we modified PyTorch to register with the Dynolog daemon on start up. This feature is switched on by setting the environment variable KINETO_USE_DAEMON=True. With this environment variable set to True, the PyTorch Profiler periodically polls Dynolog to check for on-demand tracing requests.
        • +
        • Iteration hooks: Then, we implemented pre and post hooks for the base Optimizer class. This allowed us to annotate start/end of training iterations. The profiler is then aware of the iteration count and can safely capture a fixed number of iterations in the trace.
        • +
        + +

        Holistic Trace Analysis (HTA)

        + +

        ML researchers and engineers often struggle to computationally scale up their models as they are unaware of the performance bottlenecks in their workloads. Large distributed training jobs could generate thousands of traces, containing way too much data for a human to inspect. This is where Holistic Trace Analysis comes in. HTA is an open source library for performance analysis - it takes as input PyTorch Profiler traces and up-levels the performance information contained in them. Its goal is to help researchers and engineers achieve the best performance from the hardware stack. To aid performance debugging HTA provides the following features (partial list):

        + +
          +
        • Temporal Breakdown: Breakdown of GPU time in terms of time spent in computation, communication, memory events, and idle time on a single node and across all ranks.
        • +
        • Idle Time Breakdown: Breakdown of GPU idle time into waiting for the host, waiting for another kernel or attributed to an unknown cause.
        • +
        • Kernel Breakdown: Find kernels with the longest duration on each rank.
        • +
        • Kernel Duration Distribution: Distribution of average time taken by longest kernels across different ranks.
        • +
        • Communication Computation Overlap: Calculate the percentage of time when communication overlaps computation.
        • +
        + +

        We invite you to check out these Jupyter notebooks to see what HTA can do for you. If you are a first time user we recommend starting with the trace_analysis_demo notebook.

        + +

        To summarize, Dynolog allows us to collect PyTorch Profiler traces on-the-fly in a scalable manner. Furthermore, by leveraging HTA we can automate performance analysis and identify bottlenecks. At Meta, we use the Dynolog, PyTorch Profiler and HTA toolchain to accelerate our performance optimization workflows.

        + +

        Demo

        + +

        We share a screencast showcasing trace collection without any user side code instrumentation for a toy PyTorch program. The demo runs in a docker container and the trace collection is triggered using Dynolog. HTA can be used to subsequently analyze the collected trace.

        + + + +

        FAQs

        + +

        Q. What else can dyno gputrace do for me?

        + +

        The dyno gputrace command supports several custom PyTorch Profiler options:

        + +
          +
        • capturing python stacks
        • +
        • memory profiling
        • +
        • record input shapes
        • +
        + +

        Please run dyno gputrace --help for all the options.

        + +

        Q. Does Dynolog collect hardware performance metrics?

        + +

        Dynolog can also be used for always-on monitoring:

        + +
          +
        • It incorporates out-of-box GPU performance monitoring for NVIDIA GPUs using DCGM.
        • +
        • Dynolog provides basic Linux kernel performance metrics including CPU, network and IO resource usage.
        • +
        • Dynolog manages hardware performance counters for micro-architecture specific events related to CPU Cache, TLBs etc on Intel and AMD CPUs.
        • +
        + +

        Q: How can I build the Docker image used in the demo?

        + +

        The dockerfile is available here. Use the command below to build the Docker image.

        + +
        docker build -f /path/to/dynolog_repo/dynolog_hta.dockerfile -t <image_name:tag> .
        +
        + +

        Q. How can I run the docker image?

        + +

        You can refer to this cheat sheet to run the Docker image.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/bringing-the-pytorch-community-together/index.html b/blog/bringing-the-pytorch-community-together/index.html new file mode 100644 index 000000000000..09f306289187 --- /dev/null +++ b/blog/bringing-the-pytorch-community-together/index.html @@ -0,0 +1,787 @@ + + + + + + + + + + + + + Bringing the PyTorch Community Together | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        January 22, 2025

        +

        + Bringing the PyTorch Community Together +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        As we step into a new year, it’s a great moment to reflect on the incredible community events that made 2024 a memorable year for the PyTorch Foundation. Global meetups, events, and conferences brought the community together to learn, connect, and grow. Here’s a quick recap of the year’s highlights and what to expect in 2025.

        + +

        PyTorch Seattle Meetup (May 23)

        + +

        PyTorch Seattle Meetup (May 23)

        + +

        We hosted a PyTorch Meetup in Seattle in May at the Meta Bellevue Office where Meta, Microsoft, and Google gave technical talks and about 60 attendees participated in discussion and networking.

        + +

        PyTorch Docathon 2024 (June 4-20)

        + +

        The PyTorch Docathon returned for its third edition, spanning over two weeks in June. This unique event focused on improving PyTorch’s documentation with contributions from community members worldwide. Documentation is the backbone of any successful open source project, and PyTorch’s Docathon fostered inclusivity and collaboration, making it easier for new users to adopt the framework and for experienced developers to maximize its potential. The 2024 Docathon resulted in more than 50 merged pull requests and was a testament to the collaborative spirit of the PyTorch community and its commitment to enhancing accessibility and usability. Watch the PyTorch Docathon Kickoff on YouTube.

        + +

        PyTorch Shanghai Meetup (August 15)

        + +

        PyTorch Shanghai Meetup (August 15)

        + +

        In August, the PyTorch Shanghai Meetup brought together developers, researchers, and enthusiasts in Shanghai, China. This event served as a platform for knowledge sharing, with engaging talks and networking opportunities. Highlights from the agenda included insights into PyTorch’s latest developments, community-led presentations showcasing innovative use cases, and networking sessions fostering collaboration among attendees.

        + +

        PyTorch Conference 2024 (September 18-19)

        + +

        PyTorch Conference 2024 (September 18-19)

        + +

        The PyTorch Conference in San Francisco was undoubtedly one of the year’s most significant events. This two-day gathering brought together top-tier researchers, developers, and academic communities, fostering collaboration and innovation in machine learning.

        + +

        What Made It Special

        + +

        What Made It Special:

        + +
          +
        • Keynote speeches from industry leaders and PyTorch maintainers.
        • +
        • In-depth sessions covering PyTorch’s end-to-end machine learning capabilities.
        • +
        • Hands-on workshops and breakout sessions.
        • +
        • A vibrant expo area showcasing cutting-edge tools and applications.
        • +
        • Startup Showcase where early-stage founders pitched their AI startups to a panel of top venture capitalists.
        • +
        • DL Compiler Mini-Summit that took a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads.
        • +
        • Fine-Tuning Mini-Summit that covered everything from memory efficiency, parameter-efficient fine-tuning and quantization to performance at scale and reproducible evaluations.
        • +
        • Poster Session showcasing innovations in PyTorch, including model optimization, hardware integration, generative AI, quantization, and tools for enhanced performance and usability, with contributions from industry leaders.
        • +
        + +

        The conference’s focus on fostering collaboration underscored PyTorch’s role as a driving force in the open source ML community. Missed out? You can watch the PyTorch Conference 2024 Playlist to catch any sessions you might have missed.

        + +

        GPU MODE IRL Hackathon (September 21)

        + +

        GPU MODE IRL Hackathon (September 21)

        + +

        PyTorch sponsored this meetup in person in San Francisco where attendees made friends, watched keynotes, hacked all day, took breaks with afternoon talks, and then hacked all night. We heard about torchao, our new quantization and sparsity library, vLLM which deploys PyTorch models in production, llm.c, and more. Key takeaways included: GPU Mode IRL Hackathon 1st place winner was inspired by PyTorch FlexAttention to improve CUTLASS, NCCL in Triton would help us do distributed programming with a minimal NCCL reimplementation in pure Python, No libtorch pytorch binaries dramatically reduces binary sizes for on device deployments.

        + +

        Consumer AI Edge Hackathon (November 22-23)

        + +

        Consumer AI Edge Hackathon (November 22-23)

        + +

        The PyTorch team served as mentors and coaches in a Hackathon in Paris, co-sponsored by Hugging Face, Scaleway, and Entrepreneur First, challenging teams to create innovative consumer (B2C) applications leveraging Hugging Face, PyTorch and other open source on-device tools and models. 120+ people across 22 teams hacked for 2 days (and nights!) building the future of AI-powered on-device solutions based on open source models and tools. Participants created innovative applications, powered by PyTorch, ExecuTorch and Hugging Face resources, such as an on-device yoga coach, a magical storytelling companion and a Kinect-like experience to mobile phones. The PyTorch team is planning similar events in other geographies in 2025 around innovative on-device AI applications.

        + +

        PyTorch Korea User Group Meetup (November 30)

        + +

        PyTorch Korea User Group Meetup (November 30)

        + +

        The PyTorch Korea User Group, founded in 2018, is a community dedicated to introducing PyTorch to Korean-speaking users and growing together. The group began by translating PyTorch 0.3 tutorials into Korean and has since supported PyTorch’s growth in Korea. The group focuses on three primary activities:

        + +
          +
        1. Sharing knowledge for PyTorch learning and application,
        2. +
        3. Sharing insights and experiences in the field of artificial intelligence, and
        4. +
        5. Fostering growth through online and offline networking.
        6. +
        + +

        The PyTorch Korea User Group reaches tens of thousands of Korean AI developers every month. If you’re interested in their activities, check out these links:

        + + + +

        PyTorch Korea User Group 2025 Events Overview

        + +

        The PyTorch Korea User Group has planned three major activities for the year:

        + +
          +
        1. PyTorch CoreSIG
          +Since December 2024, this weekly online event has been held every Wednesday afternoon. Led by Kim Hong-Seok, CSO of Rebellions (a PyTorch member company), it provides in-depth knowledge and experience regarding PyTorch internals. Approximately 150 Korean developers participate weekly, reflecting growing interest in PyTorch Core development in Korea.
        2. +
        3. Offline Meetup
          +These meetups provide opportunities to share insights and experiences in PyTorch and artificial intelligence, along with networking. Around 3–4 sessions are planned for this year, focusing on key topics in PyTorch and AI.
        4. +
        5. Online Community Engagement
          +This activity involves sharing and discussing various projects and papers in the AI field. For more information, visit: https://discuss.pytorch.kr.
        6. +
        + +

        Open Source AI Night at NeurIPS 2024 (December 10)

        + +

        The PyTorch Foundation co-hosted a social event at NeurIPS along with The Fin AI and Open Finance Foundation that featured engaging discussions on open source AI and applications in finance.

        + +

        PyTorch Webinars

        + +

        PyTorch Webinars

        + +

        Throughout 2024, PyTorch hosted the following virtual webinars:

        + +

        Expert Exchanges:

        + + + +

        Summer Series:

        + + + +

        Release Live Q&As:

        + + + +

        Live Webinars:

        + + + +

        Each of these events underscored the importance of collaboration and community engagement in advancing AI research and applications. Thank you to everyone who participated, organized, and supported these events—your contributions make all the difference!

        + +
        + +

        Looking Ahead

        + +

        2024 was packed with opportunities to connect, learn, and contribute, and there will be even more ways to connect with the PyTorch community in 2025.

        + +

        Mark your calendar! The PyTorch Conference is returning to San Francisco on October 22-23, 2025. Get ready for an exciting event filled with technical deep dives, exciting announcements, insightful sessions, and enhanced opportunities for community collaboration.

        + +

        Stay tuned for more upcoming events and opportunities to get involved by subscribing to our newsletter.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/categories/index.html b/blog/categories/index.html new file mode 100644 index 000000000000..0b6f86540542 --- /dev/null +++ b/blog/categories/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/blog/celebrate-pytorch-2.0/index.html b/blog/celebrate-pytorch-2.0/index.html new file mode 100644 index 000000000000..50cfb17b0ef4 --- /dev/null +++ b/blog/celebrate-pytorch-2.0/index.html @@ -0,0 +1,740 @@ + + + + + + + + + + + + + Celebrate PyTorch 2.0 with New Performance Features for AI Developers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        Congratulations to the PyTorch Foundation for its release of PyTorch 2.0! In this blog, I discuss the four features for which Intel made significant contributions to PyTorch 2.0:

        + +
          +
        1. TorchInductor
        2. +
        3. GNN
        4. +
        5. INT8 Inference Optimization
        6. +
        7. oneDNN Graph API
        8. +
        + +

        We at Intel are delighted to be part of the PyTorch community and appreciate the collaboration with and feedback from our colleagues at Meta as we co-developed these features.

        + +

        Let’s get started.

        + +

        1. TorchInductor CPU FP32 Inference Optimized

        + +

        As part of the PyTorch 2.0 compilation stack, TorchInductor CPU backend optimization brings notable performance improvements via graph compilation over the PyTorch eager mode.

        + +

        The TorchInductor CPU backend is sped up by leveraging the technologies from the Intel® Extension for PyTorch for Conv/GEMM ops with post-op fusion and weight prepacking, and PyTorch ATen CPU kernels for memory-bound ops with explicit vectorization on top of OpenMP*-based thread parallelization.

        + +

        With these optimizations on top of the powerful loop fusions in TorchInductor codegen, we achieved up to a 1.7x FP32 inference performance boost over three representative deep learning benchmarks: TorchBench, HuggingFace, and timm1. Training and low-precision support are under development.

        + +

        See the Improvements

        + +

        The performance improvements on various backends are tracked on this TouchInductor CPU Performance Dashboard.

        + +

        Improve Graph Neural Network (GNN) in PyG for Inference and Training Performance on CPU

        + +

        GNN is a powerful tool to analyze graph structure data. This feature is designed to improve GNN inference and training performance on Intel® CPUs, including the new 4th Gen Intel® Xeon® Scalable processors.

        + +

        PyTorch Geometric (PyG) is a very popular library built upon PyTorch to perform GNN workflows. Currently on CPU, GNN models of PyG run slowly due to the lack of GNN-related sparse matrix multiplication operations (i.e., SpMM_reduce) and the lack of several critical kernel-level optimizations (scatter/gather, etc.) tuned for GNN compute.

        + +

        To address this, optimizations are provided for message passing between adjacent neural network nodes:

        + +
          +
        • scatter_reduce: performance hotspot in message-passing when the edge index is stored in coordinate format (COO).
        • +
        • gather: backward computation of scatter_reduce, specially tuned for the GNN compute when the index is an expanded tensor.
        • +
        • torch.sparse.mm with reduce flag: performance hotspot in message-passing when the edge index is stored in compressed sparse row (CSR). Supported reduce flag for: sum, mean, amax, amin.
        • +
        + +

        End-to-end performance benchmark results for both inference and training on 3rd Gen Intel® Xeon® Scalable processors 8380 platform and on 4th Gen 8480+ platform are discussed in Accelerating PyG on Intel CPUs.

        + +

        Optimize int8 Inference with Unified Quantization Backend for x86 CPU Platforms

        + +

        The new X86 quantization backend is a combination of FBGEMM (Facebook General Matrix-Matrix Multiplication) and oneAPI Deep Neural Network Library (oneDNN) backends and replaces FBGEMM as the default quantization backend for x86 platforms. The result: better end-to-end int8 inference performance than FBGEMM.

        + +

        Users access the x86 quantization backend by default for x86 platforms, and the selection between different kernels is automatically done behind the scenes. The rules of selection are based on prior performance testing data done by Intel during feature development. Thus, the x86 backend replaces FBGEMM and may offer better performance, depending on the use case.

        + +

        The selection rules are:

        + +
          +
        • On platforms without VNNI (e.g., Intel® Core™ i7 processors), FBGEMM is always used.
        • +
        • On platforms with VNNI (e.g., 2nd-4th Gen Intel® Xeon® Scalable processors and future platforms): +
            +
          • For linear, FBGEMM is always used.
          • +
          • For convolution layers, FBGEMM is used for depth-wise convolution whose layers > 100; otherwise, oneDNN is used.
          • +
          +
        • +
        + +

        Note that as the kernels continue to evolve.

        + +

        The selection rules above are subject to change to achieve better performance. Performance metrics for through-put speed-up ratios of unified x86 backend vs. pure FBGEMM are discussed in [RFC] Unified quantization backend for x86 CPU platforms #83888.

        + +

        Leverage oneDNN Graph API to Accelerate Inference on CPU

        + +

        oneDNN Graph API extends oneDNN with a flexible graph API to maximize the optimization opportunity for generating efficient code on Intel® AI hardware. It automatically identifies the graph partitions to be accelerated via fusion. The fusion patterns focus on fusing compute-intensive operations such as convolution, matmul, and their neighbor operations for both inference and training use cases.

        + +

        Currently, BFloat16 and Float32 datatypes are supported and only inference workloads can be optimized. BF16 is only optimized on machines with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) BF16 support.

        + +

        Few or no modifications are needed in PyTorch to support newer oneDNN Graph fusions/optimized kernels. To use oneDNN Graph, users can:

        + +
          +
        • Either use the API torch.jit.enable_onednn_fusion(True) before JIT tracing a model, OR …
        • +
        • Use its context manager, viz. with torch.jit.fuser(“fuser3”).
        • +
        • For accelerating BFloat16 inference, we rely on eager-mode AMP (Automatic Mixed Precision) support in PyTorch and disable JIT mode’s AMP.
        • +
        + +

        See the PyTorch performance tuning guide.

        + +

        Next Steps

        + +

        Get the Software

        + +

        Try out PyTorch 2.0 and realize the performance benefits for yourself from these Intel-contributed features.

        + +

        We encourage you to check out Intel’s other AI Tools and Framework optimizations and learn about the open, standards-based oneAPI multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio.

        + +

        For more details about 4th Gen Intel Xeon Scalable processor, visit AI Platform where you can learn about how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines.

        + +

        PyTorch Resources

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/clipping-in-opacus/index.html b/blog/clipping-in-opacus/index.html new file mode 100644 index 000000000000..2038bc561033 --- /dev/null +++ b/blog/clipping-in-opacus/index.html @@ -0,0 +1,1011 @@ + + + + + + + + + + + + + Enabling Fast Gradient Clipping and Ghost Clipping in Opacus | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Enayat Ullah, Huanyu Zhang, Will Bullock, Ilya Mironov + +

        +

        Introduction and Context

        + +

        Differentially Private Stochastic Gradient Descent (DP-SGD) is the canonical method for training machine learning models with differential privacy. It involves the following two modifications to its non-private counterpart, Stochastic Gradient Descent.

        + +
          +
        1. +

          Per-sample gradient clipping: Clip gradients with respect to every sample in the mini-batch, ensuring that its norm is at most a pre-specified value, “Clipping Norm”, C, in every iteration.

          +
        2. +
        3. +

          Noise addition: Add Gaussian noise of pre-specified variance, depending on the clipping norm and privacy parameters, to the average clipped gradient, in every iteration.

          +
        4. +
        + +

        The first change, per-sample gradient clipping, introduces additional complexities since, in general, it requires instantiating per-sample gradients.

        + +

        Opacus is a PyTorch implementation of DP-SGD. Opacus addresses the above task by employing hook functions, which allows intervening on specific events, such as forward and backward passes. For more details about Opacus, we encourage readers to review the previous blog posts: DP-SGD Algorithm Explained, Efficient Per-Sample Gradient Computation in Opacus and Efficient Per-Sample Gradient Computation for More Layers in Opacus.

        + +

        While Opacus provides substantial efficiency gains compared to the naive approaches, the memory cost of instantiating per-sample gradients is significant. In particular, memory usage is proportional to the batch size times the number of trainable parameters. Consequently, memory limits Opacus to small batch sizes and/or small models, significantly restricting its range of applications.

        + +

        We introduce Fast Gradient Clipping and Ghost Clipping to Opacus, which enable developers and researchers to perform gradient clipping without instantiating the per-sample gradients. As an example, this allows for fine-tuning 7M parameters of BERT, on a single 16GB GPU, with a batch size of 1024, with memory comparable to using PyTorch (without applying DP-SGD). In contrast, the previous version of Opacus, supported a maximum batch size of roughly 256 for the same setting. We provide a tutorial on how to use Fast Gradient Clipping in Opacus with the aforementioned task as an example.

        + +

        Fast Gradient Clipping and Ghost Clipping

        + +

        The key idea behind these techniques is based on the following observation: suppose per-sample gradient norms are known, then gradient clipping can be achieved by backpropagation on a re-weighted loss function $ \bar{L} $. This loss function is defined as $ \bar{L} = \sum_{i} R_{i} L_{i} $, where $ R_i = \min\left(\frac{C}{C_i}, 1\right) $ are the clipping coefficients computed from the per-sample gradient norms $ {C_i} $ and $ {L_i} $ are per-sample losses.

        + +

        The above idea may seem circular at first glance, as it appears to require instantiating per-sample gradients in order to calculate per-sample gradient norms. However, for certain widely-used components of neural network architectures, such as fully connected/linear layers, it is indeed possible to obtain per-sample gradient norms in a single backpropagation pass without the need for per-sample gradients. This suggests a workflow that involves two backpropagation passes: the first to compute per-sample gradient norms, and the second to compute the aggregated (not per-sample) clipped gradient. The second backpropagation is simply the standard batched backpropagation.

        + +

        backpropagation diagram

        + +

        backpropagation diagram

        + +

        Figure 1: Comparison between vanilla Opacus (top left), Fast Gradient Clipping (top right), and Ghost clipping (bottom). We marked in red gradient instantiations that become memory bottlenecks. For vanilla Opacus, it has to instantiate the per-sample gradients. Fast Gradient Clipping instantiates per-sample gradients for each layer to compute its norm, which is immediately released once the backward pass moves on to the next layer. Ghost Clipping works directly from per-sample activation gradients and per-sample activations, and avoids the need for gradient instantiation.

        + +

        Fast Gradient Clipping
        +In Fast Gradient Clipping, the per-sample gradient norm is calculated in three steps:

        + +
          +
        1. For each layer, the per-sample gradient is instantiated and its norm is calculated.
        2. +
        3. The per-sample gradient is then immediately discarded.
        4. +
        5. The (squared) per-sample gradient norms of each layer are summed up to obtain the overall (squared) per-sample gradient norm.
        6. +
        + +

        Ghost Clipping
        +Extending the approach of Fast Gradient Clipping, Ghost Clipping uses the fact that for linear layers1, per-sample gradient norms can be calculated just from activation gradients and activations. In particular, let backprops and activations be per-sample activation gradients and activations, of dimensions batch_size ✕ output_width and batch_size ✕ input_width, respectively. The per-sample gradient is the outer product of the two, which takes O(batch_size ✕ input_width ✕ output_width) time and space.

        + +

        The ghost clipping trick instead calculates the (squared) norm of backprops and activations, sample-wise, and takes their product, which gives the (squared) norm of the gradient. This takes O(batch-size ✕ (input_width + output_width)) time and takes O(batch-size) space to store. Since per-sample activation and per-sample activation gradients are already stored, additional memory is needed only for storing the norms.

        + +

        Relationship between Fast Gradient Clipping and Ghost Clipping

        + +
          +
        1. Fast Gradient Clipping and Ghost Clipping are complementary techniques. Fast Gradient Clipping can be applied to any type of layer, while Ghost Clipping is a strictly better technique for supported layers.
        2. +
        3. Our implementation automatically switches to Fast Gradient Clipping when the layer is not supported by Ghost Clipping.
        4. +
        + +

        How to use Fast Gradient Clipping in Opacus

        + +

        The training loop is identical to that of the standard PyTorch loop. As in Opacus before, we use the PrivacyEngine(), which “sanitizes” the model and optimizer. To enable Ghost Clipping, the argument grad_sample_mode="ghost" is used. Additionally, make_private() takes the loss criterion as an extra input and sanitizes it. This allows us to hide the two backward passes and the loss rescaling in between in loss.backward().

        + +
        from opacus import PrivacyEngine
        +criterion = nn.CrossEntropyLoss() # example loss function
        +
        +privacy_engine = PrivacyEngine()
        +model_gc, optimizer_gc, criterion_gc, train_loader, = privacy_engine.make_private(
        +        module=model,
        +        optimizer=optimizer,
        +        data_loader=train_loader,
        +        noise_multiplier=noise_multiplier
        +        max_grad_norm=max_grad_norm,
        +	 criterion=criterion,
        +        grad_sample_mode="ghost",
        +)
        +
        +# The training loop below is identical to that of PyTorch
        +
        +for input_data, target_data in train_loader:
        +    output_gc = model_gc(input_data) # Forward pass
        +    optimizer_gc.zero_grad()
        +    loss = criterion_gc(output_gc, target_data)
        +    loss.backward()
        +    optimizer_gc.step()  # Add noise and update the model
        +
        + +

        Internally, before the first pass, we enable the hooks, which allows us to capture layer-wise values corresponding to forward and backward calls. They are used to compute the per-sample gradient norms. We then compute the clipping coefficients, rescale the loss function and disable hooks, which lets us use the standard PyTorch backward pass.

        + +

        Memory Complexity Analysis

        + +

        Consider a multi-layer neural network with the following properties:

        + +

        L: Number of layers
        +d: Maximum layer width
        +B: Batch size
        +K: Number of non-supported/non-linear layers

        + +

        The memory overhead of DP-SGD with Ghost Clipping compared to plain (PyTorch) SGD is an additive O(BL), required to store the per-sample gradient norms for all layers. Further, if there is a non-supported layer (if K≥1), then there is an additional O(Bd2) memory to instantiate the gradient of that layer.

        + +

        Memory Benchmarking

        + +

        We provide results on the memory usage for a variety of settings.

        + +

        Fine-Tuning BERT

        + +

        We consider the problem of privately fine-tuning the last three layers of BERT for a text classification task. The base model has over 100M parameters, of which we fine-tune the last three layers, BertEncoder, BertPooler, and Classifier, comprising roughly 7.6M parameters. The experiments are run on a P100 GPU with 16 GB of memory.

        + +

        The following table reports the maximum memory and time taken per iteration for the various methods:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + Batch size +
        B = 32 + B = 128 + B = 512 + B = 1024 + B = 2048 +
        Mem + Time + Mem + Time + Mem + Time + Mem + Time + +
        PyTorch SGD + 236 MB + 0.15 s + 1.04 GB + 0.55 s + 5.27 GB + 2.1 s + 12.7 GB + 4.2 s + OOM +
        DP-SGD + 1,142 MB + 0.21 s + 4.55 GB + 0.68 s + OOM + OOM + OOM +
        FGC DP-SGD + 908 MB + 0.21 s + 3.6 GB + 0.75 s + OOM + OOM + OOM +
        GC DP-SGD + 362 MB + 0.21 s + 1.32 GB + 0.67 s + 5.27 GB + 2.5 s + 12.7 GB + 5 s + OOM +
        + +

        In terms of peak memory footprint, DP-SGD > FGC DP-SGD ≫ GC DP-SGD ≈ PyTorch SGD. Further, the runtimes are similar because most of the parameters are frozen and the forward pass takes up most of the time.

        + +

        Synthetic Setup: Memory Profiling

        + +

        We consider the following setup to profile the memory used by PyTorch SGD, Vanilla DP-SGD and Ghost Clipping, GC DP-SGD.

        + +
          +
        • 2-layer fully connected neural network +
            +
          • Input: 5120
          • +
          • Hidden: 2560
          • +
          • Output: 1280
          • +
          • Total number of model parameters = 15.6M
          • +
          • Model size = 62.5 MB
          • +
          +
        • +
        • Batch size, different values, as seen in the table below.
        • +
        + +

        The table below summarizes the max memory increase (in MB) broken down by stages of the training loop for each of the methods.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch Size + Method + Model to GPU + Forward + First Backward + Second Backward + Optimizer Step +
        32 + PyTorch SGD + 62.5 + 0.5 + 62.5 + N/A + 0 +
        Vanilla DP-SGD + 62.5 + 0.47 + 3,663 + N/A + 162.5 +
        GC DP-SGD + 62.5 + 0.47 + 63.13 + 50 + 125 +
        217 + PyTorch SGD + 62.5 + 1920 + 1932.5 + N/A + 0 +
        Vanilla DP-SGD + OOM +
        GC DP-SGD + 62.5 + 1920 + 2625 + 1932.5 + 125 +
        + +

        Industry use case

        + +

        We tested Ghost Clipping DP-SGD on an internal Meta use case, consisting of a model of size roughly 100B with 40M trainable parameters. Our initial results show that Ghost Clipping SGD reduces 95% memory of vanilla DP-SGD, and achieves comparable memory usage to PyTorch SGD.

        + +

        Conclusion

        + +

        In this post, we describe implementations of Fast Gradient Clipping and Ghost Clipping in Opacus that enable memory-efficient training of machine learning models with differential privacy. Currently, the Ghost Clipping implementation only applies to linear layers, but, as outlined in part 3 of the series, it can be extended to “generalized” linear layers such as convolutions and multi-head attention. The current techniques require two explicit backpropagation steps, which increases runtime. We will explore developments on top of Ghost Clipping such as the Book-Keeping algorithm for mitigation.

        + +

        To learn more about Opacus, visit opacus.ai and github.com/pytorch/opacus.

        + +

        Acknowledgements

        + +

        We thank Iden Kalemaj, Darren Liu, Karthik Prasad, Hao Shi, Igor Shilov, Davide Testuggine, Eli Uriegas, Haicheng Wang, and Richard Zou for valuable feedback and suggestions.

        + +
        +
          +
        1. +

          There are ways to extend Ghost Clipping to non-linear layers. 

          +
        2. +
        +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/compiling-numpy-code/index.html b/blog/compiling-numpy-code/index.html new file mode 100644 index 000000000000..08c8ba491b55 --- /dev/null +++ b/blog/compiling-numpy-code/index.html @@ -0,0 +1,913 @@ + + + + + + + + + + + + + Compiling NumPy code into C++ or CUDA via torch.compile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Evgeni Burovski, Ralf Gommers and Mario Lezcano + +

        +

        Quansight engineers have implemented support for tracing through NumPy code via +torch.compile in PyTorch 2.1. This feature leverages PyTorch’s compiler to +generate efficient fused vectorized code without having to modify your original +NumPy code. Even more, it also allows for executing NumPy code on CUDA +just by running it through torch.compile under torch.device("cuda")!

        + +

        In this post, we go over how to use this feature and give a few tips and tricks +to make the most out of it.

        + +

        Compiling NumPy code into Parallel C++

        + +

        We take as our running example one step in a K-Means algorithm. +This piece of code is borrowed from this NumPy book

        + +
        import numpy as np
        +
        +def kmeans(X, means):
        +    return np.argmin(np.linalg.norm(X - means[:, None], axis=2), axis=0)
        +
        + +

        We create a synthetic dataset with 20M random 2-D points. We can see that, +given that the means are chosen appropriately, the function returns the correct +cluster for all of them

        + +
        npts = 10_000_000
        +X = np.repeat([[5, 5], [10, 10]], [npts, npts], axis=0)
        +X = X + np.random.randn(*X.shape)  # 2 distinct "blobs"
        +means = np.array([[5, 5], [10, 10]])
        +np_pred = kmeans(X, means)
        +
        + +

        Benchmarking this function gives us a baseline of 1.26s on an AMD 3970X CPU.

        + +

        Compiling this function is now as easy as wrapping it with torch.compile and +executing it with the example inputs

        + +
        import torch
        +
        +compiled_fn = torch.compile(kmeans)
        +compiled_pred = compiled_fn(X, means)
        +assert np.allclose(np_pred, compiled_pred)
        +
        + +

        The compiled function yields a 9x speed-up when running it on 1 core. Even +better, as opposed to NumPy, our generated code does take advantage of all the +cores in a processor. As such, when we run it on 32 cores, we get a 57x +speed-up. Note that PyTorch always uses all the available cores unless +explicitly restricted, so this is the default behavior you get when using +torch.compile.

        + +

        We may inspect the generated C++ code by running the script with the +environment variable TORCH_LOGS=output_code. When doing so, we can see that +torch.compile was able to compile the broadcasting and the two reductions +into just one for-loop, and parallelize it using OpenMP

        + +
        extern "C" void kernel(const double* in_ptr0, const long* in_ptr1, long* out_ptr0) {
        +    #pragma omp parallel num_threads(32)
        +    #pragma omp for
        +    for(long i0=0L; i0<20000000L; i0+=1L) {
        +        auto tmp0 = in_ptr0[2L*i0];
        +        auto tmp1 = in_ptr1[0L];
        +        auto tmp5 = in_ptr0[1L + (2L*i0)];
        +        auto tmp6 = in_ptr1[1L];
        +        // Rest of the kernel omitted for brevity
        +
        + +

        Compiling NumPy code into CUDA

        + +

        Compiling our code so that it runs on CUDA is as simple as setting the +default device to be CUDA

        + +
        with torch.device("cuda"):
        +    cuda_pred = compiled_fn(X, means)
        +assert np.allclose(np_pred, cuda_pred)
        +
        + +

        By inspecting the generated code via TORCH_LOGS=output_code, we see that, +rather than generating CUDA code directly, torch.compile generates rather +readable triton code

        + +
        def triton_(in_ptr0, in_ptr1, out_ptr0, XBLOCK : tl.constexpr):
        +    xnumel = 20000000
        +    xoffset = tl.program_id(0) * XBLOCK
        +    xindex = xoffset + tl.arange(0, XBLOCK)[:]
        +    xmask = xindex < xnumel
        +    x0 = xindex
        +    tmp0 = tl.load(in_ptr0 + (2*x0), xmask)
        +    tmp1 = tl.load(in_ptr1 + (0))
        +    // Rest of the kernel omitted for brevity
        +
        + +

        Running this small snippet on an RTX 2060 gives an 8x speed-up over the +original NumPy code. This is something, but it is not particularly impressive, +given the speed-ups we have seen on CPU. Let’s have a look into how to squeeze +the most out of our GPU via a couple minor changes.

        + +

        float64 vs float32. Many GPUs, in particular consumer-grade ones, are +rather sluggish when running operations on float64. For this reason, changing +the data generation to float32, the original NumPy code just gets a bit +faster, about a 9%, but our CUDA code gets 40% faster, yielding a 11x +speed-up over the plain NumPy code.

        + +

        torch.compile, by default, respects the NumPy semantics, and as such, it uses +np.float64 as its default dtype for all its creation ops. As discussed, this +can hinder performance, so it is possible to change this default by setting

        + +
        from torch._dynamo import config
        +config.numpy_default_float = "float32"
        +
        + +

        CPU <> CUDA copies. An 11x speed-up is good, but it is not even close to +the CPU numbers. This is caused by a small transformation that torch.compile +does behind the scenes. The code above takes NumPy arrays and returns NumPy +arrays. All of these arrays are on CPU, but the computations are performed on +the GPU. This means that every time the function is called, torch.compile has +to copy all these arrays from CPU to the GPU, and then copy the result back to +CPU to preserve the original semantics. There is no native solution to this +issue in NumPy, as NumPy does not have the notion of a device. That being +said, we can work around it by creating a wrapper to this function so that it +accepts PyTorch tensors and returns PyTorch tensors.

        + +
        @torch.compile
        +def tensor_fn(X, means):
        +    X, means = X.numpy(), means.numpy()
        +    ret = kmeans(X, means)
        +    return torch.from_numpy(ret)
        +
        +def cuda_fn(X, means):
        +    with torch.device("cuda"):
        +        return tensor_fn(X, means)
        +
        + +

        This function now takes tensors in CUDA memory and returns tensors in CUDA +memory, but the function itself is written in NumPy! torch.compile uses the +numpy() and the from_numpy() calls as hints, and optimizes them away, and +internally it simply works with PyTorch tensors without moving the memory at +all. When we keep the tensors in CUDA and perform the computations in +float32, we see a 200x speed-up over the initial NumPy implementation on +float32 arrays.

        + +

        Mixing NumPy and PyTorch. In this example, we had to write a small adaptor +to convert tensors to ndarrays and then back to tensors. In programs that mix +PyTorch and NumPy converting a tensor into an ndarray is often implemented as +x.detach().cpu().numpy(), or simply x.numpy(force=True). Since when running +under torch.compile we can run NumPy code in CUDA, we can implement this +conversion pattern as call to x.numpy(), as we did above. Doing so and +running the resulting code under device("cuda") will generate efficient CUDA +code from original NumPy calls without copying the data from CUDA to CPU at +all. Note that the resulting code does not run without torch.compile. For it +to run in eager mode one would need to rollback to x.numpy(force=True).

        + +

        Further Speed-up tricks

        + +

        General advice. The CUDA code we have shown is already quite efficient, but +it is true that the running example is rather short. When dealing with larger +programs, we may need to tweak parts of it to make it more efficient. A good +place to start is the multiple tutorials and FAQs for torch.compile. +This showcases a number of ways to inspect the tracing process, and how to +identify problematic code that may cause slowdowns.

        + +

        Advice when compiling NumPy code. NumPy, even if rather similar to PyTorch, +is often used very differently. It is rather common to perform computations in +NumPy and then do an if/else depending on values within the array, or perform +operations in-place, perhaps via boolean masks. These constructions, while +supported by torch.compile, hamper its performance. Changes like writing the +code in a branchless way to avoid graph breaks, or avoiding in-place ops can go +a long way.

        + +

        To write fast NumPy code, it is best to avoid loops, but sometimes they are +unavoidable. When tracing through a loop, torch.compile will try to fully +unroll it. This is sometimes desirable, but sometimes it may not even be +possible, like when we have a dynamic stopping condition, like in a while loop. +In these cases, it may be best to just compile the body of the loop, perhaps a +few iterations at a time (loop unrolling).

        + +

        Debugging NumPy code. Debugging is rather tricky when a compiler is +involved. To figure out whether an error you are hitting is a torch.compile +error, or an error from the program, you can execute your NumPy program without +torch.compile by replacing the NumPy import by import torch._numpy as np. +This is should just be used for debugging purposes and is in no way a +replacement for the PyTorch API, as it is much slower and, as a private API, +may change without notice. See also this FAQ for other tricks.

        + +

        Differences between NumPy and torch.compile NumPy

        + +

        NumPy scalars. NumPy returns NumPy scalars in almost any case where PyTorch +would return a 0-D tensor (e.g. from np.sum). Under torch.compile, NumPy +scalars are treated as 0-D arrays. This is just fine in most cases. The only +case when their behavior diverges is when NumPy scalars are implicitly used as +Python scalars. For example,

        + +
        >>> np.asarray(2) * [1, 2, 3]  # 0-D array is an array-like
        +array([2, 4, 6])
        +>>> u = np.int32(2)
        +>>> u * [1, 2, 3]              # scalar decays into a Python int
        +[1, 2, 3, 1, 2, 3]
        +>>> torch.compile(lambda: u * [1, 2, 3])()
        +array([2, 4, 6])               # acts as a 0-D array, not as a scalar ?!?!
        +
        + +

        If we compile the first two lines, we see that torch.compile treats u as a +0-D array. To recover the eager semantics, we just need to make the casting +explicit

        + +
        >>> torch.compile(lambda: int(u) * [1, 2, 3])()
        +[1, 2, 3, 1, 2, 3]
        +
        + +

        Type promotion and versioning. NumPy’s type promotion rules may be, at +times, a bit surprising

        + +
        >>> np.zeros(1, dtype=np.int8) + 127
        +array([127], dtype=int8)
        +>>> np.zeros(1, dtype=np.int8) + 128
        +array([128], dtype=int16)
        +
        + +

        NumPy 2.0 is changing these rules to follow others that are closer to those +PyTorch. The relevant technical document is NEP 50. +torch.compile went ahead and implemented NEP 50 rather than the about-to-be-deprecated rules.

        + +

        In general, NumPy within torch.compile follows NumPy 2.0 pre-release.

        + +

        Beyond NumPy: SciPy and scikit-learn

        + +

        In parallel to this effort of making torch.compile understand NumPy code, +other Quansight engineers have designed and proposed a way to support PyTorch +tensors within scikit-learn and SciPy. This was received enthusiastically by +other maintainers from these libraries, as it was shown that using PyTorch as a +backend would often yield considerable speed-ups. Both projects have now merged +initial support for PyTorch tensors across a number of APIs and submodules.

        + +

        This sets the stepping stone to move towards a future where PyTorch tensors can +be used within other libraries in the Python data ecosystem. Even more, this +will enable running these other libraries on GPUs and even compiling code +mixing these libraries and PyTorch, similar to what we have been discussed in +this post.

        + +

        If you want to learn more about this effort, how to use it, or how to help +moving it forward, see this other blogpost.

        + +

        Conclusion

        + +

        PyTorch has committed since its inception to be a framework compatible with the +rest of the Python ecosystem. Enabling compiling NumPy programs, and +establishing the tools necessary to do the same for other prominent libraries +are two more steps in this direction. Quansight and Meta continue working hand +on hand, improving the compatibility between PyTorch and the rest of the +ecosystem.

        + +

        From Quansight, we would like to thank Mengwei, Voz, and Ed for their +invaluable help in integrating our work with torch.compile. We would also +like to thank Meta for funding this project as well as previous work on +improving NumPy compatibility within PyTorch, and the project that led to +supporting PyTorch within scikit-learn and SciPy. These are giant leaps towards +consolidating PyTorch as the framework of choice within the open source Python +data ecosystem.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/compromised-nightly-dependency/index.html b/blog/compromised-nightly-dependency/index.html new file mode 100644 index 000000000000..da2e21fc1756 --- /dev/null +++ b/blog/compromised-nightly-dependency/index.html @@ -0,0 +1,709 @@ + + + + + + + + + + + + + Compromised PyTorch-nightly dependency chain between December 25th and December 30th, 2022. | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + The PyTorch Team + +

        +

        If you installed PyTorch-nightly on Linux via pip between December 25, 2022 and December 30, 2022, please uninstall it and torchtriton immediately, and use the latest nightly binaries (newer than Dec 30th 2022).

        + +
        $ pip3 uninstall -y torch torchvision torchaudio torchtriton
        +$ pip3 cache purge
        +
        + +

        PyTorch-nightly Linux packages installed via pip during that time installed a dependency, torchtriton, which was compromised on the Python Package Index (PyPI) code repository and ran a malicious binary. This is what is known as a supply chain attack and directly affects dependencies for packages that are hosted on public package indices.

        + +

        NOTE: Users of the PyTorch stable packages are not affected by this issue.

        + +

        How to check if your Python environment is affected

        + +

        The following command searches for the malicious binary in the torchtriton package (PYTHON_SITE_PACKAGES/triton/runtime/triton) and prints out whether your current Python environment is affected or not.

        + +
        python3 -c "import pathlib;import importlib.util;s=importlib.util.find_spec('triton'); affected=any(x.name == 'triton' for x in (pathlib.Path(s.submodule_search_locations[0] if s is not None else '/' ) / 'runtime').glob('*'));print('You are {}affected'.format('' if affected else 'not '))"
        +
        + +

        The malicious binary is executed when the triton package is imported, which requires explicit code to do and is not PyTorch’s default behavior.

        + +

        The Background

        + +

        At around 4:40pm GMT on December 30 (Friday), we learned about a malicious dependency package (torchtriton) that was uploaded to the Python Package Index (PyPI) code repository with the same package name as the one we ship on the PyTorch nightly package index. Since the PyPI index takes precedence, this malicious package was being installed instead of the version from our official repository. This design enables somebody to register a package by the same name as one that exists in a third party index, and pip will install their version by default.

        + +

        This malicious package has the same name torchtriton but added in code that uploads sensitive data from the machine.

        + +

        What we know

        + +

        torchtriton on PyPI contains a malicious triton binary which is installed at PYTHON_SITE_PACKAGES/triton/runtime/triton. Its SHA256 hash is listed below.

        + +

        SHA256(triton)= 2385b29489cd9e35f92c072780f903ae2e517ed422eae67246ae50a5cc738a0e

        + +

        The binary’s main function does the following:

        + +
          +
        • Get system information +
            +
          • nameservers from /etc/resolv.conf
          • +
          • hostname from gethostname()
          • +
          • current username from getlogin()
          • +
          • current working directory name from getcwd()
          • +
          • environment variables
          • +
          +
        • +
        • Read the following files +
            +
          • /etc/hosts
          • +
          • /etc/passwd
          • +
          • The first 1,000 files in $HOME/*
          • +
          • $HOME/.gitconfig
          • +
          • $HOME/.ssh/*
          • +
          +
        • +
        • Upload all of this information, including file contents, via encrypted DNS queries to the domain *.h4ck[.]cfd, using the DNS server wheezy[.]io
        • +
        + +

        The binary’s file upload functionality is limited to files less than 99,999 bytes in size. It also uploads only the first 1,000 files in $HOME (but all files < 99,999 bytes in the .ssh directory).

        + +

        Steps taken towards mitigation

        + +
          +
        • torchtriton has been removed as a dependency for our nightly packages and replaced with pytorch-triton (pytorch/pytorch#91539) and a dummy package registered on PyPI (so that this issue doesn’t repeat)
        • +
        • All nightly packages that depend on torchtriton have been removed from our package indices at https://download.pytorch.org until further notice
        • +
        • We have reached out to the PyPI security team to get proper ownership of the torchtriton package on PyPI and to delete the malicious version
        • +
        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/computational-graphs-constructed-in-pytorch/index.html b/blog/computational-graphs-constructed-in-pytorch/index.html new file mode 100644 index 000000000000..1a5e444fa20c --- /dev/null +++ b/blog/computational-graphs-constructed-in-pytorch/index.html @@ -0,0 +1,1095 @@ + + + + + + + + + + + + + How Computational Graphs are Constructed in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Preferred Networks + +

        +

        In the previous post we went over the theoretical foundations of automatic differentiation and reviewed the implementation in PyTorch. In this post, we will be showing the parts of PyTorch involved in creating the graph and executing it. In order to understand the following contents, please read @ezyang’s wonderful blog post about PyTorch internals.

        + +

        Autograd components

        + +

        First of all, let’s look at where the different components of autograd live:

        + +

        tools/autograd: Here we can find the definition of the derivatives as we saw in the previous post derivatives.yaml, several python scripts and a folder called templates. These scripts and the templates are used at building time to generate the C++ code for the derivatives as specified in the yaml file. Also, the scripts here generate wrappers for the regular ATen functions so that the computational graph can be constructed.

        + +

        torch/autograd: This folder is where the autograd components that can be used directly from python are located. In function.py we find the actual definition of torch.autograd.Function, a class used by users to write their own differentiable functions in python as per the documentation. functional.py holds components for functionally computing the jacobian vector product, hessian, and other gradient related computations of a given function. +The rest of the files have additional components such as gradient checkers, anomaly detection, and the autograd profiler.

        + +

        torch/csrc/autograd: This is where the graph creation and execution-related code lives. +All this code is written in C++, since it is a critical part that is required to be extremely performant. Here we have several files that implement the engine, metadata storage, and all the needed components. Alongside this, we have several files whose names start with python_, and their main responsibility is to allow python objects to be used in the autograd engine.

        + +

        Graph Creation

        + +

        Previously, we described the creation of a computational graph. Now, we will see how PyTorch creates these graphs with references to the actual codebase.

        + +

        + +
        +Figure 1: Example of an augmented computational graph +

        + +

        It all starts when in our python code, where we request a tensor to require the gradient.

        + +
        >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
        +
        + +

        When the required_grad flag is set in tensor creation, c10 will allocate an AutogradMeta object that is used to hold the graph information.

        + +
        
        +void TensorImpl::set_requires_grad(bool requires_grad) {
        +  ...
        +  if (!autograd_meta_)
        +    autograd_meta_ = impl::GetAutogradMetaFactory()->make();
        +    autograd_meta_->set_requires_grad(requires_grad, this);
        +}
        +
        + +

        The AutogradMeta object is defined in torch/csrc/autograd/variable.h as follows:

        + +
        
        +struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
        +  std::string name_;
        +
        +  Variable grad_;
        +  std::shared_ptr<Node> grad_fn_;
        +  std::weak_ptr<Node> grad_accumulator_;
        +  // other fields and methods
        +  ...
        +};
        +
        + +

        The most important fields in this structure are the computed gradient in grad_ and a pointer to the function grad_fn that will be called by the engine to produce the actual gradient. Also, there is a gradient accumulator object that is used to add together all the different gradients where this tensor is involved as we will see in the graph execution.

        + +

        Graphs, Nodes and Edges.

        + +

        Now, when we call a differentiable function that takes this tensor as an argument, the associated metadata will be populated. Let’s suppose that we call a regular torch function that is implemented in ATen. Let it be the multiplication as in our previous blog post example. The resulting tensor has a field called grad_fn that is essentially a pointer to the function that will be used to compute the gradient of that operation.

        + +
        >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
        +>>> v = x[0] * x[1]
        +>>> v
        +tensor(0.3750, grad_fn=<MulBackward0>)
        +
        + +

        Here we see that the tensors’ grad_fn has a MulBackward0 value. This function is the same that was written in the derivatives.yaml file, and its C++ code was generated automatically by all the scripts in tools/autograd. It’s auto-generated source code can be seen in torch/csrc/autograd/generated/Functions.cpp.

        + +
        variable_list MulBackward0::apply(variable_list&& grads) {
        +  std::lock_guard<std::mutex> lock(mutex_);
        +
        +  IndexRangeGenerator gen;
        +  auto self_ix = gen.range(1);
        +  auto other_ix = gen.range(1);
        +  variable_list grad_inputs(gen.size());
        +  auto& grad = grads[0];
        +  auto self = self_.unpack();
        +  auto other = other_.unpack();
        +  bool any_grad_defined = any_variable_defined(grads);
        +  if (should_compute_output({ other_ix })) {
        +    auto grad_result = any_grad_defined ? (mul_tensor_backward(grad, self, other_scalar_type)) : Tensor();
        +    copy_range(grad_inputs, other_ix, grad_result);
        +  }
        +  if (should_compute_output({ self_ix })) {
        +    auto grad_result = any_grad_defined ? (mul_tensor_backward(grad, other, self_scalar_type)) : Tensor();
        +    copy_range(grad_inputs, self_ix, grad_result);
        +  }
        +  return grad_inputs;
        +}
        +
        + +

        The grad_fn objects inherit from the TraceableFunction class, a descendant of Node with just a property set to enable tracing for debugging and optimization purposes. A graph by definition has nodes and edges, so these functions are indeed the nodes of the computational graph that are linked together by using Edge objects to enable the graph traversal later on.

        + +

        The Node definition can be found in the torch/csrc/autograd/function.h file.

        + +
        struct TORCH_API Node : std::enable_shared_from_this<Node> {
        + ...
        + /// Evaluates the function on the given inputs and returns the result of the
        +  /// function call.
        +  variable_list operator()(variable_list&& inputs) {
        +  ...
        +  }
        +
        +protected:
        +  /// Performs the `Node`'s actual operation.
        +  virtual variable_list apply(variable_list&& inputs) = 0;
        +  
        +  edge_list next_edges_;
        +
        + +

        Essentially we see that it has an override of the operator () that performs the call to the actual function, and a pure virtual function called apply. The automatically generated functions override this apply method as we saw in the MulBackward0 example above. Finally, the node also has a list of edges to enable graph connectivity.

        + +

        The Edge object is used to link Nodes together and its implementation is straightforward.

        + +
        struct Edge {
        +  ...
        +  /// The function this `Edge` points to.
        +  std::shared_ptr<Node> function;
        +  /// The identifier of a particular input to the function.
        +  uint32_t input_nr;
        +};
        +
        + +

        It only requires a function pointer (the actual grad_fn objects that the edges link together), and an input number that acts as an id for the edge.

        + +

        Linking nodes together

        + +

        When we invoke the product operation of two tensors, we enter into the realm of autogenerated code. All the scripts that we saw in tools/autograd fill a series of templates that wrap the differentiable functions in ATen. These functions have code to construct the backward graph during the forward pass.

        + +

        The gen_variable_type.py script is in charge of writing all this wrapping code. This script is called from the tools/autograd/gen_autograd.py during the pytorch build process and it will output the automatically generated function wrappers to torch/csrc/autograd/generated/.

        + +

        Let’s take a look at how the tensor multiplication generated function looks like. The code has been simplified, but it can be found in the torch/csrc/autograd/generated/VariableType_4.cpp file when compiling pytorch from source.

        + +
        at::Tensor mul_Tensor(c10::DispatchKeySet ks, const at::Tensor & self, const at::Tensor & other) {
        +  ...
        +  auto _any_requires_grad = compute_requires_grad( self, other );
        +  std::shared_ptr<MulBackward0> grad_fn;
        +  if (_any_requires_grad) {
        +    // Creates the link to the actual grad_fn and links the graph for backward traversal
        +    grad_fn = std::shared_ptr<MulBackward0>(new MulBackward0(), deleteNode);
        +    grad_fn->set_next_edges(collect_next_edges( self, other ));
        +    ...
        +  }
        +  
        +  // Does the actual function call to ATen
        +  auto _tmp = ([&]() {
        +    at::AutoDispatchBelowADInplaceOrView guard;
        +    return at::redispatch::mul(ks & c10::after_autograd_keyset, self_, other_);
        +  })();
        +
        +  auto result = std::move(_tmp);
        +    if (grad_fn) {
        +       // Connects the result to the graph
        +      set_history(flatten_tensor_args( result ), grad_fn);
        +  }
        +  ...
        +  return result;
        +}
        +
        + +

        Let’s walk through the most important lines of this code. +First of all, the grad_fn object is created with: ` grad_fn = std::shared_ptr(new MulBackward0(), deleteNode);`.

        + +

        After the grad_fn object is created, the edges used to link the nodes together are created by using the grad_fn->set_next_edges(collect_next_edges( self, other )); calls.

        + +
        struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
        +  edge_list next_edges;
        +  using IterArgs<MakeNextFunctionList>::operator();
        +  void operator()(const Variable& variable) {
        +    if (variable.defined()) {
        +      next_edges.push_back(impl::gradient_edge(variable));
        +    } else {
        +      next_edges.emplace_back();
        +    }
        +  }
        +  void operator()(const c10::optional<Variable>& variable) {
        +    if (variable.has_value() && variable->defined()) {
        +      next_edges.push_back(impl::gradient_edge(*variable));
        +    } else {
        +      next_edges.emplace_back();
        +    }
        +  }
        +};
        +
        +template <typename... Variables>
        +edge_list collect_next_edges(Variables&&... variables) {
        +  detail::MakeNextFunctionList make;
        +  make.apply(std::forward<Variables>(variables)...);
        +  return std::move(make.next_edges);
        +}
        +
        + +

        Given an input variable (it’s just a regular tensor), collect_next_edges + will create an Edge object by calling impl::gradient_edge

        + +
         Edge gradient_edge(const Variable& self) {
        +    // If grad_fn is null (as is the case for a leaf node), we instead
        +    // interpret the gradient function to be a gradient accumulator, which will
        +    // accumulate its inputs into the grad property of the variable. These
        +    // nodes get suppressed in some situations, see "suppress gradient
        +    // accumulation" below. Note that only variables which have `requires_grad =
        +    // True` can have gradient accumulators.
        +    if (const auto& gradient = self.grad_fn()) {
        +      return Edge(gradient, self.output_nr());
        +    } else {
        +      return Edge(grad_accumulator(self), 0);
        +    }
        +  }
        +
        + +

        To understand how edges work, let’s assume that an early executed function produced two output tensors, both with their grad_fn set, each tensor also has an output_nr property with the order in which they were returned. When creating the edges for the current grad_fn, an Edge object per input variable will be created. The edges will point to the variable’s grad_fn and will also track the output_nr to establish ids used when traversing the graph. In the case that the input variables are “leaf”, i.e. they were not produced by any differentiable function, they don’t have a grad_fn attribute set. A special function called a gradient accumulator is set by default as seen in the above code snippet.

        + +

        After the edges are created, the grad_fn graph Node object that is being currently created will hold them using the set_next_edges function. This is what connects grad_fns together, producing the computational graph.

        + +
         void set_next_edges(edge_list&& next_edges) {
        +    next_edges_ = std::move(next_edges);
        +    for(const auto& next_edge : next_edges_) {
        +      update_topological_nr(next_edge);
        +    }
        +  }
        +
        + +

        Now, the forward pass of the function will execute, and after the execution set_history will connect the output tensors to the grad_fn Node.

        + +
        inline void set_history(
        +    at::Tensor& variable,
        +    const std::shared_ptr<Node>& grad_fn) {
        +  AT_ASSERT(grad_fn);
        +  if (variable.defined()) {
        +    // If the codegen triggers this, you most likely want to add your newly added function
        +    // to the DONT_REQUIRE_DERIVATIVE list in tools/autograd/gen_variable_type.py
        +    TORCH_INTERNAL_ASSERT(isDifferentiableType(variable.scalar_type()));
        +    auto output_nr =
        +        grad_fn->add_input_metadata(variable);
        +    impl::set_gradient_edge(variable, {grad_fn, output_nr});
        +  } else {
        +    grad_fn->add_input_metadata(Node::undefined_input());
        +  }
        +}
        +
        + +

        set_history calls set_gradient_edge, which just copies the grad_fn and the output_nr to the AutogradMeta object that the tensor has.

        + +
         void set_gradient_edge(const Variable& self, Edge edge) {
        +    auto* meta = materialize_autograd_meta(self);
        +    meta->grad_fn_ = std::move(edge.function);
        +    meta->output_nr_ = edge.input_nr;
        +    // For views, make sure this new grad_fn_ is not overwritten unless it is necessary
        +    // in the VariableHooks::grad_fn below.
        +    // This logic is only relevant for custom autograd Functions for which multiple
        +    // operations can happen on a given Tensor before its gradient edge is set when
        +    // exiting the custom Function.
        +    auto diff_view_meta = get_view_autograd_meta(self);
        +    if (diff_view_meta && diff_view_meta->has_bw_view()) {
        +      diff_view_meta->set_attr_version(self._version());
        +    }
        +  }
        +
        + +

        This tensor now will be the input to another function and the above steps will be all repeated. Check the animation below to see how the graph is created.

        + +

        + +
        +Figure 2: Animation that shows the graph creation +

        + +

        Registering Python Functions in the graph

        + +

        We have seen how autograd creates the graph for the functions included in ATen. However, when we define our differentiable functions in Python, they are also included in the graph!

        + +

        An autograd python defined function looks like the following:

        + +
        class Exp(torch.autograd.Function):
        +     @staticmethod
        +     def forward(ctx, i):
        +         result = i.exp()
        +         ctx.save_for_backward(result)
        +         return result
        +
        +     @staticmethod
        +     def backward(ctx, grad_output):
        +         result, = ctx.saved_tensors
        +         return grad_output * result
        +
        +# Call the function
        +Exp.apply(torch.tensor(0.5, requires_grad=True))
        +# Outputs: tensor(1.6487, grad_fn=<ExpBackward>)
        +
        + +

        In the above snippet autograd detected our python function when creating the graph. All of this is possible thanks to the Function class. Let’s take a look at what happens when we call apply.

        + +

        apply is defined in the torch._C._FunctionBase class, but this class is not present in the python source. _FunctionBase is defined in C++ by using the python C API to hook C functions together into a single python class. We are looking for a function named THPFunction_apply.

        + +
        
        +PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
        +{
        +  
        +  // Generates the graph node
        +  THPObjectPtr backward_cls(PyObject_GetAttrString(cls, "_backward_cls"));
        +  if (!backward_cls) return nullptr;
        +  THPObjectPtr ctx_obj(PyObject_CallFunctionObjArgs(backward_cls, nullptr));
        +  if (!ctx_obj) return nullptr;
        +  THPFunction* ctx = (THPFunction*)ctx_obj.get();
        +
        +  auto cdata = std::shared_ptr<PyNode>(new PyNode(std::move(ctx_obj)), deleteNode);
        +  ctx->cdata = cdata;
        +
        +  // Prepare inputs and allocate context (grad fn)
        +  // Unpack inputs will collect the edges
        +  auto info_pair = unpack_input<false>(inputs);
        +  UnpackedInput& unpacked_input = info_pair.first;
        +  InputFlags& input_info = info_pair.second;
        +
        +   // Initialize backward function (and ctx)
        +  bool is_executable = input_info.is_executable;
        +  cdata->set_next_edges(std::move(input_info.next_edges));
        +  ctx->needs_input_grad = input_info.needs_input_grad.release();
        +  ctx->is_variable_input = std::move(input_info.is_variable_input);
        +
        +  // Prepend ctx to input_tuple, in preparation for static method call
        +  auto num_args = PyTuple_GET_SIZE(inputs);
        +  THPObjectPtr ctx_input_tuple(PyTuple_New(num_args + 1));
        +  if (!ctx_input_tuple) return nullptr;
        +  Py_INCREF(ctx);
        +  PyTuple_SET_ITEM(ctx_input_tuple.get(), 0, (PyObject*)ctx);
        +  for (int i = 0; i < num_args; ++i) {
        +    PyObject *arg = PyTuple_GET_ITEM(unpacked_input.input_tuple.get(), i);
        +    Py_INCREF(arg);
        +    PyTuple_SET_ITEM(ctx_input_tuple.get(), i + 1, arg);
        +  }
        +
        +  // Call forward
        +  THPObjectPtr tensor_outputs;
        +  {
        +    AutoGradMode grad_mode(false);
        +    THPObjectPtr forward_fn(PyObject_GetAttrString(cls, "forward"));
        +    if (!forward_fn) return nullptr;
        +    tensor_outputs = PyObject_CallObject(forward_fn, ctx_input_tuple);
        +    if (!tensor_outputs) return nullptr;
        +  }
        +
        +  // Here is where the outputs gets the tensors tracked
        +  return process_outputs(cls, cdata, ctx, unpacked_input, inputs, std::move(tensor_outputs),
        +                         is_executable, node);
        +  END_HANDLE_TH_ERRORS
        +}
        +
        + +

        Although this code is hard to read at first due to all the python API calls, it essentially does the same thing as the auto-generated forward functions that we saw for ATen:

        + +

        Create a grad_fn object. +Collect the edges to link the current grad_fn with the input tensors one. +Execute the function forward. +Assign the created grad_fn to the output tensors metadata.

        + +

        The grad_fn object is created in:

        + +
          // Generates the graph node
        +  THPObjectPtr backward_cls(PyObject_GetAttrString(cls, "_backward_cls"));
        +  if (!backward_cls) return nullptr;
        +  THPObjectPtr ctx_obj(PyObject_CallFunctionObjArgs(backward_cls, nullptr));
        +  if (!ctx_obj) return nullptr;
        +  THPFunction* ctx = (THPFunction*)ctx_obj.get();
        +
        +  auto cdata = std::shared_ptr<PyNode>(new PyNode(std::move(ctx_obj)), deleteNode);
        +  ctx->cdata = cdata;
        +
        + +

        Basically, it asks the python API to get a pointer to the Python object that can execute the user-written function. Then it wraps it into a PyNode object that is a specialized Node object that calls the python interpreter with the provided python function when apply is executed during the forward pass. Note that in the code cdata is the actual Node object that is part of the graph. ctx is the object that is passed to the python forward/backward functions and it is used to store autograd related information by both, the user’s function and PyTorch.

        + +

        As in the regular C++ functions we also call collect_next_edges to track the inputs grad_fn objects, but this is done in unpack_input:

        + +
        template<bool enforce_variables>
        +std::pair<UnpackedInput, InputFlags> unpack_input(PyObject *args) {
        +  ...
        +  flags.next_edges = (flags.is_executable ? collect_next_edges(unpacked.input_vars) : edge_list());
        +  return std::make_pair(std::move(unpacked), std::move(flags));
        +}
        +
        + +

        After this, the edges are assigned to the grad_fn by just doing cdata->set_next_edges(std::move(input_info.next_edges)); and the forward function is called through the python interpreter C API.

        + +

        Once the output tensors are returned from the forward pass, they are processed and converted to variables inside the process_outputs function.

        + +
        PyObject* process_outputs(PyObject *op_obj, const std::shared_ptr<PyNode>& cdata,
        +                          THPFunction* grad_fn, const UnpackedInput& unpacked,
        +                          PyObject *inputs, THPObjectPtr&& raw_output, bool is_executable,
        +                          torch::jit::Node* node) {
        +  ...
        +  _wrap_outputs(cdata, grad_fn, unpacked.input_vars, raw_output, outputs, is_executable);
        +  _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace, unpack_output);
        +  if (is_executable) {
        +    _save_variables(cdata, grad_fn);
        +  } ...
        +  return outputs.release();
        +}
        +
        + +

        Here, _wrap_outputs is in charge of setting the forward outputs grad_fn to the newly created one. For this, it calls another _wrap_outputs function defined in a different file, so the process here gets a little confusing.

        + +
        static void _wrap_outputs(const std::shared_ptr<PyNode>& cdata, THPFunction *self,
        +    const variable_list &input_vars, PyObject *raw_output, PyObject *outputs, bool is_executable)
        +{
        +  auto cdata_if_executable = is_executable ? cdata : nullptr;
        + ...
        +
        +  // Wrap only the tensor outputs.
        +  // This calls csrc/autograd/custom_function.cpp
        +  auto wrapped_outputs = _wrap_outputs(input_vars, non_differentiable, dirty_inputs, raw_output_vars, cdata_if_executable);
        +...
        +}
        +
        + +

        The called _wrap_outputs is the one in charge of setting the autograd metadata in the output tensors:

        + +
        std::vector<c10::optional<Variable>> _wrap_outputs(const variable_list &input_vars,
        +  const std::unordered_set<at::TensorImpl*> &non_differentiable,
        +  const std::unordered_set<at::TensorImpl*> &dirty_inputs,
        +  const at::ArrayRef<c10::optional<Variable>> raw_outputs,
        +  const std::shared_ptr<Node> &cdata) {
        +
        +
        +  std::unordered_set<at::TensorImpl*> inputs;
        +  
        +  // Sets the grad_fn and output_nr of an output Variable.
        +  auto set_history = [&](Variable& var, uint32_t output_nr, bool is_input, bool is_modified,
        +                         bool is_differentiable) {
        +    // Lots of checks
        +    if (!is_differentiable) {
        +     ...
        +    } else if (is_input) {
        +      // An input has been returned, but it wasn't modified. Return it as a view
        +      // so that we can attach a new grad_fn to the Variable.
        +      // Run in no_grad mode to mimic the behavior of the forward.
        +      {
        +        AutoGradMode grad_mode(false);
        +        var = var.view_as(var);
        +      }
        +      impl::set_gradient_edge(var, {cdata, output_nr});
        +    } else if (cdata) {
        +      impl::set_gradient_edge(var, {cdata, output_nr});
        +    }
        +  };
        +
        + +

        And this is where set_gradient_edge was called and this is how a user-written python function gets included in the computational graph with its associated backward function!

        + +

        Closing remarks

        + +

        This blog post is intended to be a code overview on how PyTorch constructs the actual computational graphs that we discussed in the previous post. The next entry will deal with how the autograd engine executes these graphs.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/cuda-free-inference-for-llms/index.html b/blog/cuda-free-inference-for-llms/index.html new file mode 100644 index 000000000000..385b9ff7c3b2 --- /dev/null +++ b/blog/cuda-free-inference-for-llms/index.html @@ -0,0 +1,1962 @@ + + + + + + + + + + + + + CUDA-Free Inference for LLMs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        September 04, 2024

        +

        + CUDA-Free Inference for LLMs +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Adnan Hoque, Less Wright, Raghu Ganti and Mudhakar Srivatsa + +

        +

        In this blog, we discuss the methods we used to achieve FP16 inference with popular LLM models such as Meta’s Llama3-8B and IBM’s Granite-8B Code, where 100% of the computation is performed using OpenAI’s Triton Language.
        +For single token generation times using our Triton kernel based models, we were able to approach 0.76-0.78x performance relative to the CUDA kernel dominant workflows for both Llama and Granite on Nvidia H100 GPUs, and 0.62-0.82x on Nvidia A100 GPUs.

        + +

        Why explore using 100% Triton? Triton provides a path for enabling LLMs to run on different types of GPUs - NVIDIA, AMD, and in the future Intel and other GPU based accelerators. It also provides a higher layer of abstraction in Python for programming GPUs and has allowed us to write performant kernels faster than authoring them using vendor specific APIs. In the rest of this blog, we will share how we achieve CUDA-free compute, micro-benchmark individual kernels for comparison, and discuss how we can further improve future Triton kernels to close the gaps.

        + +

        + +

        Figure 1. Inference throughput benchmarks with Triton and CUDA variants of Llama3-8B and Granite-8B, on NVIDIA H100 and A100
        +Settings: batch size = 2, input sequence length = 512, output sequence length = 256

        + +

        2.0 Composition of a Transformer Block

        + +

        We start with a breakdown of the computations that happen in Transformer-based models. The figure below shows the “kernels” of a typical Transformer block.

        + +

        + Figure 2. Transformer Block by core kernels

        + +

        The core operations for a Llama3 architecture are summarized in this list:

        + +
          +
        1. RMSNorm
        2. +
        3. Matrix multiplication: Fused QKV
        4. +
        5. RoPE
        6. +
        7. Attention
        8. +
        9. Matrix multiplication: Output Projection
        10. +
        11. RMSNorm
        12. +
        13. Matrix multiplication: Fused Gate + Up Projection
        14. +
        15. Activation function: SiLU
        16. +
        17. Element Wise Multiplication
        18. +
        19. Matrix multiplication: Down Projection
        20. +
        + +

        Each of these operations is computed on the GPU through the execution of one (or multiple) kernels. While the specifics of each of these kernels can vary across different transformer models, the core operations remain the same. For example, IBM’s Granite 8B Code model uses bias in the MLP layer, different from Llama3. Such changes do require modifications to the kernels. A typical model is a stack of these transformer blocks wired together with embedding layers.

        + +

        3.0 Model Inference

        + +

        Typical model architecture code is shared with a python model.py file that is launched by PyTorch. In the default PyTorch eager execution mode, these kernels are all executed with CUDA. To achieve 100% Triton for end-to-end Llama3-8B and Granite-8B inference we need to write and integrate handwritten Triton kernels as well as leverage torch.compile (to generate Triton ops). First, we replace smaller ops with compiler generated Triton kernels, and second, we replace more expensive and complex computations (e.g. matrix multiplication and flash attention) with handwritten Triton kernels.

        + +

        Torch.compile generates Triton kernels automatically for RMSNorm, RoPE, SiLU and Element Wise Multiplication. Using tools like Nsight Systems we can observe these generated kernels; they appear as tiny dark green kernels in-between the matrix multiplications and attention.

        + +

        +Figure 3. Trace of Llama3-8B with torch.compile, showing CUDA kernels being used for matrix multiplications and flash attention

        + +

        For the above trace, we note that the two major ops that make up 80% of the E2E latency in a Llama3-8B style model are matrix multiplication and attention kernels and both remain CUDA kernels. Thus to close the remaining gap, we replace both matmul and attention kernels with handwritten Triton kernels.

        + +

        4.0 Triton SplitK GEMM Kernel

        + +

        For the matrix multiplications in the linear layers, we wrote a custom FP16 Triton GEMM (General Matrix-Matrix Multiply) kernel that leverages a SplitK work decomposition. We have previously discussed this parallelization in other blogs as a way to accelerate the decoding portion of LLM inference.

        + +

        5.0 GEMM Kernel Tuning

        + +

        To achieve optimal performance we used the exhaustive search approach to tune our SplitK GEMM kernel. Granite-8B and Llama3-8B have linear layers with the following shapes:

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Linear LayerShape (in_features, out_features)
        Fused QKV Projection(4096, 6144)
        Output Projection(4096, 4096)
        Fused Gate + Up Projection(4096, 28672)
        Down Projection(14336, 4096)
        + +

        Figure 4. Granite-8B and Llama3-8B Linear Layer Weight Matrix Shapes

        + +

        Each of these linear layers have different weight matrix shapes. Thus, for optimal performance the Triton kernel must be tuned for each of these shape profiles. After tuning for each linear layer we were able to achieve 1.20x E2E speedup on Llama3-8B and Granite-8B over the untuned Triton kernel.

        + +

        6.0 Flash Attention Kernel

        + +

        We evaluated a suite of existing Triton flash attention kernels with different configurations, namely:

        + +
          +
        1. AMD Flash
        2. +
        3. OpenAI Flash
        4. +
        5. Dao AI Lab Flash
        6. +
        7. XFormers Flash
        8. +
        9. PyTorch FlexAttention
        10. +
        + +

        We evaluated the text generation quality of each of these kernels, first, in eager mode and then (if we were able to torch.compile the kernel with standard methods) compile mode. For kernels 2-5, we noted the following:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        KernelText Generation QualityTorch.compileSupport for Arbitrary Sequence Length
        AMD FlashCoherentYesYes
        OpenAI FlashIncoherentDid not evaluate. WIP to debug precision in eager mode firstNo
        Dao AI Lab FlashIncoherentDid not evaluate. WIP to debug precision in eager mode firstYes
        Xformers FlashDecodingHit a compilation error before we were able to evaluate text qualityWIPNo (This kernel is optimized for decoding)
        PyTorch FlexAttentionCoherentWIPWIP
        + +

        Figure 5. Table of combinations we tried with different Flash Attention Kernels

        + +

        The above table summarizes what we observed out-of-the box. With some effort we expect that kernels 2-5 can be modified to meet the above criteria. However, this also shows that having a kernel that works for benchmarking is often only the start of having it usable as an end to end production kernel.
        +We chose to use the AMD flash attention kernel in our subsequent tests as it can be compiled via torch.compile and produces legible output in both eager and compiled mode.

        + +

        To satisfy torch.compile compatibility with the AMD flash attention kernel, we had to define it as a torch custom operator. This process is explained in detail here. The tutorial link discusses how to wrap a simple image crop operation. However, we note that wrapping a more complex flash attention kernel follows a similar process. The two step approach is as follows:

        + +
          +
        1. Wrap the function into a PyTorch Custom Operator
        2. +
        + +

        + +
          +
        1. Add a FakeTensor Kernel to the operator, which given the shapes of the input tensors of flash (q, k and v) provides a way to compute the output shape of the flash kernel
        2. +
        + +

        + +

        After defining the Triton flash kernel as a custom op, we were able to successfully compile it for our E2E runs.

        + +

        + +

        Figure 6. Trace of Llama3-8B with torch.compile, after swapping in Triton matmul and Triton flash attention kernels

        + +

        From Figure 5, we note that now, after integrating both the SplitK matrix multiplication kernel, the torch op wrapped flash attention kernel, and then running torch.compile, we are able to achieve a forward pass that uses 100% Triton computation kernels.

        + +

        7.0 End-to-End Benchmarks

        + +

        We performed end-to-end measurements on NVIDIA H100s and A100s (single GPU) with Granite-8B and Llama3-8B models. We performed our benchmarks with two different configurations.

        + +

        The Triton kernel configuration uses:

        + +
          +
        1. Triton SplitK GEMM
        2. +
        3. AMD Triton Flash Attention
        4. +
        + +

        The CUDA Kernel configuration uses:

        + +
          +
        1. cuBLAS GEMM
        2. +
        3. cuDNN Flash Attention - Scaled Dot-Product Attention (SDPA)
        4. +
        + +

        We found the following throughput and inter-token latencies for both eager and torch compiled modes, with typical inference settings:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        GPUModelKernel ConfigMedian Latency (Eager) [ms/tok]Median Latency (Compiled) [ms/tok]
        H100Granite-8BTriton27.4211.59
          CUDA18.849.50
         Llama3-8BTriton20.3610.61
          CUDA16.598.59
        A100Granite-8BTriton53.4416.88
          CUDA37.1314.25
         Llama3-8BTriton44.4417.94
          CUDA32.4512.96
        + +

        Figure 7. Granite-8B and Llama3-8B Single Token Generation Latency on H100 and A100,
        +(batch size = 2, input sequence length = 512, output sequence length = 256)

        + +

        To summarize, the Triton models can get up to 78% of the performance of the CUDA models on the H100 and up to 82% on the A100.

        + +

        The performance gap can be explained by the kernel latencies we observe for matmul and flash attention, which are discussed in the next section.

        + +

        8.0 Microbenchmarks

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        KernelTriton [us]CUDA [us]
        QKV Projection Matmul2521
        Flash Attention138
        Output Projection Matmul2117
        Gate + Up Projection Matmul8483
        Down Projection Matmul5842
        + +

        Figure 8. Triton and CUDA Kernel Latency Comparison (Llama3-8B on NVIDIA H100)
        +Input was an arbitrary prompt (bs=1, prompt = 44 seq length), decoding latency time

        + +

        From the above, we note the following:

        + +
          +
        1. +

          Triton matmul kernels are 1.2-1.4x slower than CUDA

          +
        2. +
        3. +

          AMDs Triton Flash Attention kernel is 1.6x slower than CUDA SDPA

          +
        4. +
        + +

        These results highlight the need to further improve the performance of kernels that are core primitives like GEMM and Flash Attention. We leave this as future research, as recent works (e.g. FlashAttention-3, FlexAttention) provide ways to leverage the underlying hardware better as well as Triton pathways that we hope to be able to build on to produce greater speedups. To illustrate this, we compared FlexAttention with SDPA and AMD’s Triton Flash kernel.

        + +

        We are working to verify E2E performance with FlexAttention. For now, initial microbenchmarks with Flex show promise for longer context lengths and decoding problem shapes, where the query vector is small:

        + +

        + +

        Figure 9. FlexAttention Kernel Benchmarks on NVIDIA H100 SXM5 80GB
        +(batch=1, num_heads=32, seq_len=seq_len, head_dim=128)

        + +

        9.0 Future Work

        + +

        For future work we plan to explore ways to further optimize our matmuls that leverage the hardware better, such as this blog we published on utilizing TMA for H100, as well as different work decompositions (persistent kernel techniques like StreamK etc.) to get greater speedups for our Triton-based approach. For flash attention, we plan to explore FlexAttention and FlashAttention-3 as the techniques used in these kernels can be leveraged to help further close the gap between Triton and CUDA.
        +We also note that our prior work has shown promising results for FP8 Triton GEMM kernel performance versus cuBLAS FP8 GEMM, thus in a future post we will explore E2E FP8 LLM inference.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/cutlass-ping-pong-gemm-kernel/index.html b/blog/cutlass-ping-pong-gemm-kernel/index.html new file mode 100644 index 000000000000..6b964f5b8e03 --- /dev/null +++ b/blog/cutlass-ping-pong-gemm-kernel/index.html @@ -0,0 +1,818 @@ + + + + + + + + + + + + + Deep Dive on CUTLASS Ping-Pong GEMM Kernel | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        November 01, 2024

        +

        + Deep Dive on CUTLASS Ping-Pong GEMM Kernel +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Less Wright, Adnan Hoque + +

        +

        Figure 1. FP8 GEMM Throughput Comparison CUTLASS vs Triton

        + +

        Figure 1. FP8 GEMM Throughput Comparison CUTLASS vs Triton

        + +

        Summary

        + +

        In this post, we provide an overview, with relevant FP8 inference kernel benchmarking, of the CUTLASS Ping-Pong GEMM kernel.

        + +

        Ping-Pong is one of the fastest matmul (GEMM) kernel architectures available for the Hopper GPU architecture. Ping-Pong is a member of the Warp Group Specialized Persistent Kernels family, which includes both Cooperative and Ping-Pong variants. Relative to previous GPUs, Hopper’s substantial tensor core compute capability requires deep asynchronous software pipelining in order to achieve peak performance.

        + +

        The Ping-Pong and Cooperative kernels exemplify this paradigm, as the key design patterns are persistent kernels to amortize launch and prologue overhead, and ‘async everything’ with specialized warp groups with two consumers and one producer, to create a highly overlapped processing pipeline that is able to continuously supply data to the tensor cores.

        + +

        When the H100 (Hopper) GPU was launched, Nvidia billed it as the first truly asynchronous GPU. That statement highlights the need for H100 specific kernel architectures to also be asynchronous in order to fully maximize computational/GEMM throughput.

        + +

        The pingpong GEMM, introduced in CUTLASS 3.x, exemplifies this by moving all aspects of the kernel to a ‘fully asynchronous’ processing paradigm. In this blog, we’ll showcase the core features of the ping-pong kernel design as well as showcase its performance on inference workloads vs cublas and triton split-k kernels.

        + +

        Ping-Pong Kernel Design

        + +

        Ping-Pong (or technically ‘sm90_gemm_tma_warpspecialized_pingpong’) operates with an asynchronous pipeline, leveraging warp specialization. Instead of the more classical homogeneous kernels, “warp groups” take on specialized roles. Note that a warp group consists of 4 warps of 32 threads each, or 128 total threads.

        + +

        On earlier architectures, latency was usually hidden by running multiple thread blocks per SM. However, with Hopper, the Tensor Core throughput is so high that it necessitates moving to deeper pipelines. These deeper pipelines then hinder running multiple thread blocks per SM. Thus, persistent thread blocks now issue collective main loops across multiple tiles and multiple warp groups. Thread block clusters are allocated based on the total SM count.

        + +

        For Ping-Pong, each warp group takes on a specialized role of either Data producer or Data consumer.

        + +

        The producer warp group focuses on producing data movement to fill the shared memory buffers (via TMA). Two other warp groups are dedicated consumers that process the math (MMA) portion with tensor cores, and then do any follow up work and write their results back to global memory (epilogue).

        + +

        Producer warp groups work with TMA (Tensor Memory Accelerator), and are deliberately kept as lightweight as possible. In fact, in Ping-Pong, they deliberately reduce their register resources to improve occupancy. Producers will reduce their max register counts by 40, vs consumers will increase their max register count by 232, an effect we can see in the CUTLASS source and corresponding SASS:

        + +

        source code

        + +

        Unique to Ping-Pong, each consumer works on separate C output tiles. (For reference, the cooperative kernel is largely equivalent to Ping-Pong, but both consumer groups work on the same C output tile). Further, the two consumer warp groups then split their work between the main loop MMA and epilogue.

        + +

        This is shown in the below image:

        + +

        Figure 2: An overview of the Ping-Pong Kernel pipeline. Time moves left to right.

        + +

        Figure 2: An overview of the Ping-Pong Kernel pipeline. Time moves left to right.

        + +

        By having two consumers, it means that one can be using the tensor cores for MMA while the other performs the epilogue, and then vice-versa. This maximizes the ‘continuous usage’ of the tensor cores on each SM, and is a key part of the reason for the max throughput. The tensor cores can be continuously fed data to realize their (near) maximum compute capability. (See the bottom section of the Fig 2 illustration above).

        + +

        Similar to how Producer threads stay focused only on data movements, MMA threads only issue MMA instructions in order to achieve peak issue rate. MMA threads must issue multiple MMA instructions and keep these in flight against TMA wait barriers.

        + +

        An excerpt of the kernel code is shown below to cement the specialization aspects:

        + +
        // Two types of warp group 'roles' 
        +enum class WarpGroupRole {
        +      Producer = 0,
        +      Consumer0 = 1,
        +      Consumer1 = 2
        +    };
        +
        +//warp group role assignment
        +auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
        +
        + +

        Data Movement with Producers and Tensor Memory Accelerator

        + +

        The producer warps focus exclusively on data movement - specifically they are kept as lightweight as possible and in fact give up some of their register space to the consumer warps (keeping only 40 registers, while consumers will get 232). Their main task is issuing TMA (tensor memory accelerator) commands to move data from Global memory to shared memory as soon as a shared memory buffer is signaled as being empty.

        + +

        To expand on TMA, or Tensor Memory Accelerator, TMA is a hardware component introduced with H100’s that asynchronously handles the transfer of memory from HBM (global memory) to shared memory. By having a dedicated hardware unit for memory movement, worker threads are freed to engage in other work rather than computing and managing data movement. TMA not only handles the movement of the data itself, but also calculates the required destination memory addresses, can apply any transforms (reductions, etc.) to the data and can handle layout transformations to deliver data to shared memory in a ‘swizzled’ pattern so that it’s ready for use without any bank conflicts. Finally, it can also multicast the same data if needed to other SM’s that are members of the same thread cluster. Once the data has been delivered, TMA will then signal the consumer of interest that the data is ready.

        + +

        CUTLASS Asynchronous Pipeline Class

        + +

        This signaling between producers and consumers is coordinated via the new Asynchronous Pipeline Class which CUTLASS describes as follows:

        + +

        “Implementing a persistent GEMM algorithm calls for managing dozens of different kinds of asynchronously executing operations that synchronize using multiple barriers organized as a circular list.

        + +

        This complexity is too much for human programmers to manage by hand.

        + +

        As a result, we have developed [CUTLASS Pipeline Async Class]…”

        + +

        Barriers and synchronization within the Ping-Pong async pipeline

        + +

        Producers must ‘acquire’ a given smem buffer via ‘producer_acquire’. At the start, a pipeline is empty meaning that producer threads can immediately acquire the barrier and begin moving data.

        + +
        PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
        +
        + +

        Once the data movement is complete, producers issue the ‘producer_commit’ method to signal the consumer threads that data is ready.
        +However, for Ping-Pong, this is actually a noop instruction since TMA based producer’s barriers are automatically updated by the TMA when writes are completed.

        + +

        consumer_wait - wait for data from producer threads (blocking).

        + +

        consumer_release - signal waiting producer threads that they are finished consuming data from a given smem buffer. In other words, allow producers to go to work refilling this with new data.

        + +

        From there, synchronization will begin in earnest where the producers will wait via the blocking producer acquire until they can acquire a lock, at which point their data movement work will repeat. This continues until the work is finished.

        + +

        To provide a pseudo-code overview:

        + +
        //producer
        +While (work_tile_info.is_valid_tile) {
        +
        +	collective_mainloop.dma() // fetch data with TMA
        +	scheduler.advance_to_next_work()
        +	Work_tile_info = scheduler.get_current_work()
        +
        +}
        +
        +// Consumer 1, Consumer 2
        +While (work_tile_info.is_valid_tile()) {
        +
        +	collective_mainloop.mma()
        +	scheduler.advance_to_next_work()
        +	Work_tile_info = scheduler.get_current_work()
        +
        +}
        +
        + +

        And a visual birds-eye view putting it all together with the underlying hardware:

        + +

        Figure 3: An overview of the full async pipeline for Ping-Pong

        + +

        Figure 3: An overview of the full async pipeline for Ping-Pong

        + +

        Step-by-Step Breakdown of Ping-Pong Computation Loop

        + +

        Finally, a more detailed logical breakout of the Ping-Pong processing loop:

        + +

        A - Producer (DMA) warp group acquires a lock on a shared memory buffer.

        + +

        B - this allows it to kick off a tma cp_async.bulk request to the tma chip (via a single thread).

        + +

        C - TMA computes the actual shared memory addressing required, and moves the data to shared memory. As part of this, swizzling is performed in order to layout the data in smem for the fastest (no bank conflict) access.

        + +

        C1 - potentially, data can also be multicast to other SMs and/or it may need to wait for data from other tma multicast to complete the loading. (threadblock clusters now share shared memory across multiple SMs!)

        + +

        D - At this point, the barrier is updated to signal the arrival of the data to smem.

        + +

        E - The relevant consumer warpgroup now gets to work by issuing multiple wgmma.mma_async commands, which then read the data from smem to Tensor cores as part of it’s wgmma.mma_async matmul operation.

        + +

        F - the MMA accumulator values are written to register memory as the tiles are completed.

        + +

        G - the consumer warp group releases the barrier on the shared memory.

        + +

        H - the producer warp groups go to work issuing the next tma instruction to refill the now free smem buffer.

        + +

        I - The consumer warp group simultaneously applies any epilogue actions to the accumulator, and then move data from register to a different smem buffer.

        + +

        J - The consumer warp issues a cp_async command to move data from smem to global memory.

        + +

        The cycle repeats until the work is completed. Hopefully this provides you with a working understanding of the core concepts that power Ping-Pong’s impressive performance.

        + +

        Microbenchmarks

        + +

        To showcase some of Ping-Pong’s performance, below are some comparison charts related to our work on designing fast inference kernels.

        + +

        First a general benchmarking of the three fastest kernels so far (lower is better): \

        + +

        Figure 4, above: Benchmark timings of FP8 GEMMs, lower is better (faster)

        + +

        Figure 4, above: Benchmark timings of FP8 GEMMs, lower is better (faster)

        + +

        And translating that into a relative speedup chart of Ping-Pong vs cuBLAS and Triton:

        + +

        Figure 5, above: Relative speedup of Ping-Pong vs the two closest kernels.

        + +

        Figure 5, above: Relative speedup of Ping-Pong vs the two closest kernels.

        + +

        The full source code for the Ping-Pong kernel is here (619 lines of deeply templated CUTLASS code, or to paraphrase the famous turtle meme - “it’s templates…all the way down! ):

        + + + +

        In addition, we have implemented PingPong as a CPP extension to make it easy to integrate into use with PyTorch here (along with a simple test script showing it’s usage):

        + + + +

        Finally, for continued learning, Nvidia has two GTC videos that dive into kernel design with CUTLASS:

        + + + +

        Future Work

        + +

        Data movement is usually the biggest impediment to top performance for any kernel, and thus having an optimal strategy understanding of TMA (Tensor Memory Accelerator) on Hopper is vital. We previously published work on TMA usage in Triton. Once features like warp specialization are enabled in Triton, we plan to do another deep dive on how Triton kernels like FP8 GEMM and FlashAttention can leverage kernel designs like Ping-Pong for acceleration on Hopper GPUs.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/datathon-2025/index.html b/blog/datathon-2025/index.html new file mode 100644 index 000000000000..2ef3d61e1d1d --- /dev/null +++ b/blog/datathon-2025/index.html @@ -0,0 +1,670 @@ + + + + + + + + + + + + + Solve Real-Word AI Challenges with PyTorch at Datathon 2025: DataOrbit | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Aakash Senthilnathan + +

        +

        We’re excited to have PyTorch sponsor Datathon 2025: DataOrbit, a place where students can collaborate with a team to solve problems using real-world datasets! This event, hosted by Data Science UCSB in collaboration with Gaucho Sports Analytics and ACM@UCSB, will take place on February 22–23rd, 2025 at UC Santa Barbara, with the incredible opportunity to present your project to a panel of corporate and faculty judges – including the executive director of Pytorch! – for a chance to win prizes up to $3000.

        + +

        logo

        + +

        PyTorch’s versatility and power have made it an essential tool for tackling complex data problems in domains ranging from computer vision and natural language processing to time series analysis. At Datathon 2025: DataOrbit, participants will have the chance to leverage PyTorch’s dynamic framework, ease of use, and robust ecosystem to build innovative solutions. Whether you’re building machine learning models, experimenting with deep learning architectures, or applying PyTorch to solve real-world challenges, workshops and mentors will be available to help you dive deeper into its capabilities and accelerate your project’s success.

        + +

        Register Here: tinyurl.com/dataorbit25-reg (Open until February 21st or until capacity is reached)

        + +

        Additional information regarding the timeline of events can be found on the registration form.

        + +

        About the Datathon

        + +
          +
        • Open only to undergraduate students in the United States
        • +
        • In-person events over 36 hours
        • +
        • Teams sizes of 2-5 people
        • +
        • 10 different prize tracks
        • +
        • Workshops and office hours teaching essential data science tools and techniques
        • +
        • Professional development workshops + networking opportunities with our sponsors
        • +
        • All meals provided
        • +
        • A fun time!
        • +
        + +

        If you have a group you would like to work with, we require that every member register separately. If you do not have a group, we will have an opportunity at the beginning of the event to participate in an activity to form groups. Unfortunately, at this time we do not provide travel accommodations or lodging for participants.

        + +

        If you are interested in mentoring students virtually during the course of our datathon, or have any other questions contact us at datascience.ucsb@gmail.com.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/democratizing-ai-with-pytorch/index.html b/blog/democratizing-ai-with-pytorch/index.html new file mode 100644 index 000000000000..4f29a6cb2bba --- /dev/null +++ b/blog/democratizing-ai-with-pytorch/index.html @@ -0,0 +1,736 @@ + + + + + + + + + + + + + Democratizing AI with PyTorch Foundation and ROCm™ support for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + AMD + +

        +

        AMD Founding Member

        + +

        Last year, Meta announced that PyTorch joined the Linux Foundation as a neutral home for growing the machine learning project and community with AMD representation as a part of the founding membership and governing board.

        + +

        PyTorch Foundation’s mission is to drive AI adoption by democratizing its software ecosystem through open source principles aligning with the AMD core principle of an Open software ecosystem. AMD strives to foster innovation through the support for latest generations of hardware, tools, libraries, and other components to simplify and accelerate adoption of AI across a broad range of scientific discoveries.

        + +
        +
        +

        +AMD, along with key PyTorch codebase developers (including those at Meta AI), delivered a set of updates to the ROCm™ open software ecosystem that brings stable support for AMD Instinct™ accelerators as well as many Radeon™ GPUs. This now gives PyTorch developers the ability to build their next great AI solutions leveraging AMD GPU accelerators & ROCm. The support from PyTorch community in identifying gaps, prioritizing key updates, providing feedback for performance optimizing and supporting our journey from “Beta” to “Stable” was immensely helpful and we deeply appreciate the strong collaboration between the two teams at AMD and PyTorch. The move for ROCm support from “Beta” to “Stable” came in the PyTorch 1.12 release (June 2022) brings the added support to easily run PyTorch on native environment without having to configure custom dockers. This is a sign of confidence about the quality of support and performance of PyTorch using AMD Instinct and ROCm. The results of these collaborative efforts are evident in the performance measured on key industry benchmarks like Microsoft’s SuperBench shown below in Graph 1. +

        +
        +
        +

        +“We are excited to see the significant impact of developers at AMD to contribute to and extend features within PyTorch to make AI models run in a more performant, efficient, and scalable way. A great example of this is the thought-leadership around unified memory approaches between the framework and future hardware systems, and we look forward to seeing that feature progress.”
        +- Soumith Chintala, PyTorch lead-maintainer and Director of Engineering, Meta AI +

        +
        +
        + +

        The progressive improvements on both the AMD CDNA™ architecture as well as ROCm and PyTorch shows single GPU model throughput increase from AMD Instinct MI100 to the latest generation AMD Instinct MI200 family GPUs going from ROCm 4.2 to ROCm 5.3 and from PyTorch 1.7 to PyTorch 1.12.

        + +

        Graph 1: ML model performance over generation using Microsoft Superbench Suite

        + +

        Graph 1: ML model performance over generation using Microsoft Superbench Suite 1, 2, 3

        + +

        Below are a few of the key updates for ROCm support since the PyTorch 1.12 release

        + +

        Full Continuous Integration (CI) for ROCm on PyTorch

        + +

        With the ROCm support for PyTorch move from “Beta” to “Stable,” all the functions and features commits are now verified through a full Continuous Integration (CI) process. The CI process helps ensure the proper build and test process ahead of an expected Docker and PIP wheel release with stable commits forthcoming.

        + +

        Support for Kineto Profiler

        + +

        The addition of Kineto profiler support to ROCm now helps developers and users understand performance bottlenecks through effective diagnosis and profiling tools. The tool also provides recommendations to improve known issues and visualization through TensorBoard UI.

        + +

        Key PyTorch Libraries support added

        + +

        PyTorch ecosystem libraries like TorchText (Text classification), TorchRec (libraries for recommender systems - RecSys), TorchVision (Computer Vision), TorchAudio (audio and signal processing) are fully supported since ROCm 5.1 and upstreamed with PyTorch 1.12.

        + +

        Key libraries provided with the ROCm software stack including MIOpen (Convolution models), RCCL (ROCm Collective Communications) and rocBLAS (BLAS for transformers) were further optimized to offer new potential efficiencies and higher performance.

        + +

        MIOpen innovates on several fronts, such as implementing fusion to optimize for memory bandwidth and GPU launch overheads, providing an auto-tuning infrastructure to overcome the large design space of problem configurations, and implementing different algorithms to optimize convolutions for different filter and input sizes. MIOpen is one of the first libraries to publicly support the bfloat16 data-type for convolutions, allowing efficient training at lower precision maintaining expected accuracy.

        + +

        RCCL (pronounced “Rickle”) is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe®, Infinity Fabric™ (GPU to GPU) as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in single or multiple nodes and can be used in either single- or multi-process (e.g., MPI) applications.

        + +

        Along with the above key highlights, over 50 features and functionality improvements were completed jointly between AMD and PyTorch to add stable support for ROCm. These include improvements to tools, compilers, runtime, graph optimizations through TorchScript, INT8 quant path usage, and ONNX runtime integration including support for Navi 21 based Radeon™ PRO datacenter graphics card to name a few.

        + +

        AITemplate Inference Engine

        + +

        MetaAI recently published a blog announcing the release of its open source AITemplate (link) for a unified inference system supporting AMD Instinct GPU accelerators using the AMD ROCm stack. This Python based framework can help significantly improve performance through increased utilization of AMD matrix cores for transformer blocks. This is achieved through the AMD Composable Kernel (CK) library which provides performance critical Kernels for ML AI workloads across multiple architectures including GPUs and CPUs through HIP & C++.

        + +

        Moreover, the AITemplate also provides out-of-the-box support for widely used AI models like BERT, ResNET, Vision Transformer, Stable Diffusion etc. simplifying deployment process through these pretrained models.

        + +

        What’s coming with future ROCm releases?

        + +

        Unified memory models for CPU + GPU

        + +

        As system architecture evolves to address the complexity of large problem sizes and data sets, memory management becomes a key performance bottle neck that needs a cohesive strategy to be addressed through innovations at both hardware and software levels. AMD is uniquely positioned to address this problem with its effective data center solutions integrating AMD EPYC™ CPU cores with its AMD Instinct GPU compute units in a truly unified datacenter APU (Accelerated Processing Unit) form factor set to be launched in 2H 2023.

        + +

        The software work to leverage the unified CPU + GPU memory has already started in collaboration with the PyTorch team, to enable the usage of a fast, low latency, synchronized memory model that enables not only AMD but also other AI accelerators to address the complex memory management problem of today. We are looking forward to this joint effort and announcement soon.

        + +

        Acknowledgement

        + +

        The content in this blog highlights the joint work between AMD and key PyTorch contributors including Meta, working on many of the core features, as well as Microsoft enabling ONNX Runtime support. We are looking forward to working with the other founding members at the PyTorch Foundation on the next steps and improvements to democratize and grow adoption of PyTorch across the industry.

        + +

        CAUTIONARY STATEMENT

        + +

        +This blog contains forward-looking statements concerning Advanced Micro Devices, Inc. (AMD) such as the availability, timing and expected benefits of an AMD datacenter APU form factor, which are made pursuant to the Safe Harbor provisions of the Private Securities Litigation Reform Act of 1995. Forward-looking statements are commonly identified by words such as “would,” “may,” “expects,” “believes,” “plans,” “intends,” “projects” and other terms with similar meaning. Investors are cautioned that the forward-looking statements in this blog are based on current beliefs, assumptions and expectations, speak only as of the date of this blog and involve risks and uncertainties that could cause actual results to differ materially from current expectations. Such statements are subject to certain known and unknown risks and uncertainties, many of which are difficult to predict and generally beyond AMD’s control, that could cause actual results and other future events to differ materially from those expressed in, or implied or projected by, the forward-looking information and statements. Investors are urged to review in detail the risks and uncertainties in AMD’s Securities and Exchange Commission filings, including but not limited to AMD’s most recent reports on Forms 10-K and 10-Q. AMD does not assume, and hereby disclaims, any obligation to update forward-looking statements made in this blog, except as may be required by law. +

        + +

        Endnotes

        + +
          +
        1. MI100D-01 SuperBench v0.5 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™ 7763 CPU server tested with 1x AMD Instinct™ MI100 (32GB HBM2e) 300W GPU, SBIOS 2.2, Ubuntu® 20.04.5 LTS, host ROCm™ 5.2.0, guest ROCm 4.2, PyTorch 1.7.0. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations.
        2. +
        3. MI200D-01 SuperBench v0.6 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™ 7763 CPU server tested with 1x AMD Instinct™ MI210 (64GB HBM2e) 300W GPU, SBIOS 2.2, Ubuntu 20.04.5 LTS, host ROCm 5.3.0, guest ROCm 5.3, PyTorch 1.12. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations.
        4. +
        5. MI200D-02: SuperBench v0.6 model training results based on AMD internal testing as of 11/09/2022 measuring the total training throughput, at half precision, using a 2P AMD EPYC™️ 7763 CPU server tested with 1x AMD Instinct™️ MI250 (128GB HBM2e) 560W GPU, SBIOS M12, Ubuntu 20.04 LTS, host ROCm 5.3.0, guest ROCm 5.3, PyTorch 1.12. Server manufacturers may vary configurations, yielding different results. Performance may vary based factors including use of latest drivers and optimizations.
        6. +
        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/deploying-llms-torchserve-vllm/index.html b/blog/deploying-llms-torchserve-vllm/index.html new file mode 100644 index 000000000000..52a0a4fc9983 --- /dev/null +++ b/blog/deploying-llms-torchserve-vllm/index.html @@ -0,0 +1,821 @@ + + + + + + + + + + + + + Deploying LLMs with TorchServe + vLLM | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 31, 2024

        +

        + Deploying LLMs with TorchServe + vLLM +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Matthias Reso, Ankith Gunapal, Simon Mo, Li Ning, Hamid Shojanazeri + +

        +

        The vLLM engine is currently one of the top-performing ways to execute large language models (LLM). It provides the vllm serve command as an easy option to deploy a model on a single machine. While this is convenient, to serve these LLMs in production and at scale some advanced features are necessary.

        + +

        flow diagram

        + +

        TorchServe offers these essential production features (like custom metrics and model versioning) and through its flexible custom handler design, makes it very easy to integrate features such as retrieval-augmented generation (RAG) or safeguards like Llama Guard. It is therefore natural to pair the vLLM engine with TorchServe to create a full-fledged LLM serving solution for production.

        + +

        Before going into the specifics of the integration, we will demonstrate the deployment of a Llama-3.1-70B-Instruct model using TorchServe’s vLLM docker image.

        + +

        Quickly getting started with Llama 3.1 on TorchServe + vLLM

        + +

        To get started we need to build the new TS LLM Docker container image by checking out the TorchServe repository and execute the following command from the main folder:

        + +
        docker build --pull . -f docker/Dockerfile.vllm -t ts/vllm
        +
        + +

        The container uses our new LLM launcher script ts.llm_launcher which takes a Hugging Face model URI or local folder and spins up a local TorchServe instance with the vLLM engine running in the backend. To serve a model locally, you can create an instance of the container with the following command:

        + +
        #export token=<HUGGINGFACE_HUB_TOKEN>
        +docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 
        +8080:8080 -v data:/data ts/vllm --model_id meta-llama/Meta-Llama-3.1-70B-Instruct --disable_token_auth
        +
        + +

        You can test the endpoint locally with this curl command:

        + +
        curl -X POST -d '{"model":"meta-llama/Meta-Llama-3.1-70B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
        +
        + +

        The docker stores the model weights in the local folder “data” which gets mounted as /data inside the container. To serve your custom local weights simply copy them into data and point the model_id to /data/<your weights>.

        + +

        Internally, the container uses our new ts.llm_launcher script to launch TorchServe and deploy the model. The launcher simplifies the deployment of an LLM with TorchServe into a single command line and can also be used outside the container as an efficient tool for experimentation and testing. To use the launcher outside the docker, follow the TorchServe installation steps and then execute the following command to spin up a 8B Llama model:

        + +
        # after installing TorchServe and vLLM run
        +python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct  --disable_token_auth
        +
        + +

        If multiple GPUs are available the launcher will automatically claim all visible devices and apply tensor parallelism (see CUDA_VISIBLE_DEVICES to specify which GPUs to use).

        + +

        While this is very convenient, it’s important to note that it does not encompass all the functionalities provided by TorchServe. For those looking to leverage more advanced features, a model archive needs to be created. While this process is a bit more involved than issuing a single command, it bears the advantage of custom handlers and versioning. While the former allows to implement RAG inside the preprocessing step, the latter lets you test different versions of a handler and model before deploying on a larger scale.

        + +

        Before we provide the detailed steps to create and deploy a model archive, let’s dive into the details of the vLLM engine integration.

        + +

        TorchServe’s vLLM Engine Integration

        + +

        As a state-of-the-art serving framework, vLLM offers a plethora of advanced features, including PagedAttention, continuous batching, rapid model execution through CUDA graphs, and support for various quantization methods such as GPTQ, AWQ, INT4, INT8, and FP8. It also provides integration for important parameter-efficient adapter methods like LoRA and access to a wide range of model architectures including Llama and Mistral. vLLM is maintained by the vLLM team and a thriving open-source community.

        + +

        To facilitate quick deployment, it offers a serving mode based on FastAPI to serve LLMs over HTTP. For a tighter, more flexible integration the project also provides the vllm.LLMEngine which offers interfaces to process requests on a continuous basis. We leveraged the asynchronous variant for the integration into TorchServe.

        + +

        TorchServe is an easy-to-use, open-source solution for serving PyTorch models in production. As a production-tested serving solution, TorchServe offers numerous benefits and features beneficial for deploying PyTorch models at scale. By combining it with the inference performance of the vLLM engine these benefits can now also be used to deploy LLMs at scale.

        + +

        Torchserve highlights and integrations

        + +

        To maximize hardware utilization it is generally a good practice to batch requests from multiple users together. Historically, TorchServe only offered a synchronized mode to collect requests from various users. In this mode, TorchServe waits for a predefined amount of time (e.g., batch_delay=200ms) or until enough requests (e.g., batch_size=8) have arrived. When one of these events is triggered, the batched data gets forwarded to the backend where the model is applied to the batch, and the model output is returned to the users through the frontend. This works especially well for traditional vision models where outputs for each request usually finish at the same time.

        + +

        For generative use cases, particularly text generation, the assumption that requests are ready simultaneously is no longer valid, as responses will have varying lengths. Although TorchServe supports continuous batching (the ability to add and remove requests dynamically), this mode only accommodates a static maximum batch size. With the introduction of PagedAttention, even this assumption of a maximum batch size becomes more flexible, as vLLM can combine requests of different lengths in a highly adaptable manner to optimize memory utilization.

        + +

        To achieve optimal memory utilization, i.e., to fill unused gaps in memory (think Tetris), vLLM requires complete control over the decision of which requests to process at any given time. To provide this flexibility, we had to reevaluate how TorchServe handles user requests. Instead of the previous synchronous processing mode, we introduced an asynchronous mode (see diagram below) where incoming requests are directly forwarded to the backend, making them available for vLLM. The backend feeds the vllm.AsyncEngine, which can now select from all available requests. If streaming mode is enabled and the first token of a request is available, the backend will send out the result immediately and continue sending tokens until the final token is generated.

        + +

        flow diagram

        + +

        Our implementation of the VLLMHandler enables users to quickly deploy any model compatible with vLLM using a configuration file, while still offering the same level of flexibility and customizability through a custom handler. Users are free to add e.g. custom preprocessing or post-processing steps by inheriting from VLLMHandler and overriding the respective class methods.

        + +

        We also support single-node, multi-GPU distributed inference, where we configure vLLM to use tensor parallel sharding of the model to either increase capacity for smaller models or enable larger models that do not fit on a single GPU, such as the 70B Llama variants. Previously, TorchServe only supported distributed inference using torchrun, where multiple backend worker processes were spun up to shard the model. vLLM manages the creation of these processes internally, so we introduced the new “custom” parallelType to TorchServe which launches a single backend worker process and provides the list of assigned GPUs. The backend process can then launch its own subprocesses if necessary.

        + +

        To facilitate integration of TorchServe + vLLM into docker-based deployments, we provide a separate Dockerfile based on TorchServe’s GPU docker image, with vLLM added as a dependency. We chose to keep the two separate to avoid increasing the docker image size for non-LLM deployments.

        + +

        Next, we will demonstrate the steps required to deploy a Llama 3.1 70B model using TorchServe + vLLM on a machine with four GPUs.

        + +

        Step-by-Step Guide

        + +

        For this step-by-step guide we assume the installation of TorchServe has finished successfully. Currently, vLLM is not a hard-dependency for TorchServe so let’s install the package using pip:

        + +
        $ pip install -U vllm==0.6.1.post2
        +
        + +

        In the following steps, we will (optionally) download the model weights, explain the configuration, create a model archive, deploy and test it:

        + +

        1. (Optional) Download Model Weights

        + +

        This step is optional, as vLLM can also handle downloading the weights when the model server is started. However, pre-downloading the model weights and sharing the cached files between TorchServe instances can be beneficial in terms of storage usage and startup time of the model worker. If you choose to download the weights, use the huggingface-cli and execute:

        + +
        # make sure you have logged into huggingface with huggingface-cli login before
        +# and have your access request for the Llama 3.1 model weights approved
        +
        +huggingface-cli download meta-llama/Meta-Llama-3.1-70B-Instruct --exclude original/*
        +
        + +

        This will download the files under $HF_HOME, and you can alter the variable if you want to place the files elsewhere. Please ensure that you update the variable wherever you run TorchServe and make sure it has access to that folder.

        + +

        2. Configure the Model

        + +

        Next, we create a YAML configuration file that contains all the necessary parameters for our model deployment. The first part of the config file specifies how the frontend should launch the backend worker, which will ultimately run the model in a handler. The second part includes parameters for the backend handler, such as the model to load, followed by various parameters for vLLM itself. For more information on possible configurations for the vLLM engine, please refer to this link.

        + +
        echo '
        +# TorchServe frontend parameters
        +minWorkers: 1            
        +maxWorkers: 1            # Set the number of worker to create a single model instance
        +startupTimeout: 1200     # (in seconds) Give the worker time to load the model weights
        +deviceType: "gpu" 
        +asyncCommunication: true # This ensures we can cummunicate asynchronously with the worker
        +parallelType: "custom"   # This lets TS create a single backend prosses assigning 4 GPUs
        +parallelLevel: 4
        +
        +# Handler parameters
        +handler:
        +    # model_path can be a model identifier for Hugging Face hub or a local path
        +    model_path: "meta-llama/Meta-Llama-3.1-70B-Instruct"
        +    vllm_engine_config:  # vLLM configuration which gets fed into AsyncVLLMEngine
        +        max_num_seqs: 16
        +        max_model_len: 512
        +        tensor_parallel_size: 4
        +        served_model_name:
        +            - "meta-llama/Meta-Llama-3.1-70B-Instruct"
        +            - "llama3"
        +'> model_config.yaml
        +
        + +

        3. Create the Model Folder

        + +

        After creating the model configuration file (model_config.yaml), we will now create a model archive that includes the configuration and additional metadata, such as versioning information. Since the model weights are large, we will not include them inside the archive. Instead, the handler will access the weights by following the model_path specified in the model configuration. Note that in this example, we have chosen to use the “no-archive” format, which creates a model folder containing all necessary files. This allows us to easily modify the config files for experimentation without any friction. Later, we can also select the mar or tgz format to create a more easily transportable artifact.

        + +
        mkdir model_store
        +torch-model-archiver --model-name vllm --version 1.0 --handler vllm_handler --config-file model_config.yaml --archive-format no-archive --export-path model_store/
        +
        + +

        4. Deploy the Model

        + +

        The next step is to start a TorchServe instance and load the model. Please note that we have disabled token authentication for local testing purposes. It is highly recommended to implement some form of authentication when publicly deploying any model.

        + +

        To start the TorchServe instance and load the model, run the following command:

        + +
        torchserve --start --ncs  --model-store model_store --models vllm --disable-token-auth
        +
        + +

        You can monitor the progress of the model loading through the log statements. Once the model has finished loading, you can proceed to test the deployment.

        + +

        5. Test the Deployment

        + +

        The vLLM integration uses an OpenAI API compatible format so we can either use a specialized tool for this purpose or curl. The JSON data we are using here includes the model identifier as well as the prompt text. Other options and their default values can be found in the vLLMEngine docs.

        + +
        echo '{
        +  "model": "llama3",
        +  "prompt": "A robot may not injure a human being",
        +  "stream": 0
        +}' | curl --header "Content-Type: application/json"   --request POST --data-binary @-   http://localhost:8080/predictions/vllm/1.0/v1/completions
        +
        + +

        The output of the request looks like this:

        + +
        {
        +  "id": "cmpl-cd29f1d8aa0b48aebcbff4b559a0c783",
        +  "object": "text_completion",
        +  "created": 1727211972,
        +  "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
        +  "choices": [
        +    {
        +      "index": 0,
        +      "text": " or, through inaction, allow a human being to come to harm.\nA",
        +      "logprobs": null,
        +      "finish_reason": "length",
        +      "stop_reason": null,
        +      "prompt_logprobs": null
        +    }
        +  ],
        +  "usage": {
        +    "prompt_tokens": 10,
        +    "total_tokens": 26,
        +    "completion_tokens": 16
        +  }
        +
        + +

        When streaming is False TorchServe will collect the full answer and send it in one go after the last token was created. If we flip the stream parameter we will receive piecewise data containing a single token in each message.

        + +

        Conclusion

        + +

        In this blog post, we explored the new, native integration of the vLLM inference engine into TorchServe. We demonstrated how to locally deploy a Llama 3.1 70B model using the ts.llm_launcher script and how to create a model archive for deployment on any TorchServe instance. Additionally, we discussed how to build and run the solution in a Docker container for deployment on Kubernetes or EKS. In future works, we plan to enable multi-node inference with vLLM and TorchServe, as well as offer a pre-built Docker image to simplify the deployment process.

        + +

        We would like to express our gratitude to Mark Saroufim and the vLLM team for their invaluable support in the lead-up to this blog post.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/deprecation-cuda-python-support/index.html b/blog/deprecation-cuda-python-support/index.html new file mode 100644 index 000000000000..e8a19e61e3a6 --- /dev/null +++ b/blog/deprecation-cuda-python-support/index.html @@ -0,0 +1,699 @@ + + + + + + + + + + + + + Deprecation of CUDA 11.6 and Python 3.7 Support | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        For the upcoming PyTorch 2.0 feature release (target March 2023), we will target CUDA 11.7 as the stable version and CUDA 11.8 as the experimental version of CUDA and Python >=3.8, <=3.11.

        + +

        If you are still using or depending on CUDA 11.6 or Python 3.7 builds, we strongly recommend moving to at least CUDA 11.7 and Python 3.8, as it would be the minimum versions required for PyTorch 2.0.

        + +

        Please note that as of Feb 1, CUDA 11.6 and Python 3.7 are no longer included in the nightlies

        + +

        Please refer to the Release Compatibility Matrix for PyTorch releases:

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        PyTorch Version + Python + Stable CUDA + Experimental CUDA +
        2.0 + >=3.8, <=3.11 + CUDA 11.7, CUDNN 8.5.0.96 + CUDA 11.8, CUDNN 8.7.0.84 +
        1.13 + >=3.7, <=3.10 + CUDA 11.6, CUDNN 8.3.2.44 + CUDA 11.7, CUDNN 8.5.0.96 +
        1.12 + >=3.7, <=3.10 + CUDA 11.3, CUDNN 8.3.2.44 + CUDA 11.6, CUDNN 8.3.2.44 +
        + +

        As of 2/1/2023

        + +

        For more information on PyTorch releases, updated compatibility matrix and release policies, please see (and bookmark) Readme.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/develop-android-applications/index.html b/blog/develop-android-applications/index.html new file mode 100644 index 000000000000..70f83a165fd0 --- /dev/null +++ b/blog/develop-android-applications/index.html @@ -0,0 +1,688 @@ + + + + + + + + + + + + + Learn how to develop Android applications with ExecuTorch and Llama models | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Arm + +

        +

        This blog is courtesy of the PyTorch team at Arm. More details can be found here.

        + +

        Arm’s compute platform is delivering GenAI applications on phones, laptops, and servers. Cost, privacy, performance, security, and energy efficiency are just some of the reasons developers are investigating on-device AI.

        + +

        A new Learning Path explaining how to leverage the capabilities of large language models (LLMs) on Android using ExecuTorch and XNNPACK is now available.

        + +

        Here’s a summary of what you’ll learn:

        + +
          +
        • +

          Development Environment setup

          + +

          The Learning Path begins by guiding you through setting up your development environment, ensuring you have all the necessary tools installed, including Android Studio, the Android NDK, Java JDK, and Python.

          +
        • +
        • +

          ExecuTorch and XNNPACK

          + +

          You’ll learn about the core technologies: ExecuTorch, a framework for deploying PyTorch models to edge devices, and XNNPACK, a high-performance library for executing neural networks on Arm-based platforms.

          +
        • +
        • +

          Llama models

          + +

          The Learning Path explores Llama, a family of powerful LLMs, focusing specifically on the 8B Llama 3 model. You’ll learn about quantization techniques, which are essential for optimizing model size and performance on mobile devices.

          +
        • +
        • +

          Prepare Llama models for ExecuTorch

          + +

          You’ll be guided through the process of downloading, exporting, and evaluating Llama models, ensuring they are ready for deployment using ExecuTorch.

          +
        • +
        • +

          Check model performance on Android

          + +

          The Learning Path walks you through cross-compiling the Llama runner binary for Android, allowing you to test your model’s performance on your phone.

          +
        • +
        • +

          Build and run an Android Chat App

          + +

          Finally, you’ll learn how to build a native Android chat app using the LlamaDemo application from the ExecuTorch repository. This hands-on experience allows you to put your knowledge into practice and create a real-world application.

          +
        • +
        + +

        Explore this Learning Path if you want to learn how to leverage the power of LLMs on your Android phone, and gain expertise in tools for on-device machine learning.

        + +

        Dig into the excitement of building Android chat apps and understand more about how they work on the Arm Developer Hub.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/dinosaurs-to-seismic-imaging/index.html b/blog/dinosaurs-to-seismic-imaging/index.html new file mode 100644 index 000000000000..b5607df06921 --- /dev/null +++ b/blog/dinosaurs-to-seismic-imaging/index.html @@ -0,0 +1,756 @@ + + + + + + + + + + + + + From PyTorch Conference 2023: From Dinosaurs to Seismic Imaging with Intel | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Ramya Ravi, Susan Kahler at Intel + +

        +

        Dinosaur fossil

        + +

        Lightning Talk 1: Seismic Data to Subsurface Models with OpenFWI

        + +

        Speaker: Benjamin Consolvo, AI Software Engineering Manager, Intel, LinkedIn

        + +

        Session Overview

        + +

        In this session, Ben begins with an overview of seismic imaging and full waveform inversion (FWI). Seismic imaging and FWI helps us to explore land for important subsurface minerals necessary for human thriving. To find those crucial subsurface minerals, we need to image the subsurface with a high degree of accuracy at a low cost, which involves two main challenges. He explains the solutions for those challenges using AI, which are summarized below.

        + + + + + + + + + + + + + + +
        Challenges + Solutions using AI +
        Traditional physics based FWI requires an accurate starting model. + Data-driven deep learning solutions do not require an accurate starting model. +
        GPUs are typically used for fine-tuning neural networks but are often unavailable and expensive. + CPUs are highly available, inexpensive, and viable for AI fine-tuning. The new 4th Gen Intel® Xeon® Scalable processor has the built-in AI accelerator engine called Intel® AMX (Intel® Advanced Matrix Extensions) that helps to accelerate AI training and inference performance. +
        + +

        Next, he shows the wave propagation for the subsurface model and corresponding seismic shot gathers. In his example, the shot gathers are synthetically generated time-sampled records of sounds recordings from a shot (like a dynamite explosion or vibroseis truck) recorded by geophones spread across a large area. For this application, the training data consists of a pair of subsurface model image and seismic shot gather images, where the model from the shot gather is predicted.

        + + + + + + + + + + + + + + + + + + + + + + +
        + Number of Seismic Shot Images + Number of subsurface model images +
        Train + 120,000 + 24,000 +
        Test + 25,000 + 5,000 +
        Validation + 5,000 + 1,000 +
        + +

        In this application, the algorithm used during training was InversionNET (encoder-decoder convolutional neural network). Check out the implementation details for InversionNET architecture in Deng et al. (2021).

        + +

        He then shows the results:

        + +
          +
        1. Prediction versus ground truth model after one epoch and at 50 epochs. After training InversionNET, the predicted model is much closer to the ground truth image.
        2. +
        3. Training loss and validation loss curves decreasing over time across 50 epochs.
        4. +
        + +

        Finally, Ben concludes his talk by highlighting that he was able to successfully fine-tune a deep neural network without an accurate starting model to obtain subsurface model on a 4th generation Intel® Xeon® Scalable processor.

        + +

        Watch the full video recording here and download the presentation. More details can be found in this blog.

        + +

        About the Speaker

        + +

        Ben Consolvo

        + +

        Ben Consolvo is an AI Solutions Engineering Manager at Intel. He has been building a team and a program around Intel’s AI technology paired with Intel’s hardware offerings. He brings a background and passion in data science, particularly in deep learning (DL) and computer vision. He has applied his skills in DL in the cybersecurity industry to automatically identify phishing websites, as well as to the oil and gas industry to identify subsurface features for geophysical imaging.

        + +

        Lightning Talk 2: Dinosaur Bone Hunt

        + +

        Speaker: Bob Chesebrough, Sr Solution Architect, Intel, LinkedIn

        + +

        Session Overview

        + +

        In this session, Bob starts the presentation by explaining his interest in collecting dinosaur bones and gives an overview of Intel AI Software portfolio.

        + +

        He then explains the steps to create a dinosaur site treasure map or dinosaur bone likelihood map:

        + +
          +
        1. Collect data and create training data (New Mexico aerial photos of the Morrison Formation - a famous dinosaur bone bed in the Western United States and the GPS coordinates for small bone fragments discovered)
        2. +
        3. Train a simple ResNet 18 model using Intel® Extension for PyTorch
        4. +
        5. Score the model on Utah photos and create a heat map
        6. +
        + +

        Finally, Bob shows the results that dinosaur bones were discovered in Utah using dinosaur bone likelihood map. Go to the GitHub repository to access the code sample and try out the sample using Intel Extension for PyTorch.

        + +

        Watch the full video recording here and download the presentation. More details can be found in this blog.

        + +

        About the Speaker

        + +

        Bob Chesebrough

        + +

        Bob Chesebrough’s industry experience is software development/AI solution engineering for fortune 100 companies and national laboratories for over three decades. He is also a hobbyist who has logged over 800 miles and 1000 hours in the field finding dinosaur bones. He and his sons discovered an important fossil of the only known crocodilian from the Jurassic in New Mexico, they have also discovered and logged into the museum 2000+ bones localities and described a new mass bone bed in New Mexico.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/docathon-2025/index.html b/blog/docathon-2025/index.html new file mode 100644 index 000000000000..9582eb0e3b93 --- /dev/null +++ b/blog/docathon-2025/index.html @@ -0,0 +1,684 @@ + + + + + + + + + + + + + Announcing the PyTorch Docathon 2025 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        May 01, 2025

        +

        + Announcing the PyTorch Docathon 2025 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        PyTorch Docathon 2025

        + +

        We’re thrilled to announce the 2025 PyTorch Docathon! This is a hackathon-style event aimed at enhancing PyTorch documentation with the support of the community. Documentation is a vital component of any technology, and by refining it, we can simplify the onboarding process for new users, help them effectively utilize PyTorch’s features, and ultimately speed up the transition from research to production in machine learning.

        + +

        WHY PARTICIPATE

        + +

        Low Barrier to Entry

        + +

        Unlike many open-source projects that require deep knowledge of the codebase and previous contributions to join hackathon events, the Docathon is tailored for newcomers. While we expect participants to be familiar with Python, and have basic knowledge of PyTorch and machine learning, there are tasks related to website issues that don’t even require that level of expertise.

        + +

        Tangible Results

        + +

        A major advantage of the Docathon is witnessing the immediate impact of your contributions. Enhancing documentation significantly boosts a project’s usability and accessibility, and you’ll be able to observe these improvements directly. Seeing tangible outcomes can also be a strong motivator to continue contributing.

        + +

        Collaborative Environment

        + +

        The Docathon fosters a collaborative atmosphere, offering you the chance to work alongside other contributors and PyTorch maintainers to improve the documentation. This is a fantastic opportunity to learn from peers, exchange ideas, and build connections.

        + +

        Learning Opportunities

        + +

        Even if you’re not a PyTorch expert, the Docathon offers a valuable learning experience. You’ll have the chance to delve into PyTorch modules, test tutorials on your machine, and explore them in the CI environment.

        + +

        WHO SHOULD PARTICIPATE

        + +

        Whether you’re a seasoned documentation expert or just starting out, we invite everyone to join in the PyTorch docathon to contribute and develop your skills and knowledge to help improve the documentation for everyone! We will have issues labelled by skill level, and the PyTorch Discord will be available for collaboration and help.

        + +

        EVENT DETAILS

        + +
          +
        • June 3: Kick-off 10 AM PT
        • +
        • June 4 - June 15: Submissions and Feedback
        • +
        • June 16 - June 17: Final Reviews
        • +
        • June 18: Winner Announcements
        • +
        + +

        Make sure to RSVP to the event so you receive all the notifications and instructions on how to participate.

        + +

        Further details about the Docathon will be shared during the Kick-off call on June 3.

        + +

        Don’t forget to register for this year’s event: RSVP now

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/docathon-h1-2023-wrap-up/index.html b/blog/docathon-h1-2023-wrap-up/index.html new file mode 100644 index 000000000000..59a8350f65e2 --- /dev/null +++ b/blog/docathon-h1-2023-wrap-up/index.html @@ -0,0 +1,662 @@ + + + + + + + + + + + + + 🎉 PyTorch Docathon H1 2023 Wrap-up 🎉 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Thank you to all who participated in our first ever PyTorch Docathon, the results have been nothing short of amazing! We want to extend our sincerest gratitude to all the participants who made this event a resounding success. Your passion, talent, and hard work have left an indelible mark on the PyTorch documentation.

        + +

        The virtual Docathon ran from May 31 through June 15 with more than 230 registrants and more than 110 participants joining the Docathon Slack channel, the energy and enthusiasm were palpable. Entrants were judged on the difficulty of submissions that resulted in over 40 merged pull requests and the publication of four new tutorials and addition of one new example.

        + +

        We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide. See the full list of contributors here.

        + +

        Meet the top contributors:

        + + + +

        As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch documentation and code, and pushing the boundaries of what’s possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the AI community.

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/docathon-june-2024/index.html b/blog/docathon-june-2024/index.html new file mode 100644 index 000000000000..eeee4d7edaa0 --- /dev/null +++ b/blog/docathon-june-2024/index.html @@ -0,0 +1,661 @@ + + + + + + + + + + + + + Announcing PyTorch Docathon June, 2024 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are thrilled to announce the upcoming PyTorch Docathon in June! The Docathon, akin to a hackathon, is an event dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Documentation is a vital component of any technology. By refining it, we can simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine learning. See our previous events here and here.

        + +

        Why Participate

        + +

        The Docathon is an inclusive event designed to be accessible to newcomers, requiring only a basic understanding of Python, PyTorch, and Machine Learning, with some tasks not even requiring these skills. It offers a rewarding experience as participants can see the direct impact of their contributions on the project’s usability and accessibility. The Docathon promotes a collaborative environment, allowing participants to work with other contributors and PyTorch maintainers, fostering the exchange of ideas and networking. It also provides a rich learning experience, offering the opportunity to explore PyTorch modules, update docstrings, and test tutorials.

        + +

        Event Details

        + +

        June 4: Kick-off
        +June 4 - 16: Submissions and Feedback
        +June 17 - 18: Final Reviews
        +June 20: Winner Announcements

        + +

        Further details for the Docathon will be announced at the Kick-off call on June 4.

        + +

        Please register to join this year’s event.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/docathon-kickoff-h1-2024/index.html b/blog/docathon-kickoff-h1-2024/index.html new file mode 100644 index 000000000000..4553981ee48d --- /dev/null +++ b/blog/docathon-kickoff-h1-2024/index.html @@ -0,0 +1,703 @@ + + + + + + + + + + + + + Ready, Set, Contribute: PyTorch Docathon Kickoff H1 2024 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        The PyTorch Docathon is now live! This event is dedicated to enhancing the quality of the PyTorch documentation with the invaluable assistance of our community. Our hope with this Docathon is to simplify the process for new users to get started with PyTorch, guide them in effectively utilizing its features, and ultimately expedite the transition from research to production in machine learning.

        + +

        JOIN THE KICK-OFF EVENT
        +on June 4th at 10 AM PT

        + +

        Event Details

        + +
          +
        • June 4: Kick-off - join a 30-minutes livestream kick off event on Discord on June 4th at 10 AM PT here. If you can’t join the kick-off event, watch our welcome video on YouTube
        • +
        • June 4-June 16: Submissions and Feedback
        • +
        • June 17-18: Final Reviews
        • +
        • June 20: Winner Announcements
        • +
        + +

        How to Contribute

        + +

        Review the Docathon H1 2024 issue in the pytorch/pytorch or pytorch/tutorials repo that contain all the necessary information on participating in the Docathon and highlights the specific issues to work on. Remember to sign the CLA in your first PR and adhere to the Code of Conduct guidelines.

        + +

        Read the Code of Conduct

        + +

        Take a moment to review the PyTorch code of conduct found here. This document outlines the expectations for behavior and communication within our team, and it is important that everyone is aware of and adheres to these guidelines.

        + +

        Join our Discord

        + +

        This channel serves as the main communication hub during the Docathon. You can join it using by using this link:

        + +

        JOIN DISCORD SERVER

        + +

        When you first join the server, you will have limited access. To gain full access to our Discord PyTorch Docathon Channel:

        + +
          +
        1. Enter the server and navigate to the #self-roles channel.
        2. +
        3. In the #self-roles channel, click on the ‘Join Docathon’ button in the relevant post to assign yourself the docathon role.
        4. +
        5. After assigning the role, you will see the ‘PyTorch Docathon H1 2024 Section’ in the left-hand menu for discussions.
        6. +
        7. To help prevent spam we are asking that you change your server username to your GitHub username or the email username you registered with.
        8. +
        + +

        Explore the GitHub Issues

        + +

        All the Docathon issues are posted on GitHub. You can find them by the docathon-h1-2024 label in the following participating repositories:

        + + + +

        The issues are categorized into three levels of difficulty: easy, medium, and advanced. If this is your first time contributing to PyTorch, we recommend starting with an issue at the easy level.

        + +

        Prizes for Winners

        + +

        We will have a leaderboard throughout the duration of the Docathon. The more you contribute, the higher you’ll get on the board! Our top three winners will get free admission to PyTorch Conference 2024.

        + +

        Thank you to our Partners

        + +

        This year, we’re thrilled to work with the PyTorch Teams at Meta, Google and Snowflake to help us put on a successful event. We’ll also be at Snowflake Dev Day on June 6 where you can hear from Meta’s Matthias Reso, and check out our PyTorch booth.

        + +

        Happy contributing!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/doctr-joins-pytorch-ecosystem/index.html b/blog/doctr-joins-pytorch-ecosystem/index.html new file mode 100644 index 000000000000..ad753056a7ba --- /dev/null +++ b/blog/doctr-joins-pytorch-ecosystem/index.html @@ -0,0 +1,783 @@ + + + + + + + + + + + + + docTR joins PyTorch Ecosystem: From Pixels to Data, Building a Recognition Pipeline with PyTorch and docTR | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Olivier Dulcy & Sebastian Olivera, Mindee + +

        +

        docTR logo

        + +

        We’re thrilled to announce that the docTR project has been integrated into the PyTorch ecosystem! This integration ensures that docTR aligns with PyTorch’s standards and practices, giving developers a reliable, community-backed solution for powerful OCR workflows.

        + +

        For more information on what it means to be a PyTorch ecosystem project, see the PyTorch Ecosystem Tools page.

        + +

        About docTR

        + +

        docTR is an Apache 2.0 project developed and distributed by Mindee to help developers integrate OCR capabilities into applications with no prior knowledge required.

        + +

        To quickly and efficiently extract text information, docTR uses a two-stage approach:

        + +
          +
        • First, it performs text detection to localize words.
        • +
        • Then, it conducts text recognition to identify all characters in a word.
        • +
        + +

        Detection and recognition are performed by state-of-the-art models written in PyTorch. To learn more about this approach, you can refer to the docTR documentation.

        + +

        docTR enhances the user experience in PyTorch projects by providing high-performance OCR capabilities right out of the box. Its specially designed models require minimal to no fine-tuning for common use cases, allowing developers to quickly integrate advanced document analysis features.

        + +

        Local installation

        + +

        docTR requires Python >= 3.10 and supports Windows, Mac and Linux. Please refer to our README for necessary dependencies for MacBook with the M1 chip.

        + +
        pip3 install -U pip
        +pip3 install "python-doctr[torch,viz]"
        +
        + +

        This will install docTR along with the latest version of PyTorch.

        + +
        Note: docTR also provides docker images for an easy deployment, such as a part of Kubernetes cluster.
        +
        + +

        Text recognition

        + +

        Now, let’s try docTR’s OCR recognition on this sample:

        + +

        OCR sample

        + +

        The OCR recognition model expects an image with only one word on it and will output the predicted word with a confidence score. You can use the following snippet to test OCR capabilities from docTR:

        + +
        python
        +from doctr.io import DocumentFile
        +from doctr.models import recognition_predictor
        +
        +doc = DocumentFile.from_images("/path/to/image")
        +
        +# Load the OCR model
        +# This will download pre-trained models hosted by Mindee
        +model = recognition_predictor(pretrained=True)
        +
        +result = model(doc)
        +print(result)
        +
        + +

        Here, the most important line of code is model = recognition_predictor(pretrained=True). This will load a default text recognition model, crnn_vgg16_bn, but you can select other models through the arch parameter. You can check out the available architectures.

        + +

        When run on the sample, the recognition predictor retrieves the following data: [('MAGAZINE', 0.9872216582298279)]

        + +
        Note: using the DocumentFile object docTR provides an easy way to manipulate PDF or Images.
        +
        + +

        Text detection

        + +

        The last example was a crop on a single word. Now, what about an image with several words on it, like this one?

        + +

        photo of magazines

        + +

        A text detection model is used before the text recognition to output a segmentation map representing the location of the text. Following that, the text recognition is applied on every detected patch.

        + +

        Below is a snippet to run only the detection part:

        + +
        from doctr.io import DocumentFile
        +from doctr.models import detection_predictor
        +from matplotlib import pyplot as plt
        +from doctr.utils.geometry import detach_scores
        +from doctr.utils.visualization import draw_boxes
        +
        +doc = DocumentFile.from_images("path/to/my/file")
        +model = detection_predictor(pretrained=True)
        +
        +result = model(doc)
        +
        +draw_boxes(detach_scores([result[0]["words"]])[0][0], doc[0])
        +plt.axis('off')
        +plt.show()
        +
        + +

        Running it on the full sample yields the following:

        + +

        photo of magazines

        + +

        Similarly to the text recognition, detection_predictor will load a default model (fast_base here). You can also load another one by providing it through the arch parameter.

        + +

        The full implementation

        + +

        Now, let’s plug both components into the same pipeline.

        + +

        Conveniently, docTR provides a wrapper that does exactly that for us:

        + +
        from doctr.io import DocumentFile
        +from doctr.models import ocr_predictor
        +
        +doc = DocumentFile.from_images("/path/to/image")
        +
        +model = ocr_predictor(pretrained=True, assume_straight_pages=False)
        +
        +result = model(doc)
        +result.show()
        +
        + +

        photo of magazines

        + +

        The last line should display a matplotlib window which shows the detected patches. Hovering the mouse over them will display their contents.

        + +

        You can also do more with this output, such as reconstituting a synthetic document like so:

        + +
        import matplotlib.pyplot as plt
        +
        +synthetic_pages = result.synthesize()
        +plt.imshow(synthetic_pages[0])
        +plt.axis('off')
        +plt.show()
        +
        + +

        black text on white

        + +

        The pipeline is highly customizable, where you can modify the detection or recognition model behaviors by passing arguments to the ocr_predictor. Please refer to the documentation to learn more about it.

        + +

        Conclusion

        + +

        We’re excited to welcome docTR into the PyTorch Ecosystem, where it seamlessly integrates with PyTorch pipelines to deliver state-of-the-art OCR capabilities right out of the box.

        + +

        By empowering developers to quickly extract text from images or PDFs using familiar tooling, docTR simplifies complex document analysis tasks and enhances the overall PyTorch experience.

        + +

        We invite you to explore the docTR GitHub repository, join the docTR community on Slack, and reach out at contact@mindee.com for inquiries or collaboration opportunities.

        + +

        Together, we can continue to push the boundaries of document understanding and develop even more powerful, accessible tools for everyone in the PyTorch community.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/easily-list-and-initialize-models-with-new-apis-in-torchvision/index.html b/blog/easily-list-and-initialize-models-with-new-apis-in-torchvision/index.html new file mode 100644 index 000000000000..73dd841e10eb --- /dev/null +++ b/blog/easily-list-and-initialize-models-with-new-apis-in-torchvision/index.html @@ -0,0 +1,762 @@ + + + + + + + + + + + + + Easily list and initialize models with new APIs in TorchVision | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Vasilis Vryniotis and Laurence Rouesnel + +

        +

        TorchVision now supports listing and initializing all available built-in models and weights by name. This new API builds upon the recently introduced Multi-weight support API, is currently in Beta, and it addresses a long-standing request from the community.

        + +

        + +

        + +

        You can try out the new API in the latest nightly release of TorchVision. We’re looking to collect feedback ahead of finalizing the feature in TorchVision v0.14. We have created a dedicated Github Issue where you can post your comments, questions and suggestions!

        + +

        Querying and initializing available models

        + +

        Before the new model registration API, developers had to query the __dict__ attribute of the modules in order to list all available models or to fetch a specific model builder method by its name:

        + +
        # Initialize a model by its name:
        +model = torchvision.models.__dict__[model_name]()
        +
        +# List available models:
        +available_models = [
        +    k for k, v in torchvision.models.__dict__.items()
        +    if callable(v) and k[0].islower() and k[0] != "_"
        +]
        +
        + +

        The above approach does not always produce the expected results and is hard to discover. For example, since the get_weight() method is exposed publicly under the same module, it will be included in the list despite not being a model. In general, reducing the verbosity (less imports, shorter names etc) and being able to initialize models and weights directly from their names (better support of configs, TorchHub etc) was feedback provided previously by the community. To solve this problem, we have developed a model registration API.

        + +

        A new approach

        + +

        We’ve added 4 new methods under the torchvision.models module:

        + +
        from torchvision.models import get_model, get_model_weights, get_weight, list_models
        +
        + +

        The styles and naming conventions align closely with a prototype mechanism proposed by Philip Meier for the Datasets V2 API, aiming to offer a similar user experience. The model registration methods are kept private on purpose as we currently focus only on supporting the built-in models of TorchVision.

        + +

        List models

        + +

        Listing all available models in TorchVision can be done with a single function call:

        + +
        >>> list_models()
        +['alexnet', 'mobilenet_v3_large', 'mobilenet_v3_small', 'quantized_mobilenet_v3_large', ...]
        +
        + +

        To list the available models of specific submodules:

        + +
        >>> list_models(module=torchvision.models)
        +['alexnet', 'mobilenet_v3_large', 'mobilenet_v3_small', ...]
        +>>> list_models(module=torchvision.models.quantization)
        +['quantized_mobilenet_v3_large', ...]
        +
        + +

        Initialize models

        + +

        Now that you know which models are available, you can easily initialize a model with pre-trained weights:

        + +
        >>> get_model("quantized_mobilenet_v3_large", weights="DEFAULT")
        +QuantizableMobileNetV3(
        +  (features): Sequential(
        +   ....
        +   )
        +)
        +
        + +

        Get weights

        +

        Sometimes, while working with config files or using TorchHub, you might have the name of a specific weight entry and wish to get its instance. This can be easily done with the following method:

        + +
        >>> get_weight("ResNet50_Weights.IMAGENET1K_V2")
        +ResNet50_Weights.IMAGENET1K_V2
        +
        + +

        To get the enum class with all available weights of a specific model you can use either its name:

        + +
        >>> get_model_weights("quantized_mobilenet_v3_large")
        +<enum 'MobileNet_V3_Large_QuantizedWeights'>
        +
        + +

        Or its model builder method:

        + +
        >>> get_model_weights(torchvision.models.quantization.mobilenet_v3_large)
        +<enum 'MobileNet_V3_Large_QuantizedWeights'>
        +
        + +

        TorchHub support

        +

        The new methods are also available via TorchHub:

        + +
        import torch
        +
        +# Fetching a specific weight entry by its name:
        +weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2")
        +
        +# Fetching the weights enum class to list all available entries:
        +weight_enum = torch.hub.load("pytorch/vision", "get_model_weights", name="resnet50")
        +print([weight for weight in weight_enum])
        +
        + +

        Putting it all together

        + +

        For example, if you wanted to retrieve all the small-sized models with pre-trained weights and initialize one of them, it’s a matter of using the above APIs:

        + +
        import torchvision
        +from torchvision.models import get_model, get_model_weights, list_models
        +
        +
        +max_params = 5000000
        +
        +tiny_models = []
        +for model_name in list_models(module=torchvision.models):
        +    weights_enum = get_model_weights(model_name)
        +    if len([w for w in weights_enum if w.meta["num_params"] <= max_params]) > 0:
        +        tiny_models.append(model_name)
        +
        +print(tiny_models)
        +# ['mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mobilenet_v2', ...]
        +
        +model = get_model(tiny_models[0], weights="DEFAULT")
        +print(sum(x.numel() for x in model.state_dict().values()))
        +# 2239188
        +
        + +

        For more technical details please see the original RFC. Please spare a few minutes to provide your feedback on the new API, as this is crucial for graduating it from beta and including it in the next release. You can do this on the dedicated Github Issue. We are looking forward to reading your comments!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ecosystem-day-2021-recap/index.html b/blog/ecosystem-day-2021-recap/index.html new file mode 100644 index 000000000000..a7234aa7e96f --- /dev/null +++ b/blog/ecosystem-day-2021-recap/index.html @@ -0,0 +1,673 @@ + + + + + + + + + + + + + PyTorch Ecosystem Day 2021 Recap and New Contributor Resources | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Thank you to our incredible community for making the first ever PyTorch Ecosystem Day a success! The day was filled with discussions on new developments, trends and challenges showcased through 71 posters, 32 breakout sessions and 6 keynote speakers.

        + +
        + +
        + +

        Special thanks to our keynote speakers: Piotr Bialecki, Ritchie Ng, Miquel Farré, Joe Spisak, Geeta Chauhan, and Suraj Subramanian who shared updates from the latest release of PyTorch, exciting work being done with partners, use case example from Disney, the growth and development of the PyTorch community in Asia Pacific, and latest contributor highlights.

        + +

        If you missed the opening talks, you rewatch them here:

        + + +

        In addition to the talks, we had 71 posters covering various topics such as multimodal, NLP, compiler, distributed training, researcher productivity tools, AI accelerators, and more. From the event, it was clear that an underlying thread that ties all of these different projects together is the cross-collaboration of the PyTorch community. Thank you for continuing to push the state of the art with PyTorch!

        + +

        To view the full catalogue of poster, please visit PyTorch Ecosystem Day 2021 Event Page.

        + +

        New Contributor Resources

        +

        Today, we are also sharing new contributor resources that we are trying out to give you the most access to up-to-date news, networking opportunities and more.

        +
          +
        • Contributor Newsletter - Includes curated news including RFCs, feature roadmaps, notable PRs, editorials from developers, and more to support keeping track of everything that’s happening in our community.
        • +
        • Contributors Discussion Forum - Designed for contributors to learn and collaborate on the latest development across PyTorch.
        • +
        • PyTorch Developer Podcast (Beta) - Edward Yang, PyTorch Research Scientist, at Facebook AI shares bite-sized (10 to 20 mins) podcast episodes discussing topics about all sorts of internal development topics in PyTorch.
        • +
        + +

        Thank you,

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ecosystem_day_2021/index.html b/blog/ecosystem_day_2021/index.html new file mode 100644 index 000000000000..cc295a41d23c --- /dev/null +++ b/blog/ecosystem_day_2021/index.html @@ -0,0 +1,691 @@ + + + + + + + + + + + + + Announcing PyTorch Ecosystem Day | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        March 09, 2021

        +

        + Announcing PyTorch Ecosystem Day +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We’re proud to announce our first PyTorch Ecosystem Day. The virtual, one-day event will focus completely on our Ecosystem and Industry PyTorch communities!

        + +

        PyTorch is a deep learning framework of choice for academics and companies, all thanks to its rich ecosystem of tools and strong community. As with our developers, our ecosystem partners play a pivotal role in the development and growth of the community.

        + +
        + +
        + +

        We will be hosting our first PyTorch Ecosystem Day, a virtual event designed for our ecosystem and industry communities to showcase their work and discover new opportunities to collaborate.

        + +

        PyTorch Ecosystem Day will be held on April 21, with both a morning and evening session, to ensure we reach our global community. Join us virtually for a day filled with discussions on new developments, trends, challenges, and best practices through keynotes, breakout sessions, and a unique networking opportunity hosted through Gather.Town .

        + +

        Event Details

        +

        April 21, 2021 (Pacific Time) +Fully digital experience

        + +
          +
        • +

          Morning Session: (EMEA) +Opening Talks - 8:00 am-9:00 am PT +Poster Exhibition & Breakout Sessions - 9:00 am-12:00 pm PT

          +
        • +
        • +

          Evening Session (APAC/US) +Opening Talks - 3:00 pm-4:00 pm PT +Poster Exhibition & Breakout Sessions - 3:00 pm-6:00 pm PT

          +
        • +
        • +

          Networking - 9:00 am-7:00 pm PT

          +
        • +
        + +

        There are two ways to participate in PyTorch Ecosystem Day:

        + +
          +
        1. +

          Poster Exhibition from the PyTorch ecosystem and industry communities covering a variety of topics. Posters are available for viewing throughout the duration of the event. To be part of the poster exhibition, please see below for submission details. If your poster is accepted, we highly recommend tending your poster during one of the morning or evening sessions or both!

          +
        2. +
        3. +

          Breakout Sessions are 40-min sessions freely designed by the community. The breakouts can be talks, demos, tutorials or discussions. Note: you must have an accepted poster to apply for the breakout sessions.

          +
        4. +
        + +

        Call for posters now open! Submit your proposal today! Please send us the title and summary of your projects, tools, and libraries that could benefit PyTorch researchers in academia and industry, application developers, and ML engineers for consideration. The focus must be on academic papers, machine learning research, or open-source projects. Please no sales pitches. Deadline for submission is March 18, 2021.

        + +

        Visit pytorchecosystemday.fbreg.com for more information and we look forward to welcoming you to PyTorch Ecosystem Day on April 21st!

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/effective-multi-objective-nueral-architecture/index.html b/blog/effective-multi-objective-nueral-architecture/index.html new file mode 100644 index 000000000000..d42251f35c78 --- /dev/null +++ b/blog/effective-multi-objective-nueral-architecture/index.html @@ -0,0 +1,784 @@ + + + + + + + + + + + + + Efficient Multi-Objective Neural Architecture Search with Ax | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + David Eriksson, Max Balandat + +

        +

        tl;dr

        + +

        Multi-Objective Optimization in Ax enables efficient exploration of tradeoffs (e.g. between model performance and model size or latency) in Neural Architecture Search. This method has been successfully applied at Meta for a variety of products such as On-Device AI. In this post, we provide an end-to-end tutorial that allows you to try it out yourself.

        + +

        Introduction

        + +

        Neural networks continue to grow in both size and complexity. Developing state-of-the-art architectures is often a cumbersome and time-consuming process that requires both domain expertise and large engineering efforts. In an attempt to overcome these challenges, several Neural Architecture Search (NAS) approaches have been proposed to automatically design well-performing architectures without requiring a human in-the-loop.

        + +

        Despite being very sample-inefficient, naïve approaches like random search and grid search are still popular for both hyperparameter optimization and NAS (a study conducted at NeurIPS 2019 and ICLR 2020 found that 80% of NeurIPS papers and 88% of ICLR papers tuned their ML model hyperparameters using manual tuning, random search, or grid search). But as models are often time-consuming to train and may require large amounts of computational resources, minimizing the number of configurations that are evaluated is important.

        + +

        Ax is a general tool for black-box optimization that allows users to explore large search spaces in a sample-efficient manner using state-of-the art algorithms such as Bayesian Optimization. At Meta, Ax is used in a variety of domains, including hyperparameter tuning, NAS, identifying optimal product settings through large-scale A/B testing, infrastructure optimization, and designing cutting-edge AR/VR hardware.

        + +

        In many NAS applications, there is a natural tradeoff between multiple metrics of interest. For instance, when deploying models on-device we may want to maximize model performance (e.g., accuracy), while simultaneously minimizing competing metrics such as power consumption, inference latency, or model size, in order to satisfy deployment constraints. In many cases, we have been able to reduce computational requirements or latency of predictions substantially by accepting a small degradation in model performance (in some cases we were able to both increase accuracy and reduce latency!). Principled methods for exploring such tradeoffs efficiently are key enablers of Sustainable AI.

        + +

        At Meta, we have successfully used multi-objective Bayesian NAS in Ax to explore such tradeoffs. Our methodology is being used routinely for optimizing AR/VR on-device ML models. Beyond NAS applications, we have also developed MORBO which is a method for high-dimensional multi-objective optimization that can be used to optimize optical systems for augmented reality (AR).

        + +

        Fully automated Multi-Objective NAS with Ax

        + +

        Ax’s Scheduler allows running experiments asynchronously in a closed-loop fashion by continuously deploying trials to an external system, polling for results, leveraging the fetched data to generate more trials, and repeating the process until a stopping condition is met. No human intervention or oversight is required. Features of the Scheduler include:

        + +
          +
        • +

          Customizability of parallelism, failure tolerance, and many other settings;

          +
        • +
        • +

          A large selection of state-of-the-art optimization algorithms;

          +
        • +
        • +

          Saving in-progress experiments (to a SQL DB or json) and resuming an experiment from storage;

          +
        • +
        • +

          Easy extensibility to new backends for running trial evaluations remotely.

          +
        • +
        + +

        The following illustration from the Ax scheduler tutorial summarizes how the scheduler interacts with any external system used to run trial evaluations:

        + + + +

        + +

        + +

        To run automated NAS with the Scheduler, the main things we need to do are:

        + +
          +
        • +

          Define a Runner, which is responsible for sending off a model with a particular architecture to be trained on a platform of our choice (like Kubernetes, or maybe just a Docker image on our local machine). In the tutorial below, we use TorchX for handling deployment of training jobs.

          +
        • +
        • +

          Define a Metric, which is responsible for fetching the objective metrics (such as accuracy, model size, latency) from the training job. In our tutorial, we use Tensorboard to log data, and so can use the Tensorboard metrics that come bundled with Ax.

          +
        • +
        + +

        Tutorial

        + +

        In our tutorial we show how to use Ax to run multi-objective NAS for a simple neural network model on the popular MNIST dataset. While the underlying methodology can be used for more complicated models and larger datasets, we opt for a tutorial that is easily runnable end-to-end on a laptop in less than an hour. In our example, we will tune the widths of two hidden layers, the learning rate, the dropout probability, the batch size, and the number of training epochs. The goal is to trade off performance (accuracy on the validation set) and model size (the number of model parameters) using multi-objective Bayesian optimization.

        + +

        The tutorial makes use of the following PyTorch libraries:

        + +
          +
        • +

          PyTorch Lightning (specifying the model and training loop)

          +
        • +
        • +

          TorchX (for running training jobs remotely / asynchronously)

          +
        • +
        • +

          BoTorch (the Bayesian optimization library that powers Ax’s algorithms)

          +
        • +
        + +

        The complete runnable example is available as a PyTorch Tutorial.

        + +

        Results

        + +

        The final results from the NAS optimization performed in the tutorial can be seen in the tradeoff plot below. Here, each point corresponds to the result of a trial, with the color representing its iteration number, and the star indicating the reference point defined by the thresholds we imposed on the objectives. We see that our method was able to successfully explore the trade-offs between validation accuracy and number of parameters and found both large models with high validation accuracy as well as small models with lower validation accuracy. Depending on the performance requirements and model size constraints, the decision maker can now choose which model to use or analyze further.

        + +

        + +

        + +

        Visualizations

        + +

        Ax provides a number of visualizations that make it possible to analyze and understand the results of an experiment. Here, we will focus on the performance of the Gaussian process models that model the unknown objectives, which are used to help us discover promising configurations faster. Ax makes it easy to better understand how accurate these models are and how they perform on unseen data via leave-one-out cross-validation. In the figures below, we see that the model fits look quite good - predictions are close to the actual outcomes, and predictive 95% confidence intervals cover the actual outcomes well. Additionally, we observe that the model size (num_params) metric is much easier to model than the validation accuracy (val_acc) metric.

        + + + + + +
        +

        + +

        + +

        + +

        +
        + +

        Takeaways

        + +
          +
        • +

          We showed how to run a fully automated multi-objective Neural Architecture Search using Ax.

          +
        • +
        • +

          Using the Ax Scheduler, we were able to run the optimization automatically in a fully asynchronous fashion - this can be done locally (as done in the tutorial) or by deploying trials remotely to a cluster (simply by changing the TorchX scheduler configuration).

          +
        • +
        • +

          The state-of-the-art multi-objective Bayesian optimization algorithms available in Ax allowed us to efficiently explore the tradeoffs between validation accuracy and model size.

          +
        • +
        + +

        Advanced Functionality

        + +

        Ax has a number of other advanced capabilities that we did not discuss in our tutorial. Among these are the following:

        + +

        Early Stopping

        + +

        When evaluating a new candidate configuration, partial learning curves are typically available while the NN training job is running. We can use the information contained in the partial curves to identify under-performing trials to stop early in order to free up computational resources for more promising candidates. While not demonstrated in the above tutorial, Ax supports early stopping out-of-the-box.

        + +

        High-dimensional search spaces

        + +

        In our tutorial, we used Bayesian optimization with a standard Gaussian process in order to keep the runtime low. However, these models typically scale to only about 10-20 tunable parameters. Our new SAASBO method (paper, Ax tutorial, BoTorch tutorial) is very sample-efficient and enables tuning hundreds of parameters. SAASBO can easily be enabled by passing use_saasbo=True to choose_generation_strategy.

        + +

        Acknowledgements

        + +

        We thank the TorchX team (in particular Kiuk Chung and Tristan Rice) for their help with integrating TorchX with Ax, and the Adaptive Experimentation team @ Meta for their contributions to Ax and BoTorch.

        + +

        References

        + +

        D. Eriksson, P. Chuang, S. Daulton, M. Balandat. Optimizing model accuracy and latency using Bayesian multi-objective neural architecture search. Meta Research blog, July 2021.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/efficient-large-scale-training-with-pytorch/index.html b/blog/efficient-large-scale-training-with-pytorch/index.html new file mode 100644 index 000000000000..237b5cdf94db --- /dev/null +++ b/blog/efficient-large-scale-training-with-pytorch/index.html @@ -0,0 +1,1096 @@ + + + + + + + + + + + + + Efficient Large-Scale Training with Pytorch FSDP and AWS | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Less Wright, Hamid Shojanazeri, Geeta Chauhan + +

        +

        Cutting-edge AI models are becoming extremely large. The cost and overhead of training these models is increasing rapidly, and involves large amounts of engineering and guesswork to find the right training regime. FSDP reduces these costs significantly by enabling you to train much larger models with the same amount of resources. FSDP lowers the memory footprint on your GPUs, and is usable via a lightweight configuration that requires substantially less effort, typically with just a few lines of code.

        + +

        The main performance gains in FSDP come from maximizing the overlap between network communication and model computation, and eliminating the memory redundancy inherent in traditional data parallel training (DDP). PyTorch FSDP can train models approximately 4x larger on the same server resources as DDP and 20x larger if we combine activation checkpointing and activation offloading.

        + +

        Since PyTorch 1.12, FSDP is now in beta status, and has added a number of new features that can be tuned to further accelerate your model training.

        + +

        In this series of blog posts, we will explain multiple performance optimizations you can run with FSDP to boost your distributed training speed and model sizes within the context of your available server resources. We use the HuggingFace T5 3B, 11B and DeepVit, in fine-tuning mode, as the running examples throughout the series.

        + +

        As a preview of some of the optimizations discussed in this series, we show the before and after performance scaled in Flops below (Note that these results can vary based on your server resources and model architecture).

        + +

        + +

        + +

        *T5 3B Performance measured on AWS A100 and A10 servers. Original with no optimizations and Tuned with the applied optimization

        + +

        + +

        + +

        *T5 11B Performance measured on A100 servers. Original with no optimizations and Tuned with the applied optimization

        + +

        In this first post, we will provide a quick overview of FSDP and how it can make training large- scale AI models more efficient. We will highlight briefly the multiple performance options available, and dive deeper into the details on these in upcoming posts. We will then conclude with an overview on how to leverage AWS parallel cluster for large- scale training with FSDP.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Optimization + T5 Model + Throughput Improvement +
        Mixed Precision + 3 B + 5x +
        11 B + 10x +
        Activation Checkpointing (AC) + 3 B + 10x +
        11 B + 100x +
        Transformer Wrapping Policy + 3 B + 2x +
        11 B + Unable to run the experiment without the Transformer wrapping policy. +
        Full Shard Strategy + 3 B + 1.5x +
        11 B + Not able to run with Zero2 +
        + +

        Performance optimization gains on T5 models over non-optimized.

        + +

        In our experiments with the T5 3B model, using the transformer wrapping policy resulted in >2x higher throughput measured in TFLOPS versus the default wrapping policy. Activation checkpointing resulted in 10x improvement by reinvesting the freed memory from the checkpoints into larger batch size. Mixed precision with BFloat16 resulted in ~5x improvement versus FP32 and finally the full sharding strategy versus zero2 (DDP) resulted in 1.5x improvement.

        + +

        We ran similar experiments for a larger model, T5 11B, but the larger model size resulted in some changes to the experiment space. Specifically, we found that two optimizations, transformer wrapping policy and activation checkpointing, were needed to enable us to run these experiments on 3 nodes (each node had 8 A100 gpus with 80 GB of memory). With these optimizations, we could fit a batch size of 50 and get higher throughput compared to removing each one of them. Thus rather than running on/off solely for a single optimization test as with the 3B model, the larger model experiments were done with 1 of 3 optimizations turned on/off while always running the other two in order to allow a usable batch size for both test states for each item.

        + +

        Based on TFLOP comparisons, with the 11B model, we saw even more payoff from the optimizations. Mixed precision(~10x improvement) and activation checkpointing (~100x improvement) had a much larger impact with the 11B model compared to the 3B parameter model. With mixed precision we could fit ~2x larger batch sizes and with activation checkpointing >15x batch sizes (from 3 with no activation checkpointing to 50 with activation checkpointing) which translated into large throughput improvements.

        + +

        We also have observed that for these larger models > 3B, using Zero2 sharding strategy would result in minimal room left in memory for the batch data, and had to go with very small batch sizes (e.g 1-2) that essentially makes full sharding strategy a necessity to enable fitting larger batches sizes.

        + +

        Note - this tutorial assumes a basic understanding of FSDP. To learn more about basics of FSDP please refer to the getting started and advanced FSDP tutorials.

        + +

        What is FSDP? How does it make Large-Scale Training More Efficient

        + +

        FSDP expands upon distributed data parallel, by parallelizing not just data, but the model parameters, the optimizer states and gradients associated with the model. Specifically - each GPU only stores a subset of the entire model and the associated subset of optimizer states and gradients.

        + +

        To show the evolution of distributed training, we can start from the beginning, where AI models were simply trained on a single GPU.

        + +

        DDP (Distributed Data Parallel) was the initial step up from training with only a single GPU, and was an effort to address the data and model size growth, where multiple GPUs each housed their own copy of the same model. The gain here is that the data for each batch could be split and processed independently on each GPU, all at the same time,thus parallelizing the processing of the data set and increasing training speed by the increasing number of GPUs. The tradeoff is the need to communicate the gradients between each GPU to synchronize the models after the backward pass.

        + +

        FSDP expands on scaling models by removing the redundancy of optimizer calculations and state storage, as well as gradient and memory storage of model parameters that are present in DDP (DDP = Distributed Data Parallel). This redundancy reduction, along with increased communication overlap where model parameter communication takes place at the same time as model computation, is what allows FSDP to train much larger models with the same resources as DDP.

        + +

        A key point is that this efficiency also allows for AI models that are larger than a single GPU to be trained. The model size available for training is now increased to the aggregate memory of all GPUs, rather than the size of a single GPU. (And as a point of note, FSDP can go beyond aggregated GPU memory by leveraging CPU memory as well, though we will not directly cover this aspect here).

        + +

        As discussed in a previous blog post, with DDP the largest model that we could train on 32, A100 gpus with 40 GB memory (4 nodes) was up to 3B parameters, and batch size of 128, with the help of activation checkpointing. By contrast, using FSDP we were able to train up to 81B model size, combining activation checkpointing, along with activation and parameter offloading. In another experiment, we benchmarked a 1T parameter model with FSDP using 512 gpus.

        + +

        + +

        + +

        For intuition on the parameter level workings of FSDP, below we show an animation detailing how the model parameters are sharded and communicated assuming a two GPU scenario and a simple 8 parameter model:

        + +

        + +

        + +

        Above - the animations walk through the steps involved with the initial sharding of the model amongst ranks, and we start the all_gathers and forward pass

        + +

        + +

        + +

        We continue through the model with the forward pass. After each FSDP unit completes, non-locally owned params are dropped to free memory, and optionally activations can be checkpointed. This continues until we finish the forward pass and compute the loss.

        + +

        + +

        + +

        During the backward pass, another all_gather is used to load the parameters and the gradients are computed. These gradients are then reduce_scattered so that the local owners of each param can aggregate and prepare to update the weights.

        + +

        + +

        + +

        Finally, each rank passes the summed gradients through the optimizer states and updates the weights to complete the mini-batch.

        + +

        With the model now distributed across the entire set of available GPUs, the logical question is how data moves through the model given this sharding of model parameters.

        + +

        This is accomplished by FSDP coordinating with all GPUs to effectively share (communicate) the respective parts of the model. The model is decomposed into FSDP units and parameters within each unit are flattened and then sharded across all GPUs. Within each FSDP unit, GPU’s are assigned interleaving ownership of individual model parameters.

        + +

        By interleaving, we mean the following - assuming 2 gpus with an id of 1 and 2, the FSDP unit ownership pattern would be [12121212], rather than a contiguous chunk of [111222].

        + +

        During training, an all_gather is initiated and the locally owned model parameters within a FSDP unit are shared by the owner GPU with the other non-owners, when they need it, on a ‘just in time’ type basis. FSDP prefetches parameters to overlap all_gather communication with computation.

        + +

        When those requested parameters arrive, the GPU uses the delivered parameters, in combination with the parameters it already owns, to create a fully populated FSDP unit. Thus there is a moment where each GPU hits peak memory usage while holding a fully populated FSDP unit.

        + +

        It then processes the data through the FSDP unit, and drops the parameters it received from other GPU’s to free up memory for the next unit…the process continues over and over proceeding through the entire model to complete the forward pass.The process is then repeated (in general) for the backward pass.(note - this is a simplified version for understanding..there is additional complexity but this should help construct a basic mental model of the FSDP process).

        + +

        This eliminates much of the memory redundancy present in DDP, but imposes the cost of higher amounts of network communication to shuttle these requested parameters back and forth amongst all the GPUs.Overlapping the communication timing with the computation taking place is the basis of many of the performance improvements we’ll discuss in this series. The key gains are frequently based on the fact that communication can often take place at the same time as computation.As you can surmise, having high communication speed is vital for FSDP performance.

        + +

        How do I optimize my training with FSDP?

        + +

        There are four main performance improvements we will cover - the transformer wrapper, activation checkpointing, mixed precision, and selecting the proper sharding strategy. The flowchart below will help as a checklist for tuning options that we will discuss in this post.

        + +

        + +

        + +

        Wrapping policy - for transformers, use Transformer wrapping policy

        + +

        The first performance optimization is leveraging the FSDP transformer wrapper for transformer models.

        + +

        One of the pre-defined wrapping policy is size_based_autowrap_policy. With size_based_autowrap_policy, FSDP will traverse the module structure from bottom to top, a new FSDP unit will be created once the current unit has at least the min_num_params specified within the size policy (this defaults to 1e8, or 100M). If the module can not be created as an FSDP unit, FSDP will continue to check its parent module. This size based wrapping policy may not be ideal for some model structures, PyTorch distributed team is actively working on a new default wrapping policy in the next release which is based on size and also module execution order, users can simply tune the size and achieve the optimized performance.

        + +

        In the current release, you can greatly improve your performance when running Transformer models by using the ‘transformer wrapper’. You will need to provide the appropriate layer class for your model. Here, layer class is the class that houses the Multi-Head Attention and Feed Forward Network.

        + +

        FSDP will then form the FSDP units around the layer class rather than arbitrary breaks based on parameter size. By sharding the model around layer classes that are uniformly repeated within the transformer, FSDP can create uniform FSDP units that better balance the overlap of computation and communication. By contrast, size based wrapping can produce very uneven or skewed shards for models, which then have uneven matching of compute vs communication overlap. As discussed earlier, the main driver of FSDP high performance is the overlap of communication and computation, and hence why the Transformer wrapper provides improved performance. Note that the Transformer wrapper can also be used for non-transformer models if these models have a list of uniform layers.

        + +

        Let’s compare the performance difference on a T5, 3B parameter model when running under the default wrapper and the transformer wrapper.

        + +

        For default wrapping, we don’t need to take any action - we simply pass the model to FSDP as shown:

        + +
        model = FSDP(
        +      model,
        +      device_id=torch.cuda.current_device(),
        +  )
        +
        + +

        In this case FSDP will simply wrap the whole model in a single FSDP unit.

        + +

        Running on an NVIDIA A100-SXM4–40GB with 8 GPUs, we are able to reach 2.3 TFlops and 95% GPU memory utilization with a batch size of 14.

        + +

        However, since T5 is a transformer model, we are better served to leverage the transformer wrapper for this model.

        + +

        To use that, we need to isolate the layer class for the transformer, and then pass it in to create our transformer wrapper.

        + +
        from transformers.models.t5.modeling_t5 import T5Block
        +
        + +

        And now we can create our Transformer wrapper:

        + +
        transformer_auto_wrapper_policy = functools.partial(
        +        transformer_auto_wrap_policy,
        +        transformer_layer_cls={
        +            T5Block,  # < ---- Your Transformer layer class
        +        },
        +    )
        +
        + +

        With our model aware wrapper ready, we can initialize FSDP:

        + +
        # invoke FSDP with your transformer wrapper policy:
        +model = FSDP(
        +        model,
        +        auto_wrap_policy=transformer_auto_wrapper_policy,
        +        device_id=torch.cuda.current_device(),  # streaming init
        +    )
        +
        + +

        Running this wrapped model, we can see some substantial performance gains.We can fit nearly double the batch size, going to 28, and with better memory and communication efficiency, we see a TFlops increase to 5.07 from 2.3.

        + +

        Thus, we’ve increased our training throughput by over 200% (2.19x) due to providing greater model info to FSDP! The transformer wrapping policy results in more fine-grained and balanced FSDP units each holding a layer class, which leads to a more effective communication-computation overlap.

        + +

        + +

        + +

        Above: Graphical comparison of TFlops based on wrapper type

        + +

        If you are training a Transformer model, it pays to configure your training with FSDP using the transformer wrapper. For more information on how to isolate your layer class, please see our in depth video on Transformer wrapping here, where we walk through a number of transformers showing where the layer class can be found.

        + +

        Mixed precision - use BF16 if you have an Ampere architecture GPU

        + +

        FSDP supports a flexible mixed precision policy that gives you granular control over parameters, gradients and buffer data types. This lets you easily leverage BFloat16 or FP16 to increase your training speed by up to 70%.

        + +

        *Note that BFloat 16 is only available on Ampere type GPUs. On AWS this is available with p4dn and g5 instances.

        + +

        By way of comparison, we can show a 77% speed improvement when comparing fully tuned BFloat16 vs FP32 on an 8B DeepVit model.

        + +

        + +

        + +

        We have obtained even greater acceleration using BFloat16 in fine-tuning a 3B HuggingFace T5 model as shown in the figures below. We observed that because of the lower precision the validation loss of BFloat16 is slightly behind in the first few epochs, but it is able to catch up and results in the same final accuracy as FP32.

        + +

        + +

        + +

        To use mixed precision, we create a policy with our desired data types, and pass it in during the FSDP initialization.

        + +

        To create our policy, we need to import the MixedPrecision class, and then define our custom policy using our customized class:

        + +
        from torch.distributed.fsdp import MixedPrecision
        +bfSixteen = MixedPrecision(
        +   param_dtype=torch.bfloat16,
        +   # Gradient communication precision.
        +   reduce_dtype=torch.bfloat16,
        +   # Buffer precision.
        +   buffer_dtype=torch.bfloat16,
        +)
        +model = FSDP(
        +       model,
        +       auto_wrap_policy=transformer_auto_wrapper_policy,
        +       mixed_precision=bfloatPolicy)
        +
        + +

        You can mix and match the precision for parameters, gradients and buffers as you prefer:

        + +
        comboPolicy = MixedPrecision(
        +        # Param precision
        +        param_dtype=torch.bfloat16,
        +        # Gradient communication precision.
        +        reduce_dtype=torch.float32,
        +        # Buffer precision.
        +        buffer_dtype=torch.float32,
        +    )
        +
        + +

        For training with FP16, you will need to also use the ShardedGradScaler, which we will cover in subsequent posts. For BFloat16, it is a drop-in replacement.

        + +

        AnyPrecision Optimizer - going beyond mixed precision with full BF16 training

        + +

        Mixed precision training, both in FSDP and elsewhere, maintains the working weights in the reduced datatype (BF16 or FP16) while keeping the master weights in full FP32. The reason for the master weights in FP32 is that running in pure BF16 will result in ‘weight stagnation’, where very small weight updates are lost due to the lower precision, and the accuracy flatlines over time while FP32 weights can continue to improve from these small updates.

        + +

        In order to resolve this dilemma, we can use the new AnyPrecision optimizer available in TorchDistX (Torch Distributed Experimental) that allows you to successfully train and keep the master weights in pure BF16 instead of FP32. In addition, unlike the typical storage of optimizer states in FP32, AnyPrecision is able to maintain states in pure BF16 as well.

        + +

        AnyPrecision enables pure BF16 training by maintaining an extra buffer that tracks the precision lost during the weight updates and re-applies that during the next update…effectively resolving the weight stagnation issue without requiring FP32.

        + +

        As a comparison of the throughput gains available with pure BF16 training using AnyPrecision, we ran experiments using FSDP with the T5 11B model with regular FP32 training, Mixed Precision training with BF16, and pure BF16 training using the AnyPrecision optimizer on 3 nodes with A100 gpus as mentioned previously.

        + +

        + +

        + +

        As shown above, training with AnyPrecision and pure BF16 resulted in 2x the throughput vs Mixed Precision, and over 20x improvement vs FP32.

        + +

        The potential tradeoff is the impact on final accuracy - in the cases we tested, the accuracy was equal or better than FP32 due to a regularization effect from the slightly reduced precision, but your results may vary.

        + +

        AnyPrecision optimizer is available for you to test with here, and is a drop in replacement for AdamW optimizer.

        + +

        Activation checkpointing - increasing throughput by trading compute for memory

        + +

        + +

        + +

        FSDP supports activation checkpointing once the model has been sharded, and makes it easy to implement. The graph above shows ~4x throughput improvement using activation checkpointing.

        + +

        Activation checkpointing is where the intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder. This generally increases available GPU memory by over 30%.

        + +

        The tradeoff is that during the backward pass, these previously removed intermediate activations must be re-calculated again using information in the checkpoint (duplicate compute), but by leveraging the increased GPU memory, one can increase the batch size such that the net throughput can increase substantially.

        + +
        # verify we have FSDP activation support ready by importing:
        +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
        +   checkpoint_wrapper,
        +   CheckpointImpl,
        +   apply_activation_checkpointing_wrapper,
        +)
        +
        + +

        The steps required to implement activation checkpointing is to first import the FSDP checkpointing functions. We need declare our checkpointer wrapper type which is non-reentrant and create a check function to identify which layer to wrap as follows

        + +
        non_reentrant_wrapper = partial(
        +    checkpoint_wrapper,
        +    offload_to_cpu=False,
        +    checkpoint_impl=CheckpointImpl.NO_REENTRANT,
        +)
        +check_fn = lambda submodule: isinstance(submodule, T5Block)
        +
        + +
        apply_activation_checkpointing_wrapper(
        +       model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn
        +   )
        +
        + +

        Important note - this must be run after the model has been initialized with FSDP.

        + +

        However, hopefully you’ve seen how some initial tuning with FSDP options can have a large impact on your training performance.

        + +

        With that, we turn our attention from how to scale within FSDP, to how to scale your server hardware for FSDP using AWS.

        + +

        Large Scale Training with FSDP on AWS - For multi-node prioritize high speed network

        + +

        AWS provides several services that can be used to run distributed training with FSDP: Amazon EC2 Accelerated Computing instances, AWS ParallelCluster, and Amazon Sagemaker.

        + +

        In this series of blog posts, we used Amazon EC2 p4d instances in a single-instance multi-GPU configuration and in a multi-instance configuration using AWS ParallelCluster and SageMaker in order to run our training jobs.

        + +

        Here, we’ll focus specifically on AWS parallel cluster and provide an overview of how to utilize it for training purposes.

        + +

        AWS ParallelCluster Setup

        + +

        AWS ParallelCluster is an open source, cluster management tool that makes it easy for you to deploy and manage High Performance Computing (HPC) clusters on AWS. AWS ParallelCluster uses yaml configuration files to provision all the necessary resources. It also supports multiple instance types, job submission queues, shared file systems like Amazon EFS (NFS) or Amazon FSx for Lustre, and job schedulers like AWS Batch and Slurm.

        + +

        + +

        + +

        Workflow on Clusters

        + +

        The high level idea is to have a cluster that has a head node which controls the compute nodes. The actual training job runs on the compute nodes. Overall steps to run a training job on a cluster are as follows:

        + +
          +
        1. Set up an AWS ParallelCuster (we discuss below)
        2. +
        3. Connect to the head node, and import the training code/ setup the environment.
        4. +
        5. Pull the data and place it in a shared folder that compute nodes can access (FSx Lustre drive).
        6. +
        7. Run the training job using a job scheduler (in this case Slurm).
        8. +
        + +

        Setup AWS ParallelCuster

        + +

        To setup AWS ParallelCluster,

        + +
          +
        1. +

          Deploy a network stack. This step is optional since you could use your account default VPC and let AWS ParallelCluster create your subnets and security groups. However, we prefer to compartmentalize our desired network infrastructure and do this deployment via a CloudFormation stack.

          + +

          Since we deploy a public and a private subnet, we want to create them into an Availability Zone that contains our target instances, in this case p4d. We consult their availability in the region we use (us-east-1) through the following AWS CLI command:

          + +

          aws ec2 describe-instance-type-offerings --location-type availability-zone \ --filters Name=instance-type,Values=p4d.24xlarge --region us-east-1 --output table

          + +

          We see three availability zones containing p4d instances, we pick one of them (us-east-1c, yours may be different) when deploying our network stack. This can be done with the AWS Console or the AWS CLI. In our case we use the latter as follows

          + +

          aws cloudformation create-stack --stack-name VPC-Large-Scale --capabilities CAPABILITY_IAM --template-body file://VPC-Large-Scale.yaml --parameters ParameterKey=SubnetsAZ,ParameterValue=us-east-1c

          + +

          CloudFormation will deploy our new VPC, subnets, security groups and endpoints on our behalf. Once done, you can retrieve the IDs of the public and private subnets by querying the stack outputs and the values PublicSubnet and PrivateSubnet.

          + +

          For example, using the AWS CLI for the private subnet:

          + +

          aws cloudformation describe-stacks --stack-name VPC-Large-Scale --query "Stacks[0].Outputs[?OutputKey=='PrivateSubnet'].OutputValue" --output text

          +
        2. +
        3. +

          Create ParallelCluster, The cluster configuration file specifies the resources for our cluster. These resources include instance type for Head node, compute nodes, access to S3 buckets, shared storage where our data will be located. We will use Amazon FSx for Lustre that offers a fully managed shared storage service with Lustre.

          + +

          Here is an example of a cluster configuration file. We can use AWs ParallelCluster CLI to create the cluster. Please note that the private and public subnet IDs will need to be replaced by the ones you retrieved earlier. You will be able to control the cluster using the AWS ParallelCluster CLI to start, stop, pause, etc.

          + +
          pcluster create-cluster --cluster-name my-hpc-cluster --cluster-configuration cluster.yaml
          +
          +
        4. +
        5. +

          SSH to Head node - once the cluster is ready, we can connect to the Head node using the SSH protocol, pull our training code with and place the data in the shared storage specified in the cluster configuration file.

          + +
          pcluster ssh --cluster-name cluster -i your-key_pair
          +
          +
        6. +
        7. +

          Launch the training job - now that we have the data and training code, we can launch the slurm job for training. Here is an example of a slurm script to launch the job using torchrun.

          +
        8. +
        + +

        More details on how to set up the cluster is out of the scope of this post, however we will have a separate post on it.

        + +

        What’s next?

        + +

        With this post we provided a high level overview of FSDP and how it efficiently scales distributed AI training. The flowchart included will help provide a checklist for you to review tuning options discussed such as the transformer wrapper and activation checkpointing.

        + +

        In the next posts, we will continue with the T5 model and go deeper into each of the topics above, specifically with sharding strategy and other optimizations to provide more insight and details. For now, a good reference for the sharding strategy is in our video tutorial here:

        + +

        If you have questions or find an issue, please find the authors Less, Hamid and Geeta or open an issue on PyTorch github.

        + +

        Special thanks to:

        + +

        Pytorch Distributed team, Shen Li, Rohan Varma, Yanli Zhao, Andrew Gu, Anjali Sridhar, Ana Simoes, Pierre-Yves Aquilanti, Sundar Ranganathan, and the broader AWS team for supporting us with providing infrastructure and technical support for running the large scale experiments.

        + +

        Resources:

        + +

        FSDP video series

        + +

        Getting started with FSDP

        + +

        Advanced tutorial on FSDP

        + +

        API documentation

        + + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus/index.html b/blog/efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus/index.html new file mode 100644 index 000000000000..f9fa3f00dac0 --- /dev/null +++ b/blog/efficient-pytorch-io-library-for-large-datasets-many-files-many-gpus/index.html @@ -0,0 +1,830 @@ + + + + + + + + + + + + + Efficient PyTorch I/O library for Large Datasets, Many Files, Many GPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Alex Aizman, Gavin Maltby, Thomas Breuel + +

        +

        Data sets are growing bigger every day and GPUs are getting faster. This means there are more data sets for deep learning researchers and engineers to train and validate their models.

        + +
          +
        • Many datasets for research in still image recognition are becoming available with 10 million or more images, including OpenImages and Places.
        • +
        • million YouTube videos (YouTube 8M) consume about 300 TB in 720p, used for research in object recognition, video analytics, and action recognition.
        • +
        • The Tobacco Corpus consists of about 20 million scanned HD pages, useful for OCR and text analytics research.
        • +
        + +

        Although the most commonly encountered big data sets right now involve images and videos, big datasets occur in many other domains and involve many other kinds of data types: web pages, financial transactions, network traces, brain scans, etc.

        + +

        However, working with the large amount of data sets presents a number of challenges:

        + +
          +
        • Dataset Size: datasets often exceed the capacity of node-local disk storage, requiring distributed storage systems and efficient network access.
        • +
        • Number of Files: datasets often consist of billions of files with uniformly random access patterns, something that often overwhelms both local and network file systems.
        • +
        • Data Rates: training jobs on large datasets often use many GPUs, requiring aggregate I/O bandwidths to the dataset of many GBytes/s; these can only be satisfied by massively parallel I/O systems.
        • +
        • Shuffling and Augmentation: training data needs to be shuffled and augmented prior to training.
        • +
        • Scalability: users often want to develop and test on small datasets and then rapidly scale up to large datasets.
        • +
        + +

        Traditional local and network file systems, and even object storage servers, are not designed for these kinds of applications. The WebDataset I/O library for PyTorch, together with the optional AIStore server and Tensorcom RDMA libraries, provide an efficient, simple, and standards-based solution to all these problems. The library is simple enough for day-to-day use, is based on mature open source standards, and is easy to migrate to from existing file-based datasets.

        + +

        Using WebDataset is simple and requires little effort, and it will let you scale up the same code from running local experiments to using hundreds of GPUs on clusters or in the cloud with linearly scalable performance. Even on small problems and on your desktop, it can speed up I/O tenfold and simplifies data management and processing of large datasets. The rest of this blog post tells you how to get started with WebDataset and how it works.

        + +

        The WebDataset Library

        + +

        The WebDataset library provides a simple solution to the challenges listed above. Currently, it is available as a separate library (github.com/tmbdev/webdataset), but it is on track for being incorporated into PyTorch (see RFC 38419). The WebDataset implementation is small (about 1500 LOC) and has no external dependencies.

        + +

        Instead of inventing a new format, WebDataset represents large datasets as collections of POSIX tar archive files consisting of the original data files. The WebDataset library can use such tar archives directly for training, without the need for unpacking or local storage.

        + +

        WebDataset scales perfectly from small, local datasets to petascale datasets and training on hundreds of GPUs and allows data to be stored on local disk, on web servers, or dedicated file servers. For container-based training, WebDataset eliminates the need for volume plugins or node-local storage. As an additional benefit, datasets need not be unpacked prior to training, simplifying the distribution and use of research data.

        + +

        WebDataset implements PyTorch’s IterableDataset interface and can be used like existing DataLoader-based code. Since data is stored as files inside an archive, existing loading and data augmentation code usually requires minimal modification.

        + +

        The WebDataset library is a complete solution for working with large datasets and distributed training in PyTorch (and also works with TensorFlow, Keras, and DALI via their Python APIs). Since POSIX tar archives are a standard, widely supported format, it is easy to write other tools for manipulating datasets in this format. E.g., the tarp command is written in Go and can shuffle and process training datasets.

        + +

        Benefits

        + +

        The use of sharded, sequentially readable formats is essential for very large datasets. In addition, it has benefits in many other environments. WebDataset provides a solution that scales well from small problems on a desktop machine to very large deep learning problems in clusters or in the cloud. The following table summarizes some of the benefits in different environments.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        EnvironmentBenefits of WebDataset
        Local Cluster with AIStoreAIStore can be deployed easily as K8s containers and offers linear scalability and near 100% utilization of network and I/O bandwidth. Suitable for petascale deep learning.
        Cloud ComputingWebDataset deep learning jobs can be trained directly against datasets stored in cloud buckets; no volume plugins required. Local and cloud jobs work identically. Suitable for petascale learning.
        Local Cluster with existing distributed FS or object storeWebDataset’s large sequential reads improve performance with existing distributed stores and eliminate the need for dedicated volume plugins.
        Educational EnvironmentsWebDatasets can be stored on existing web servers and web caches, and can be accessed directly by students by URL
        Training on Workstations from Local DrivesJobs can start training as the data still downloads. Data doesn’t need to be unpacked for training. Ten-fold improvements in I/O performance on hard drives over random access file-based datasets.
        All EnvironmentsDatasets are represented in an archival format and contain metadata such as file types. Data is compressed in native formats (JPEG, MP4, etc.). Data management, ETL-style jobs, and data transformations and I/O are simplified and easily parallelized.
        + +

        We will be adding more examples giving benchmarks and showing how to use WebDataset in these environments over the coming months.

        + +

        High-Performance

        +

        For high-performance computation on local clusters, the companion open-source AIStore server provides full disk to GPU I/O bandwidth, subject only to hardware constraints. This Bigdata 2019 Paper contains detailed benchmarks and performance measurements. In addition to benchmarks, research projects at NVIDIA and Microsoft have used WebDataset for petascale datasets and billions of training samples.

        + +

        Below is a benchmark of AIStore with WebDataset clients using 12 server nodes with 10 rotational drives each.

        + +
        + +
        + +

        The left axis shows the aggregate bandwidth from the cluster, while the right scale shows the measured per drive I/O bandwidth. WebDataset and AIStore scale linearly to about 300 clients, at which point they are increasingly limited by the maximum I/O bandwidth available from the rotational drives (about 150 MBytes/s per drive). For comparison, HDFS is shown. HDFS uses a similar approach to AIStore/WebDataset and also exhibits linear scaling up to about 192 clients; at that point, it hits a performance limit of about 120 MBytes/s per drive, and it failed when using more than 1024 clients. Unlike HDFS, the WebDataset-based code just uses standard URLs and HTTP to access data and works identically with local files, with files stored on web servers, and with AIStore. For comparison, NFS in similar experiments delivers about 10-20 MBytes/s per drive.

        + +

        Storing Datasets in Tar Archives

        + +

        The format used for WebDataset is standard POSIX tar archives, the same archives used for backup and data distribution. In order to use the format to store training samples for deep learning, we adopt some simple naming conventions:

        +
          +
        • datasets are POSIX tar archives
        • +
        • each training sample consists of adjacent files with the same basename
        • +
        • shards are numbered consecutively
        • +
        + +

        For example, ImageNet is stored in 1282 separate 100 Mbyte shards with names pythonimagenet-train-000000.tar to imagenet-train-001281.tar, the contents of the first shard are:

        + +
        -r--r--r-- bigdata/bigdata      3 2020-05-08 21:23 n03991062_24866.cls
        +-r--r--r-- bigdata/bigdata 108611 2020-05-08 21:23 n03991062_24866.jpg
        +-r--r--r-- bigdata/bigdata      3 2020-05-08 21:23 n07749582_9506.cls
        +-r--r--r-- bigdata/bigdata 129044 2020-05-08 21:23 n07749582_9506.jpg
        +-r--r--r-- bigdata/bigdata      3 2020-05-08 21:23 n03425413_23604.cls
        +-r--r--r-- bigdata/bigdata 106255 2020-05-08 21:23 n03425413_23604.jpg
        +-r--r--r-- bigdata/bigdata      3 2020-05-08 21:23 n02795169_27274.cls
        +
        + +

        WebDataset datasets can be used directly from local disk, from web servers (hence the name), from cloud storage and object stores, just by changing a URL. WebDataset datasets can be used for training without unpacking, and training can even be carried out on streaming data, with no local storage.

        + +

        Shuffling during training is important for many deep learning applications, and WebDataset performs shuffling both at the shard level and at the sample level. Splitting of data across multiple workers is performed at the shard level using a user-provided shard_selection function that defaults to a function that splits based on get_worker_info. (WebDataset can be combined with the tensorcom library to offload decompression/data augmentation and provide RDMA and direct-to-GPU loading; see below.)

        + +

        Code Sample

        +

        Here are some code snippets illustrating the use of WebDataset in a typical PyTorch deep learning application (you can find a full example at http://github.com/tmbdev/pytorch-imagenet-wds.

        + +
        import webdataset as wds
        +import ...
        +
        +sharedurl = "/imagenet/imagenet-train-{000000..001281}.tar"
        +
        +normalize = transforms.Normalize(
        +  mean=[0.485, 0.456, 0.406],
        +  std=[0.229, 0.224, 0.225])
        +
        +preproc = transforms.Compose([
        +  transforms.RandomResizedCrop(224),
        +  transforms.RandomHorizontalFlip(),
        +  transforms.ToTensor(),
        +  normalize,
        +])
        +
        +dataset = (
        +  wds.Dataset(sharedurl)
        +  .shuffle(1000)
        +  .decode("pil")
        +  .rename(image="jpg;png", data="json")
        +  .map_dict(image=preproc)
        +  .to_tuple("image", "data")
        +)
        +
        +loader = torch.utils.data.DataLoader(dataset, batch_size=64, num_workers=8)
        +
        +for inputs, targets in loader:
        +  ...
        +
        + +

        This code is nearly identical to the file-based I/O pipeline found in the PyTorch Imagenet example: it creates a preprocessing/augmentation pipeline, instantiates a dataset using that pipeline and a data source location, and then constructs a DataLoader instance from the dataset.

        + +

        WebDataset uses a fluent API for a configuration that internally builds up a processing pipeline. Without any added processing stages, In this example, WebDataset is used with the PyTorch DataLoader class, which replicates DataSet instances across multiple threads and performs both parallel I/O and parallel data augmentation.

        + +

        WebDataset instances themselves just iterate through each training sample as a dictionary:

        + +
        # load from a web server using a separate client process
        +sharedurl = "pipe:curl -s http://server/imagenet/imagenet-train-{000000..001281}.tar"
        +
        +dataset = wds.Dataset(sharedurl)
        +
        +for sample in dataset:
        +  # sample["jpg"] contains the raw image data
        +  # sample["cls"] contains the class
        +  ...
        +
        + +

        For a general introduction to how we handle large scale training with WebDataset, see these YouTube videos.

        + + + +
          +
        • +

          AIStore is an open-source object store capable of full-bandwidth disk-to-GPU data delivery (meaning that if you have 1000 rotational drives with 200 MB/s read speed, AIStore actually delivers an aggregate bandwidth of 200 GB/s to the GPUs). AIStore is fully compatible with WebDataset as a client, and in addition understands the WebDataset format, permitting it to perform shuffling, sorting, ETL, and some map-reduce operations directly in the storage system. AIStore can be thought of as a remix of a distributed object store, a network file system, a distributed database, and a GPU-accelerated map-reduce implementation.

          +
        • +
        • +

          tarp is a small command-line program for splitting, merging, shuffling, and processing tar archives and WebDataset datasets.

          +
        • +
        • +

          tensorcom is a library supporting distributed data augmentation and RDMA to GPU.

          +
        • +
        • +

          pytorch-imagenet-wds contains an example of how to use WebDataset with ImageNet, based on the PyTorch ImageNet example.

          +
        • +
        • +

          Bigdata 2019 Paper with Benchmarks

          +
        • +
        + +

        Check out the library and provide your feedback for RFC 38419.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/empowering-models-performance/index.html b/blog/empowering-models-performance/index.html new file mode 100644 index 000000000000..16b7246e7a13 --- /dev/null +++ b/blog/empowering-models-performance/index.html @@ -0,0 +1,728 @@ + + + + + + + + + + + + + Empowering Models with Performance: The Art of Generalized Model Transformation Approach | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Jackie (Jiaqi) Xu, Yanbo Liang, Jason Ansel, Chunzhi Yang, Jade Nie, Yuzhen Huang, CK Luk, Xiaodong Wang, Lu Fang, Menglu Yu, Jinwon Lee, Daohang Shi, Flavio Sales Truzzi + +

        +

        Introduction

        + +

        PyTorch 2.0 (PT2) offers a compiled execution mode which rewrites Python bytecode to extract sequences of PyTorch operations, translating them into a Graph IR. The IR is then just-in-time compiled through a customizable back end, improving training performance without user interference. Often, production models may go through multiple stages of optimization/lowering to hit performance targets. Therefore, having a compiled mode is desirable as it can separate the work of improving model performance from direct modification of the PyTorch model implementation. Thus, the compiled mode becomes more important, enabling Pytorch users to enhance model performance without modifying the PyTorch code implementation. This feature is particularly valuable for optimizing complex models, including large-scale and production-ready ones.

        + +

        In our previous blog post , we outlined how heuristic model transformation rules are employed to optimize intricate production models. While these rules enabled substantial performance gains for some pilot models, they lacked universal adaptability; they don’t consistently perform well across different models or sometimes even within different sections of a single model.

        + +

        Fig.1 PT1 Graph mode vs PT2 Compile mode.

        + +

        Fig. 1: PT1 Graph mode vs PT2 Compile mode.

        + +

        In this blog post, we propose a more generalized model transformation solution, serving as a plugin to the PT2 compiler as shown in Fig.1 which is more general, performant and user-friendly, bringing performance improvements to both model training and inference without manual efforts. As illustrated in Fig.2, by incorporating the previously user-defined transformations into the compiler, we have streamlined the production stack. These changes bring advantages to a broader range of PyTorch models, extending beyond just Meta models, which has already been incorporated in PT2 and is ready for use to benefit all Pytorch models.

        + +

        Fig.2 Simplified stack with PT2 compile mode.

        + +

        Fig. 2: Simplified stack with PT2 compile mode.

        + +

        Guiding Principle: Atomic Rules

        + +

        Traditionally, people might use predefined heuristic rules to replace a model subgraph with another more performant subgraph toreduce launch overhead, minimize memory bw, and fully occupy SMs. However, this approach doesn’t scale well as it is hard to craft a set of rules that fits all models perfectly.

        + +

        Instead of grappling with bulky, complex rules, we can actually break them down into smaller, more digestible pieces – what we call ‘atomic rules’. These tiny powerhouses of efficiency target the transformation of individual operators, to conduct one step of the fusion/transformation. This makes them easy to handle and apply, offering a straightforward path to optimizing models. So, with these atomic rules in hand, optimizing any model for top-tier performance becomes a breeze!

        + +

        We will walk through some simple examples to demonstrate how we use a chain of atomic rules to replace complicated heuristic rules.

        + +

        Case 1: Horizontal fusion of computation chains started with accesses to embedding tables

        + +

        Horizontal fusion means fusing parallel operators into one so as to reduce the number of kernels to be launched and improve performance. In our previous blog (Section 3.2), we described model transformations that fused layernorm and activation functions after embedding bags, as shown in the figure provided. However, this method, had limitations:

        + +
          +
        1. It only worked with layernorm and activation functions after embedding.
        2. +
        3. It was restricted to models with specific architecture rules, causing various issues in our production stack, including parameter changes and inference disruptions.
        4. +
        + +

        To improve, we can use three atomic rules as shown in Fig.3 to replace the complicated heuristic rule:

        + +
          +
        • Fuse layernorms that follow the same split nodes horizontally.
        • +
        • Then, fuse tanh functions following the same split nodes horizontally.
        • +
        • Lastly, fuse vertical split-cat nodes.
        • +
        + +

        These atomic rules offer a clean and streamlined way for model simplification and optimization.

        + +

        Fig.3 Before, we optimized the model in one go by replacing subgraphs. Now, with atomic rules, we optimize step-by-step, covering more cases.

        + +

        Fig. 3: Before, we optimized the model in one go by replacing subgraphs. Now, with atomic rules, we optimize step-by-step, covering more cases.

        + +

        Case 2: Fuse horizontal MLP

        + +

        MLPs (Multilayer Perceptrons) are fundamental components of deep neural networks, often consisting of linear, normalization, and activation functions. In complex models, there’s often a need to fuse many horizontal MLPs. Traditional methods find and replace parallel MLPs with a fused module as shown in Fig.4, but this isn’t always straightforward. Some models might not have normalization, or they might use different activation functions, making it hard to apply a one-size-fits-all rule.

        + +

        This is where our atomic rules come in handy. These simplified rules target individual operators one at a time, making the process easier and more manageable. We use the following atomic rules for horizontal MLP fusion:

        + +
          +
        • Fusing horizontal linear operators
        • +
        • Fusing horizontal layernorms.
        • +
        • Fusing horizontal activation functions.
        • +
        + +

        Fig.4 Pseudocode for fusing MLP. Traditional optimizations need manual Python code changes.

        + +

        Fig. 4: Pseudocode for fusing MLP. Traditional optimizations need manual Python code changes.

        + +

        The beauty of these rules is that they’re not limited to one case. They can be applied broadly. Since PyTorch models are built with torch operators, focusing on a smaller set of operators simplifies the process. This approach is not only more manageable but also more general compared to writing a specific large pattern replacement rule, making it easier to optimize various models efficiently.

        + + + +

        Our principle is to use chained atomic rules to replace heuristic rules. While this approach covers a wider range of cases, it does entail a longer time for graph search and pattern matching. The next question is: how can we minimize compilation time while performing compile-time graph searches efficiently?

        + +

        We design a two-step greedy algorithm as illustrated in Fig. 5. The first step in this process is to identify the target nodes, which we follow certain rules, e.g., identifying all linear operations with the same input shapes. Once identified, we use a Breadth-First Search (BFS) strategy to separate these nodes into different sets, so that nodes within a set don’t have data dependency. The nodes within each of these sets are independent and can be fused horizontally.

        + +

        Fig.5 Process of model transformation with graph IR.

        + +

        Fig. 5: Process of model transformation with graph IR.

        + +

        With our approach, the search time is roughly 60 seconds for one of our largest internal models, which is manageable for on-the-fly tasks.

        + +

        In the End

        + +

        In our tests with internal ranking models, we observed approximately 5% to 15% training performance improvement across five models on top of the performance gain brought by torch.compile. We have enabled the optimization in PT2 compiler stack and landed it as default when users choose Inductor as the backend (config). We expect our generalized transformation approach could benefit models beyond Meta, and look forward to more discussion and improvement through this compiler level transformation framework.

        + +

        Acknowledgements

        + +

        Many thanks to Mark Saroufim, Gregory Chanan, Adnan Aziz, and Rocky Liu for their detailed and insightful reviews.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16/index.html b/blog/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16/index.html new file mode 100644 index 000000000000..c01a1fb2623a --- /dev/null +++ b/blog/empowering-pytorch-on-intel-xeon-scalable-processors-with-bfloat16/index.html @@ -0,0 +1,722 @@ + + + + + + + + + + + + + Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Mingfei Ma (Intel), Vitaly Fedyunin (Meta), Wei Wei (Meta) + +

        +

        Overview

        + +

        Recent years, the growing complexity of AI models have been posing requirements on hardware for more and more compute capability. Reduced precision numeric format has been proposed to address this problem. Bfloat16 is a custom 16-bit floating point format for AI which consists of one sign bit, eight exponent bits, and seven mantissa bits. With the same dynamic range as float32, bfloat16 doesn’t require a special handling such as loss scaling. Therefore, bfloat16 is a drop-in replacement for float32 when running deep neural networks for both inference and training.

        + +

        The 3rd Gen Intel® Xeon® Scalable processor (codenamed Cooper Lake), is the first general purpose x86 CPU with native bfloat16 support. Three new bfloat16 instructions were introduced in Intel® Advanced Vector Extensions-512 (Intel® AVX-512): VCVTNE2PS2BF16, VCVTNEPS2BF16, and VDPBF16PS. The first two instructions perform conversion from float32 to bfloat16, and the last one performs a dot product of bfloat16 pairs. Bfloat16 theoretical compute throughput is doubled over float32 on Cooper Lake. On the next generation of Intel® Xeon® Scalable Processors, bfloat16 compute throughput will be further enhanced through Advanced Matrix Extensions (Intel® AMX) instruction set extension.

        + +

        Intel and Meta previously collaborated to enable bfloat16 on PyTorch, and the related work was published in an earlier blog during launch of Cooper Lake. In that blog, we introduced the hardware advancement for native bfloat16 support and showcased a performance boost of 1.4x to 1.6x of bfloat16 over float32 from DLRM, ResNet-50 and ResNext-101-32x4d.

        + +

        In this blog, we will introduce the latest software enhancement on bfloat16 in PyTorch 1.12, which would apply to much broader scope of user scenarios and showcase even higher performance boost.

        + +

        Native Level Optimization on Bfloat16

        + +

        On PyTorch CPU bfloat16 path, the compute intensive operators, e.g., convolution, linear and bmm, use oneDNN (oneAPI Deep Neural Network Library) to achieve optimal performance on Intel CPUs with AVX512_BF16 or AMX support. The other operators, such as tensor operators and neural network operators, are optimized at PyTorch native level. We have enlarged bfloat16 kernel level optimizations to majority of operators on dense tensors, both inference and training applicable (sparse tensor bfloat16 support will be covered in future work), specifically:

        + +
          +
        • Bfloat16 vectorization: Bfloat16 is stored as unsigned 16-bit integer, which requires it to be casted to float32 for arithmetic operations such as add, mul, etc. Specifically, each bfloat16 vector will be converted to two float32 vectors, processed accordingly and then converted back. While for non-arithmetic operations such as cat, copy, etc., it is a straight memory copy and no data type conversion will be involved.
        • +
        • Bfloat16 reduction: Reduction on bfloat16 data uses float32 as accumulation type to guarantee numerical stability, e.g., sum, BatchNorm2d, MaxPool2d, etc.
        • +
        • Channels Last optimization: For vision models, Channels Last is the preferable memory format over Channels First from performance perspective. We have implemented fully optimized CPU kernels for all the commonly used CV modules on channels last memory format, taking care of both float32 and bfloat16.
        • +
        + +

        Run Bfloat16 with Auto Mixed Precision

        + +

        To run model on bfloat16, typically user can either explicitly convert the data and model to bfloat16, for example:

        + +
        # with explicit conversion
        +input = input.to(dtype=torch.bfloat16)
        +model = model.to(dtype=torch.bfloat16)
        +
        + +

        or utilize torch.amp (Automatic Mixed Precision) package. The autocast instance serves as context managers or decorators that allow regions of your script to run in mixed precision, for example:

        + +
        # with AMP
        +with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
        +    output = model(input)
        +
        + +

        Generally, the explicit conversion approach and AMP approach have similar performance. Even though, we recommend run bfloat16 models with AMP, because:

        + +
          +
        • +

          Better user experience with automatic fallback: If your script includes operators that don’t have bfloat16 support, autocast will implicitly convert them back to float32 while the explicit converted model will give a runtime error.

          +
        • +
        • +

          Mixed data type for activation and parameters: Unlike the explicit conversion which converts all the model parameters to bfloat16, AMP mode will run in mixed data type. To be specific, input/output will be kept in bfloat16 while parameters, e.g., weight/bias, will be kept in float32. The mixed data type of activation and parameters will help improve performance while maintaining the accuracy.

          +
        • +
        + +

        Performance Gains

        + +

        We benchmarked inference performance of TorchVision models on Intel® Xeon® Platinum 8380H CPU @ 2.90GHz (codenamed Cooper Lake), single instance per socket (batch size = 2 x number of physical cores). Results show that bfloat16 has 1.4x to 2.2x performance gain over float32.

        + +

        + +

        + +

        The performance boost of bfloat16 over float32 primarily comes from 3 aspects:

        + +
          +
        • The compute intensive operators take advantage of the new bfloat16 native instruction VDPBF16PS which doubles the hardware compute throughput.
        • +
        • Bfloat16 have only half the memory footprint of float32, so theoretically the memory bandwidth intensive operators will be twice faster.
        • +
        • On Channels Last, we intentionally keep the same parallelization scheme for all the memory format aware operators (can’t do this on Channels First though), which increases the data locality when passing each layer’s output to the next. Basically, it keeps the data closer to CPU cores while data would reside in cache anyway. And bfloat16 will have a higher cache hit rate compared with float32 in such scenarios due to smaller memory footprint.
        • +
        + +

        Conclusion & Future Work

        + +

        In this blog, we introduced recent software optimizations on bfloat16 introduced in PyTorch 1.12. Results on the 3rd Gen Intel® Xeon® Scalable processor show that bfloat16 has 1.4x to 2.2x performance gain over float32 on the TorchVision models. Further improvement is expected on the next generation of Intel® Xeon® Scalable Processors with AMX instruction support. Though the performance number for this blog is collected with TorchVision models, the benefit is broad across all topologies. And we will continue to extend the bfloat16 optimization effort to a broader scope in the future!

        + +

        Acknowledgement

        + +

        The results presented in this blog is a joint effort of Meta and Intel PyTorch team. Special thanks to Vitaly Fedyunin and Wei Wei from Meta who spent precious time and gave substantial assistance! Together we made one more step on the path of improving the PyTorch CPU eco system.

        + +

        Reference

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/enhancing-deep-learning/index.html b/blog/enhancing-deep-learning/index.html new file mode 100644 index 000000000000..9ecf2e984697 --- /dev/null +++ b/blog/enhancing-deep-learning/index.html @@ -0,0 +1,742 @@ + + + + + + + + + + + + + Enhancing Deep Learning Workflows: PyTorch Ecosystem Tools | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Welcome to the thriving PyTorch ecosystem, where a wealth of tools and libraries await, purpose-built to elevate your experience in deep learning as a developer or researcher. The Ecosystem Tools pages host many projects from experts spanning academia, industry, application development, and machine learning.

        + +

        Initially, PyTorch aimed to establish a thriving community, enabling developers to access each other’s tools, engage in meaningful discussions, and explore the wealth of resources available within the community.

        + +

        Today, the PyTorch ecosystem has grown to feature over 100 projects tailored to your needs, providing robust support, enhanced speed, and effortless integration with PyTorch. If your project aligns with our mission, we invite you to submit it and join this dynamic ecosystem.

        + +

        New this month, we’ve moved all of our Ecosystem blogs over to our PyTorch.org website to host a space where our community can show off the latest innovations with our users. Read on to hear about the latest projects in the ecosystem!

        + +

        Explore the Latest Tools and Frameworks in the Ecosystem

        + +

        As we continue into 2024, we’re thrilled to showcase an impressive array of ecosystem tools that significantly enrich the PyTorch community. These tools cover a wide range of domains, including pose estimation, profiling, and even quantum computing. Let’s explore each one to witness firsthand how they are reshaping the PyTorch landscape, opening up exciting possibilities for developers.

        + +

        Anomalib

        + +

        Anomalib is a deep learning library that aims to collect state-of-the-art anomaly detection algorithms for benchmarking on both public and private datasets. Anomalib provides several ready-to-use implementations of anomaly detection algorithms described in the recent literature, as well as a set of tools that facilitate the development and implementation of custom models. The library has a strong focus on image-based anomaly detection, where the goal of the algorithm is to identify anomalous images, or anomalous pixel regions within images in a dataset. Anomalib is constantly updated with the latest algorithms and training/inference extensions.

        + +

        Diffusers

        + +

        Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you’re looking for a simple inference solution or training your own diffusion models, Diffusers is a modular toolbox that supports both.

        + +

        Pomegranate

        + +

        Pomegranate is a versatile machine learning library that integrates seamlessly with PyTorch. It provides a wide range of probabilistic models and tools for probabilistic modeling tasks. Pomegranate empowers users to build complex models such as hidden Markov models (HMMs), Bayesian networks, and Gaussian mixture models (GMMs). By combining the strengths of PyTorch and Pomegranate, developers can leverage the power of deep learning and probabilistic modeling to tackle various machine learning challenges.

        + +

        PyPose

        + +

        PyPose is a PyTorch-based library designed for pose estimation tasks. With PyPose, developers can efficiently train and deploy models for human pose estimation, a fundamental computer vision problem. By leveraging PyTorch’s flexibility and performance, PyPose simplifies the process of building accurate pose estimation models. Its intuitive APIs and pre-trained models make it an excellent choice for researchers and developers exploring human pose estimation applications.

        + +

        PyPOTS

        + +

        A python toolbox/library for data mining on partially-observed time series with PyTorch, including SOTA models supporting tasks of imputation, classification, clustering, and forecasting on incomplete (irregularly-sampled) multivariate time series with missing values.

        + +

        OctoML Profiler

        + +

        OctoML Profiler is a performance profiling tool that aids in optimizing PyTorch models. This tool helps developers identify performance bottlenecks and inefficiencies within their deep learning models. By providing insights into memory usage, compute time, and data movement, the OctoML Profiler enables developers to fine-tune their models for improved efficiency. With this valuable feedback, developers can optimize their models for deployment on various hardware platforms.

        + +

        Open Compass

        + +

        OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features include: Comprehensive support for models and datasets, efficient distributed evaluation, diversified evaluation paradigms, modular design with high extensibility and experiment management and reporting mechanism.

        + +

        Renate

        + +

        Renate is a PyTorch-based library for neural architecture search (NAS). It simplifies the process of automatically searching for optimal neural network architectures tailored to specific tasks. Renate leverages techniques like reinforcement learning and evolutionary algorithms to efficiently explore the architecture space. By using Renate, developers can save significant time and resources while discovering highly performant models.

        + +

        RoMa

        + +

        RoMa is a standalone library to handle rotation representations with PyTorch (rotation matrices, quaternions, rotation vectors, etc). It aims for robustness, ease-of-use, and efficiency.

        + +

        Substra

        + +

        Substra is an open source federated learning (FL) software. It enables the training and validation of machine learning models on distributed datasets. It provides a flexible Python interface and a web application to run federated learning training at scale. Substra’s main usage is in production environments. It has already been deployed and used by hospitals and biotech companies. Substra can also be used on a single machine to perform FL simulations and debug code.

        + +

        TorchQuantum

        + +

        TorchQuantum is a powerful library that combines the PyTorch framework with quantum computing concepts. It enables developers to explore quantum machine learning algorithms and build hybrid classical-quantum models. By integrating the principles of quantum computing into PyTorch, TorchQuantum opens up new possibilities for solving complex problems that traditional deep learning approaches may struggle with.

        + +

        TIAToolbox

        + +

        The TIAToolbox (Text-Image-Augmentation Toolbox) is a PyTorch library designed to augment text and image data for deep learning tasks. It offers a comprehensive set of tools for data augmentation, including transformations, noise injection, and image/text synthesis. By applying TIAToolbox, developers can enrich their training datasets, improve model generalization, and enhance the robustness of their deep learning models.

        + +

        torchdistill

        + +

        torchdistill is a coding-free framework built on PyTorch for reproducible deep learning and knowledge distillation studies. The framework is designed to enable users to design experiments by declarative PyYAML configuration files and supports high-level module abstractions.

        + +

        TorchOpt

        + +

        TorchOpt is a PyTorch library focused on optimization algorithms for deep learning. It provides a collection of state-of-the-art optimization techniques, such as stochastic gradient descent (SGD) variants, adaptive learning rate methods, and optimization schedules. TorchOpt empowers developers to fine-tune their models efficiently, converge faster, and achieve better performance in various deep learning tasks.

        + +

        USB

        + +

        USB, or Unified Speech-to-Text Benchmark, is a PyTorch-based toolkit for training and evaluating speech recognition models. It provides standardized datasets and evaluation metrics to facilitate fair and accurate comparisons between different speech recognition architectures. By using USB, researchers and developers can benchmark their models against state-of-the-art systems and drive advancements in the field of automatic speech recognition.

        + +

        Zeus

        + +

        Zeus is the current state-of-the-art in deep learning energy measurement and optimization. It has monitor components that allow users to measure GPU energy consumption and optimizer components that automatically optimize DNN or GPU knobs based on measurements from the monitor component.

        + +

        Be Part of Our Ecosystem

        + +

        Our diverse ecosystem tools are instrumental in PyTorch’s success.. They provide essential support for tasks such as pose estimation, probabilistic modeling, performance profiling, model interpretability, speech recognition, quantum computing, data augmentation, optimization, and neural architecture search.

        + +

        Leveraging these tools empowers developers and researchers to accelerate their deep learning workflows and unlock new possibilities in the field of AI.

        + +

        Have a tool that would be a good fit for the PyTorch Ecosystem? If you can answer the below questions, we’d love for you to submit your tool for review.

        + +
          +
        1. Does your project complement PyTorch, enhancing user experience, introducing new capabilities, or accelerating training and inference processes? +
            +
          • Examples could include visualization tools, a kernel library or a framework that sits on top to enable research in a particular area such as NLP.
          • +
          +
        2. +
        3. Is the project ready for broad developer usage? +
            +
          • For example, is the project stable, will it be maintained, and is there adequate supporting infrastructure, documentation, and technical support to allow a developer to successfully use it?
          • +
          +
        4. +
        + +

        Thank you to all of our contributors and collaborators in our ecosystem! Here’s to a great 2024.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/executorch-alpha/index.html b/blog/executorch-alpha/index.html new file mode 100644 index 000000000000..1460693d780a --- /dev/null +++ b/blog/executorch-alpha/index.html @@ -0,0 +1,692 @@ + + + + + + + + + + + + + ExecuTorch Alpha: Taking LLMs and AI to the Edge with Our Community and Partners | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of ExecuTorch alpha, focused on deploying large language models (LLMs) and large ML models to the edge, stabilizing the API surface, and improving our installation processes. It has been an exciting few months from our 0.1 (preview) release in collaboration with our partners at Arm, Apple, and Qualcomm Technologies, Inc.

        + +

        In this post we’ll discuss our full support for Meta’s Llama 2, early support for Meta’s Llama 3, broad model support in ExecuTorch, and highlight the important work our partners have done to move us forward.

        + +

        Large Language Models on Mobile

        + +

        Mobile devices are highly constrained for compute, memory, and power. To bring LLMs to these devices, we heavily leverage quantization and other techniques to pack these models appropriately.

        + +

        ExecuTorch alpha supports 4-bit post-training quantization using GPTQ. We’ve provided broad device support on CPU by landing dynamic shape support and new dtypes in XNNPack. We’ve also made significant improvements in export and lowering, reduced memory overhead and improved runtime performance. This enables running Llama 2 7B efficiently on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22, S23, and S24 phones and other edge devices. Early support for Llama 3 8B is also included. We are always improving the token/sec on various edge devices and you can visit GitHub for the latest performance numbers.

        + +

        We’re working closely with our partners at Apple, Arm, and Qualcomm Technologies to delegate to GPU and NPU for performance through Core ML, MPS, TOSA, and Qualcomm AI Stack backends respectively.

        + +

        Supported Models

        + +

        We remain committed to supporting an ever-expanding list of models with ExecuTorch. Since preview, we have significantly expanded our tested models across NLP, vision and speech, with full details in our release notes. Although support for on-device LLMs is early, we anticipate most traditional models to function seamlessly out of the box, with delegation to XNNPACK, Core ML, MPS, TOSA, and HTP for performance. If you encounter any problems please open a GitHub issue with us.

        + +

        Productivity

        + +

        Deploying performant models tuned for specific platforms often require deep visualization into the on-device runtime data to determine the right changes to make in the original PyTorch model. With ExecuTorch alpha, we provide a powerful SDK with observability throughout the process from model authoring to deployment, including delegate and hardware-level information.

        + +

        The ExecuTorch SDK was enhanced to include better debugging and profiling tools. Because ExecuTorch is built on PyTorch, the debugging capabilities include the ability to map from operator nodes back to original Python source code for more efficient anomaly resolution and performance tuning for both delegated and non-delegated model instances. You can learn more about the ExecuTorch SDK here.

        + +

        Partnerships

        + +

        ExecuTorch has only been possible because of strong collaborations across Arm, Apple, and Qualcomm Technologies. The collaboration for the initial launch of ExecuTorch continues as we support LLMs and large AI models on the edge for PyTorch. As we’ve seen with this early work for ExecuTorch alpha, there are unique challenges with these larger models and we’re excited to develop in the open.

        + +

        We also want to highlight the great partnership with Google on XNNPACK for CPU performance. The teams continue to work together upstreaming our changes and across the TensorFlow and PyTorch teams to make sure we can all support generative AI models on the edge with SOTA performance.

        + +

        Lastly, our hardware partner MediaTek has been doing work enabling the Llama collection of models with ExecuTorch on their SoCs. We’ll have more to share in the future.

        + +

        Alpha and Production Usage

        + +

        With our alpha release, we have production-tested ExecuTorch. Meta is using ExecuTorch for hand tracking on Meta Quest 3 and a variety of models on Ray-Ban Meta Smart Glasses. In addition, we have begun the rollout of ExecuTorch with Instagram and are integrating with other Meta products. We are excited to see how ExecuTorch can be used for other edge experiences.

        + +

        Community

        + +

        We are excited to see various efforts in the community to adopt or contribute to ExecuTorch. For instance, Unity recently shared their work at the Game Developers Conference (GDC) on leveraging ExecuTorch and Edge IR to run PyTorch models with their neural network inference library Sentis. Leveraging ExecuTorch’s hackability and extensibility, Unity introduced their own custom backend that serializes ExecuTorch’s Edge Dialect IR into Sentis’ native serialized format enabling developers to begin using PyTorch models easily in their games and apps.

        + +

        We’ve been building and innovating with ExecuTorch in the open. Our north star is to empower the community to deploy any ML model on edge devices painlessly and efficiently. Whether you are a hobbyist or this is your day job, we’d love for you to jump in to bring your ML models to the edge. We are looking for your help to:

        + +
          +
        1. Use ExecuTorch to run your LLM models locally on various deployment targets and share your feedback
        2. +
        3. Expand our supported models, including bug reports
        4. +
        5. Expand our quantization schemes
        6. +
        7. Help us build out delegates to GPU and NPU
        8. +
        + +

        To all individual contributors and early adopters of ExecuTorch, a big thank you as well. We can’t wait to have more of you join us!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/executorch-beta/index.html b/blog/executorch-beta/index.html new file mode 100644 index 000000000000..f9e8051bf036 --- /dev/null +++ b/blog/executorch-beta/index.html @@ -0,0 +1,710 @@ + + + + + + + + + + + + + ExecuTorch Beta: On-Device AI and LLMs, Stability, and Acceleration with Partners | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +
          +
        • ExecuTorch has achieved Beta status with the release of v0.4, providing stable APIs and runtime, as well as extensive kernel coverage.
        • +
        • ExecuTorch is the recommended on-device inference engine for Llama 3.2 1B/3B models, offering enhanced performance and memory efficiency for both original and quantized models.
        • +
        • There has been a significant increase in adoption and ecosystem growth for ExecuTorch, and the focus is now on improving reliability, performance, and coverage for non-CPU backends as the next steps.
        • +
        + +

        Current On-Device AI Market

        + +

        The on-device AI market has been rapidly expanding, and is revolutionizing the way we interact with technology. It is unlocking new experiences, enabling personalization, and reducing latency. Traditionally, computer vision and speech recognition have been the primary use-cases for on-device AI, particularly in IoT, industrial applications, and mobile devices. However, the emergence of Large Language Models (LLMs) has made Generative AI the fastest growing sector in AI, subsequently highlighting the importance of on-device Generative AI. IDC forecasts by 2028, close to 1 billion GenAI capable smartphones being shipped worldwide.

        + +

        LLMs are not only getting smaller but more powerful. This has led to the creation of a new class of applications that leverage multiple models for intelligent agents and streamlined workflows. The community is rapidly adopting and contributing to these new models, with quantized versions being created within hours of model release. Several leading technology companies are investing heavily in small LLMs, even deploying Low-Rank Adaptation (LoRA) at scale on-device to transform user experiences.

        + +

        However, this rapid progress comes at a cost. The fragmentation of our on-device AI landscape creates complexity and inefficiency when going from model authoring to edge deployment. This is where PyTorch’s ExecuTorch comes in – our Beta announcement marks an important milestone in addressing these challenges and empowering developers to create innovative, AI-powered applications.

        + +

        What’s New Today

        + +

        It’s been exactly one year since we first open sourced ExecuTorch, six months since Alpha release, and today, we’re excited to announce three main developments:

        + +

        1. Beta. ExecuTorch has reached Beta status starting from v0.4! It is now widely adopted and used in production environments across Meta. Through this adoption process we’ve identified and addressed feature gaps, improved stability, and expanded kernel and accelerator coverage. These improvements make us confident to promote ExecuTorch from Alpha to Beta status, and we are happy to welcome the community to adopt it in their own production settings. Here are three concrete enhancements:

        + +
          +
        1. Developers can write application code and include the latest ExecuTorch as a dependency, updating when needed with a clean API contract. This is possible due to our API stabilization efforts, as well as our explicit API lifecycle and backwards compatibility policy.
        2. +
        3. Running ExecuTorch on CPUs reached the necessary performance, portability and coverage. In particular, we have implemented more than 85% of all core ATen operators as part of our portable CPU kernels library to ensure running a model on ExecuTorch just works in most cases and making missing ops an exception rather than the norm. Moreover, we integrated and extensively tested our XNNPACK delegate for high performance on a wide range of CPU architectures. It is used in a number of production cases today.
        4. +
        5. In addition to the low-level ExecuTorch components for greater portability, we built extensions and higher-level abstractions to support more common use-cases such as developer tooling to support on-device debugging and profiling, and Module.h extension to simplify deployment for mobile devices.
        6. +
        + +

        2. On-Device Large-Language Models (LLMs). There has been a growing interest in the community to deploy Large Language Models (LLMs) on edge devices, as it offers improved privacy and offline capabilities. However, these models are quite large, pushing the limits of what is possible. Fortunately, ExecuTorch can support these models, and we’ve enhanced the overall framework with numerous optimizations.

        + +
          +
        • ExecuTorch is the recommended framework to run latest Llama models on-device with excellent performance today. The Llama 3.2 1B/3B models are well-suited for mobile deployment, and it is especially true with the official quantized 1B/3B model releases from Meta, as it provides a great balance between performance, accuracy, and size. When deploying Llama 3.2 1B/3B quantized models, decode latency improved by 2.5x and prefill latency improved by 4.2x on average, while model size decreased by 56% and memory usage reduced by 41% on average when benchmarked on Android OnePlus 12 device (we’ve also verified similar relative performance on Samsung S24+ for 1B and 3B, and Samsung S22 for 1B). For Llama 3.2 1B quantized model, for example, ExecuTorch is able to achieve 50.2 tokens/s for decoding and 260 tokens/s for prefill on the OnePlus 12, using the latest CPU kernels from XNNPACK and Kleidi libraries. These quantized models allow developers to integrate LLMs into memory and power-constrained devices while still maintaining quality and safety.
        • +
        • One of the value propositions of ExecuTorch is being able to use accelerators on mobile devices seamlessly. In fact, ExecuTorch also showcased accelerators to achieve even greater performance running Llama across Apple MPS backend, Qualcomm AI Accelerator, and MediaTek AI Accelerator.
        • +
        • There has been growing community and industry interest in multimodal and beyond text-only LLMs, evidenced by Meta’s Llama 3.2 11B/90B vision models and open-source models like Llava. We have so far enabled Llava 1.5 7B model on phones via ExecuTorch, making many optimizations, notably reducing runtime memory from 11GB all the way down to 5GB.
        • +
        + +

        3. Ecosystem and Community Adoption
        +Now that ExecuTorch is in Beta, it is mature enough to be used in production. It is being increasingly used at Meta across various product surfaces. For instance, ExecuTorch already powers various ML inference use cases across Meta’s Ray-Ban Meta Smart Glasses and Quest 3 VR headsets as well as Instagram and WhatsApp.

        + +

        We also partnered with Hugging Face to provide native ExecuTorch support for models being exported using torch.export. This collaboration ensures exported artifacts can directly be lowered and run efficiently on various mobile and edge devices. Models like gemma-2b and phi3-mini are already supported and more foundational models support is in progress.

        + +

        With stable APIs and Gen AI support, we’re excited to build and grow ExecuTorch with the community. The on-device AI community is growing rapidly and finding ways to adopt ExecuTorch across various fields. For instance, ExecuTorch is being used in a mobile app built by Digica to streamline inventory management in hospitals. As another example, Software Mansion developed an app, EraserAI, to remove unwanted objects from a photo with EfficientSAM running on-device with ExecuTorch via Core ML delegate.

        + +

        Towards General Availability (GA):
        +Since the original release of ExecuTorch alpha, we’ve seen a growing interest within the community in using ExecuTorch in various production environments. To that end, we have made great progress towards more stabilized and matured APIs and have made a significant investment in community support, adoption and contribution to ExecuTorch. As are are getting close to GA, we are investing our efforts in the following areas:

        + +
          +
        • +

          Non-CPU backends: Bringing non-CPU backends to even greater robustness, coverage and performance is our next goal. From day one of our original launch, we have partnered with Apple (for Core ML and MPS), Arm (for EthosU NPU) and Qualcomm (for Hexagon NPU) on accelerator integration with ExecuTorch, and we’ve since then expanded our partnership to MediaTek (NPU) and Cadence (XTensa DSP). We’re also building Vulkan GPU integration in-house. In terms of feature coverage, we’ve successfully implemented the core functionalities with our partners, ensured seamless integration with our developer tooling, and showcased successful LLM integration with many of the accelerators. Our next big step is to thoroughly validate the performance and reliability of the system in real-world, production use-cases. This stage will help us fine-tune the experience and ensure the stability needed for smooth operations.

          +
        • +
        • +

          Benchmarking infra: As part of our ongoing testing efforts, we’ve developed a benchmarking infrastructure along with a public dashboard to showcase our progress toward on-device model inference benchmarking. This allows us to transparently track and display model coverage across various backends, giving our community real-time insights into how we’re advancing towards our goals.

          +
        • +
        + +

        We’re excited to share these developments with you and look forward to continued improvements in collaboration with our partners and the community! We welcome community contribution to help us make ExecuTorch the clear choice for deploying AI and LLM models on-device. We invite you to start using ExecuTorch in your on-device projects, or even better consider contributing to it. You can also report any issues on our GitHub page.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/experience-power-pytorch-2.0/index.html b/blog/experience-power-pytorch-2.0/index.html new file mode 100644 index 000000000000..9d3b30af83f4 --- /dev/null +++ b/blog/experience-power-pytorch-2.0/index.html @@ -0,0 +1,673 @@ + + + + + + + + + + + + + Experience the power of PyTorch 2.0 on AMD Solutions | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + AMD + +

        +

        PyTorch 2.0 represents a significant step forward for the PyTorch machine learning framework. The stable release of PyTorch 2.0 brings new features that unlock even higher performance, while remaining backward compatible with prior releases and retaining the Pythonic focus which has helped to make PyTorch so enthusiastically adopted by the AI/ML community. AMD has long been a strong proponent of PyTorch, and we are delighted that the PyTorch 2.0 stable release includes support for AMD Instinct™ and Radeon™ GPUs that are supported by the ROCm™ software platform.

        + +

        With the stable PyTorch 2.0 release, PyTorch 2.0 introduces torch.compile as a beta feature underpinned by TorchInductor with support for AMD Instinct and Radeon GPUs through OpenAI Triton deep learning compiler. Through TorchInductor, developers can now generate low level kernels using Triton that are portable and performant to hand-written kernels on native hardware centric kernel programming models.

        + +

        OpenAI Triton is a language and compiler for blocked algorithms, which aims to provide an abstraction layer between CUDA/HIP and Torch at which developers can write efficient kernels more productively. We have written a new backend which interfaces Triton’s custom MLIR dialects with our ROCm compiler stack.

        + +

        Triton can automatically optimize kernels generated by machine learning compilers such as TorchInductor for multiple AI accelerators including AMD Instinct GPU accelerator by leveraging hardware-specific features of the AMD CDNA™ GPU architecture. This makes it easy for developers and users to switch seamlessly from any HW to AMD Instinct GPU accelerators and get great out of the box performance.

        + +

        In addition, compilers like Triton can also enable developers to use high-level programming languages, such as Python, to write machine learning code that can be efficiently compiled and executed on specialized hardware. This can help greatly improve the productivity of machine learning developers, as they can focus on the algorithmic aspects of their models and rely on the compiler to generate efficient code.

        + +

        By design, PyTorch 2.0 is backward compatible to earlier PyTorch releases. This holds true for the ROCm build of PyTorch 2.0 as well. Developers using PyTorch with AMD GPUs can migrate to PyTorch 2.0 with the confidence that their existing code will continue to work without any required changes, so there is no penalty to access the improvements that come with this release. On the other hand, using PyTorch 2.0 and TorchInductor can result in significant performance improvement over the default eager-mode as shown below.

        + +

        The initial results using AMD Instinct MI250 GPUs already shows strong performance improvement with minimal optimization on TorchInductor compared to the default eager-mode. We see an average performance increase of up to 1.54X on 44 out of the 45 models on HuggingFace benchmarks suite with CamemBert, DistillGPT2 and T5Small being a few of the standout models with up to 1.5X or more performance improvement over eager-mode. We are looking forward to continued engagement with members of the PyTorch team at Meta to enable further optimization on ROCm software stack and the additional performance improvement for future PyTorch releases.

        + +

        Image 1: AMD MI250 GPU performance improvement for TorchInductor vs eager-mode using HuggingFace

        + +

        Image 1: AMD MI250 GPU performance improvement for TorchInductor vs eager-mode using HuggingFace MI200-89.

        + +

        PyTorch 2.0 follows the same set of install options as before to build and install for supporting AMD GPUs. These include an installable Python package hosted at pytorch.org, AMD’s public PyTorch docker image, and of course the option to build from source using the upstream PyTorch repository. As with PyTorch builds for other platforms, the specific command line to be run for pip-based install is provided by the configurator at https://pytorch.org/get-started/locally/.

        + +

        The GPUs supported by the ROCm software platform which forms the basis for PyTorch support on AMD GPUs are documented at https://docs.amd.com/bundle/Hardware_and_Software_Reference_Guide/page/Hardware_and_Software_Support.html

        + +

        Conclusion

        + +

        PyTorch 2.0 represents a major step in continuing to broaden support for ML developers by increasing performance while maintaining a simple, Pythonic interface. This performance uplift is made possible in large part by the new TorchInductor infrastructure, which in turn harnesses the Triton ML programming language and just-in-time compiler. AMD’s support for these technologies allows users to realize the full promise of the new PyTorch architecture. Our GPU support in PyTorch 2.0 is just one manifestation of a larger vision around AI and machine learning. AI/ML plays an important role in multiple AMD product lines, including Instinct and Radeon GPUs, Alveo™ data center accelerators, and both Ryzen™ and EPYC processors. These hardware and software initiatives are all part of AMD’s Pervasive AI vision, and we look forward to addressing the many new challenges and opportunities of this dynamic space.

        + +

        MI200-89 – PyTorch Inductor mode HuggingFace Transformers training speedup, running the standard PyTorch 2.0 test suite, over PyTorch eager-mode comparison based on AMD internal testing on a single GCD as of 3/10/2023 using a 2P AMD EPYC™ 7763 production server with 4x AMD Instinct™ MI250 (128GB HBM2e) 560W GPUs with Infinity Fabric™ technology; host ROCm™ 5.3, guest ROCm™ 5.4.4, PyTorch 2.0.0, Triton 2.0. Server manufacturers may vary configurations, yielding different results. Performance may vary based on factors including use of latest drivers and optimizations.

        + +

        © 2023 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD Arrow logo, AMD CDNA, AMD Instinct, EPYC, Radeon, ROCm, Ryzen, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective owners.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks/index.html b/blog/extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks/index.html new file mode 100644 index 000000000000..a48e4a05a95f --- /dev/null +++ b/blog/extending-torchvisions-transforms-to-object-detection-segmentation-and-video-tasks/index.html @@ -0,0 +1,806 @@ + + + + + + + + + + + + + Extending TorchVision’s Transforms to Object Detection, Segmentation & Video tasks | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Philip Meier, Victor Fomin, Vasilis Vryniotis, Nicolas Hug + +

        +

        Note: A previous version of this post was published in November 2022. We have updated this post with the most up-to-date info, in view of the upcoming 0.15 release of torchvision in March 2023, jointly with PyTorch 2.0.

        + +

        TorchVision is extending its Transforms API! Here is what’s new:

        + +
          +
        • You can use them not only for Image Classification but also for Object Detection, Instance & Semantic Segmentation and Video Classification.
        • +
        • You can use new functional transforms for transforming Videos, Bounding Boxes and Segmentation Masks.
        • +
        + +

        The API is completely backward compatible with the previous one, and remains the same to assist the migration and adoption. We are now releasing this new API as Beta in the torchvision.transforms.v2 namespace, and we would love to get early feedback from you to improve its functionality. Please reach out to us if you have any questions or suggestions.

        + +

        Limitations of current Transforms

        + +

        The existing Transforms API of TorchVision (aka V1) only supports single images. As a result it can only be used for classification tasks:

        + +
        from torchvision import transforms
        +trans = transforms.Compose([
        +   transforms.ColorJitter(contrast=0.5),
        +   transforms.RandomRotation(30),
        +   transforms.CenterCrop(480),
        +])
        +imgs = trans(imgs)
        +
        + +

        The above approach doesn’t support Object Detection nor Segmentation. This limitation made any non-classification Computer Vision tasks second-class citizens as one couldn’t use the Transforms API to perform the necessary augmentations. Historically this made it difficult to train high-accuracy models using TorchVision’s primitives and thus our Model Zoo lagged by several points from SoTA.

        + +

        To circumvent this limitation, TorchVision offered custom implementations in its reference scripts that show-cased how one could perform augmentations in each task. Though this practice enabled us to train high accuracy classification, object detection & segmentation models, it was a hacky approach which made those transforms impossible to import from the TorchVision binary.

        + +

        The new Transforms API

        + +

        The Transforms V2 API supports videos, bounding boxes, and segmentation masks meaning that it offers native support for many Computer Vision tasks. The new solution is a drop-in replacement:

        + +
        import torchvision.transforms.v2 as transforms
        +
        +# Exactly the same interface as V1:
        +trans = transforms.Compose([
        +    transforms.ColorJitter(contrast=0.5),
        +    transforms.RandomRotation(30),
        +    transforms.CenterCrop(480),
        +])
        +imgs, bboxes, labels = trans(imgs, bboxes, labels)
        +
        + +

        The new Transform Classes can receive any arbitrary number of inputs without enforcing specific order or structure:

        + +
        # Already supported:
        +trans(imgs)  # Image Classification
        +trans(videos)  # Video Tasks
        +trans(imgs, bboxes, labels)  # Object Detection
        +trans(imgs, bboxes, masks, labels)  # Instance Segmentation
        +trans(imgs, masks)  # Semantic Segmentation
        +trans({"image": imgs, "box": bboxes, "tag": labels})  # Arbitrary Structure
        +
        +# Future support:
        +trans(imgs, bboxes, labels, keypoints)  # Keypoint Detection
        +trans(stereo_images, disparities, masks)  # Depth Perception
        +trans(image1, image2, optical_flows, masks)  # Optical Flow
        +trans(imgs_or_videos, labels)  # MixUp/CutMix-style Transforms
        +
        + +

        The Transform Classes make sure that they apply the same random transforms to all the inputs to ensure consistent results.

        + +

        The functional API has been updated to support all necessary signal processing kernels (resizing, cropping, affine transforms, padding etc) for all inputs:

        + +
        from torchvision.transforms.v2 import functional as F
        +
        +
        +# High-level dispatcher, accepts any supported input type, fully BC
        +F.resize(inpt, size=[224, 224])
        +# Image tensor kernel
        +F.resize_image_tensor(img_tensor, size=[224, 224], antialias=True) 
        +# PIL image kernel
        +F.resize_image_pil(img_pil, size=[224, 224], interpolation=BILINEAR)
        +# Video kernel
        +F.resize_video(video, size=[224, 224], antialias=True) 
        +# Mask kernel
        +F.resize_mask(mask, size=[224, 224])
        +# Bounding box kernel
        +F.resize_bounding_box(bbox, size=[224, 224], spatial_size=[256, 256])
        +
        + +

        Under the hood, the API uses Tensor subclassing to wrap the input, attach useful meta-data and dispatch to the right kernel. For your data to be compatible with these new transforms, you can either use the provided dataset wrapper which should work with most of torchvision built-in datasets, or your can wrap your data manually into Datapoints:

        + +
        from torchvision.datasets import wrap_dataset_for_transforms_v2
        +ds = CocoDetection(..., transforms=v2_transforms)
        +ds = wrap_dataset_for_transforms_v2(ds) # data is now compatible with transforms v2!
        +
        +# Or wrap your data manually using the lower-level Datapoint classes:
        +from torchvision import datapoints
        +
        +imgs = datapoints.Image(images)
        +vids = datapoints.Video(videos)
        +masks = datapoints.Mask(target["masks“])
        +bboxes = datapoints.BoundingBox(target["boxes], format=XYXY, spatial_size=imgs.shape)
        +
        + +

        In addition to the new API, we now provide importable implementations for several data augmentations that are used in SoTA research such as Large Scale Jitter, AutoAugmentation methods and several new Geometric, Color and Type Conversion transforms.

        + +

        The API continues to support both PIL and Tensor backends for Images, single or batched input and maintains JIT-scriptability on both the functional and class APIs.. The new API has been verified to achieve the same accuracy as the previous implementation.

        + +

        An end-to-end example

        + +

        Here is an example of the new API using the following image. It works both with PIL images and Tensors. For more examples and tutorials, take a look at our gallery!

        + +
        from torchvision import io, utils
        +from torchvision import datapoints
        +from torchvision.transforms import v2 as T
        +from torchvision.transforms.v2 import functional as F
        +
        +# Defining and wrapping input to appropriate Tensor Subclasses
        +path = "COCO_val2014_000000418825.jpg"
        +img = datapoints.Image(io.read_image(path))
        +# img = PIL.Image.open(path)
        +bboxes = datapoints.BoundingBox(
        +    [[2, 0, 206, 253], [396, 92, 479, 241], [328, 253, 417, 332],
        +     [148, 68, 256, 182], [93, 158, 170, 260], [432, 0, 438, 26],
        +     [422, 0, 480, 25], [419, 39, 424, 52], [448, 37, 456, 62],
        +     [435, 43, 437, 50], [461, 36, 469, 63], [461, 75, 469, 94],
        +     [469, 36, 480, 64], [440, 37, 446, 56], [398, 233, 480, 304],
        +     [452, 39, 463, 63], [424, 38, 429, 50]],
        +    format=datapoints.BoundingBoxFormat.XYXY,
        +    spatial_size=F.get_spatial_size(img),
        +)
        +labels = [59, 58, 50, 64, 76, 74, 74, 74, 74, 74, 74, 74, 74, 74, 50, 74, 74]
        +# Defining and applying Transforms V2
        +trans = T.Compose(
        +    [
        +        T.ColorJitter(contrast=0.5),
        +        T.RandomRotation(30),
        +        T.CenterCrop(480),
        +    ]
        +)
        +img, bboxes, labels = trans(img, bboxes, labels)
        +# Visualizing results
        +viz = utils.draw_bounding_boxes(F.to_image_tensor(img), boxes=bboxes)
        +F.to_pil_image(viz).show()
        +
        + +

        Development milestones and future work

        + +

        Here is where we are in development:

        + +
          +
        • Design API
        • +
        • Write Kernels for transforming Videos, Bounding Boxes, Masks and Labels
        • +
        • Rewrite all existing Transform Classes (stable + references) on the new API: +
            +
          • Image Classification
          • +
          • Video Classification
          • +
          • Object Detection
          • +
          • Instance Segmentation
          • +
          • Semantic Segmentation
          • +
          +
        • +
        • Verify the accuracy of the new API for all supported Tasks and Backends
        • +
        • Speed Benchmarks and Performance Optimizations (in progress - planned for Dec)
        • +
        • Graduate from Prototype (planned for Q1)
        • +
        • Add support of Depth Perception, Keypoint Detection, Optical Flow and more (future)
        • +
        • Add smooth support for batch-wise transforms like MixUp and CutMix
        • +
        + +

        We would love to get feedback from you to improve its functionality. Please reach out to us if you have any questions or suggestions.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text/index.html b/blog/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text/index.html new file mode 100644 index 000000000000..f76c281f2859 --- /dev/null +++ b/blog/fast-beam-search-decoding-in-pytorch-with-torchaudio-and-flashlight-text/index.html @@ -0,0 +1,779 @@ + + + + + + + + + + + + + Fast Beam Search Decoding in PyTorch with TorchAudio and Flashlight Text | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Caroline Chen, Jacob Kahn (@jacob_d_kahn) + +

        +

        Beam search decoding with industry-leading speed from Flashlight Text (part of the Flashlight ML framework) is now available with official support in TorchAudio, bringing high-performance beam search and text utilities for speech and text applications built on top of PyTorch. The current integration supports CTC-style decoding, but it can be used for any modeling setting that outputs token-level probability distributions over time steps.

        + +

        A brief beam search refresher

        + +

        In speech and language settings, beam search is an efficient, greedy algorithm that can convert sequences of continuous values (i.e. probabilities or scores) into graphs or sequences (i.e. tokens, word-pieces, words) using optional constraints on valid sequences (i.e. a lexicon), optional external scoring (i.e. an LM which scores valid sequences), and other score adjustments for particular sequences.

        + +

        In the example that follows, we’ll consider — a token set of {ϵ, a, b}, where ϵ is a special token that we can imagine denotes a space between words or a pause in speech. Graphics here and below are taken from Awni Hannun’s excellent distill.pub writeup on CTC and beam search.

        + +

        + +

        + +

        With a greedy-like approach, beam search considers the next viable token given an existing sequence of tokens — in the example above, a, b, b is a valid sequence, but a, b, a is not. We rank each possible next token at each step of the beam search according to a scoring function. Scoring functions (s) typically looks something like:

        + +

        + +

        + +

        Where ŷ is a potential path/sequence of tokens, x is the input (P(ŷ|x) represents the model’s predictions over time), and 𝛼 is a weight on the language model probability (P(y) the probability of the sequence under the language model). Some scoring functions add 𝜷 which adjusts a score based on the length of the predicted sequence |ŷ|. This particular scoring function is used in FAIR’s prior work on end-to-end ASR, and there are many variations on scoring functions which can vary across application areas.

        + +

        Given a particular sequence, to assess the next viable token in that sequence (perhaps constrained by a set of allowed words or sequences, such as a lexicon of words), the beam search algorithm scores the sequence with each candidate token added, and sorts token candidates based on those scores. For efficiency and since the number of paths is exponential in the token set size, the top-k highest-scoring candidates are kept — k represents the beam size.

        + +

        + +

        + +

        There are many other nuances with how beam search can progress: similar hypothesis sequences can be “merged”, for instance. +

        + +

        The scoring function can be further augmented to up/down-weight token insertion or long or short words. Scoring with stronger external language models, while incurring computational cost, can also significantly improve performance; this is frequently referred to as LM fusion. There are many other knobs to tune for decoding — these are documented in TorchAudio’s documentation and explored further in TorchAudio’s ASR Inference tutorial. Since decoding is quite efficient, parameters can be easily swept and tuned.

        + +

        Beam search has been used in ASR extensively over the years in far too many works to cite, and in strong, recent results and systems including wav2vec 2.0 and NVIDIA’s NeMo.

        + + + +

        Beam search remains a fast competitor to heavier-weight decoding approaches such as RNN-Transducer that Google has invested in putting on-device and has shown strong results with on common benchmarks. Autoregressive text models at scale can benefit from beam search as well. Among other things, beam search gives:

        + +
          +
        • A flexible performance/latency tradeoff — by adjusting beam size and the external LM, users can sacrifice latency for accuracy or pay for more accurate results with a small latency cost. Decoding with no external LM can improve results at very little performance cost.
        • +
        • Portability without retraining — existing neural models can benefit from multiple decoding setups and plug-and-play with external LMs without training or fine-tuning.
        • +
        • A compelling complexity/accuracy tradeoff — adding beam search to an existing modeling pipeline incurs little additional complexity and can improve performance.
        • +
        + +

        Performance Benchmarks

        + +

        Today’s most commonly-used beam search decoding libraries today that support external language model integration include Kensho’s pyctcdecode, NVIDIA’s NeMo toolkit. We benchmark the TorchAudio + Flashlight decoder against them with a wav2vec 2.0 base model trained on 100 hours of audio evaluated on LibriSpeech dev-other with the official KenLM 3-gram LM. Benchmarks were run on Intel E5-2698 CPUs on a single thread. All computation was in-memory — KenLM memory mapping was disabled as it wasn’t widely supported.

        + +

        When benchmarking, we measure the time-to-WER (word error rate) — because of subtle differences in the implementation of decoding algorithms and the complex relationships between parameters and decoding speed, some hyperparameters differed across runs. To fairly assess performance, we first sweep for parameters that achieve a baseline WER, minimizing beam size if possible.

        + +

        + +

        + +

        +Decoding performance on Librispeech dev-other of a pretrained wav2vec 2.0 model. TorchAudio + Flashlight decoding outperforms by an order of magnitude at low WERs. +

        + +

        + +

        + +

        +Time-to-WER results, deferring to smaller beam size, across decoders. The TorchAudio + Flashlight decoder scales far better with larger beam sizes and at lower WERs. +

        + +

        TorchAudio API and Usage

        + +

        TorchAudio provides a Python API for CTC beam search decoding, with support for the following:

        + +
          +
        • lexicon and lexicon-free decoding
        • +
        • KenLM n-gram language model integration
        • +
        • character and word-piece decoding
        • +
        • sample pretrained LibriSpeech KenLM models and corresponding lexicon and token files
        • +
        • various customizable beam search parameters (beam size, pruning threshold, LM weight…)
        • +
        + +

        To set up the decoder, use the factory function torchaudio.models.decoder.ctc_decoder

        + +
        from torchaudio.models.decoder import ctc_decoder, download_pretrained_files
        +files = download_pretrained_files("librispeech-4-gram")
        +decoder = ctc_decoder(
        +   lexicon=files.lexicon,
        +   tokens=files.tokens,
        +   lm=files.lm,
        +   nbest=1,
        +   ... additional optional customizable args ...
        +)
        +
        + +

        Given emissions of shape (batch, time, num_tokens), the decoder will compute and return a List of batch Lists, each consisting of the nbest hypotheses corresponding to the emissions. Each hypothesis can be further broken down into tokens, words (if a lexicon is provided), score, and timesteps components.

        + +
        emissions = acoustic_model(waveforms)  # (B, T, N)
        +batch_hypotheses = decoder(emissions)  # List[List[CTCHypothesis]]
        +
        +# transcript for a lexicon decoder
        +transcripts = [" ".join(hypo[0].words) for hypo in batch_hypotheses]
        +
        +# transcript for a lexicon free decoder, splitting by sil token
        +batch_tokens = [decoder.idxs_to_tokens(hypo[0].tokens) for hypo in batch_hypotheses]
        +transcripts = ["".join(tokens) for tokens in batch_tokens]
        +
        + +

        Please refer to the documentation for more API details, and the tutorial (ASR Inference Decoding) or sample inference script for more usage examples.

        + +

        Upcoming Improvements

        + +

        Full NNLM support — decoding with large neural language models (e.g. transformers) remains somewhat unexplored at scale. Already supported in Flashlight, we plan to add support in TorchAudio, allowing users to use custom decoder-compatible LMs. Custom word level language models are already available in the nightly TorchAudio build, and is slated to be released in TorchAudio 0.13.

        + +

        Autoregressive/seq2seq decoding — Flashlight Text also supports sequence-to-sequence (seq2seq) decoding for autoregressive models, which we hope to add bindings for and add to TorchAudio and TorchText with efficient GPU implementations as well.

        + +

        Better build support — to benefit from improvements in Flashlight Text, TorchAudio will directly submodule Flashlight Text to make upstreaming modifications and improvements easier. This is already in effect in the nightly TorchAudio build, and is slated to be released in TorchAudio 0.13.

        + +

        Citation

        + +

        To cite the decoder, please use the following:

        + +
        @inproceedings{kahn2022flashlight,
        +  title={Flashlight: Enabling innovation in tools for machine learning},
        +  author={Kahn, Jacob D and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and others},
        +  booktitle={International Conference on Machine Learning},
        +  pages={10557--10574},
        +  year={2022},
        +  organization={PMLR}
        +}
        +
        +
        @inproceedings{yang2022torchaudio,
        +  title={Torchaudio: Building blocks for audio and speech processing},
        +  author={Yang, Yao-Yuan and Hira, Moto and Ni, Zhaoheng and Astafurov, Artyom and Chen, Caroline and Puhrsch, Christian and Pollack, David and Genzel, Dmitriy and Greenberg, Donny and Yang, Edward Z and others},
        +  booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
        +  pages={6982--6986},
        +  year={2022},
        +  organization={IEEE}
        +}
        +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/finetune-llms/index.html b/blog/finetune-llms/index.html new file mode 100644 index 000000000000..0ca329bd7485 --- /dev/null +++ b/blog/finetune-llms/index.html @@ -0,0 +1,815 @@ + + + + + + + + + + + + + Finetune LLMs on your own consumer hardware using tools from PyTorch and Hugging Face ecosystem | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Younes Belkada, Marc Sun, Titus von Köller, Sourab Mangrulkar, Benjamin Bossan, Lysandre Debut, Steven Liu + +

        +

        We demonstrate how to finetune a 7B parameter model on a typical consumer GPU (NVIDIA T4 16GB) with LoRA and tools from the PyTorch and Hugging Face ecosystem with complete reproducible Google Colab notebook.

        + +

        Introduction

        + +

        Large Language Models (LLMs) have shown impressive capabilities in industrial applications. Often, developers seek to tailor these LLMs for specific use-cases and applications to fine-tune them for better performance. However, LLMs are large by design and require a large number of GPUs to be fine-tuned.

        + +

        Let’s focus on a specific example by trying to fine-tune a Llama model on a free-tier Google Colab instance (1x NVIDIA T4 16GB). Llama-2 7B has 7 billion parameters, with a total of 28GB in case the model is loaded in full-precision. Given our GPU memory constraint (16GB), the model cannot even be loaded, much less trained on our GPU. This memory requirement can be divided by two with negligible performance degradation. You can read more about running models in half-precision and mixed precision for training here.

        + +

        What makes our Llama fine-tuning expensive?

        + +

        In the case of full fine-tuning with Adam optimizer using a half-precision model and mixed-precision mode, we need to allocate per parameter:

        + +
          +
        • 2 bytes for the weight
        • +
        • 2 bytes for the gradient
        • +
        • 4 + 8 bytes for the Adam optimizer states
        • +
        + +

        → With a total of 16 bytes per trainable parameter, this makes a total of 112GB (excluding the intermediate hidden states). Given that the largest GPU available today can have up to 80GB GPU VRAM, it makes fine-tuning challenging and less accessible to everyone. To bridge this gap, Parameter Efficient Fine-Tuning (PEFT) methods are largely adopted today by the community.

        + +

        Parameter Efficient Fine-Tuning (PEFT) methods

        + +

        PEFT methods aim at drastically reducing the number of trainable parameters of a model while keeping the same performance as full fine-tuning.

        + +

        They can be differentiated by their conceptual framework: does the method fine-tune a subset of existing parameters, introduce new parameters, introduce trainable prompts, etc.? We recommend readers to have a look at the paper shared below that extensively compares existing PEFT methods.

        + +

        Venn diagram

        + +

        Image taken from the paper: Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning

        + +

        For this blog post, we will focus on Low-Rank Adaption for Large Language Models (LoRA), as it is one of the most adopted PEFT methods by the community.

        + +

        Low-Rank Adaptation for Large Language Models (LoRA) using 🤗 PEFT

        + +

        The LoRA method by Hu et al. from the Microsoft team came out in 2021, and works by attaching extra trainable parameters into a model(that we will denote by base model).

        + +

        To make fine-tuning more efficient, LoRA decomposes a large weight matrix into two smaller, low-rank matrices (called update matrices). These new matrices can be trained to adapt to the new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn’t receive any further adjustments. To produce the final results, both the original and the adapted weights are combined.

        + +

        This approach has several advantages:

        + +
          +
        • LoRA makes fine-tuning more efficient by drastically reducing the number of trainable parameters.
        • +
        • The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them.
        • +
        • LoRA is orthogonal to many other parameter-efficient methods and can be combined with many of them.
        • +
        • The performance of models fine-tuned using LoRA is comparable to the performance of fully fine-tuned models.
        • +
        • LoRA does not add any inference latency when adapter weights are merged with the base model
        • +
        + +

        In principle, LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. However, for simplicity and further parameter efficiency, in Transformer models LoRA is typically applied to attention blocks only. The resulting number of trainable parameters in a LoRA model depends on the size of the low-rank update matrices, which is determined mainly by the rank r and the shape of the original weight matrix.

        + +

        Animated diagram that show how LoRA works in practice

        + +

        Animated diagram that show how LoRA works in practice - original content adapter from the figure 1 of LoRA original paper

        + +

        Below is a code snippet showing how to train LoRA model using Hugging Face PEFT library:

        + +

        code snippet showing how to train LoRA model using  Hugging Face PEFT library

        + +

        The base model can be in any dtype: leveraging SOTA LLM quantization and loading the base model in 4-bit precision

        + +

        According to the LoRA formulation, the base model can be compressed in any data type (‘dtype’) as long as the hidden states from the base model are in the same dtype as the output hidden states from the LoRA matrices.

        + +

        Compressing and quantizing large language models has recently become an exciting topic as SOTA models become larger and more difficult to serve and use for end users. Many people in the community proposed various approaches for effectively compressing LLMs with minimal performance degradation.

        + +

        This is where the bitsandbytes library comes in. Its purpose is to make cutting-edge research by Tim Dettmers, a leading academic expert on quantization and the use of deep learning hardware accelerators, accessible to the general public.

        + +

        QLoRA: One of the core contributions of bitsandbytes towards the democratization of AI

        + +

        Quantization of LLMs has largely focused on quantization for inference, but the QLoRA (Quantized model weights + Low-Rank Adapters) paper showed the breakthrough utility of using backpropagation through frozen, quantized weights at large model scales.

        + +

        With QLoRA we are matching 16-bit fine-tuning performance across all scales and models, while reducing fine-tuning memory footprint by more than 90%— thereby allowing fine-tuning of SOTA models on consumer-grade hardware.

        + +

        In this approach, LoRA is pivotal both for purposes of fine-tuning and the correction of minimal, residual quantization errors. Due to the significantly reduced size of the quantized model it becomes possible to generously place low-rank adaptors at every network layer, which together still make up just 0.2% of the original model’s weight memory footprint. Through such usage of LoRA, we achieve performance that has been shown to be equivalent to 16-bit full model finetuning.

        + +

        System diagram

        + +

        In addition to generous use of LoRA, to achieve high-fidelity fine-tuning of 4-bit models, QLoRA uses 3 further algorithmic tricks:

        + +
          +
        1. 4-bit NormalFloat (NF4) quantization, a custom data type exploiting the property of the normal distribution of model weights and distributing an equal number of weights (per block) to each quantization bin—thereby enhancing information density.
        2. +
        3. Double Quantization, quantization of the quantization constants (further savings).
        4. +
        5. Paged Optimizers, preventing memory spikes during gradient checkpointing from causing out-of-memory errors.
        6. +
        + +

        An interesting aspect is the dequantization of 4-bit weights in the GPU cache, with matrix multiplication performed as a 16-bit floating point operation. In other words, we use a low-precision storage data type (in our case 4-bit, but in principle interchangeable) and one normal precision computation data type. This is important because the latter defaults to 32-bit for hardware compatibility and numerical stability reasons, but should be set to the optimal BFloat16 for newer hardware supporting it to achieve the best performance.

        + +

        To conclude, through combining these refinements to the quantization process and generous use of LoRA, we compress the model by over 90% and retain full model performance without the usual quantization degradation, while also retaining full fine-tuning capabilities with 16-bit LoRA adapters at every layer.

        + +

        Using QLoRA in practice

        + +

        These SOTA quantization methods come packaged in the bitsandbytes library and are conveniently integrated with HuggingFace 🤗 Transformers. For instance, to use LLM.int8 and QLoRA algorithms, respectively, simply pass load_in_8bit and load_in_4bit to the from_pretrained method.

        + +
        import torch
        +from transformers import AutoModelForCausalLM, AutoTokenizer
        +
        +model_id = "facebook/opt-125m"
        +# For LLM.int8()
        +# model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
        +
        +# For QLoRA
        +model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
        +
        + +

        You can read more about quantization features in this specific section of the documentation: https://huggingface.co/docs/transformers/main_classes/quantization

        + +

        When using QLoRA with Adam optimizer using a 4-bit base model and mixed-precision mode, we need to allocate per parameter:

        + +
          +
        • ~0.5 bytes for the weight
        • +
        • 2 bytes for the gradient
        • +
        • 4 + 8 bytes for the Adam optimizer states
        • +
        + +

        Giving a total of 14 bytes per trainable parameter times 0.0029 as we end up having only 0.29% trainable parameters with QLoRA, this makes the QLoRA training setup cost around 4.5GB to fit, but requires in practice ~7-10GB to include intermediate hidden states which are always in half-precision (7 GB for a sequence length of 512 and 10GB for a sequence length of 1024) in the Google Colab demo shared in the next section.

        + +

        Below is the code snippet showing how to train QLoRA model using Hugging Face PEFT:

        + +

        code snippet showing how to train QLoRA model using Hugging Face PEFT

        + +

        Using TRL for LLM training

        + +

        Models such as ChatGPT, GPT-4, and Claude are powerful language models that have been fine-tuned using a method called Reinforcement Learning from Human Feedback (RLHF) to be better aligned with how we expect them to behave and would like to use them. The finetuning goes through 3 steps:

        + +
          +
        • Supervised Fine-tuning (SFT)
        • +
        • Reward / preference modeling (RM)
        • +
        • Reinforcement Learning from Human Feedback (RLHF)
        • +
        + +

        Process diagram

        + +

        From InstructGPT paper: Ouyang, Long, et al. “Training language models to follow instructions with human feedback.” arXiv preprint arXiv:2203.02155 (2022).

        + +

        Here, we will only focus on the supervised fine-tuning step. We train the model on the new dataset following a process similar to that of pretraining. The objective is to predict the next token (causal language modeling). Multiple techniques can be applied to make the training more efficient:

        + +
          +
        • Packing: Instead of having one text per sample in the batch and then padding to either the longest text or the maximal context of the model, we concatenate a lot of texts with an End-Of-Sentence (EOS) token in between and cut chunks of the context size to fill the batch without any padding. This approach significantly improves training efficiency as each token processed by the model contributes to training.
        • +
        + +

        Sample diagram

        + +
          +
        • Train on completion only: We want the model to be able to understand the prompt and generate an answer/. Instead of training the model on the whole input (prompt + answer), the training will be more efficient if we only train the model on completion.
        • +
        + +

        You can perform supervised fine-tuning with these techniques using SFTTrainer:

        + +
        from trl import SFTTrainer
        +
        +trainer = SFTTrainer(
        +    model=model,
        +    args=training_arguments,
        +    train_dataset=train_dataset,
        +    dataset_text_field="text",
        +    max_seq_length=1024,
        +    packing=True,
        +)
        +
        + +

        Since SFTTrainer back-end is powered by 🤗accelerate, you can easily adapt the training to your hardware setup in one line of code!

        + +

        For example, with you have 2 GPUs, you can perform Distributed Data Parallel training with using the following command:

        + +
        accelerate launch --num_processes=2 training_llama_script.py
        +
        + +

        Putting all the pieces together

        + +

        We made a complete reproducible Google Colab notebook that you can check through this link. We use all the components shared in the sections above and fine-tune a llama-7b model on UltraChat dataset using QLoRA. As it can be observed through the screenshot below, when using a sequence length of 1024 and a batch size od 4, the memory usage remains very low (around 10GB).

        + +

        Memory usage diagram

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/flash-decoding/index.html b/blog/flash-decoding/index.html new file mode 100644 index 000000000000..47d343854f06 --- /dev/null +++ b/blog/flash-decoding/index.html @@ -0,0 +1,809 @@ + + + + + + + + + + + + + Flash-Decoding for long-context inference | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 13, 2023

        +

        + Flash-Decoding for long-context inference +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Tri Dao, Daniel Haziza, Francisco Massa, Grigory Sizov + +

        +

        Motivation

        + +

        Large language models (LLM) such as ChatGPT or Llama have received unprecedented attention lately. However, they remain massively expensive to run. Even though generating a single response can cost about $0.01 (a few seconds of an 8xA100 instance on AWS), the costs quickly add up when scaling to billions of users, who could have multiple daily interactions with such LLMs. Some use cases are more expensive, like code auto-completion, because it runs whenever a new character is typed. As LLM applications multiply, even small efficiency gains to the generation time can have a massive impact.

        + +

        LLM inference (or “decoding”) is an iterative process: tokens are generated one at a time. Generating full sentences of N tokens requires N forward passes through the model. Fortunately, it is possible to cache previously calculated tokens: this means that a single generation step does not depend on the context length, except for a single operation, the attention. This operation does not scale well with context length.

        + +

        There are a number of important emerging use cases of LLMs that utilize a long context. With a longer context, LLMs can reason about longer documents, either to summarize or answer questions about them, they can keep track of longer conversations, or even process entire codebases before writing code. As an example, most LLMs had a context length of up to 2k in 2022 (GPT-3), but we now have open-source LLMs scaling up to 32k (Llama-2-32k), or even 100k more recently (CodeLlama). In this setting, attention takes a significant fraction of time during inference.

        + +

        When scaling on the batch size dimension, the attention can also become a bottleneck even with relatively small contexts. This is because the amount of memory to read scales with the batch dimension, whereas it only depends on the model size for the rest of the model.

        + +

        We present a technique, Flash-Decoding, that significantly speeds up attention during inference, bringing up to 8x faster generation for very long sequences. The main idea is to load the keys and values in parallel as fast as possible, then separately rescale and combine the results to maintain the right attention outputs.

        + +

        Multi-head attention for decoding

        + +

        During decoding, every new token that is generated needs to attend to all previous tokens, to compute:

        + +

        softmax(queries @ keys.transpose) @ values

        + +

        This operation has been optimized with FlashAttention (v1 and v2 recently) in the training case, where the bottleneck is the memory bandwidth to read and write the intermediate results (e.g. Q @ K^T). However, these optimizations don’t apply directly to the inference case, because the bottlenecks are different. For training, FlashAttention parallelizes across the batch size and query length dimensions. During inference, the query length is typically 1: this means that if the batch size is smaller than the number of streaming multiprocessors (SMs) on the GPU (108 for an A100), the operation will only use a small part of the GPU! This is especially the case when using long contexts, because it requires smaller batch sizes to fit in GPU memory. With a batch size of 1, FlashAttention will use less than 1% of the GPU!

        + +

        FlashAttention

        + +

        FlashAttention parallelizes across blocks of queries and batch size only, and does not manage to occupy the entire GPU during decoding

        + +

        The attention can also be done using matrix multiplication primitives - without using FlashAttention. In this case, the operation occupies the GPU entirely, but launches many kernels that write and read intermediate results, which is not optimal.

        + +

        A faster attention for decoding: Flash-Decoding

        + +

        Our new approach Flash-Decoding is based on FlashAttention, and adds a new parallelization dimension: the keys/values sequence length. It combines the benefits of the 2 approaches from above. Like FlashAttention, it stores very little extra data to global memory, however it fully utilizes the GPU even when the batch size is small, as long as the context length is large enough.

        + +

        Flash-Decoding

        + +

        Flash-Decoding also parallelizes across keys and values, at the cost of a small final reduction step

        + +

        Flash-Decoding works in 3 steps:

        + +
          +
        1. First, we split the keys/values in smaller chunks.
        2. +
        3. We compute the attention of the query with each of these splits in parallel using FlashAttention. We also write 1 extra scalar per row and per split: the log-sum-exp of the attention values.
        4. +
        5. Finally, we compute the actual output by reducing over all the splits, using the log-sum-exp to scale the contribution of each split.
        6. +
        + +

        All of this is possible because the attention/softmax can be calculated iteratively. In Flash-Decoding, it is used at 2 levels: within splits (like FlashAttention), and across splits to perform the final reduction.

        + +

        In practice, step (1) does not involve any GPU operation, as the key/value chunks are views of the full key/value tensors. We then have 2 separate kernels to perform respectively (2) and (3).

        + +

        Benchmarks on CodeLlama 34B

        + +

        To validate this approach, we benchmark the decoding throughput of the CodeLLaMa-34b. This model has the same architecture as Llama 2, and more generally results should generalize across many LLMs. We measure the decoding speed in tok/s at various sequence lengths, from 512 to 64k, and compare multiple ways of calculating the attention:

        + +
          +
        • Pytorch: Running the attention using pure PyTorch primitives (without using FlashAttention)
        • +
        • FlashAttention v2
        • +
        • FasterTransformer: Uses the FasterTransformer attention kernel
        • +
        • Flash-Decoding
        • +
        • And an upper bound calculated as the time it takes to read from memory the entire model along with the KV-cache
        • +
        + +

        Flash-Decoding unlocks up to 8x speedups in decoding speed for very large sequences, and scales much better than alternative approaches.

        + +

        CodeLlama

        + +

        All approaches perform similarly for small prompts, but scale poorly as the sequence length increases from 512 to 64k, except Flash-Decoding. In this regime (batch size 1) with Flash-Decoding, scaling the sequence length has little impact on generation speed

        + +

        Component-level micro-benchmarks

        + +

        We also micro-benchmark the scaled multi-head attention for various sequence lengths and batch sizes on A100 with inputs in f16. We set the batch size to 1, and use 16 query heads of dimension 128, for 2 key/value heads (grouped-query attention), which matches the dimensions used in CodeLLaMa-34b when running on 4 GPUs.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
            
        Setting \ AlgorithmPyTorch Eager (us)Flash-Attention v2.0.9 (us)Flash-Decoding (us)
        B=256, seqlen=2563058.6390.563.4
        B=128, seqlen=5123151.4366.367.7
        B=64, seqlen=10243160.4364.877.7
        B=32, seqlen=20483158.335258.5
        B=16, seqlen=40963157401.757
        B=8, seqlen=81923173.1529.256.4
        B=4, seqlen=163843223582.758.2
        B=2, seqlen=327683224.11156.160.3
        B=1, seqlen=655361335.62300.664.4
        B=1, seqlen=13107226644592.2106.6
        + +

        Micro-benchmark of the multi-head attention, run-time in us. Flash-Decoding achieves almost constant run-time as the sequence length scales to up to 64k.

        + +

        The up to 8x speedup end-to-end measured earlier is made possible because the attention itself is up to 50x faster than FlashAttention. Up until sequence length 32k, the attention time is roughly constant, because Flash-Decoding manages to fully utilize the GPU.

        + +

        Using Flash-Decoding

        + +

        Flash-decoding is available:

        + +
          +
        • In the FlashAttention package, starting at version 2.2
        • +
        • Through xFormers starting at version 0.0.22 through `xformers.ops.memory_efficient_attention`. The dispatcher will automatically use either the Flash-Decoding or FlashAttention approaches depending on the problem size. When these approaches are not supported, it can dispatch to an efficient triton kernel that implements the Flash-Decoding algorithm.
        • +
        + +

        A full example of decoding with LLaMa v2 / CodeLLaMa is available in the FlashAttention repo here and in the xFormers repo here. We also provide a minimal example of an efficient decoding code for LLaMa v1/v2 models, meant to be fast, easy to read, educational and hackable.

        + +

        Acknowledgements

        + +

        Thanks to Erich Elsen, Ashish Vaswani, and Michaël Benesty for suggesting this idea of splitting the KVcache loading. We want to thank Jeremy Reizenstein, Patrick Labatut and Andrew Tulloch for the valuable discussions, and Quentin Carbonneaux for contributing the efficient decoding example to xFormers. We also want to thank Geeta Chauhan and Gregory Chanan for helping with the writing and more broadly contributing to getting this published on the PyTorch blog.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/flashattention-3/index.html b/blog/flashattention-3/index.html new file mode 100644 index 000000000000..dbef94cab475 --- /dev/null +++ b/blog/flashattention-3/index.html @@ -0,0 +1,750 @@ + + + + + + + + + + + + + FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Jay Shah and Ganesh Bikshandi, Colfax Research, Ying Zhang, Meta, Vijay Thakkar and Pradeep Ramani, NVIDIA, Tri Dao, TogetherAI and Princeton University + +

        +

        Attention, as a core layer of the ubiquitous Transformer architecture, is a bottleneck for large language models and long-context applications. FlashAttention (and FlashAttention-2) pioneered an approach to speed up attention on GPUs by minimizing memory reads/writes, and is now used by most libraries to accelerate Transformer training and inference. This has contributed to a massive increase in LLM context length in the last two years, from 2-4K (GPT-3, OPT) to 128K (GPT-4), or even 1M (Llama 3). However, despite its success, FlashAttention has yet to take advantage of new capabilities in modern hardware, with FlashAttention-2 achieving only 35% utilization of theoretical max FLOPs on the H100 GPU. In this blogpost, we describe three main techniques to speed up attention on Hopper GPUs: exploiting asynchrony of the Tensor Cores and TMA to (1) overlap overall computation and data movement via warp-specialization and (2) interleave block-wise matmul and softmax operations, and (3) incoherent processing that leverages hardware support for FP8 low-precision.

        + +

        We’re excited to release FlashAttention-3 that incorporates these techniques. It’s 1.5-2.0x faster than FlashAttention-2 with FP16, up to 740 TFLOPS, i.e., 75% utilization of H100 theoretical max FLOPS. With FP8, FlashAttention-3 reaches close to 1.2 PFLOPS, with 2.6x smaller error than baseline FP8 attention.

        + +

        FlashAttention-3 is available at: https://github.com/Dao-AILab/flash-attention
        +Paper

        + +

        FlashAttention Recap

        + +

        FlashAttention is an algorithm that reorders the attention computation and leverages tiling and recomputation to significantly speed it up and reduce memory usage from quadratic to linear in sequence length. We use tiling to load blocks of inputs from HBM (GPU memory) to SRAM (fast cache), perform attention with respect to that block, and update the output in HBM. By not writing the large intermediate attention matrices to HBM, we reduce the amount of memory reads/writes, which brings 2-4x wallclock time speedup.

        + +

        Here we show a diagram of FlashAttention forward pass: with tiling and softmax rescaling, we operate by blocks and avoid having to read/write from HBM, while obtaining the correct output with no approximation.

        + +

        math equations

        + +

        New hardware features on Hopper GPUs - WGMMA, TMA, FP8

        + +

        While FlashAttention-2 can achieve up to 70% theoretical max FLOPS on Ampere (A100) GPUs, it does not yet take advantage of new features on Hopper GPUs to maximize performance. We describe some of the new Hopper-specific features here, and why they are important.

        + +

        1. WGMMA (Warpgroup Matrix Multiply-Accumulate). This new feature makes use of the new Tensor Cores on Hopper, with much higher throughput1 than the older mma.sync instruction in Ampere (image from the H100 white paper).

        + +

        image from the H100 white paper

        + +

        2. TMA (Tensor Memory Accelerator). This is a special hardware unit that accelerates the transfer of data between global memory and shared memory, taking care of all index calculation and out-of-bound predication. This frees up registers, which is a valuable resource to increase tile size and efficiency.

        + +

        block diagram

        + +

        3. Low-precision with FP8. This doubles the Tensor Core throughput (e.g. 989 TFLOPS with FP16 and 1978 TFLOPS with FP8), but trades off accuracy by using fewer bits to represent floating point numbers.

        + +

        6x throughput

        + +

        FlashAttention-3 makes use of all of these new features of Hopper, using powerful abstractions from NVIDIA’s CUTLASS library.
        +
        +By rewriting FlashAttention to use these new features, we can already significantly speed it up (e.g., from 350 TFLOPS in FlashAttention-2 FP16 forward pass to around 540-570 TFLOPS). However, the asynchronous nature of the new instructions on Hopper (WGMMA and TMA) opens up additional algorithmic opportunities to overlap operations and thereby extract even greater performance. For this blogpost, we’ll explain two such techniques specific to attention. The generic technique of warp specialization, with separate producer and consumer warps doing TMA and WGMMA, is well-covered elsewhere in the context of GEMM and works the same here.

        + +

        Asynchrony: Overlapping GEMM and Softmax

        + +

        Why overlap?

        + +

        Attention has GEMMs (those matmuls between Q and K and between attention probability P and V) and softmax as its two main operations. Why do we need to overlap them? Isn’t most of the FLOPS in the GEMMs anyway? As long as the GEMMs are fast (e.g., computed using WGMMA instructions), shouldn’t the GPU be going brrrr?

        + +

        The problem is that non-matmul operations are much slower than matmul operations on modern accelerators. Special functions such as exponential (for the softmax) have even lower throughput than floating point multiply-add; they are evaluated by the multi-function unit, a unit separate from floating point multiply-add or matrix multiply-add. As an example, the H100 GPU SXM5 has 989 TFLOPS of FP16 matrix multiply, but only 3.9 TFLOPS (256x less throughput) for special functions2! For head dimension 128, there are 512x more matmul FLOPS than exponential, which means that exponential can take 50% of the time compared to matmul. The situation is even worse for FP8, where the matmul FLOPS are twice as fast yet exponential FLOPS stay the same speed. Ideally we want matmul and softmax to operate in parallel. While the Tensor Cores are busy with matmul, the multi-function units should be calculating exponential!

        + +

        Inter-warpgroup overlapping with pingpong scheduling

        + +

        The first and easiest way to overlap GEMM and softmax is to do nothing at all! The warp schedulers already try to schedule warps so that if some warps are blocked (e.g., waiting for GEMM results), other warps can run. That is, the warp schedulers do some of this overlapping for us, for free.

        + +

        However, we can improve on this by doing some of the scheduling manually. As an example, if we have 2 warpgroups (labeled 1 and 2 – each warpgroup is a group of 4 warps), we can use synchronization barriers (bar.sync) so that warpgroup 1 first does its GEMMs (e.g., GEMM1 of one iteration and GEMM0 of the next iteration), and then warpgroup 2 does its GEMMs while warpgroup 1 does its softmax, and so on. This “pingpong” schedule is illustrated in the figure below, where the same color denotes the same iteration.

        + +

        block chart

        + +

        This would allow us to perform the softmax in the shadow of the GEMMs of the other warpgroup. Of course, this figure is just a caricature; in practice the scheduling is not really this clean. Nevertheless, pingpong scheduling can improve FP16 attention forward pass from around 570 TFLOPS to 620 TFLOPS (head dim 128, seqlen 8K).

        + +

        Intra-warpgroup overlapping of GEMM and Softmax

        + +

        Even within one warpgroup, we can have some part of softmax running while the GEMMs of that warpgroup is running. This is illustrated in this figure, where the same color denotes the same iteration.

        + +

        block chart

        + +

        This pipelining increases throughput from around 620 TFLOPS to around 640-660 TFLOPS for FP16 attention forward, at the cost of higher register pressure. We need more registers to hold both accumulators of the GEMMs, and the input/output of softmax. Overall, we find this technique to offer a favorable tradeoff.

        + +

        Low-precision: reduce quantization error with incoherent processing

        + +

        LLM activation can have outliers with much larger magnitude than the rest of the features. These outliers make it difficult to quantize, producing much larger quantization errors. We leverage incoherent processing, a technique used in the quantization literature (e.g. from QuIP) that multiplies the query and key with a random orthogonal matrix to “spread out” the outliers and reduce quantization error. In particular, we use the Hadamard transform (with random signs), which can be done per attention head in O(d log d) instead of O(d^2) time, where d is the head dimension. Since the Hadamard transform is memory-bandwidth bound, it can be fused with previous operations such as rotary embedding (also memory-bandwidth bound) “for free”.

        + +

        In our experiment where Q, K, V are generated from a standard normal distribution but 0.1% of the entries have large magnitudes (to simulate outliers), we found that incoherent processing can reduce the quantization error by 2.6x. We show numerical error comparison in the table below. Please see the paper for details.

        + +

        text diagram

        + +

        Attention benchmark

        + +

        We show some results with FlashAttention-3, and compare it to FlashAttention-2, as well as the implementation in Triton and cuDNN (both of which already use new hardware features of Hopper GPUs).

        + +

        For FP16, we see about 1.6x-1.8x speedup over FlashAttention-2

        + +

        speed charts

        + +

        speed charts

        + +

        For FP8, we can reach close to 1.2 PFLOPS!

        + +

        speed charts

        + +

        Discussion

        + +

        This blogpost highlights some of the optimizations for FlashAttention available on Hopper GPUs. Other optimizations (e.g., variable length sequences, persistent kernel, and in-kernel transpose for FP8) are covered in the paper.

        + +

        We have seen that designing algorithms that take advantage of the hardware they run on can bring significant efficiency gains and unlock new model capabilities such as long context. We look forward to future work on optimization for LLM inference, as well as generalizing our techniques to other hardware architectures.

        + +

        We also look forward to FlashAttention-3 being integrated in a future release of PyTorch.

        + + +

        Notes

        + +
        +
          +
        1. + +

          Without the wgmma instruction, the older mma.sync instruction can only reach about ⅔ the peak throughput of Hopper Tensor Cores: https://arxiv.org/abs/2402.13499v1 

          +
        2. +
        3. + +

          The CUDA programming guide specifies that the throughput for special functions is 16 operations per streaming multiprocessor (SM) per clock cycle. We multiply 16 by 132 SMs and 1830 Mhz (clock speed used to calculate 989 TFLOPS of FP16 matmul) to get 3.9 TFLOPS 

          +
        4. +
        +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/flexattention-for-inference/index.html b/blog/flexattention-for-inference/index.html new file mode 100644 index 000000000000..5a6e6fb62032 --- /dev/null +++ b/blog/flexattention-for-inference/index.html @@ -0,0 +1,984 @@ + + + + + + + + + + + + + FlexAttention Part II: FlexAttention for Inference | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Joy Dong, Boyuan Feng, Driss Guessous, Joel Schlosser, Yanbo Liang, Horace He + +

        +

        Overview

        + +

        In PyTorch 2.5.0 release, we introduced FlexAttention torch.nn.attention.flex_attention for ML researchers who’d like to customize their attention kernels without writing kernel code. This blog introduces our decoding backend optimized for inference, supporting GQA and PagedAttention, along with feature updates including nested jagged tensor support, performance tuning guides and trainable biases support.

        + +

        If you’re looking for an easy way to play around with FlexAttention in your post-training / inference pipeline, PyTorch native post-training library torchtune and inference codebase gpt-fast already have FlexAttention integrated. Try it out!

        + +

        We are excited to share that our paper on FlexAttention has been accepted for presentation at the MLSys2025 Conference held from May 12-15th in Santa Clara, California.

        + +

        Title: FlexAttention: A Programming Model for Generating Optimized Attention Kernels. Poster

        + +

        FlexAttention for Inference

        + +

        TL;DR: torch.compile lowers flex_attention to a fused FlashDecoding kernel when it runs on a very short query.

        + +

        One fused attention kernel does not suit all – especially in long-context LLM inference.

        + +

        The decoding phase of LLM inference is an iterative process: tokens are generated one at a time, requiring N forward passes to generate an N-token sentence. Fortunately, each iteration doesn’t need to recompute self-attention over the full sentence — previously calculated tokens are cached, therefore we only need to attend the newly generated token to the cached context.

        + +

        chart

        + +

        This results in a unique attention pattern where a short query sequence (1 token) attends to a long key-value cache (context length up to 128k). Traditional optimizations for square attention kernels (q_len ≈ kv_len) don’t directly apply here. This pattern poses new challenges for GPU memory utilization and occupancy. We build a dedicated FlexDecoding backend optimized for long-context LLM inference incorporating decoding-specific techniques from FlashDecoding.

        + +

        FlexDecoding is implemented as an alternative backend for the torch.nn.attention.flex_attention operator. flex_attention automatically switches to the FlexDecoding backend for its JIT compilation when given a short query and a long KV cache. If the input shape changes significantly, for example transitioning from the prefill phase to decoding, JIT recompilation generates a separate kernel for each scenario.

        + +
        flex_attention = torch.compile(flex_attention)
        +
        +k_cache = torch.random(B, H, 16384, D) 
        +v_cache = torch.random(B, H, 16384, D)
        +
        +...
        +
        +# Prefill Phase: query shape = [B, H, 8000, D]
        +flex_attention(q_prefill, k_cache, v_cache, ...) # Uses FlexAttention backend optimized for prefill & training
        +
        +# Decoding Phase: q_last_token shape = [B, H, 1, D]
        +flex_attention(q_last_token  , k_cache, v_cache, ...) # Recompiles with the FlexDecoding backend 
        +
        +# decode 2 tokens at the same time: q_last_2_tokens shape = [B, H, 2, D]
        +flex_attention(q_last_2_tokens, k_cache, v_cache, ...) # No recompilation needed! Runs the decoding kernel again.
        +
        + +

        Working with KV Cache

        + +

        One of the key optimizations for efficient inference is maintaining a preallocated KV cache that updates in place as new tokens are generated. Instead of enforcing a specific KV cache policy with a dedicated API, FlexDecoding allows users to define and manage the KV cache themselves.

        + +

        Similar to FlexAttention, FlexDecoding takes user-defined mask_mod and score_mod functions. These functions modify attention scores before the softmax operation.

        + +

        chart

        + +
        score_mod(score, b, h, q_idx, kv_idx) -> tensor # return updated score
        +
        + +

        Score is a scalar pytorch tensor that represents the dot product of a query token and a key token. The rest of the arguments specify which score is being computed:

        + +
          +
        • b batch index
        • +
        • h attention head index
        • +
        • q_idx token position in query tensor
        • +
        • kv_idx token position in key/value tensor
        • +
        + +

        In the decoding phase, previously calculated tokens are cached, and only the latest generated token (i-th) is used as the query. A naive causal mask on this one token query looks like this:

        + +
        def causal(score, b, h, q_idx, kv_idx):
        +    return torch.where(q_idx >= kv_idx, score, -float("inf"))
        +
        + +

        chart

        + +

        This is problematic: the new token “saw” should attend to all previously generated tokens i.e. “The cat sat on the mat and saw”, not just the first entry in the kv cache. To correct this, the score_mod needs to offset q_idx by i for accurate decoding.

        + +

        chart

        + +

        Creating a new score_mod for each token to accommodate the offset is slow since it means FlexAttention needs to be recompiled every iteration for a different score_mod. Instead,

        + +

        We define this offset as a tensor and increment its value at each iteration:

        + +
        offset = torch.tensor(i, "cuda")
        +def causal_w_offset(score, b, h, q_idx, kv_idx):
        +    return torch.where(q_idx + offset >= kv_idx, score, -float("inf"))
        +
        +# Attend the i-th token
        +flex_attention(..., score_mod=causal_w_offset  ) # Compiles the kernel here 
        +...
        +# Attend the i+1-th token
        +offset = offset + 1 # Increment offset
        +flex_attention(..., score_mod=causal_w_offset ) # Doesn't need to recompile! 
        +
        + +

        Notably, here offset becomes a captured tensor and it does not need to recompile if offset changes values.

        + +

        Manually rewriting your score_mod and mask_mod for offset handling isn’t necessary. We can automate this process with a generic rewriter:

        + +
        offset = torch.tensor(i, "cuda")
        +
        +def get_score_mod_w_offset(score_mod: _score_mod_signature, _offset: tensor):
        +    def _score_mod(score, b, h, q, kv):
        +        return score_mod(score, b, h, q + _offset, kv)
        +    return _score_mod
        +
        +def get_mask_mod_w_offset(mask_mod: _mask_mod_signature, _offset: tensor):
        +    def _mask_mod(b, h, q, kv):
        +        return mask_mod(b, h, q + _offset, kv)
        +    return _mask_mod
        +
        +causal_w_offset = get_score_mod_w_offset(causal, offset)
        +
        + +

        BlockMask for Inference

        + +

        We can also use BlockMask with inference to leverage mask sparsity. The idea is to precompute the BlockMask once during model setup and use slices of it during decoding

        + +

        Precomputing BlockMask

        + +

        During setup, we create a squared BlockMask for MAX_SEQ_LEN x MAX_SEQ_LEN:

        + +
        from torch.nn.attention.flex_attention import create_block_mask
        +
        +def causal_mask(b, h, q_idx, kv_idx):
        +    return q_idx >= kv_idx
        +
        +block_mask = create_block_mask(causal_mask, B=None, H=None, Q_LEN=MAX_SEQ_LEN,KV_LEN=MAX_SEQ_LEN)
        +
        + +

        chart

        + +

        Using BlockMask During Decoding

        + +

        For the i-th token, we use a slice of the mask:

        + +
        block_offset = i // block_mask.BLOCK_SIZE[0]
        +block_mask_slice = block_mask[:, :, block_offset]
        +
        +# don't forget to use the mask_mod with offset! 
        +block_mask_slice.mask_mod = get_mask_mod_w_offset(causal_mask)
        +
        + +

        chart

        + +

        Performance

        + +

        chart

        + +

        FlexDecoding kernel performs on par with FlashDecoding (FAKV) and significantly outperforms pytorch scaled_dot_product_attention (code).

        + +

        chart

        + +

        FlexDecoding boosts LLaMa3.1-8B serving performance by 1.22x-2.04x, and LLaMa3.1-70B performance by 0.99x - 1.66x compared to SDPA in gpt-fast. (code)

        + +

        Paged Attention

        + +

        vLLM is one of the popular LLM serving engines, powered by the efficient memory management from PagedAttention. Existing PagedAttention implementation requires dedicated CUDA kernels and shows limited flexibility on supporting emerging attention variants. In this section, we present a PT2-native PagedAttention implementation that is enabled by flex attention and torch.compile.

        + +

        PagedAttention scatters KV cache to reduce memory fragmentation and support higher batch sizes. Without PagedAttention, KV cache from the same request are stored in a contiguous memory, requiring 2 tensor of shape B x H x KV LEN x D. We call it a logical KV cache. Here, KV_LEN is the maximum sequence length over all requests in a batch. Considering the Figure 1(a), KV_LEN is 9 thus all requests must be padded to 9 tokens, leading to large memory waste. With PagedAttention, we can chunk each request into multiple pages of the same size page_size and scatter these pages into a physical KV cache of shape 1 x H x max seq len x D, where max_seq_len=n_pages x page_size. This avoids padding requests to the same length and saves memory. Specifically, we provide an assign API to update KV cache via index computations:

        + +
        def assign(
        +    batch_idx: torch.Tensor,
        +    input_pos: torch.Tensor,
        +    k_val: torch.Tensor,
        +    v_val: torch.Tensor,
        +    k_cache: torch.Tensor,
        +    v_cache: torch.Tensor,
        +) -> None
        +
        + +

        Behind this assign API is a page table, a tensor mapping logical KV cache to physical KV cache:

        + +

        [batch_idx, logical_page_idx] -> physical_page_idx

        + +

        assign takes k_val and v_val and scatters to physical KV cache guided by the mapping from the page table.

        + +

        chart

        + +

        Paged Attention with Page Table

        + +

        A natural question is, how to integrate PagedAttention with flex attention to support diverse attention variants? A naive idea is to materialize the logical KV cache before computing with flex attention. But this leads to redundant memory copy and bad performance. Another idea is to build a dedicated CUDA or Triton kernel for paged attention, similar to existing PagedAttention implementation. However, this adds much manual effort and code complexity.

        + +

        Instead, we design a fused indirect memory access by converting a logical block mask according to the page table. In FlexAttention, we exploit BlockMask to identify logical blocks and skip redundant computation. While Paged Attention adds an extra layer of indirect memory access, we can further convert the logical block mask to the physical block mask corresponding to the page table, as illustrated in Figure 2. Our PagedAttention implementation provides a convert_logical_block_mask via torch.gather calls:

        + +
        def convert_logical_block_mask(
        +    block_mask: BlockMask,
        +    batch_idx: Optional[torch.Tensor] = None,
        +) -> BlockMask
        +
        + +

        chart

        + +

        Paged Attention via Block Mask Conversion

        + +

        One remaining question is how to rewrite user-specified mask_mod and score_mod for PagedAttention. When users specify these modifications, they write with logical indices without the knowledge of the page table maintained at runtime. The following code shows an automated conversion at runtime which is necessary to rewrite user-specified modifications with physical kv indices. The new_mask_mod would take the physical_kv_idx and convert it back to the logical_kv_idx and apply user-specified mask_mod on the logical_kv_idx for the correct mask. For efficiency, we maintain physical_to_logical as a mapping from physical_kv_block to logical_kv_block to facilitate the conversion. For correctness, we mask out-of-boundary blocks as False with a torch.where call. After batching logical KV caches from multiple requests into the same physical KV cache, there are much more physical blocks than the number of logical blocks for each request. Thus, a physical block may not have a corresponding logical block for a specific request during block mask conversion. By masking as False with torch.where, we can ensure the correctness that data from different requests do not interfere with each other. Similarly, we can convert the score_mod automatically.

        + +
        def get_mask_mod(mask_mod: Optional[_mask_mod_signature]) -> _mask_mod_signature:
        +    if mask_mod is None:
        +        mask_mod = noop_mask
        +
        +    def new_mask_mod(
        +        b: torch.Tensor,
        +        h: torch.Tensor,
        +        q_idx: torch.Tensor,
        +        physical_kv_idx: torch.Tensor,
        +    ):
        +        physical_kv_block = physical_kv_idx // page_size
        +        physical_kv_offset = physical_kv_idx % page_size
        +        logical_block_idx = physical_to_logical[b, physical_kv_block]
        +        logical_kv_idx = logical_block_idx * page_size + physical_kv_offset
        +        return torch.where(
        +            logical_block_idx >= 0, mask_mod(b, h, q_idx, logical_kv_idx), False
        +        )
        +
        +    return new_mask_mod
        +
        + +

        Figure 3 demonstrates the latency from Paged Attention (code). Overall, there is less than 5% overhead from Flex Attention with Paged Attention, compared with Flex Attention only. We also observe an on-par performance with Flash Attention v2. A minimal serving example further shows that PagedAttention can support 76x higher batch size when evaluating on OpenOrca dataset which includes 1M GPT-4 completions and 3.2M GPT-3.5 completions.

        + +

        chart

        + +

        Paged Attention: Latency under diverse sequence length

        + +

        Ragged input sequences with Nested Jagged Tensors (NJTs)

        + +

        FlexAttention now supports ragged-sized input sequences through the use of Nested Jagged Tensors (NJTs). NJTs represent ragged-sized sequences by packing sequences into a single “stacked sequence” and maintaining a set of offsets delimiting sequence boundaries for each batch item.

        + +

        A block mask can be created for input NJTs through the new create_nested_block_mask() API. The returned block mask is compatible with the ragged structure of the given NJT, treating it as a single “stacked sequence” with inter-sequence attention automatically masked out. The mask_mod or score_mod function can be written as usual.

        + +
        from torch.nn.attention.flex_attention import create_nested_block_mask, flex_attention
        +
        +BATCH = 8
        +NUM_HEADS = 8
        +D = 16
        +device = "cuda"
        +
        +# Input NJTs of shape (BATCH, SEQ_LEN*, D) with ragged SEQ_LEN
        +sequence_lengths = [torch.randint(5, 30, ()).item() for _ in range(BATCH)]
        +query = torch.nested.nested_tensor([
        +    torch.randn(seq_len, NUM_HEADS * D, device=device)
        +    for seq_len in sequence_lengths
        +], layout=torch.jagged)
        +key = torch.randn_like(query)
        +value = torch.randn_like(query)
        +
        +# View as shape (BATCH, NUM_HEADS, SEQ_LEN*, HEAD_DIM)
        +query = query.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2)
        +key = key.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2)
        +value = value.unflatten(-1, [NUM_HEADS, D]).transpose(1, 2)
        +
        +# Simple causal mask
        +def my_mask_mod(b, h, q_idx, kv_idx):
        +    return q_idx >= kv_idx
        +
        +# Construct a block mask using the ragged structure of the
        +# specified query NJT. Ragged-sized sequences are treated as a single
        +# "stacked sequence" with inter-sequence attention masked out.
        +block_mask = create_nested_block_mask(my_mask_mod, 1, 1, query)
        +
        +# For cross attention, create_nested_block_mask() also supports a
        +# rectangular block mask using the ragged structures of both query / key.
        +#block_mask = create_nested_block_mask(my_mask_mod, 1, 1, query, key)
        +
        +output = flex_attention(query, key, value, block_mask=block_mask)
        +
        + +

        Trainable Biases

        + +

        FlexAttention now supports trainable parameters in score_mod functions. This feature enables users to reference tensors that require gradients within their score_mod implementations, with gradients automatically backpropagating through these parameters during training.

        + +

        Memory-Efficient Gradient Accumulation

        + +

        Instead of materializing the full attention scores matrix, FlexAttention uses atomic additions (tl.atomic_add) to accumulate gradients. This approach significantly reduces memory usage at the cost of introducing some non-determinism in gradient calculations.

        + +

        Handling Broadcasted Operations

        + +

        Broadcasting operations in the forward pass (e.g., score + bias[h]) require special consideration in the backward pass. When broadcasting a tensor across multiple attention scores within a head or other dimensions, we need to reduce these gradients back to the original tensor shape. Rather than materializing the full attention score matrix to perform this reduction, we use atomic operations. While this incurs some runtime overhead, it allows us to maintain memory efficiency by avoiding the materialization of large intermediate tensors.

        + +

        Current Limitations

        + +

        The implementation currently allows only a single read from each input tensor in the score_mod function. For example, bias[q_idx] + bias[kv_idx] would not be supported as it reads from the same tensor twice. We hope to remove this restriction in the future.

        + +

        Simple Example:

        + +
        bias = torch.randn(num_heads, requires_grad=True)
        +def score_mod(score, b, h, q_idx, kv_idx):
        +    return score + bias[h]  
        +
        + +

        Performance Tuning for FlexAttention

        + +

        TL;DR

        + +

        For optimal performance, compile FlexAttention using max-autotune, especially when dealing with complex score_mods and mask_mods:

        + +

        flex_attention = torch.compile(flex_attention, dynamic=True, mode=’max-autotune’)

        + +

        What is max-autotune?

        + +

        max-autotune is a torch.compile mode in which TorchInductor sweeps many kernel parameters (e.g., tile size, num_stages) and selects the best-performing configuration. This process allows kernels to test both successful and failing configurations without issues, and find the best viable configuration.

        + +

        While compilation takes longer with max-autotune, the optimal configuration is cached for future kernel executions.

        + +

        Here’s an example of FlexAttention compiled with max-autotune:

        + +
        triton_flex_attention_backward_7 0.2528 ms 100.0% BLOCKS_ARE_CONTIGUOUS=False, BLOCK_M1=32, BLOCK_M2=32, BLOCK_N1=32, BLOCK_N2=32, FLOAT32_PRECISION="'ieee'", GQA_SHARED_HEADS=7, HAS_FULL_BLOCKS=False, IS_DIVISIBLE=False, OUTPUT_LOGSUMEXP=True, PRESCALE_QK=False, QK_HEAD_DIM=128, ROWS_GUARANTEED_SAFE=False, SM_SCALE=0.08838834764831843, SPARSE_KV_BLOCK_SIZE=1073741824, SPARSE_Q_BLOCK_SIZE=1073741824, V_HEAD_DIM=128, num_stages=4, num_warps=4
        +
        + +

        Why Use max-autotune for FlexAttention?

        + +

        The amount of shared memory utilized in FlexAttention depends on score_mod and mask_mod methods. This variability means that the preconfigured default kernel parameters may lead to performance cliffs or even out of shared memory** **errors on certain hardware for some masks/mods.

        + +

        For instance, with document masks, default configurations can halve GPU occupancy, reducing performance to ~75% of its potential on some GPUs. To avoid such issues, we strongly recommend enabling max-autotune.

        + +

        Updates and Enhancements

        + +
          +
        • Now available as a prototype feature in PyTorch 2.5.0
        • +
        • Fixed critical correctness issues, including a bug affecting multiple calls to FlexAttention within the same call to torch.compile
        • +
        + +

        Expanded Architecture Support

        + +
          +
        • Arbitrary sequence length support - no longer requires multiples of 128
        • +
        • Added native grouped-query attention (GQA) support via is_gqa=True
        • +
        • Enhanced dimension flexibility: +
            +
          • Different QK and V head dimensions
          • +
          • Non-power-of-two head dimensions
          • +
          +
        • +
        • Trainable attention biases (prototype)
        • +
        + +

        Under the Hood

        + +
          +
        • New fused CPU backend
        • +
        • Improved TF32 handling for float32 inputs
        • +
        • Resolved various dynamic shape issues
        • +
        • Output layout matching query strides
        • +
        + +

        These updates make FlexAttention more robust and flexible while maintaining its core promise of combining PyTorch’s ease of use with FlashAttention’s performance benefits.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/flexattention/index.html b/blog/flexattention/index.html new file mode 100644 index 000000000000..5646c1acb95f --- /dev/null +++ b/blog/flexattention/index.html @@ -0,0 +1,1103 @@ + + + + + + + + + + + + + FlexAttention: The Flexibility of PyTorch with the Performance of FlashAttention | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch: Driss Guessous, Yanbo Liang, Joy Dong, Horace He + +

        +

        a cartoon chart flexing his muscles

        + +

        In theory, Attention is All You Need. In practice, however, we also need optimized attention implementations like FlashAttention.

        + +

        Although these fused attention implementations have substantially improved performance and enabled long contexts, this efficiency has come with a loss of flexibility. You can no longer try out a new attention variant by writing a few PyTorch operators - you often need to write a new custom kernel! This operates as a sort of “software lottery” for ML researchers - if your attention variant doesn’t fit into one of the existing optimized kernels, you’re doomed to slow runtime and CUDA OOMs.

        + +

        For some examples of attention variants, we have Causal, Relative Positional Embeddings, Alibi, Sliding Window Attention, PrefixLM, Document Masking/Sample Packing/Jagged Tensors, Tanh Soft-Capping, PagedAttention, etc. Even worse, folks often want combinations of these! Sliding Window Attention + Document Masking + Causal + Context Parallelism? Or what about PagedAttention + Sliding Window + Tanh Soft-Capping?

        + +

        The left picture below represents the state of the world today - some combinations of masking + biases + setting have existing kernels implemented. But the various options lead to an exponential number of settings, and so overall we end up with fairly spotty support. Even worse, new attention variants researchers come up with will have zero support.

        + +

        Attention variant support diagram

        + +

        To solve this hypercube problem once and for all, we introduce FlexAttention, a new PyTorch API.

        + +
          +
        1. We provide a flexible API that allows implementing many attention variants (including all the ones mentioned in the blog post so far) in a few lines of idiomatic PyTorch code.
        2. +
        3. We lower this into a fused FlashAttention kernel through torch.compile, generating a FlashAttention kernel that doesn’t materialize any extra memory and has performance competitive with handwritten ones.
        4. +
        5. We also automatically generate the backwards pass, leveraging PyTorch’s autograd machinery.
        6. +
        7. Finally, we can also take advantage of sparsity in the attention mask, resulting in significant improvements over standard attention implementations.
        8. +
        + +

        With FlexAttention, we hope that trying new attention variants will only be limited by your imagination.

        + +

        You can find many FlexAttention examples at the Attention Gym: https://github.com/pytorch-labs/attention-gym. If you have any cool applications, feel free to submit an example!

        + +

        PS: We also find this API very exciting since it leverages a lot of existing PyTorch infra in a fun way - more on that in the end.

        + +

        FlexAttention

        + +

        Here is the classic attention equation:

        + +

        math equation

        + +

        In code form:

        + +
        Q, K, V: Tensor[batch_size, num_heads, sequence_length, head_dim]
        +score: Tensor[batch_size, num_heads, sequence_length, sequence_length] = (Q @ K) / sqrt(head_dim)
        +probabilities = softmax(score, dim=-1)
        +output: Tensor[batch_size, num_heads, sequence_length, head_dim] = probabilities @ V
        +
        + +

        FlexAttention allows for an user-defined function score_mod:

        + +

        math equation

        + +

        In code form:

        + +
        Q, K, V: Tensor[batch_size, num_heads, sequence_length, head_dim]
        +score: Tensor[batch_size, num_heads, sequence_length, sequence_length] = (Q @ K) / sqrt(head_dim)
        +modified_scores: Tensor[batch_size, num_heads, sequence_length, sequence_length] = score_mod(score)
        +probabilities = softmax(modified_scores, dim=-1)
        +output: Tensor[batch_size, num_heads, sequence_length, head_dim] = probabilities @ V
        +
        + +

        This function allows you to modify the attention scores prior to softmax. Surprisingly, this ends up being sufficient for the vast majority of attention variants (examples below)!

        + +

        Concretely, the expected signature for score_mod is somewhat unique.

        + +
        def score_mod(score: f32[], b: i32[], h: i32[], q_idx: i32[], kv_idx: i32[])
        +    return score # noop - standard attention
        +
        + +

        In other words, score is a scalar pytorch tensor that represents the dot product of a query token and a key token. The rest of the arguments tell you which dot product you’re currently computing - b (current element in batch), h (current head), q_idx (position in query), kv_idx (position in key/value tensors).

        + +

        To apply this function, we could implement it as

        + +
        for b in range(batch_size):
        +    for h in range(num_heads):
        +        for q_idx in range(sequence_length):
        +            for kv_idx in range(sequence_length):
        +                modified_scores[b, h, q_idx, kv_idx] = score_mod(scores[b, h, q_idx, kv_idx], b, h, q_idx, kv_idx)
        +
        + +

        Of course, this is not how FlexAttention is implemented under the hood. Leveraging torch.compile, we automatically lower your function into a single fused FlexAttention kernel - guaranteed or your money back!

        + +

        This API ends up being surprisingly expressive. Let’s look at some examples.

        + +

        Score Mod Examples

        + +

        Full Attention

        + +

        Let’s first do “full attention”, or standard bidirectional attention. In this case, score_mod is a no-op - it takes as input the scores and then returns them as is..

        + +
        def noop(score, b, h, q_idx, kv_idx):
        +    return score
        +
        + +

        And to use it end to end (including both forwards and backwards):

        + +
        from torch.nn.attention.flex_attention import flex_attention
        +
        +flex_attention(query, key, value, score_mod=noop).sum().backward()
        +
        + +

        Relative Position Encodings

        + +

        One common attention variant is the “relative position encoding”. Instead of encoding the absolute distance in the queries and keys, relative position encoding adjusts scores based on the “distance” between the queries and keys.

        + +
        def relative_positional(score, b, h, q_idx, kv_idx):
        +    return score + (q_idx - kv_idx)
        +
        + +

        Note that unlike typical implementations, this does not need to materialize a SxS tensor. Instead, FlexAttention computes the bias values “on the fly” within the kernel, leading to significant memory and performance improvements.

        + +

        relative position encoding

        + +

        ALiBi Bias

        + +

        alibi bias

        +

        Source: Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation

        + +

        ALiBi was introduced in Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation, and claims to have beneficial properties for length extrapolation at inference. Notably, MosaicML has pointed to “lack of kernel support” as the main reason why they eventually switched from ALiBi to rotary embeddings.

        + +

        Alibi is similar to relative positional encodings with one exception - it has a per-head factor that is typically precomputed.

        + +
        alibi_bias = generate_alibi_bias() # [num_heads]
        +
        +def alibi(score, b, h, q_idx, kv_idx):
        +    bias = alibi_bias[h] * (kv_idx - q_idx)
        +    return score + bias
        +
        + +

        This demonstrates one interesting piece of flexibility torch.compile provides - we can load from alibi_bias even though it wasn’t explicitly passed in as an input! The generated Triton kernel will calculate the correct loads from the alibi_bias tensor and fuse it. Note that you could regenerate alibi_bias and we still wouldn’t need to recompile.

        + +

        Soft-capping

        + +

        Soft-capping is a technique used in Gemma2 and Grok-1 that prevents logits from growing excessively large. In FlexAttention, it looks like:

        + +
        softcap = 20
        +def soft_cap(score, b, h, q_idx, kv_idx):
        +    score = score / softcap
        +    score = torch.tanh(score)
        +    score = score * softcap
        +    return score
        +
        + +

        Note that we also automatically generate the backwards pass from the forwards pass here. Also, although this implementation is semantically correct, we likely want to use a tanh approximation in this case for performance reasons. See attention-gym for more details.

        + +

        Causal Mask

        + +

        Although bidirectional attention is the simplest, the original Attention is All You Need paper and the vast majority of LLMs use attention in a decoder-only setting where each token can only attend to the tokens prior to it. Folks often think of this as a lower-triangular mask, but with the score_mod API it can be expressed as:

        + +
        def causal_mask(score, b, h, q_idx, kv_idx):
        +    return torch.where(q_idx >= kv_idx, score, -float("inf"))
        +
        + +

        Basically, if the query token is “after” the key token, we keep the score. Otherwise, we mask it out by setting it to -inf, thus ensuring it won’t participate in the softmax calculation.

        + +

        However, masking is special compared to other modifications - if something is masked out, we can completely skip its computation! In this case, a causal mask has about 50% sparsity, so not taking advantage of the sparsity would result in a 2x slowdown. Although this score_mod is sufficient to implement causal masking correctly, getting the performance benefits of sparsity requires another concept - mask_mod.

        + +

        Mask Mods

        + +

        To take advantage of sparsity from masking, we need to do some more work. Specifically, by passing a mask_mod to create_block_mask, we can create a BlockMask. FlexAttention can then use BlockMask to take advantage of the sparsity!

        + +

        The signature of mask_mod is very similar to score_mod - just without the score. In particular

        + +
        # returns True if this position should participate in the computation
        +mask_mod(b, h, q_idx, kv_idx) => bool
        +
        + +

        Note that score_mod is strictly more expressive than mask_mod. However, for masking, it’s recommended to use mask_mod and create_block_mask, as it’s more performant. See the FAQ on why score_mod and mask_mod are separate.

        + +

        Now, let’s take a look at how we might implement causal mask with mask_mod.

        + +

        Causal Mask

        + +
        from torch.nn.attention.flex_attention import create_block_mask
        +
        +def causal(b, h, q_idx, kv_idx):
        +    return q_idx >= kv_idx
        +
        +# Because the sparsity pattern is independent of batch and heads, we'll set them to None (which broadcasts them) 
        +block_mask = create_block_mask(causal, B=None, H=None, Q_LEN=1024, KV_LEN=1024)
        +# In this case, we don't need a score_mod, so we won't pass any in.
        +# However, score_mod can still be combined with block_mask if you need the additional flexibility.
        +flex_attention(query, key, value, block_mask=block_mask)
        +
        + +

        Note that create_block_mask is a relatively expensive operation! Although FlexAttention will not need to recompile when it changes, if you aren’t careful about caching it, it can lead to significant slowdowns (check out the FAQ for suggestions on best practices).

        + +

        flexattention performance charts

        + +

        While the TFlops are roughly the same, the execution time is 2x faster for the mask_mod version! This demonstrates that we can leverage the sparsity that BlockMask provides us without losing hardware efficiency.

        + +

        Sliding Window + Causal

        + +

        Sliding Window Causal diagrams

        +

        Source: Mistral 7B

        + +

        Popularized by Mistral, sliding window attention (also known as local attention) takes advantage of the intuition that the most recent tokens are the most useful. In particular, it allows the query token to only attend to, say, the 1024 most recent tokens. This is often used together with causal attention.

        + +
        SLIDING_WINDOW = 1024
        +
        +def sliding_window_causal(b, h, q_idx, kv_idx):
        +    causal_mask = q_idx >= kv_idx
        +    window_mask = q_idx - kv_idx <= SLIDING_WINDOW 
        +    return causal_mask & window_mask
        +
        +# If you want to be cute...
        +from torch.nn.attention import and_masks
        +
        +def sliding_window(b, h, q_idx, kv_idx)
        +    return q_idx - kv_idx <= SLIDING_WINDOW
        +
        +sliding_window_causal = and_masks(causal_mask, sliding_window)
        +
        + +

        We benchmark it against F.scaled_dot_product_attention with a sliding window mask as well as FA2 with a causal mask (as a reference point for performance). Not only are we significantly faster than F.scaled_dot_product_attention, we’re also significantly faster than FA2 with a causal mask as this mask has significantly more sparsity.

        + +

        execution time charts

        + +

        PrefixLM

        + +

        PrefixLM diagram

        +

        Source: PaliGemma: A versatile 3B VLM for transfer

        + +

        The T5 architecture, proposed in Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer, describes an attention variant that performs full bidirectional attention on a “prefix”, and causal attention on the rest. We again compose two mask functions to accomplish this, one for causal masking and one that is based off of the prefix length.

        + +
        prefix_length: [B]
        +def prefix_mask(b, h, q_idx, kv_idx):
        +    return kv_idx <= prefix_length[b]
        +
        +prefix_lm_causal = or_masks(prefix_mask, causal_mask)
        +# In this case, our mask is different per sequence so we set B equal to our batch size
        +block_mask = create_block_mask(prefix_lm_causal, B=B, H=None, S, S)
        +
        + +

        Just like with score_mod, mask_mod allows us to refer to additional tensors that aren’t explicitly an input to the function! However, with prefixLM, the sparsity pattern changes per input. This means that for each new input batch, we’ll need to recompute the BlockMask. One common pattern is to call create_block_mask at the beginning of your model and reuse that block_mask for all attention calls in your model. See Recomputing Block Masks vs. Recompilation.

        + +

        However, in exchange for that, we’re not only able to have an efficient attention kernel for prefixLM, we’re also able to take advantage of however much sparsity exists in the input! FlexAttention will dynamically adjust its performance based off of the BlockMask data, without needing to recompile the kernel.

        + +

        Document Masking/Jagged Sequences

        + +

        Another common attention variant is document masking/jagged sequences. Imagine that you have a number of sequences of varying length. You want to train on all of them together, but unfortunately, most operators only accept rectangular tensors.

        + +

        Through BlockMask, we can support this efficiently in FlexAttention as well!

        + +
          +
        1. First, we flatten all sequences into a single sequence with sum(sequence lengths) tokens.
        2. +
        3. Then, we compute the document_id that each token belongs to.
        4. +
        5. Finally, in our mask_mod, we simply whether the query and kv token belong to the same document!
        6. +
        + +
        # The document that each token belongs to.
        +# e.g. [0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2] corresponds to sequence lengths 3, 2, and 6.
        +document_id: [SEQ_LEN]
        +
        +def document_masking(b, h, q_idx, kv_idx):
        +    return document_id[q_idx] == document_id[kv_idx]
        +
        + +

        And that’s it! In this case, we see that we end up with a blockdiagonal mask.

        + +

        blockdiagonal mask

        + +

        One interesting aspect about document masking is that it’s easy to see how it might combine with an arbitrary combination of other masks . For example, we already defined prefixlm_mask in the previous section. Do we now need to define a prefixlm_document_mask function as well?

        + +

        In these cases, one pattern we’ve found quite useful is what we call a “higher level modification”. In this case, we can take an existing mask_mod and automatically transform it into one that works with jagged sequences!

        + +
        def generate_doc_mask_mod(mask_mod, document_id):
        +    # Get unique document IDs and their counts
        +    _, counts = torch.unique_consecutive(document_id, return_counts=True)
        +    # Create cumulative counts (offsets)
        +    offsets = torch.cat([torch.tensor([0], device=document_id.device), counts.cumsum(0)[:-1]])
        +    def doc_mask_wrapper(b, h, q_idx, kv_idx):
        +        same_doc = document_id[q_idx] == document_id[kv_idx]
        +        q_logical = q_idx - offsets[document_id[q_idx]]
        +        kv_logical = kv_idx - offsets[document_id[kv_idx]]
        +        inner_mask = mask_mod(b, h, q_logical, kv_logical)
        +        return same_doc & inner_mask
        +    return doc_mask_wrapper
        +
        + +

        For example, given the prefix_lm_causal mask from above, we can transform it into one that works on on packed documents like so:

        + +
        prefix_length = torch.tensor(2, dtype=torch.int32, device="cuda")
        +def prefix_mask(b, h, q_idx, kv_idx):
        +    return kv_idx < prefix_length
        +prefix_lm_causal = or_masks(prefix_mask, causal_mask)
        +doc_prefix_lm_causal_mask = generate_doc_mask_mod(prefix_lm_causal, document_id)
        +
        + +

        blockdiagonal mask

        + +

        Now, this mask is “block-prefixLM-diagonal” shaped. :)

        + +

        That’s all of our examples! There are far more attention variants than we have space to list, so check out Attention Gym for more examples. We hope that the community will contribute some of their favorite applications of FlexAttention as well.

        + +

        FAQ

        + +
        Q: When does FlexAttention need to recompile?
        + +

        As FlexAttention leverages torch.compile for graph capture, it can actually avoid recompilation in a broad spectrum of cases. Notably, it does not need to recompile even if captured tensors change values!

        + +
        flex_attention = torch.compile(flex_attention)
        +def create_bias_mod(bias)
        +    def bias_mod(score, b, h, q_idx, kv_idx):
        +        return score + bias
        +    return bias_mod
        +bias_mod1 = create_bias_mod(torch.tensor(0))
        +flex_attention(..., score_mod=bias_mod1) # Compiles the kernel here 
        +
        +bias_mod2 = create_bias_mod(torch.tensor(2))
        +flex_attention(..., score_mod=bias_mod2) # Doesn't need to recompile! 
        +
        + +

        Even changing the block-sparsity doesn’t require a recompile. However, if the block-sparsity changes, we do need to recompute the BlockMask.

        + +
        Q: When should we recompute the BlockMask?
        + +

        We need to recompute the BlockMask whenever the block-sparsity changes. Although computing the BlockMask is much cheaper than recompilation (on the order of hundreds of microseconds as opposed to seconds), you should still take care to not excessively recompute the BlockMask.

        + +

        Here are some common patterns and some recommendations on how you might approach them.

        + +

        Mask never changes (e.g. causal mask)
        +In this case, you can simply precompute the block mask and cache it globally, reusing it for all attention calls.

        + +
        block_mask = create_block_mask(causal_mask, 1, 1, S,S)
        +causal_attention = functools.partial(flex_attention, block_mask=block_mask)
        +
        + +

        Mask changes every batch (e.g. document masking)
        +In this case, we would suggest computing the BlockMask at the beginning of the model and threading it through the model - reusing the BlockMask for all layers.

        + +
        def forward(self, x, doc_mask):
        +    # Compute block mask at beginning of forwards
        +    block_mask = create_block_mask(doc_mask, None, None, S, S)    
        +    x = self.layer1(x, block_mask)
        +    x = self.layer2(x, block_mask)
        +    ...
        +    # amortize block mask construction cost across all layers
        +    x = self.layer3(x, block_mask) 
        +    return x
        +
        + +

        Mask changes every layer (e.g. data-dependent sparsity)
        +This is the hardest setting, since we’re unable to amortize the block mask computation across multiple FlexAttention invocations. Although FlexAttention can certainly still benefit this case, the actual benefits from BlockMask depend on how sparse your attention mask is and how fast we can construct the BlockMask. That leads us to…

        + +
        Q: How can we compute BlockMask quicker?
        + +

        create_block_mask is unfortunately fairly expensive, both from a memory and compute perspective, as determining whether a block is completely sparse requires evaluating mask_mod at every single point in the block. There are a couple ways to address this:

        + +
          +
        1. If your mask is the same across batch size or heads, make sure that you’re broadcasting over those (i.e. set them to None in create_block_mask).
        2. +
        3. Compile create_block_mask. Unfortunately, today, torch.compile does not work directly on create_block_mask due to some unfortunate limitations. However, you can set _compile=True, which will significantly reduce the peak memory and runtime (often an order of magnitude in our testing).
        4. +
        5. +

          Write a custom constructor for BlockMask. The metadata for BlockMask is quite simple (see the documentation). It’s essentially two tensors. +a. num_blocks: The number of KV blocks computed for each query block.
          +b. indices: The positions of the KV blocks computed for each query block.

          + +

          For example, here’s a custom BlockMask constructor for causal_mask.

          +
        6. +
        + +
        def create_causal_mask(S):
        +    BLOCK_SIZE = 128
        +    # The first query block computes one block, the second query block computes 2 blocks, etc.
        +    num_blocks = torch.arange(S // BLOCK_SIZE, device="cuda") + 1
        +    # Since we're always computing from the left to the right,
        +    # we can use the indices [0, 1, 2, ...] for every query block.
        +    indices = torch.arange(S // BLOCK_SIZE, device="cuda").expand(
        +        S // BLOCK_SIZE, S // BLOCK_SIZE
        +    )
        +    num_blocks = num_blocks[None, None, :]
        +    indices = indices[None, None, :]
        +    return BlockMask(num_blocks, indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=causal_mask)
        +
        + +
        Q: Why are score_mod and mask_mod different? Isn’t mask_mod just a special case of score_mod?
        + +

        Very astute question, hypothetical audience member! In fact, any mask_mod can be easily converted to a score_mod (we do not recommend using this function in practice!)

        + +
        def mask_mod_as_score_mod(b, h, q_idx, kv_idx):
        +    return torch.where(mask_mod(b, h, q_idx, kv_idx), score, -float("inf"))
        +
        + +

        So, if score_mod can implement everything mask_mod can, what’s the point of having mask_mod?

        + +

        One immediate challenge: a score_mod requires the actual score value as an input, but when we’re precomputing the BlockMask, we don’t have the actual score value. We can perhaps fake the values by passing in all zeros, and if the score_mod returns -inf, then we consider it to be masked (in fact, we originally did this!).

        + +

        However, there are two issues. The first is that this is hacky - what if the user’s score_mod returned -inf when the input is 0? Or what if the user’s score_mod masked out with a large negative value instead of -inf? It seems we’re trying to cram a round peg into a square hole. However, there’s a more important reason to separate out mask_mod from score_mod - it’s fundamentally more efficient!.

        + +

        As it turns out, applying masking to every single computed element is actually quite expensive - our benchmarks see about a 15-20% degradation in performance! So, although we can get significant speedups by skipping half the computation, we lose a meaningful part of that speedup from needing to mask out every element!

        + +

        Luckily, if we visualize the causal mask, we notice that the vast majority of blocks do not require a “causal mask” at all - they’re fully computed! It is only the blocks on the diagonal, partially computed and partially masked, that require masking to be applied.

        + +

        blockdiagonal mask

        + +

        The BlockMask previously told us which blocks we need to compute and which blocks we can skip. Now, we further augment this data structure to also tell us which blocks are “fully computed” (i.e. masking can be skipped) vs. “partially computed” (i.e. a mask needs to be applied). Note, however, that although masks can be skipped on “fully computed” blocks, other score_mods like relative positional embeddings still need to be applied.

        + +

        Given just a score_mod, there’s no sound way for us to tell which parts of it are “masking”. Hence, the user must separate these out themselves into mask_mod.

        + +
        Q: How much additional memory does the BlockMask need?
        + +

        The BlockMask metadata is of size [BATCH_SIZE, NUM_HEADS, QUERY_LEN//BLOCK_SIZE, KV_LEN//BLOCK_SIZE]. If the mask is the same across the batch or heads dimension it can be broadcasted over that dimension to save memory.

        + +

        At the default BLOCK_SIZE of 128, we expect that the memory usage will be fairly negligible for most use cases. For example, for a sequence length of 1 million, the BlockMask would only use 60MB of additional memory. If this is a problem, you can increase the block size: create_block_mask(..., BLOCK_SIZE=1024). For example, increasing BLOCK_SIZE to 1024 would result in this metadata dropping to under a megabyte.

        + +
        Q: How do the numerics compare?
        + +

        Although the results are not bitwise identical, we are confident that FlexAttention is as numerically accurate as FlashAttention. We generate the following distribution of differences comparing FlashAttention versus FlexAttention over a large range of inputs on both causal and non causal attention variants. The errors are nearly identical.

        + +

        distribution chart

        + +

        Performance

        + +

        Generally speaking, FlexAttention is nearly as performant as a handwritten Triton kernel, which is unsurprising, as we heavily leverage a handwritten Triton kernel. However, due to its generality, we do incur a small performance penalty. For example, we must incur some additional latency to determine which block to compute next. In some cases, we provide some kernel options that can affect the performance of the kernel while changing its behavior. They can be found here: performance knobs

        + +

        As a case study, let’s explore how the knobs affect the performance of causal attention. We will compare performance of the triton kernel versus FlashAttentionv2 on A100. The script can be found here.

        + +

        FlexAttention achieves 90% of FlashAttention2’s performance in the forward pass and 85% in the backward pass. FlexAttention is currently utilizing a deterministic algorithm that recomputes more intermediates than FAv2, but we have plans to improve FlexAttention’s backward algorithm and hope to close this gap!

        + +

        flexattention speed chart

        + +

        flexattention speed chart

        + +

        Conclusion

        + +

        We hope you have as much fun using FlexAttention as we did developing it! While working on this, we ended up finding way more applications of this API than we could have expected. We’ve already seen it accelerate torchtune’s sample packing throughput by 71%, replace the need for a researcher to spend over a week writing their own custom Triton kernel, and deliver competitive performance with custom handwritten attention variants.

        + +

        One final thing that made implementing FlexAttention quite fun is that we were able to leverage a lot of existing PyTorch infra in an interesting way. For example, one of the unique aspects about TorchDynamo (torch.compile’s frontend) is that it does not require tensors used in the compiled function to be explicitly passed in as inputs. This allows us to compile mods like document masking, which require accessing global variables where the global variables need to change!

        + +
        bias = torch.randn(1024, 1024)
        +def score_mod(score, b, h, q_idx, kv_idx):
        +    return score + bias[q_idx][kv_idx] # The bias tensor can change!
        +
        + +

        Furthermore, the fact that torch.compile is a generic graph-capture mechanism also allows it to support more “advanced” transformations, such as the higher order transform that transforms any mask_mod into one that works with jagged tensors.

        + +

        We also leverage TorchInductor (torch.compile’s backend) infrastructure for Triton templates. Not only did this make it easy to support codegening FlexAttention - it also automatically gave us support for dynamic shapes as well as epilogue fusion (i.e. fusing an operator onto the end of attention)! In the future, we plan on extending this support to allow for quantized versions of attention or things like RadixAttention as well.

        + +

        In addition, we also leveraged higher order ops, PyTorch’s autograd to automatically generate the backwards pass, as well as vmap to automatically apply score_mod for creating the BlockMask.

        + +

        And, of course, this project wouldn’t have been possible without Triton and TorchInductor’s ability to generate Triton code.

        + +

        We look forward to leveraging the approach we used here to more applications in the future!

        + +

        Limitations and Future Work

        + +
          +
        • FlexAttention is currently available in PyTorch nightly releases, we plan to release it as a prototype feature in 2.5.0
        • +
        • We did not cover how to use FlexAttention for inference here (or how to implement PagedAttention) - we will cover those in a later post.
        • +
        • We are working to improve the performance of FlexAttention to match FlashAttention3 on H100 GPUs.
        • +
        • FlexAttention requires that all sequence lengths be a multiple of 128 - this will be addressed soon.
        • +
        • We plan on adding GQA support soon - for now, you can just replicate the kv heads.
        • +
        + +

        Acknowledgements

        + +

        We want to highlight some prior work (and people) that have inspired FlexAttention.

        + +
          +
        • Tri Dao’s work on FlashAttention
        • +
        • Francisco Massa and the Xformers team for BlockSparseAttention in Triton
        • +
        • The Jax team’s work on SplashAttention
        • +
        • Philippe Tillet and Keren Zhou for helping us with Triton
        • +
        • Ali Hassani for discussions on neighborhood attention
        • +
        • Everybody who’s complained about attention kernels not supporting their favorite attention variant :)
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/genai-acceleration-intel-xeon/index.html b/blog/genai-acceleration-intel-xeon/index.html new file mode 100644 index 000000000000..b40e830d3ccb --- /dev/null +++ b/blog/genai-acceleration-intel-xeon/index.html @@ -0,0 +1,837 @@ + + + + + + + + + + + + + GenAI Acceleration for PyTorch 2.5 on Intel® Xeon®Processors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + the Intel PyTorch Team + +

        +

        This blog is the fifth in a series focused on accelerating generative AI models with pure, native PyTorch. We demonstrate the GenAI acceleration of GPTFast, Segment Anything Fast, and Diffusion Fast on Intel® Xeon®Processors.

        + +

        First, we revisit GPTFast, a remarkable work that speeds up text generation in under 1000 lines of native PyTorch code. Initially, GPTFast supported only the CUDA backend. We will show you how to run GPTFast on CPU and achieve additional performance speedup with weight-only quantization (WOQ).

        + +

        In Segment Anything Fast, we have incorporated support for the CPU backend and will demonstrate performance acceleration by leveraging the increased power of CPU with BFloat16, torch.compile, and scaled_dot_product_attention (SDPA) with a block-wise attention mask. The speedup ratio against FP32 can reach 2.91x in vit_b and 3.95x in vit_h.

        + +

        Finally, Diffusion Fast now supports the CPU backend and leverages the increased power of CPU with BFloat16, torch.compile, and SDPA. We also optimize the layout propagation rules for convolution, cat, and permute in Inductor CPU to improve performance. The speedup ratio against FP32 can achieve 3.91x in Stable Diffusion XL (SDXL).

        + +

        Optimization strategies to boost performance on PyTorch CPU

        + +

        GPTFast

        + +

        Over the past year, generative AI has achieved great success across various language tasks and become increasingly popular. However, generative models face high inference costs due to the memory bandwidth bottlenecks in the auto-regressive decoding process. To address these issues, the PyTorch team published GPTFast which targets accelerating text generation with only pure, native PyTorch. This project developed an LLM from scratch almost 10x faster than the baseline in under 1000 lines of native PyTorch code. Initially, GPTFast supported only the CUDA backend and garnered approximately 5,000 stars in about four months. Inspired by Llama.cpp, the Intel team provided CPU backend support starting with the PyTorch 2.4 release, further enhancing the project’s availability in GPU-free environments. The following are optimization strategies used to boost performance on PyTorch CPU:

        + +
          +
        • +

          Torch.compile

          + +

          torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable software engineers to run their PyTorch programs faster.

          +
        • +
        • +

          Weight-only Quantization

          + +

          Weight-only quantization (WOQ) is a trade-off between the performance and the accuracy since the bottleneck of the auto-regressive decoding phase in text generation is the memory bandwidth of loading weights and generally WOQ could lead to better accuracy compared to traditional quantization approach such as W8A8. GPTFast supports two types of WOQs: W8A16 and W4A16. To be specific, activations are stored in BFloat16 and model weights could be quantized to int8 and int4, as shown in Figure 1.

          +
        • +
        + +

        flow diagram

        + +

        Figure 1. Weight-only Quantization Pattern. Source: Mingfei Ma, Intel

        + +
          +
        • +

          Weight Prepacking & Micro Kernel Design.

          + +

          To maximize throughput, GPTFast allows model weights to be prepacked into hardware-specific layouts on int4 using internal PyTorch ATen APIs. Inspired by Llama.cpp, we prepacked the model weights from [N, K] to [N/kNTileSize, K, kNTileSize/2], with kNTileSize set to 64 on avx512. First, the model weights are blocked along the N dimension, then the two innermost dimensions are transposed. To minimize de-quantization overhead in kernel computation, we shuffle the 64 data elements on the same row in an interleaved pattern, packing Lane2 & Lane0 together and Lane3 & Lane1 together, as illustrated in Figure 2.

          +
        • +
        + +

        flow diagram

        + +

        Figure 2. Weight Prepacking on Int4. Source: Mingfei Ma, Intel

        + +

        During the generation phase, the torch.nn.Linear module will be lowered to be computed with high-performance kernels inside PyTorch ATen, where the quantized weights will be de-quantized first and then accumulated with fused multiply-add (FMA) at the register level, as shown in Figure 3.

        + +

        flow diagram

        + +

        Figure 3. Micro Kernel Design. Source: Mingfei Ma, Intel

        + +

        Segment Anything Fast

        + +

        Segment Anything Fast offers a simple and efficient PyTorch native acceleration for the Segment Anything Model (SAM) , which is a zero-shot vision model for generating promptable image masks. The following are optimization strategies used to boost performance on PyTorch CPU:

        + +
          +
        • +

          BFloat16

          + +

          Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation.

          +
        • +
        • +

          Torch.compile

          + +

          torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable developers to run their PyTorch programs faster.

          +
        • +
        • +

          Scaled Dot Product Attention (SDPA)

          + +

          Scaled Dot-Product Attention (SDPA) is a crucial mechanism in transformer models. PyTorch offers a fused implementation that significantly outperforms a naive approaches. For Segment Anything Fast, we convert the attention mask from bfloat16 to float32 in a block-wise manner. This method not only reduces peak memory usage, making it ideal for systems with limited memory resources, but also enhances performance.

          +
        • +
        + +

        Diffusion Fast

        + +

        Diffusion Fast offers a simple and efficient PyTorch native acceleration for text-to-image diffusion models. The following are optimization strategies used to boost performance on PyTorch CPU:

        + +
          +
        • +

          BFloat16

          + +

          Bfloat16 is a commonly used half-precision type. Through less precision per parameter and activations, we can save significant time and memory in computation.

          +
        • +
        • +

          Torch.compile

          + +

          torch.compile is a PyTorch function introduced since PyTorch 2.0 that aims to solve the problem of accurate graph capturing in PyTorch and ultimately enable software engineers to run their PyTorch programs faster.

          +
        • +
        • +

          Scaled Dot Product Attention (SDPA)

          + +

          SDPA is a key mechanism used in transformer models, PyTorch provides a fused implementation to show large performance benefits over a naive implementation.

          +
        • +
        + +

        Model Usage on Native PyTorch CPU

        + +

        GPTFast

        + +

        To launch WOQ in GPTFast, first quantize the model weights. For example, to quantize with int4 and group size of 32:

        + +
        python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4 –group size 32
        +
        + +

        Then run generation by passing the int4 checkpoint to generate.py

        + +
        python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile --device $DEVICE
        +
        + +

        To use CPU backend in GPTFast, simply switch DEVICE variable from cuda to CPU.

        + +

        Segment Anything Fast

        + +
        cd experiments
        +
        +export SEGMENT_ANYTHING_FAST_USE_FLASH_4=0
        +
        +python run_experiments.py 16 vit_b &lt;pytorch_github> &lt;segment-anything_github> &lt;path_to_experiments_data> --run-experiments --num-workers 32 --device cpu
        +
        +python run_experiments.py 16 vit_h &lt;pytorch_github> &lt;segment-anything_github> &lt;path_to_experiments_data> --run-experiments --num-workers 32 --device cpu
        +
        + +

        Diffusion Fast

        + +
        python run_benchmark.py --compile_unet --compile_vae --device=cpu
        +
        + +

        Performance Evaluation

        + +

        GPTFast

        + +

        We ran llama-2-7b-chat model based on test branch and the above hardware configuration on PyTorch. After applying the following steps, we saw a 3.8x boost compared to the baseline in eager mode:

        + +
          +
        • Use torch.compile to automatically fuse elementwise operators.
        • +
        • Reduce memory footprint with WOQ-int8.
        • +
        • Further reduce memory footprint with WOQ-int4.
        • +
        • Use AVX512 which enables faster de-quant in micro kernels.
        • +
        + +

        bar chart

        + +

        Figure 4. GPTFast Performance speedup in Llama2-7b-chat

        + +

        Segment Anything Fast

        + +

        We ran Segment Anything Fast on the above hardware configuration on PyTorch and achieved a performance speedup of BFloat16 with torch.compile and SDPA compared with FP32 as shown in Figure 5. The speedup ratio against FP32 can achieve 2.91x in vit_b, and 3.95x in vit_h.

        + +

        bar chart

        + +

        Figure 5. Segment Anything Fast Performance speedup in vit_b/vit_h

        + +

        Diffusion Fast

        + +

        We ran Diffusion Fast on the above hardware configuration on PyTorch and achieved a performance speedup of BFloat16 with torch.compile and SDPA compared with FP32 as shown in Figure 6. The speedup ratio against FP32 can achieve 3.91x in Stable Diffusion XL (SDXL).

        + +

        bar chart

        + +

        Figure 6. Diffusion Fast Performance speedup in Stable Diffusion XL

        + +

        Conclusion and Future Work

        + +

        In this blog, we introduced software optimizations for weight-only quantization, torch.compile, and SDPA, demonstrating how we can accelerate text generation with native PyTorch on CPU. Further improvements are expected with the support of the AMX-BF16 instruction set and the optimization of dynamic int8 quantization using torchao on CPU. We will continue to extend our software optimization efforts to a broader scope.

        + +

        Acknowledgments

        + +

        The results presented in this blog are a joint effort between Meta and the Intel PyTorch Team. Special thanks to Michael Gschwind from Meta who spent precious time providing substantial assistance. Together we took one more step on the path to improve the PyTorch CPU ecosystem.

        + + + +

        Part 1: How to accelerate Segment Anything over 8x with Segment Anything Fast.

        + +

        Part 2: How to accelerate Llama-7B by almost 10x with help of GPTFast.

        + +

        Part 3: How to accelerate text-to-image diffusion models up to 3x with Diffusion Fast.

        + +

        Part 4: How to speed up FAIR’s Seamless M4T-v2 model by 2.7x.

        + +

        Product and Performance Information

        + +

        Figure 4: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 1 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24.

        + +

        Figure 5: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 16 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24.

        + +

        Figure 6: Intel Xeon Scalable Processors: Measurement on 4th Gen Intel Xeon Scalable processor using: 2x Intel(R) Xeon(R) Platinum 8480+, 56cores, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB 2 [0], DSA 2 [0], IAA 2 [0], QAT 2 [0], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4800 MT/s]), BIOS 3B07.TEL2P1, microcode 0x2b000590, Samsung SSD 970 EVO Plus 2TB, CentOS Stream 9, 5.14.0-437.el9.x86_64, run single socket (1 instances in total with: 56 cores per instance, Batch Size 1 per instance), Models run with PyTorch 2.5 wheel. Test by Intel on 10/15/24.

        + +

        Notices and Disclaimers

        + +

        Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

        + +

        Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

        + +

        AI disclaimer:

        + +

        AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at www.intel.com/AIPC. Results may vary.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/geospatial-deep-learning-with-torchgeo/index.html b/blog/geospatial-deep-learning-with-torchgeo/index.html new file mode 100644 index 000000000000..f96e00f2b066 --- /dev/null +++ b/blog/geospatial-deep-learning-with-torchgeo/index.html @@ -0,0 +1,881 @@ + + + + + + + + + + + + + Geospatial deep learning with TorchGeo | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        June 23, 2022

        +

        + Geospatial deep learning with TorchGeo +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Adam Stewart (University of Illinois at Urbana-Champaign), Caleb Robinson (Microsoft AI for Good Research Lab), Isaac Corley (University of Texas at San Antonio) + +

        +

        TorchGeo is a PyTorch domain library providing datasets, samplers, transforms, and pre-trained models specific to geospatial data.

        + +

        + +

        + +

        + https://github.com/microsoft/torchgeo +

        + +

        For decades, Earth observation satellites, aircraft, and more recently UAV platforms have been collecting increasing amounts of imagery of the Earth’s surface. With information about seasonal and long-term trends, remotely sensed imagery can be invaluable for solving some of the greatest challenges to humanity, including climate change adaptation, natural disaster monitoring, water resource management, and food security for a growing global population. From a computer vision perspective, this includes applications like land cover mapping (semantic segmentation), deforestation and flood monitoring (change detection), glacial flow (pixel tracking), hurricane tracking and intensity estimation (regression), and building and road detection (object detection, instance segmentation). By leveraging recent advancements in deep learning architectures, cheaper and more powerful GPUs, and petabytes of freely available satellite imagery datasets, we can come closer to solving these important problems.

        + +

        + +

        + +

        +National Oceanic and Atmospheric Administration satellite image of Hurricane Katrina, taken on August 28, 2005 (source). Geospatial machine learning libraries like TorchGeo can be used to detect, track, and predict future trajectories of hurricanes and other natural disasters. +

        + +

        The challenges

        + +

        In traditional computer vision datasets, such as ImageNet, the image files themselves tend to be rather simple and easy to work with. Most images have 3 spectral bands (RGB), are stored in common file formats like PNG or JPEG, and can be easily loaded with popular software libraries like PIL or OpenCV. Each image in these datasets is usually small enough to pass directly into a neural network. Furthermore, most of these datasets contain a finite number of well-curated images that are assumed to be independent and identically distributed, making train-val-test splits straightforward. As a result of this relative homogeneity, the same pre-trained models (e.g., CNNs pretrained on ImageNet) have shown to be effective across a wide range of vision tasks using transfer learning methods. Existing libraries, such as torchvision, handle these simple cases well, and have been used to make large advances in vision tasks over the past decade.

        + +

        Remote sensing imagery is not so uniform. Instead of simple RGB images, satellites tend to capture images that are multispectral (Landsat 8 has 11 spectral bands) or even hyperspectral (Hyperion has 242 spectral bands). These images capture information at a wider range of wavelengths (400 nm–15 µm), far outside of the visible spectrum. Different satellites also have very different spatial resolutions—GOES has a resolution of 4 km/px, Maxar imagery is 30 cm/px, and drone imagery resolution can be as high as 7 mm/px. These datasets almost always have a temporal component, with satellite revisists that are daily, weekly, or biweekly. Images often have overlap with other images in the dataset, and need to be stitched together based on geographic metadata. These images tend to be very large (e.g., 10K x 10K pixels), so it isn’t possible to pass an entire image through a neural network. This data is distributed in hundreds of different raster and vector file formats like GeoTIFF and ESRI Shapefile, requiring specialty libraries like GDAL to load.

        + +

        + +

        + +

        +From left to right: Mercator, Albers Equal Area, and Interrupted Goode Homolosine projections (source). Geospatial data is associated with one of many different types of reference systems that project the 3D Earth onto a 2D representation. Combining data from different sources often involves re-projecting to a common reference system in order to ensure that all layers are aligned. +

        + +

        Although each image is 2D, the Earth itself is 3D. In order to stitch together images, they first need to be projected onto a 2D representation of the Earth, called a coordinate reference system (CRS). Most people are familiar with equal angle representations like Mercator that distort the size of regions (Greenland looks larger than Africa even though Africa is 15x larger), but there are many other CRSs that are commonly used. Each dataset may use a different CRS, and each image within a single dataset may also be in a unique CRS. In order to use data from multiple layers, they must all share a common CRS, otherwise the data won’t be properly aligned. For those who aren’t familiar with remote sensing data, this can be a daunting task.

        + +

        + +

        + +

        +Even if you correctly georeference images during indexing, if you don't project them to a common CRS, you'll end up with rotated images with nodata values around them, and the images won't be pixel-aligned. +

        + +

        The solution

        + +

        At the moment, it can be quite challenging to work with both deep learning models and geospatial data without having expertise in both of these very different fields. To address these challenges, we’ve built TorchGeo, a PyTorch domain library for working with geospatial data. TorchGeo is designed to make it simple:

        + +
          +
        1. for machine learning experts to work with geospatial data, and
        2. +
        3. for remote sensing experts to explore machine learning solutions.
        4. +
        + +

        TorchGeo is not just a research project, but a production-quality library that uses continuous integration to test every commit with a range of Python versions on a range of platforms (Linux, macOS, Windows). It can be easily installed with any of your favorite package managers, including pip, conda, and spack:

        + +
        $ pip install torchgeo
        +
        + +

        TorchGeo is designed to have the same API as other PyTorch domain libraries like torchvision, torchtext, and torchaudio. If you already use torchvision in your workflow for computer vision datasets, you can switch to TorchGeo by changing only a few lines of code. All TorchGeo datasets and samplers are compatible with the PyTorch DataLoader class, meaning that you can take advantage of wrapper libraries like PyTorch Lightning for distributed training. In the following sections, we’ll explore possible use cases for TorchGeo to show how simple it is to use.

        + +

        Geospatial datasets and samplers

        + +

        + +

        + +

        +Example application in which we combine A) a scene from Landsat 8 and B) Cropland Data Layer labels, even though these files are in different EPSG projections. We want to sample patches C) and D) from these datasets using a geospatial bounding box as an index. +

        + +

        Many remote sensing applications involve working with geospatial datasets —datasets with geographic metadata. In TorchGeo, we define a GeoDataset class to represent these kinds of datasets. Instead of being indexed by an integer, each GeoDataset is indexed by a spatiotemporal bounding box, meaning that two or more datasets covering a different geographic extent can be intelligently combined.

        + +

        In this example, we show how easy it is to work with geospatial data and to sample small image patches from a combination of Landsat and Cropland Data Layer (CDL) data using TorchGeo. First, we assume that the user has Landsat 7 and 8 imagery downloaded. Since Landsat 8 has more spectral bands than Landsat 7, we’ll only use the bands that both satellites have in common. We’ll create a single dataset including all images from both Landsat 7 and 8 data by taking the union between these two datasets.

        + +
        from torch.utils.data import DataLoader
        +from torchgeo.datasets import CDL, Landsat7, Landsat8, stack_samples
        +from torchgeo.samplers import RandomGeoSampler
        +
        +landsat7 = Landsat7(root="...")
        +landsat8 = Landsat8(root="...", bands=Landsat8.all_bands[1:-2])
        +landsat = landsat7 | landsat8
        +
        + +

        Next, we take the intersection between this dataset and the CDL dataset. We want to take the intersection instead of the union to ensure that we only sample from regions where we have both Landsat and CDL data. Note that we can automatically download and checksum CDL data. Also note that each of these datasets may contain files in different CRSs or resolutions, but TorchGeo automatically ensures that a matching CRS and resolution is used.

        + +
        cdl = CDL(root="...", download=True, checksum=True)
        +dataset = landsat & cdl
        +
        + +

        This dataset can now be used with a PyTorch data loader. Unlike benchmark datasets, geospatial datasets often include very large images. For example, the CDL dataset consists of a single image covering the entire contiguous United States. In order to sample from these datasets using geospatial coordinates, TorchGeo defines a number of samplers. In this example, we’ll use a random sampler that returns 256 x 256 pixel images and 10,000 samples per epoch. We’ll also use a custom collation function to combine each sample dictionary into a mini-batch of samples.

        + +
        sampler = RandomGeoSampler(dataset, size=256, length=10000)
        +dataloader = DataLoader(dataset, batch_size=128, sampler=sampler, collate_fn=stack_samples)
        +
        + +

        This data loader can now be used in your normal training/evaluation pipeline.

        + +
        for batch in dataloader:
        +    image = batch["image"]
        +    mask = batch["mask"]
        +
        +    # train a model, or make predictions using a pre-trained model
        +
        + +

        Many applications involve intelligently composing datasets based on geospatial metadata like this. For example, users may want to:

        + +
          +
        • Combine datasets for multiple image sources and treat them as equivalent (e.g., Landsat 7 and 8)
        • +
        • Combine datasets for disparate geospatial locations (e.g., Chesapeake NY and PA)
        • +
        + +

        These combinations require that all queries are present in at least one dataset, and can be created using a UnionDataset. Similarly, users may want to:

        + +
          +
        • Combine image and target labels and sample from both simultaneously (e.g., Landsat and CDL)
        • +
        • Combine datasets for multiple image sources for multimodal learning or data fusion (e.g., Landsat and Sentinel)
        • +
        + +

        These combinations require that all queries are present in both datasets, and can be created using an IntersectionDataset. TorchGeo automatically composes these datasets for you when you use the intersection (&) and union (|) operators.

        + +

        Multispectral and geospatial transforms

        + +

        In deep learning, it’s common to augment and transform the data so that models are robust to variations in the input space. Geospatial data can have variations such as seasonal changes and warping effects, as well as image processing and capture issues like cloud cover and atmospheric distortion. TorchGeo utilizes augmentations and transforms from the Kornia library, which supports GPU acceleration and supports multispectral imagery with more than 3 channels.

        + +

        Traditional geospatial analyses compute and visualize spectral indices which are combinations of multispectral bands. Spectral indices are designed to highlight areas of interest in a multispectral image relevant to some application, such as vegetation health, areas of man-made change or increasing urbanization, or snow cover. TorchGeo supports numerous transforms, which can compute common spectral indices and append them as additional bands to a multispectral image tensor.

        + +

        Below, we show a simple example where we compute the Normalized Difference Vegetation Index (NDVI) on a Sentinel-2 image. NDVI measures the presence of vegetation and vegetation health and is computed as the normalized difference between the red and near-infrared (NIR) spectral bands. Spectral index transforms operate on sample dictionaries returned from TorchGeo datasets and append the resulting spectral index to the image channel dimension.

        + +

        First, we instantiate a Sentinel-2 dataset and load a sample image. Then, we plot the true color (RGB) representation of this data to see the region we are looking at.

        + +
        import matplotlib.pyplot as plt
        +from torchgeo.datasets import Sentinel2
        +from torchgeo.transforms import AppendNDVI
        +
        +dataset = Sentinel2(root="...")
        +sample = dataset[...]
        +fig = dataset.plot(sample)
        +plt.show()
        +
        + +

        Next, we instantiate and compute an NDVI transform, appending this new channel to the end of the image. Sentinel-2 imagery uses index 0 for its red band and index 3 for its NIR band. In order to visualize the data, we also normalize the image. NDVI values can range from -1 to 1, but we want to use the range 0 to 1 for plotting.

        + +
        transform = AppendNDVI(index_red=0, index_nir=3)
        +sample = transform(sample)
        +sample["image"][-1] = (sample["image"][-1] + 1) / 2
        +plt.imshow(sample["image"][-1], cmap="RdYlGn_r")
        +plt.show()
        +
        + +

        + +

        + +

        +True color (left) and NDVI (right) of the Texas Hill Region, taken on November 16, 2018 by the Sentinel-2 satellite. In the NDVI image, red indicates water bodies, yellow indicates barren soil, light green indicates unhealthy vegetation, and dark green indicates healthy vegetation. +

        + +

        Benchmark datasets

        + +

        One of the driving factors behind progress in computer vision is the existence of standardized benchmark datasets like ImageNet and MNIST. Using these datasets, researchers can directly compare the performance of different models and training procedures to determine which perform the best. In the remote sensing domain, there are many such datasets, but due to the aforementioned difficulties of working with this data and the lack of existing libraries for loading these datasets, many researchers opt to use their own custom datasets.

        + +

        One of the goals of TorchGeo is to provide easy-to-use data loaders for these existing datasets. TorchGeo includes a number of benchmark datasets —datasets that include both input images and target labels. This includes datasets for tasks like image classification, regression, semantic segmentation, object detection, instance segmentation, change detection, and more.

        + +

        If you’ve used torchvision before, these types of datasets should be familiar. In this example, we’ll create a dataset for the Northwestern Polytechnical University (NWPU) very-high-resolution ten-class (VHR-10) geospatial object detection dataset. This dataset can be automatically downloaded, checksummed, and extracted, just like with torchvision.

        + +
        from torch.utils.data import DataLoader
        +from torchgeo.datasets import VHR10
        +
        +dataset = VHR10(root="...", download=True, checksum=True)
        +dataloader = DataLoader(dataset, batch_size=128, shuffle=True, num_workers=4)
        +
        +for batch in dataloader:
        +    image = batch["image"]
        +    label = batch["label"]
        +
        +    # train a model, or make predictions using a pre-trained model
        +
        + +

        All TorchGeo datasets are compatible with PyTorch data loaders, making them easy to integrate into existing training workflows. The only difference between a benchmark dataset in TorchGeo and a similar dataset in torchvision is that each dataset returns a dictionary with keys for each PyTorch Tensor.

        + +

        + +

        + +

        +Example predictions from a Mask R-CNN model trained on the NWPU VHR-10 dataset. The model predicts sharp bounding boxes and masks for all objects with high confidence scores. +

        + +

        Reproducibility with PyTorch Lightning

        + +

        Another key goal of TorchGeo is reproducibility. For many of these benchmark datasets, there is no predefined train-val-test split, or the predefined split has issues with class imbalance or geographic distribution. As a result, the performance metrics reported in the literature either can’t be reproduced, or aren’t indicative of how well a pre-trained model would work in a different geographic location.

        + +

        In order to facilitate direct comparisons between results published in the literature and further reduce the boilerplate code needed to run experiments with datasets in TorchGeo, we have created PyTorch Lightning datamodules with well-defined train-val-test splits and trainers for various tasks like classification, regression, and semantic segmentation. These datamodules show how to incorporate augmentations from the kornia library, include preprocessing transforms (with pre-calculated channel statistics), and let users easily experiment with hyperparameters related to the data itself (as opposed to the modeling process). Training a semantic segmentation model on the Inria Aerial Image Labeling dataset is as easy as a few imports and four lines of code.

        + +
        from pytorch_lightning import Trainer
        +from torchgeo.datamodules import InriaAerialImageLabelingDataModule
        +from torchgeo.trainers import SemanticSegmentationTask
        +
        +datamodule = InriaAerialImageLabelingDataModule(root_dir="...", batch_size=64, num_workers=6)
        +task = SemanticSegmentationTask(segmentation_model="unet", encoder_weights="imagenet", learning_rate=0.1)
        +trainer = Trainer(gpus=1, default_root_dir="...")
        +
        +trainer.fit(model=task, datamodule=datamodule)
        +
        + +

        + +

        + +

        +Building segmentations produced by a U-Net model trained on the Inria Aerial Image Labeling dataset. Reproducing these results is as simple as a few imports and four lines of code, making comparison of different models and training techniques simple and easy. +

        + +

        In our preprint we show a set of results that use the aforementioned datamodules and trainers to benchmark simple modeling approaches for several of the datasets in TorchGeo. For example, we find that a simple ResNet-50 can achieve state-of-the-art performance on the So2Sat dataset. These types of baseline results are important for evaluating the contribution of different modeling choices when tackling problems with remotely sensed data.

        + +

        Future work and contributing

        + +

        There is still a lot of remaining work to be done in order to make TorchGeo as easy to use as possible, especially for users without prior deep learning experience. One of the ways in which we plan to achieve this is by expanding our tutorials to include subjects like “writing a custom dataset” and “transfer learning”, or tasks like “land cover mapping” and “object detection”.

        + +

        Another important project we are working on is pre-training models. Most remote sensing researchers work with very small labeled datasets, and could benefit from pre-trained models and transfer learning approaches. TorchGeo is the first deep learning library to provide models pre-trained on multispectral imagery. Our goal is to provide models for different image modalities (optical, SAR, multispectral) and specific platforms (Landsat, Sentinel, MODIS) as well as benchmark results showing their performance with different amounts of training data. Self-supervised learning is a promising method for training such models. Satellite imagery datasets often contain petabytes of imagery, but accurately labeled datasets are much harder to come by. Self-supervised learning methods will allow us to train directly on the raw imagery without needing large labeled datasets.

        + +

        Aside from these larger projects, we’re always looking to add new datasets, data augmentation transforms, and sampling strategies. If you’re Python savvy and interested in contributing to TorchGeo, we would love to see contributions! TorchGeo is open source under an MIT license, so you can use it in almost any project.

        + +

        External links:

        + + + +

        If you like TorchGeo, give us a star on GitHub! And if you use TorchGeo in your work, please cite our paper.

        + +

        Acknowledgments

        + +

        We would like to thank all TorchGeo contributors for their efforts in creating the library, the Microsoft AI for Good program for support, and the PyTorch Team for their guidance. This research is part of the Blue Waters sustained-petascale computing project, which is supported by the National Science Foundation (awards OCI-0725070 and ACI-1238993), the State of Illinois, and as of December, 2019, the National Geospatial-Intelligence Agency. Blue Waters is a joint effort of the University of Illinois at Urbana-Champaign and its National Center for Supercomputing Applications. The research was supported in part by NSF grants IIS-1908104, OAC-1934634, and DBI-2021898.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/getting-started-with-pytorch-2.0/index.html b/blog/getting-started-with-pytorch-2.0/index.html new file mode 100644 index 000000000000..3f7ddc686f9e --- /dev/null +++ b/blog/getting-started-with-pytorch-2.0/index.html @@ -0,0 +1,653 @@ + + + + + + + + + + + + + Get Started with PyTorch 2.0 Summary and Overview | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Introducing PyTorch 2.0, our first steps toward the next generation 2-series release of PyTorch. Over the last few years we have innovated and iterated from PyTorch 1.0 to the most recent 1.13 and moved to the newly formed PyTorch Foundation, part of the Linux Foundation.

        + +

        To complement the PyTorch 2.0 announcement and conference, we have also posted a comprehensive introduction and technical overview within the Get Started menu at https://pytorch.org/get-started/pytorch-2.0.

        + +

        We also wanted to ensure you had all the information to quickly leverage PyTorch 2.0 in your models so we added the technical requirements, tutorial, user experience, Hugging Face benchmarks and FAQs to get you started today!

        + +

        Finally we are launching a new “Ask the Engineers: 2.0 Live Q&A” series that allows you to go deeper on a range of topics with PyTorch subject matter experts. We hope this content is helpful for the entire community and level of users/contributors.

        + +

        https://pytorch.org/get-started/pytorch-2.0

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/graphcore-joins-pytorch/index.html b/blog/graphcore-joins-pytorch/index.html new file mode 100644 index 000000000000..cbaf5fd4830e --- /dev/null +++ b/blog/graphcore-joins-pytorch/index.html @@ -0,0 +1,669 @@ + + + + + + + + + + + + + Graphcore Joins the PyTorch Foundation as a General Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Graphcore logo

        + +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Graphcore has joined as a general member.

        + +

        Graphcore is a UK-based company that specializes in designing and manufacturing AI accelerators, hardware and software specifically tailored for artificial intelligence and machine learning workloads.

        + +

        “We’re thrilled that PyTorch is the leading framework for development on the Graphcore platform,” said Executive Director of the PyTorch Foundation Ibrahim Haddad. “Graphcore has played an important role in the hardware and open source space, and we look forward to their continued contributions to PyTorch.”

        + +

        Graphcore has contributed to the PyTorch ecosystem by developing integrations to run on their IPU hardware. These integrations enable researchers and practitioners to use their preferred frameworks while taking advantage of Graphcore’s specialized hardware.

        + +

        “At Graphcore we’re truly aligned with PyTorch’s objective of reducing the barrier of entry to AI practitioners. By supporting a native PyTorch software environment for IPUs we are giving developers access to new underlying hardware, designed from the ground up for AI, to help unlock new AI techniques to improve efficiency or performance and to drive breakthroughs in AI research and applications, with the same user-friendly PyTorch framework they know and expect. We look forward to contributing to and growing the global AI community as an active member of the PyTorch Foundation and are proud to be the first general member.” Anthony Barbier, Software Frameworks Lead at Graphcore.

        + +

        To learn more about how you can be a part of the PyTorch Foundation, visit our website.

        + +

        About Graphcore

        + +

        Graphcore compute systems are accelerating the AI revolution. Powered by the groundbreaking Intelligence Processing Unit (IPU), Graphcore delivers leading-edge AI performance with unprecedented efficiency. IPUs are used around the world by organisations building their intelligent compute capabilities, including AI-centric startups, large multinational corporations and both public and private research institutions. Graphcore is backed by some of the world’s leading investors and has attracted more than $700m of funding. The company is based in Bristol, UK, with offices across Europe, Asia and North America.

        + +

        About PyTorch Foundation

        + +

        The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

        + +

        About The Linux Foundation

        + +

        The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hacker-cup/index.html b/blog/hacker-cup/index.html new file mode 100644 index 000000000000..242032a1fe8a --- /dev/null +++ b/blog/hacker-cup/index.html @@ -0,0 +1,647 @@ + + + + + + + + + + + + + Announcing Hacker Cup AI Track at NeurIPS 2024 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        The PyTorch team in partnership with Meta Hacker Cup, and Microsoft Research, are excited to announce the Hacker Cup AI Track at NeurIPS 2024. This will be the first AI track for the popular Meta Hacker Cup programming competition designed to assess the capabilities of Generative AI in performing autonomous code generation tasks. We aim to test the limits of AI in complex coding challenges and measure the performance gap between AI systems and human programmers. We will provide access to all Hacker Cup problems since 2011 alongside their respective solutions in a multimodal (image and text) format, and utilize the existing Hacker Cup infrastructure for competitor evaluation. Featuring both open evaluation, open model and open evaluation, closed model tracks, this competition invites diverse participation from research institutions of varied interests and resource constraints, including academic labs, AI startups, large technology companies, and AI enthusiasts. Our goal is to develop and democratize meaningful advancements in code automation with the very first open evaluation process for competitive AI programmers. Registration will begin in Early August, with our first qualification round on September 20th.

        + +

        For more information please visit our website at https://www.facebook.com/codingcompetitions/hacker-cup/ and join our Discord at discord.gg/wWeN9hTH32

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hadacore/index.html b/blog/hadacore/index.html new file mode 100644 index 000000000000..2b0f26773b5c --- /dev/null +++ b/blog/hadacore/index.html @@ -0,0 +1,819 @@ + + + + + + + + + + + + + HadaCore: Tensor Core Accelerated Hadamard Transform Kernel | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + IBM and Meta + +

        +

        IBM: Krish Agarwal, Rishi Astra, Adnan Hoque, Mudhakar Srivatsa, Raghu Ganti
        +Meta: Less Wright, Sijia Chen

        + +

        Quantization is a method for improving model inference speeds by compressing model weights and performing (faster) computation in lower precision data types. However, quantization can result in accuracy loss due to the presence of outliers. Recent works like QuaRot, SpinQuant, and FlashAttention-3 introduce methods to increase the numerical accuracy of INT4, INT8 and FP8 quantization in LLMs. These methods rely on Hadamard Transforms. In this blog, we present HadaCore, a Hadamard Transform CUDA kernel that achieves state-of-the-art performance on NVIDIA A100 and H100 GPUs. Our kernel achieves speedups of 1.1–1.4x and 1.0–1.3x, with a peak gain of 3.5x and 3.6x respectively, over Dao AI Lab’s Fast Hadamard Transform Kernel. We leverage a hardware-aware work decomposition that benefits from Tensor Core acceleration while maintaining quantization error reduction.

        + +

        Figure 1: Speedup of HadaCore vs Dao AI Hadamard CUDA kernel. A peak gain of 3.46x on the A100 is achieved using 128 rotation by 8.4M elements.

        + +

        Figure 1: Speedup of HadaCore vs Dao AI Hadamard CUDA kernel. A peak gain of 3.46x on the A100 is achieved using 128 rotation by 8.4M elements.

        + +

        The HadaCore Kernel is publicly available.

        + +

        Background

        + +

        QuaRot and SpinQuant both propose methods to increase the numerical accuracy of INT4 and INT8 quantization in LLMs. Both methods rotate model activations since rotations are statistically likely to reduce the magnitude of outliers, as it “distributes” extreme values among other (less extreme) dimensions, and rotation is also an easily invertible operation using the inverse of the rotation matrix. These methods can also improve FP8 inference accuracy, such as in FlashAttention-3.

        + +

        Figure 2. Transformer block showing online (red) and offline rotations (blue) in QuaRot

        + +

        Figure 2. Transformer block showing online (red) and offline rotations (blue) in QuaRot

        + +

        Applying these rotation matrices introduces model runtime overhead due to the online operations shown in Figure 2. These rotations can be applied through matrix multiplication, but the added overhead would diminish the benefits from quantization. Therefore, QuaRot and SpinQuant opt to use Walsh-Hadamard matrices, a special type of rotation matrix that can be applied faster than matrix multiplication using the Fast Walsh-Hadamard Transform algorithm. HadaCore is an optimized implementation of this algorithm for NVIDIA GPUs that support Tensor Cores.

        + +

        Tensor Core Accelerated Hadamard Transform

        + +

        HadaCore leverages NVIDIA Tensor Cores, which are specialized compute units on NVIDIA GPUs optimized for matrix multiplication. To achieve this, our kernel performs a hardware-aware work decomposition of the Fast Walsh-Hadamard algorithm. This work decomposition ensures that we can utilize the MMA PTX instructions that execute on the Tensor Core chip. HadaCore applies a 16×16 Hadamard transform to chunks of the input data. The computation can then be offloaded to the FP16 Tensor Core with usage of the mma.m16n8k16 instruction. The warp-level parallelism for HadaCore is shown below.

        + +

        Figure 3: HadaCore Parallelization, 1x256 vectors (rows) being rotated by a size 256 Hadamard.

        + +

        Figure 3: HadaCore Parallelization, 1x256 vectors (rows) being rotated by a size 256 Hadamard.

        + +

        We process fragments of 256 elements in parallel using warp-level Tensor Core operations to achieve up to a 256-size Hadamard transform. For further sizes, we shuffle data between warps and repeat.

        + +

        Microbenchmarks

        + +

        We benchmark HadaCore against the Dao AI Lab Hadamard Kernel on both NVIDIA H100 and A100 GPUs across varying Hadamard and input tensor sizes.

        + +

        Figure 4:  HadaCore Kernel Speedup on NVIDIA A100 over Dao AI Lab Fast Hadamard Kernel

        + +

        Figure 4: HadaCore Kernel Speedup on NVIDIA A100 over Dao AI Lab Fast Hadamard Kernel

        + +

        Color coded Speedup Table for NVIDIA A100, Green = Speedup over Baseline

        + +

        Color coded Speedup Table for NVIDIA A100, Green = Speedup over Baseline

        + +

        Figure 5:  HadaCore Kernel Speedup on NVIDIA H100 over Dao AI Lab Fast Hadamard Kernel

        + +

        Figure 5: HadaCore Kernel Speedup on NVIDIA H100 over Dao AI Lab Fast Hadamard Kernel

        + +

        Color coded Speedup Table for NVIDIA H100, Green = Speedup over Baseline

        + +

        Color coded Speedup Table for NVIDIA H100, Green = Speedup over Baseline

        + +

        We showcase our speedup as the input tensor size (labeled element count) in our charts increase. Element count is the number of elements in the target matrix we are rotating. For example, in multi-head attention:

        + +

        The queries (Q), keys (K) and values (V) tensors are 4D tensors of size:

        + +

        (batch_size, seq_len, n_heads, head_dim)

        + +

        A Hadamard matrix of size head_dim is applied to these activation tensors, so we refer to this as using a Hadamard size of head_dim with an element count of:

        + +

        batch_size*seq_len*n_heads*head_dim.

        + +

        Common element counts for query rotations in an attention block:

        + + + + + + + + + + + + + + + + + +
        Model \ Tokens + Prefill + Decoding +
        Llama-2 70b + 33,554,432 elements +
        +128 Hadamard size +
        + +(1 batch * 64 heads * 4096 tokens * 128 dimensional embeddings per head per token) +
        8192 elements +
        +128 Hadamard size +
        +(1 batch * 64 heads * 1 token * 128 dimensional embeddings per head per token) +
        Llama-3 8b + 33,554,432 elements +
        +128 Hadamard size +
        +(1 batch * 32 heads * 8192 tokens * 128 dimensional embeddings per head per token) +
        4,096 elements +
        +128 Hadamard size +
        +(1 batch * 32 heads * 1 token * 128 dimensional embeddings per head per token) +
        + +

        HadaCore achieves 1.1–1.4x speedup on A100 and 1.0–1.3x speedup on H100 over Dao AI Lab’s Fast Hadamard kernel, with a peak gain of 3.5x and 3.6x, respectively. For smaller sizes on H100, HadaCore’s gain decreases. For future work, we plan to incorporate usage of Hopper specific features like TMA and WGMMA for improved H100 performance.

        + +

        MMLU Benchmarks

        + +

        We evaluated MMLU scores on a Llama 3.1-8B inference workload where the FlashAttention computation was performed in FP8. Newer generation NVIDIA Hopper GPUs come equipped with FP8 Tensor Cores that deliver substantial compute gain over FP16.

        + +

        Our results show the benefit of using HadaCore for accuracy preservation when combined with optimizations such as FP8 FlashAttention.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Format + Method + Llama3.1-8B +
        +Avg. 5-Shot MMLU Accuracy +
        Q, K, V: FP16 +
        +FlashAttention: FP16 +
        N/A + 65.38 +
        Q, K, V: FP16 +
        +FlashAttention: FP8 +
        No Hadamard + 64.40 +
        Q, K, V: FP8 +
        +FlashAttention: FP8 +
        HadaCore + 65.09 +
        Q, K, V: FP8 +
        +FlashAttention: FP8 +
        Dao AI Fast Hadamard Kernel + 65.45 +
        + +

        Table 1: MMLU scores for Llama3.1 8B with FP16 baseline and FP8 attention using Hadamard transforms, comparing an implementation with explicit Hadamard matrix multiplications vs. HadaCore (higher is better)

        + +

        From the above MMLU scores, we note that for Llama3.1-8B inference with FP8 attention, HadaCore improves the quantization error introduced from computing attention in a lower precision.

        + +

        Conclusion

        + +

        We showcased our speedups achieved by moving the Fast-Walsh Hadamard algorithm into a CUDA kernel that leverages Tensor Core acceleration and achieves a peak speedup of 3.5x and 3.6x over the Dao AI Fast-Hadamard kernel on NVIDIA A100 and H100, respectively.

        + +

        Further, we showed on the MMLU benchmark that rotating with HadaCore maintains similar quantization error reduction to the Fast-Hadamard kernel, while providing computational acceleration.

        + +

        Future Work

        + +

        We plan to implement a Triton version of our kernel and experiment with more advanced techniques such as kernel fusion to support fused Hadamard transform and quantization. Further, we plan to extend our kernel to support BF16 Tensor Core compute.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hi-po-low-bit-operators/index.html b/blog/hi-po-low-bit-operators/index.html new file mode 100644 index 000000000000..0685b7f33beb --- /dev/null +++ b/blog/hi-po-low-bit-operators/index.html @@ -0,0 +1,765 @@ + + + + + + + + + + + + + High-Performance Low-Bit Operators for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Scott Roy, Digant Desai, Kimish Patel + +

        +

        We are excited to announce the addition of embedding operators with low-bit weights (1-8 bit) and linear operators with 8-bit dynamically quantized activations and low-bit weights (1-8 bit) for Arm CPUs in TorchAO, PyTorch’s native low-precision library. These operators work seamlessly across all PyTorch surfaces, including eager, torch.compile, AOTI, and ExecuTorch, and are available to use in torchchat.

        + +

        In developing these linear operators, our focus was on code sharing between PyTorch and ExecuTorch, and establishing a clear boundary between the higher-level operator and the lower-level kernel. This design allows third-party vendors to easily swap in their own kernels. We also set out to create a place and infrastructure to experiment with new CPU quantization ideas and test those across the PyTorch ecosystem.

        + +

        Universal low-bit kernels

        + +

        There is no hardware support for low-bit arithmetic. In what we call universal kernels, we explicitly separated the logic that unpacks low-bit values to int8 values, and the int8 GEMV kernel logic in a modular fashion. We started with an 8-bit kernel, for example, this 1x8 8-bit GEMV kernel that uses the Arm neondot instruction. Within the 8-bit kernel, we invoke an inlined unpacking routine to convert low-bit values into int8 values. This unpacking routine is force-inlined and templated on some low-bit value. Our experiments showed no performance difference between using a separate force-inlined unpacking routine and directly embedding the unpacking code inline.

        + +

        The advantage of this modular design is improved development speed and code maintainability. After writing an 8-bit kernel, we quickly achieved full low-bit coverage by writing simple bitpacking routines. In fact, developers who worked on the bit packing routines did not need to be experts on GEMV/GEMM kernel writing. We also reused the same bitpacking routines from the linear kernels within the embedding kernels. In future we could reuse the same bitpacking routines for universal GEMM kernels or kernels based on fma or i8mm instructions.

        + +

        Shared code between PyTorch and ExecuTorch

        + +

        To achieve shared code between PyTorch and ExecuTorch, we wrote kernels using raw pointers instead of PyTorch tensors. Moreover, we implemented the linear operator in a header that is included in separate PyTorch and ExecuTorch operator registration code. By using only features common to both ATen and ExecuTorch tensors, we ensured compatibility between the two frameworks. For multi-threaded compute, we introduced torchao::parallel_1d, which compiles to either at::parallel_for or ExecuTorch’s threadpool based on compile-time flags.

        + +

        Swappable kernels

        + +

        Our design for the higher-level multi-threaded linear operator is agnostic to the lower-level single-threaded kernels, allowing third-party vendors to swap in their own implementations. The interface between the operator and kernel is defined by a ukernel config, which specifies kernel function pointers for preparing activation data, preparing weight data, and running the kernel. The operator, responsible for tiling and scheduling, interacts with kernels solely through this config.

        + +

        Performance

        + +

        In the table below, we show Llama3.1 8B token generation performance using 6 CPU threads on an M1 Macbook Pro with 32GB of RAM.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Bitwidth x + torch.compile (Decode tokens/sec) + ExecuTorch (Decode tokens/sec) + ExecuTorch PTE size (GiB) +
        1 + 24.18 + 17.86 + 1.46 +
        2 + 27.02 + 19.65 + 2.46 +
        3 + 21.01 + 22.25 + 3.46 +
        4 + 19.51 + 19.47 + 4.47 +
        5 + 14.78 + 16.34 + 5.47 +
        6 + 12.80 + 13.61 + 6.47 +
        7 + 8.16 + 11.73 + 7.48 +
        + +

        Results were run on an M1 Macbook Pro (with 8 perf cores, and 2 efficiency cores) with 32GB of RAM and 6 threads using torchchat. In each test, the max-seq-length of 128 tokens were generated. For each bit width x, the embedding layer was groupwise quantized to x-bits with group size 32. In the linear layers, activations were dynamically quantized per token to 8 bits and weights were groupwise quantized to x-bits with group size 256. Our focus here is performance and we do not report accuracy or perplexity numbers. Depending on the model, lower bit widths may require quantization-aware training, quantizing a model with a mixture of bit widths, or adjusting the group sizes for acceptable accuracy.

        + +

        Llama 3.1 chart

        + +

        Try them out and contribute!

        + +

        If you want to see the new low-bit kernels in action, give them a try by setting up torchchat and quantizing and running an LLM locally using the kernels.

        + +

        If you want to help contribute, consider adding support for one of the following areas:

        + +
          +
        • Add universal low-bit GEMM kernels for Arm CPU, reusing the same bitpacking routines from the universal GEMV kernels.
        • +
        • Improve runtime selection of ukernel configs based on ISA, packing format, and activation shape.
        • +
        • Add low-bit kernels for other CPU ISAs like x86.
        • +
        • Integrate third-party libraries like KleidiAI with the operator framework.
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/high-performance-llama-2/index.html b/blog/high-performance-llama-2/index.html new file mode 100644 index 000000000000..f525dfee7eda --- /dev/null +++ b/blog/high-performance-llama-2/index.html @@ -0,0 +1,1119 @@ + + + + + + + + + + + + + High-Performance Llama 2 Training and Inference with PyTorch/XLA on Cloud TPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Jiewen Tan, Jon Bolin, Yeounoh Chung, Liyang Lu, Siyuan Liu, Wonjoo Lee, Manfei Bai, Meghan Cowan, Jack Cao, Milad Mohammadi, Shauheen Zahirazami, Alex Spiridonov + +

        +

        In a landscape where AI innovation is accelerating at an unprecedented pace, Meta’s Llama family of open sourced large language models (LLMs) stands out as a notable breakthrough. Llama marked a significant step forward for LLMs, demonstrating the power of pre-trained architectures for a wide range of applications. Llama 2 further pushed the boundaries of scale and capabilities, inspiring advancements in language understanding, generation, and beyond.

        + +

        Shortly after the announcement of Llama, we published a blog post showcasing ultra-low inference latency for Llama using PyTorch/XLA on Cloud TPU v4. Building on these results, today, we are proud to share Llama 2 training and inference performance using PyTorch/XLA on Cloud TPU v4 and our newest AI supercomputer, Cloud TPU v5e.

        + +

        In this blog post, we use Llama 2 as an example model to demonstrate the power of PyTorch/XLA on Cloud TPUs for LLM training and inference. We discuss the computation techniques and optimizations used to improve inference throughput and training model FLOPs utilization (MFU). For Llama 2 70B parameters, we deliver 53% training MFU, 17 ms/token inference latency, 42 tokens/s/chip throughput powered by PyTorch/XLA on Google Cloud TPU. We offer a training user guide and an inference user guide for reproducing the results in this article. Additionally, you may find our Google Next 2023 presentation here.

        + +

        Model Overview

        + +

        Llama 2 comes in various sizes, ranging from 7B to 70B parameters, catering to different needs, computational resources, and training / inference budgets. Whether it’s small-scale projects or large-scale deployments, Llama models offer versatility and scalability to accommodate a wide range of applications.

        + +

        Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. The largest, 70B model, uses grouped-query attention, which speeds up inference without sacrificing quality. Llama 2 is trained on 2 trillion tokens (40% more data than Llama) and has the context length of 4,096 tokens for inference (double the context length of Llama), which enables more accuracy, fluency, and creativity for the model.

        + +

        Llama 2 is a state-of-the-art LLM that outperforms many other open source language models on many benchmarks, including reasoning, coding, proficiency, and knowledge tests. The model’s scale and complexity place many demands on AI accelerators, making it an ideal benchmark for LLM training and inference performance of PyTorch/XLA on Cloud TPUs.

        + +

        Performance Challenge of LLMs

        + +

        Large-scale distributed training for LLMs such as Llama 2 introduces technical challenges that require practical solutions to make the most efficient use of TPUs. Llama’s size can strain both memory and processing resources of TPUs. To address this, we use model sharding, which involves breaking down the model into smaller segments, each fitting within the capacity of a single TPU core. This enables parallelism across multiple TPUs, improving training speed while reducing communication overhead.

        + +

        Another challenge is managing the large datasets required for training Llama 2 efficiently, which requires effective data distribution and synchronization methods. Additionally, optimizing factors like learning rate schedules, gradient aggregation, and weight synchronization across distributed TPUs is crucial for achieving convergence.

        + +

        After pretraining or fine-tuning Llama 2, running inference on the model checkpoint creates additional technical challenges. All of the challenges discussed in our previous blog post, such as autoregressive decoding, variable input prompt lengths, and the need for model sharding and quantization still apply for Llama 2. In addition, Llama 2 introduced two new capabilities: grouped-query attention and early stopping. We discuss how PyTorch/XLA handles these challenges to enable high-performance, cost-efficient training and inference of Llama 2 on Cloud TPU v4 and v5e.

        + +

        Large-Scale Distributed Training

        + +

        PyTorch/XLA offers two major ways of doing large-scale distributed training: SPMD, which utilizes the XLA compiler to transform and partition a single-device program into a multi-device distributed program; and FSDP, which implements the widely-adopted Fully Sharded Data Parallel algorithm.

        + +

        In this blog post, we show how to use the SPMD API to annotate the HuggingFace (HF) Llama 2 implementation to maximize performance. For comparison, we also show our FSDP results with the same configurations; read about PyTorch/XLA FSDP API here.

        + +

        SPMD Overview

        + +

        Let’s briefly review the fundamentals of SPMD. For details, please refer to our blog post and user guide.

        + +

        Mesh

        + +

        A multidimensional array that describes the logical topology of the TPU devices:

        + +
        # Assuming you are running on a TPU host that has 8 devices attached
        +num_devices = xr.global_runtime_device_count()
        +# mesh shape will be (4,2) in this example
        +mesh_shape = (num_devices // 2, 2)
        +device_ids = np.array(range(num_devices))
        +# axis_names 'x' and 'y' are optional
        +mesh = Mesh(device_ids, mesh_shape, ('x', 'y'))
        +
        + +

        Partition Spec

        + +

        A tuple that describes how the corresponding tensor’s dimensions are sharded across the mesh:

        + +
        partition_spec = ('x', 'y')
        +
        + +

        Mark Sharding

        + +

        An API that takes a mesh and a partition_spec, and then generates a sharding annotation for the XLA compiler.

        + +
        tensor = torch.randn(4, 4).to('xla')
        +# Let's resue the above mesh and partition_spec.
        +# It means the tensor's 0th dim is sharded 4 way and 1th dim is sharded 2 way.
        +xs.mark_sharding(tensor, mesh, partition_spec)
        +
        + +

        2D Sharding with SPMD

        + +

        In our SPMD blog post, we demonstrated using 1D FSDP style sharding. Here, we introduce a more powerful sharding strategy, called 2D sharding, where both the parameters and activations are sharded. This new sharding strategy not only allows fitting a larger model but also boosts the MFU to up to 54.3%. For more details, read the Benchmarks section.

        + +

        This section introduces a set of general rules that applies to most LLMs, and for convenience we directly reference the variable names and configuration names from HF Llama.

        + +

        First, let’s create a 2D Mesh with corresponding axis names: data and model. The data axis is usually where we distribute the input data, and the model axis is where we further distribute the model.

        + +
        mesh = Mesh(device_ids, mesh_shape, ('data', 'model'))
        +
        + +

        The mesh_shape can be a hyper-parameter that is tuned for different model sizes and hardware configurations. The same mesh will be reused in all following sharding annotations. In the next few sections, we will cover how to use the mesh to shard parameters, activations and input data.

        + +

        Parameter Sharding

        + +

        Below is a table that summarizes all parameters of HF Llama 2 and corresponding partition specifications. Example HF code can be found here.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Parameter Name + Explanation + Parameter Shape + Partition Spec +
        embed_tokens + embedding layer + (vocab_size, hidden_size) + (model, data) +
        q_proj + attention weights + (num_heads x head_dim, hidden_size) + (data, model) +
        k_proj / v_proj + attention weights + (num_key_value_heads x head_dim, hidden_size) + (data, model) +
        o_proj + attention weights + (hidden_size, num_heads x head_dim) + (model, data) +
        gate_proj / up_proj + MLP weights + (intermediate_size, hidden_size) + (model, data) +
        down_proj + MLP weights + (hidden_size, intermediate_size) + (data, model) +
        lm_head + HF output embedding + (vocab_size, hidden_size) + (model, data) +
        + +

        Table 1: SPMD 2D Sharding Parameter Partition Spec

        + +

        The rule is to shard the hidden_size dim of any weights except QKVO projections according to the data axis of the mesh, then shard the other dim with the remaining model axis. For QKVO, do the opposite. This model-data axis rotation methodology is similar to that of Megatron-LM to reduce communication overhead. For layernorm weights, we implicitly mark them as replicated across different devices given they are 1D tensors.

        + +

        Activation Sharding

        + +

        In order to better utilize the device memory, very often we need to annotate the output of some memory bound ops. That way the compiler is forced to only keep partial output on devices instead of the full output. In Llama 2, we explicitly annotate all torch.matmul and nn.Linear outputs. Table 2 summarizes the corresponding annotations; the example HF code can be found here.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Output Name + Explanation + Output Shape + Partition Spec +
        inputs_embeds + embedding layer output + (batch_size, sequence_length, hidden_size) + (data, None, model) +
        query_states + attention nn.Linear output + (batch_size, sequence_length, num_heads x head_dim) + (data, None, model) +
        key_states / value_states + attention nn.Linear output + (batch_size, sequence_length, num_key_value_heads x head_dim) + (data, None, model) +
        attn_weights + attention weights + (batch_size, num_attention_heads, sequence_length, sequence_length) + (data, model, None, None) +
        attn_output + attention layer output + (batch_size, sequence_length, hidden_size) + (data, None, model) +
        up_proj / gate_proj / down_proj + MLP nn.Linear outputs + (batch_size, sequence_length, intermediate_size) + (data, None, model) +
        logits + HF output embedding output + (batch_size, sequence_length, hidden_size) + (data, None, model) +
        + +

        Table 2: SPMD 2D Sharding Activation Partition Spec

        + +

        The rule is to shard the batch_size dim of any outputs according to the data axis of the mesh, then replicate the length dims of any outputs, and finally shard the last dim along the model axis.

        + +

        Input Sharding

        + +

        For input sharding, the rule is to shard the batch dim along the data axis of the mesh, and replicate the sequence_length dim. Below is the example code, and the corresponding HF change may be found here.

        + +
        partition_spec = ('data', None)
        +sharding_spec = xs.ShardingSpec(mesh, partition_spec)
        +# MpDeviceLoader will shard the input data before sending to the device.
        +pl.MpDeviceLoader(dataloader, self.args.device, input_sharding=sharding_spec, ...)
        +
        + +

        Now, all the data and model tensors that require sharding are covered!

        + +

        Optimizer States & Gradients

        + +

        You may be wondering whether it is necessary to shard the optimizer states and gradients as well. Great news: the sharding propagation feature of the XLA compiler automates the sharding annotation in these two scenarios, without needing more hints to improve performance.

        + +

        It is important to note that optimizer states are typically initialized within the first iteration of the training loop. From the standpoint of the XLA compiler, the optimizer states are the outputs of the first graph, and therefore have the sharding annotation propagated. For subsequent iterations, the optimizer states become inputs to the second graph, with the sharding annotation propagated from the first one. This is also why PyTorch/XLA typically produces two graphs for the training loops. If the optimizer states are somehow initialized before the first iteration, users will have to manually annotate them, just like the model weights.

        + +

        Again, all concrete examples of the above sharding annotation can be found in our fork of HF Transformers here. The repo also contains code for our experimental feature MultiSlice, including HybridMesh and dcn axis, which follows the same principles mentioned above.

        + +

        Caveats

        + +

        While using SPMD for training, there are a few important things to pay attention to:

        + +
          +
        • Use torch.einsum instead of torch.matmul; torch.matmul usually flattens tensors and does a torch.mm at the end, and that’s bad for SPMD when the combined axes are sharded. The XLA compiler will have a hard time determining how to propagate the sharding.
        • +
        • PyTorch/XLA provides patched [nn.Linear](https://github.com/pytorch/xla/blob/master/torch_xla/experimental/xla_sharding.py#L570) to overcome the above constraint:
        • +
        + +
        import torch_xla.experimental.xla_sharding as xs
        +from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear
        +
        + model = apply_xla_patch_to_nn_linear(model, xs.xla_patched_nn_linear_forward)
        +
        + +
          +
        • Always reuse the same mesh across all shardings
        • +
        • Always specify --dataloader_drop_last yes. The last smaller data is hard to annotate.
        • +
        • Large models which are initialized on the host can induce host-side OOM. One way to avoid this issue is to initialize parameters on the meta device, then create and shard real tensors layer-by-layer.
        • +
        + +

        Infrastructure Improvements

        + +

        Besides the above modeling techniques, we have developed additional features and improvements to maximize performance, including:

        + +
          +
        • We enable asynchronous collective communication. This requires enhancements on the XLA compiler’s latency hiding scheduler to better optimize for the Llama 2 PyTorch code.
        • +
        • We now allow sharding annotations in the middle of the IR graph, just like JAX’s jax.lax.with_sharding_constraint. Previously, only graph inputs were annotated.
        • +
        • We also propagate replicated sharding spec from the compiler to the graph outputs. This allows us to shard the optimizer states automatically.
        • +
        + +

        Inference Optimizations

        + +

        All the PyTorch/XLA optimizations implemented for Llama inference are applied to Llama 2 as well. That includes Tensor Parallelism + Dynamo (torch.compile) using torch-xla collective ops, autoregressive decoding logic improvement to avoid recompilation, bucketized prompt length, KV-cache with compilation friendly index ops. Llama 2 introduces two new changes: Grouped Query Attention, and Early Stopping when eos is reached for all prompts. We applied corresponding changes to promote better performance and flexibility with PyTorch/XLA.

        + +

        Grouped Query Attention

        + +

        Llama 2 enables Grouped Query Attention for the 70B models. It allows the number of Key and Value heads to be smaller than the number of Query heads, while still supporting KV-cache sharding up to the number of KV heads. For the 70B models, the n_kv_heads is 8, which limits the tensor parallelism to be less or equal to 8. In order to shard the model checkpoint to run on more devices, the K, V projection weights need to be replicated first, and then split into multiple pieces. For example, to shard the 70B model checkpoint from 8 pieces to 16 pieces, the K, V projection weights are duplicated and split into 2 pieces for each shard. We provide a reshard_checkpoints.py script to handle that, and to make sure the sharded checkpoint performs mathematically identical to the original checkpoint.

        + +

        EOS Early Stopping

        + +

        The Llama 2 generation code added the early stopping logic. A eos_reached tensor is used to track the completion of all the prompt generations, and if the eos token is reached for all the prompts in the batch, the generation would stop early. The similar change is incorporated in the PyTorch/XLA optimized version as well, with some minor tweaks.

        + +

        In PyTorch/XLA, checking the value of a tensor like eos_reached as part of the control flow condition would invoke a blocking device-to-host transfer. The tensor would be transferred from device memory to CPU memory to evaluate its value, while all other logics are waiting. This introduced a delay on the scale of ms after every new token generation. As a trade-off, we reduce the rate of checking the eos_reached value to be once every 10 new token generations. With this change, the impact of the blocking device-to-host transfer would be reduced by 10x, while the early stopping would still be effective, and at most 9 unnecessary tokens would be generated after each sequence reaches the eos token.

        + +

        Model Serving

        + +

        PyTorch/XLA is working on a serving strategy to enable the PyTorch community to serve their deep learning applications via Torch.Export, StableHLO, and SavedModel. PyTorch/XLA Serving is an experimental feature in PyTorch/XLA 2.1 release; for details visit our serving user guide. Users can take advantage of TorchServe to run their single-host workloads.

        + +

        Benchmarks

        + +

        Metrics

        + +

        To measure training performance, we use the industry-standard metric: Model FLOPS Utilization (MFU). Model FLOPS are the floating point operations required to perform a single forward and backward pass. Model FLOPs are hardware and implementation independent and only depend on the underlying model. MFU measures how effectively the model is using the actual hardware during training. Achieving 100% MFU means that the model is using the hardware perfectly.

        + +

        To measure inference performance, we use the industry-standard metric of throughput. First, we measure latency per token when the model has been compiled and loaded. Then, we calculate throughput by dividing batch size (BS) over latency per chip. As a result, throughput measures how the model is performing in production environments regardless of how many chips are used.

        + +

        Results

        + +

        Training Evaluation

        + +

        Figure 1 shows Llama 2 SPMD 2D sharding training results on a range of Google TPU v4 hardware with PyTorch/XLA FSDP as the baseline. We increased MFU by 28% across all sizes of Llama 2 compared to FSDP running on the same hardware configuration. This performance improvement is largely due to: 1) 2D Sharding has less communication overhead than FSDP, and 2) asynchronous collective communication is enabled in SPMD which allows communication and computation overlapping. Also note that as the model size scales, we maintain the high MFU. Table 3 shows all the hardware configurations plus some hyperparameters used in the training benchmarks.

        + +

        Figure 1. Llama 2 Training MFU on TPU v4 Hardware

        + +

        Fig. 1: Llama 2 Training MFU on TPU v4 Hardware

        + +

        The results in Figure 1 are produced with sequence length 1,024. Figure 2 shows how the performance behaves with larger sequence lengths. It shows our performance also scales linearly with sequence lengths. The MFU is expected to decrease a little as a smaller per device batch size is needed to accommodate the additional memory pressure introduced by the larger sequence length since the sequence length axis is not sharded in 2D sharding. And TPU is very sensitive to batch size. For Llama 2, 70B parameters, the performance decrease is as low as 4%. At the time of preparing these results, Hugging Face Llama 2 tokenizer limits the max model input to 2,048, preventing us from evaluating larger sequence lengths.

        + +

        Figure 2. Llama 2 SPMD Training MFU on TPU v4 with Different Sequence Lengths

        + +

        Fig. 2: Llama 2 SPMD Training MFU on TPU v4 with Different Sequence Lengths

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model Size + 7B + 13B + 70B +
        TPU NumCores + V4-32 + V4-64 + V4-256 +
        Mesh Shape + (16, 1) + (32, 1) + (32, 4) +
        Seq Len + 1,024 + 2,048 + 1,024 + 2,048 + 1,024 + 2,048 +
        Global Batch + 256 + 128 + 256 + 128 + 512 + 256 +
        Per Device Batch + 16 + 8 + 8 + 4 + 16 + 8 +
        + +

        Table 3: Llama 2 SPMD Training Benchmark TPU Configurations and Hyperparameters

        + +

        One last thing to call out is that we use adafactor as the optimizer for better memory utilization. And once again, here is the user guide to reproduce the benchmark results listed above.

        + +

        Inference Evaluation

        + +

        In this section, we extend our previous evaluation of Llama on Cloud v4 TPU. Here, we demonstrate the performance properties of TPU v5e for inference applications.

        + +

        We define inference throughput as the number of tokens produced by a model per second per TPU chip. Figure 3 shows Llama 2 70B throughput on a v5e-16 TPU node. Given Llama is a memory bound application, we see that applying weight-only quantization unblocks extending the model batch size to 32. Higher throughput results would be possible on larger TPU v5e hardware up to the point where the ICI network bandwidth between chips throttle the TPU slice from delivering higher throughput. Exploring the upper bound limits of TPU v5e on Llama 2 was outside of the scope of this work. Notice, to make the Llama 2 70B model run on v5e-16, we replicated the attention heads to have one head per chip as discussed in the Inference section above. As discussed previously, with increasing model batch size, per-token latency grows proportionally; quantization improves overall latency by reducing memory I/O demand.

        + +

        Figure 3. Llama 2 70B Inference Per-Chip Throughput on TPU v5e vs. Batch Size

        + +

        Fig. 3: Llama 2 70B Inference Per-Chip Throughput on TPU v5e vs. Batch Size

        + +

        Figure 4 shows inference throughput results across different model sizes. These results highlight the largest throughput given the hardware configuration when using bf16 precision. With weight only quantization, this throughput reaches 42 on the 70B model. As mentioned above, increasing hardware resources may lead to performance gains.

        + +

        Figure 4. Llama 2 Inference Per-Chip Throughput on TPU v5e

        + +

        Fig. 4: Llama 2 Inference Per-Chip Throughput on TPU v5e

        + +

        Figure 5 shows the cost of serving Llama 2 models (from Figure 4) on Cloud TPU v5e. We report the TPU v5e per-chip cost based on the 3-year commitment (reserved) price in the us-west4 region. All model sizes use maximum sequence length of 2,048 and maximum generation length of 1,000 tokens. Note that with quantization, the cost for the 70B model drops to $0.0036 per 1,000 tokens.

        + +

        Figure 5. Llama 2 Inference Per-Chip Cost on TPU v5e

        + +

        Fig. 5: Llama 2 Inference Per-Chip Cost on TPU v5e

        + +

        Figure 6 summarizes our best Llama 2 inference latency results on TPU v5e. Llama 2 7B results are obtained from our non-quantized configuration (BF16 Weight, BF16 Activation) while the 13B and 70B results are from the quantized (INT8 Weight, BF16 Activation) configuration. We attribute this observation to the inherent memory saving vs. compute overhead tradeoff of quantization; as a result, for smaller models, quantization may not lead to lower inference latency.

        + +

        Additionally, prompt length has a strong effect on the memory requirements of LLMs. For instance, we observe a latency of 1.2ms / token (i.e. 201 tokens / second / chip) when max_seq_len=256 at batch size of 1 with no quantization on v5e-4 running Llama2 7B.

        + +

        Figure 6. Llama 2 Inference Latency on TPU v5e

        + +

        Fig. 6: Llama 2 Inference Latency on TPU v5e

        + +

        Final Thoughts

        + +

        The recent wave of AI innovation has been nothing short of transformative, with breakthroughs in LLMs at the forefront. Meta’s Llama and Llama 2 models stand as notable milestones in this wave of progress. PyTorch/XLA uniquely enables high-performance, cost-efficient training and inference for Llama 2 and other LLMs and generative AI models on Cloud TPUs, including the new Cloud TPU v5e. Looking forward, PyTorch/XLA will continue to push the performance limits on Cloud TPUs in both throughput and scalability and at the same time maintain the same PyTorch user experience.

        + +

        We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to GitHub so that we can openly collaborate. You can also try out PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs.

        + +

        We would like to extend our special thanks to Marcello Maggioni, Tongfei Guo, Andy Davis, Berkin Ilbeyi for their support and collaboration in this effort.

        + +

        Cheers,
        +The PyTorch/XLA Team at Google

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/high-performance-llama/index.html b/blog/high-performance-llama/index.html new file mode 100644 index 000000000000..ec7f1fa1e010 --- /dev/null +++ b/blog/high-performance-llama/index.html @@ -0,0 +1,923 @@ + + + + + + + + + + + + + High performance Llama 2 deployments with AWS Inferentia2 using TorchServe | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Mike Zhang, Li Ning, Sergey Ivanov, Naman Nandan, Hamid Shojanazeri, Geeta Chauhan, Abhi Shivaditya, Michael Nguyen, Pinak Panigrahi + +

        +

        Recently, Llama 2 was released and has attracted a lot of interest from the machine learning community. Amazon EC2 Inf2 instances, powered by AWS Inferentia2, now support training and inference of Llama 2 models. In this post, we show low-latency and cost-effective inference of Llama-2 models on Amazon EC2 Inf2 instances using the latest AWS Neuron SDK release.  We first introduce how to create, compile and deploy the Llama-2 model and explain the optimization techniques introduced by AWS Neuron SDK to achieve high performance at low cost. We then present our benchmarking results. Lastly, we show how the Llama-2 model can be deployed through Amazon SageMaker using TorchServe on an Inf2 instance. 

        + +

        Llama 2 is an auto-regressive language model that uses an optimized transformer architecture

        + +

        What is Llama 2

        + +

        Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. Llama 2 is intended for commercial and research use in English. It comes in multiple sizes—7 billion, 13 billion, and 70 billion parameters—as well as pre-trained and fine-tuned variations. According to Meta, the tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align to human preferences for helpfulness and safety. Llama 2 was pre-trained on 2 trillion tokens of data from publicly available sources. The tuned models are intended for assistant-like chat, whereas pre-trained models can be adapted for a variety of natural language generation tasks. Regardless of which version of the model a developer uses, the responsible use guide from Meta can assist in guiding additional fine-tuning that may be necessary to customize and optimize the models with appropriate safety mitigations.

        + +

        Amazon EC2 Inf2 instances Overview

        + +

        Amazon EC2 Inf2 instances, featuring Inferentia2, provide 3x higher compute, 4x more accelerator memory, resulting in up to 4x higher throughput, and up to 10x lower latency, compared to the first generation Inf1 instances.

        + +

        Large language model (LLM) inference is a memory bound workload, performance scales up with more accelerator memory bandwidth. Inf2 instances are the only inference optimized instances in Amazon EC2 to provide high speed accelerator interconnect (NeuronLink) enabling high performance large LLM model deployments with cost effective distributed inference. You can now efficiently and cost-effectively deploy billion-scale LLMs across multiple accelerators on Inf2 instances.

        + +

        Inferentia2 supports FP32, TF32, BF16, FP16, UINT8, and the new configurable FP8 (cFP8) data type. AWS Neuron can take high-precision FP32 and FP16 models and autocast them to lower-precision data types while optimizing accuracy and performance. Autocasting reduces time to market by removing the need for lower-precision retraining and enabling higher-performance inference with smaller data types.

        + +

        To make it flexible and extendable to deploy constantly evolving deep learning models, Inf2 instances have hardware optimizations and software support for dynamic input shapes as well as custom operators written in C++ through the standard PyTorch custom operator programming interfaces.

        + +

        Transformers Neuron (transformers-neuronx)

        + +

        Transformers Neuron is a software package that enables PyTorch users to deploy performance optimized LLM inference. It has an optimized version of transformer models implemented with XLA high level operators (HLO), which enables sharding tensors across multiple NeuronCores, a.k.a. tensor parallelism, and performance optimizations such as parallel context encoding and KV caching for Neuron hardware. The Llama 2 source code in XLA HLOs can be found here.

        + +

        Llama 2 is supported in Transformers Neuron through the LlamaForSampling class. Transformers Neuron provides a seamless user experience with Hugging Face models to provide optimized inference on Inf2 instances. More details can be found from the Transforms Neuron Developer Guide. In the following section, we will explain how to deploy the Llama-2 13B model using Transformers Neuron. And, this example also applies to other Llama-based models.

        + +

        Llama 2 model inference with Transformers Neuron

        + +

        Create model, compile and deploy

        + +

        We have three simple steps here to create, compile and deploy the model on Inf2 instances.

        + +
          +
        1. Create a CPU model, use this script or the following code snippet to serialize and save checkpoints in a local directory.
        2. +
        + +
        from transformers import AutoModelForCausalLM
        +from transformers_neuronx.module import save_pretrained_split
        +model_cpu = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf", low_cpu_mem_usage=True)
        +model_dir = "./llama-2-13b-split"
        +save_pretrained_split(model_cpu, model_dir)
        +
        + +
          +
        1. Load and compile model from the local directory that you saved serialized checkpoints using the following. +To load the Llama 2 model, we use LlamaForSampling from Transformers Neuron. Note that the environment variable NEURON_RT_NUM_CORES specifies the number of NeuronCores to be used at runtime and it should match the tensor parallelism (TP) degree specified for the model. Also, NEURON_CC_FLAGS enables compiler optimization on decoder-only LLM models.
        2. +
        + +
        from transformers_neuronx.llama.model import LlamaForSampling
        +os.environ['NEURON_RT_NUM_CORES'] = '24'
        +os.environ['NEURON_CC_FLAGS'] = '--model-type=transformer'
        +model = LlamaForSampling.from_pretrained(
        +        model_dir,
        +        batch_size=1,
        +        tp_degree=24,
        +        amp='bf16',
        +        n_positions=16,
        +        context_length_estimate=[8]
        +    )
        +
        + +

        Now let's compile the model and load model weights into device memory with a one liner API.

        +
        model.to_neuron()
        +
        + +
          +
        1. Finally let’s run the inference on the compiled model. Note that both input and output of the sample function are a sequence of tokens.
        2. +
        + +
        inputs = torch.tensor([[1, 16644, 31844, 312, 31876, 31836, 260, 3067, 2228, 31844]])
        +seq_len = 16
        +outputs = model.sample(inputs, seq_len, top_k=1)
        +
        + +

        Inference optimizations in Transformers Neuron

        + +

        Tensor parallelism

        + +

        Latency with different TP degrees

        + +

        Transformer Neuron implements parallel tensor operations across multiple NeuronCores. We denote the number of cores to be used for inference as TP degree. Larger TP degree provides higher memory bandwidth, leading to lower latency, as LLM token generation is a memory-IO bound workload. With increasing the TP degree, the inference latency has decreased significantly, our results shows, ~4x overall speed up with increased TP degrees from 2 to 24. For the Llama-2 7B model, latency decreases from 30.1 ms/token with 2 cores to 7.9 ms/token with 24 cores; similarly for the Llama-2 13B model, it goes down from 57.3 ms/token to 11.1 ms/token.

        + +

        Parallel context encoding

        + +

        In the transformer architecture, tokens are produced in a sequential procedure called autoregressive sampling while input prompt tokens can be processed in parallel with parallel context encoding. This can significantly reduce the latency for input prompt context encoding before token generation through autoregressive sampling. By default, the parameter context_length_estimate would be set as a list of power-of-2 numbers which aims to cover a wide variety of context lengths. Depending on the use case, it can be set to custom numbers. This can be done when creating the Llama 2 model using LlamaForSampling.from_pretrained. We characterize the impact of input token length on end-to-end (E2E) latency. As shown in the figure, latency for text generation with the Llama-2 7B model only slightly increases with bigger input prompts, thanks to parallel context encoding.

        + +

        E2E latency

        + +

        KV caching

        + +

        Self-attention block performs the self-attention operation with KV vectors. And, KV vectors are calculated using token embeddings and weights of KV and thus associated with tokens. In naive implementations, for each generated token, the entire KV cache is recalculated, but this reduces performance. Therefore Transformers Neuron library is reusing previously calculated KV vectors to avoid unnecessary computation, also known as KV caching, to reduce latency in the autoregressive sampling phase. 

        + +

        Benchmarking results

        + +

        We benchmarked the latency and cost for both Llama-2 7B and 13B models under different conditions, i.e., number of output tokens, instance types. Unless specified, we use data type ‘bf16’ and batch size of 1 as this is a common configuration for real-time applications like chatbot and code assistant.

        + +

        Latency

        + +

        The following graphs shows the per token latency on inf2.48xlarge instance with TP degree 24. Here, the latency per output token is calculated as the end-to-end latency divided by the number of output tokens. Our experiments show Llama-2 7B end-to-end latency to generate 256 tokens is 2x faster compared to other comparable inference-optimized EC2 instances. 

        + +

        Latency on inf2

        + +

        Throughput

        + +

        We now show the number of tokens generated per second for the Llama-2 7B and 13B models that can be delivered by the inf2.48xlarge instance. With TP degree 24, fully utilizing all the 24 NeuronCores, we can achieve 130 tokens/sec and 90 tokens/sec for the Llama-2 7B and 13B models, respectively.

        + +

        E2E throughput

        + +

        Cost

        + +

        For latency-first applications, we show the cost of hosting Llama-2 models on the inf2.48xlarge instance, $0.011 per 1000 tokens and $0.016 per 1000 tokens for the 7B and 13B models, respectively, which achieve 3x cost saving over other comparable inference-optimized EC2 instances. Note that we report the cost based on 3-year reserved instance price which is what customers use for large production deployments.

        + +

        Cost on inf2

        + +

        We also compare the cost of hosting the Llama-2 7B model on inf2.xlarge and inf2.48xlarge instances. We can see that inf2.xlarge is more than 4x cheaper than inf2.48xlarge but at the expense of longer latency due to smaller TP degree. For example, it takes 7.9 ms for the model to generate 256 output tokens with 256 input tokens on inf2.48xlarge but 30.1 ms on Inf2.xlarge.

        + +

        Cost on Llama

        + +

        Serving Llama2 with TorchServe on EC2 Inf2 instance

        + +

        Now, we move on to model deployment. In this section, we show you how to deploy the Llama-2 13B model through SageMaker using TorchServe, which is the recommended model server for PyTorch, preinstalled in the AWS PyTorch Deep Learning Containers (DLC).

        + +

        This section describes the preparation work needed for using TorchServe, particularly, how to configure model_config.yaml and inf2_handler.py as well as how to generate model artifacts and pre-compile the model for use in later model deployment. Preparing the model artifacts ahead-of-time avoids model compilation during model deployment and thus reduces the model loading time.

        + +

        Model configuration model-config.yaml

        + +

        The parameters defined in section handler and micro_batching are used in customer handler inf2_handler.py. More details about model_config.yaml are here. TorchServe micro-batching is a mechanism to pre-process and post-process a batch of inference requests in parallel. It is able to achieve higher throughput by better utilizing the available accelerator when the backend is steadily fed with incoming data, see here for more details. For model inference on Inf2, micro_batch_size, amp, tp_degree and max_length specify the batch size, data type, tensor parallelism degree and max sequence length, respectively.

        + +
        # TorchServe Frontend Parameters
        +minWorkers: 1
        +maxWorkers: 1
        +maxBatchDelay: 100
        +responseTimeout: 10800
        +batchSize: 16
        +
        +# TorchServe Backend Custom Handler Parameters
        +handler:
        +    model_checkpoint_dir: "llama-2-13b-split"
        +    amp: "bf16"
        +    tp_degree: 12
        +    max_length: 100
        +
        +micro_batching:
        +    # Used by batch_size in function LlamaForSampling.from_pretrained
        +    micro_batch_size: 1  
        +    parallelism:
        +        preprocess: 2
        +        inference: 1
        +        postprocess: 2
        +
        + +

        Custom handler inf2_handler.py

        + +

        Custom handler in Torchserve is a simple Python script that lets you define the model initialization, preprocessing, inference and post-processing logic as functions. Here, we create our Inf2 custom handler.

        + +
          +
        1. The initialize function is used to load the model. Here, Neuron SDK will compile the model for the first time and save the precompiled model in the directory as enabled by NEURONX_CACHE in the directory specified by NEURONX_DUMP_TO. After the first time, subsequent runs will check if there are already pre-compiled model artifacts. If so, it will skip model compilation. +Once the model is loaded, we initiate warm-up inference requests so that the compiled version is cached. When the neuron persistent cache is utilized, it can significantly reduce the model loading latency, ensuring that the subsequent inference runs swiftly.
        2. +
        + +
        os.environ["NEURONX_CACHE"] = "on"
        +os.environ["NEURONX_DUMP_TO"] = f"{model_dir}/neuron_cache"
        +
        + +

        TorchServe `TextIteratorStreamerBatch` extends Hugging Face transformers `BaseStreamer` to support response streaming when `batchSize` is larger than 1. 

        + +
        self.output_streamer = TextIteratorStreamerBatch(
        +    self.tokenizer,
        +    batch_size=self.handle.micro_batch_size,
        +    skip_special_tokens=True,
        +)
        +
        + +
          +
        1. The inference function calls send_intermediate_predict_response to send the streaming response.
        2. +
        + +
        for new_text in self.output_streamer:
        +    logger.debug("send response stream")
        +    send_intermediate_predict_response(
        +        new_text[: len(micro_batch_req_id_map)],
        +        micro_batch_req_id_map,
        +        "Intermediate Prediction success",
        +        200,
        +        self.context,
        +    )
        +
        + +

        Package model artifacts

        + +

        Package all the model artifacts into a folder llama-2-13b-neuronx-b1 using the torch-model-archiver

        + +
        torch-model-archiver --model-name llama-2-13b-neuronx-b1 --version 1.0 --handler inf2_handler.py -r requirements.txt --config-file model-config.yaml --archive-format no-archive
        +
        + +

        Serve the model

        + +
        export TS_INSTALL_PY_DEP_PER_MODEL="true"
        +torchserve --ncs --start --model-store model_store --models llama-2-13b-neuronx-b1
        +
        + +

        Once the log shows “WORKER_MODEL_LOADED”, the pre-compiled model should be saved in the folder llama-2-13b-neuronx-b1/neuron_cache, which is tightly coupled with Neuron SDK version. Then, upload the folder llama-2-13b-neuronx-b1 to your S3 bucket for later use in the product deployment. The Llama-2 13B model artifacts in this blog can be found here, which is associated with Neuron SDK 2.13.2, in the TorchServe model zoo.

        + +

        Deploy Llama-2 13B model on SageMaker Inf2 instance using TorchServe 

        + +

        In this section, we deploy the Llama-2 13B model using a PyTorch Neuronx container on a SageMaker endpoint with an ml.inf2.24xlarge hosting instance, which has 6 Inferentia2 accelerators corresponding to our model configuration model_config.yaml handler’s setting - tp_degree: 12. Given that we have packaged all the model artifacts into a folder using torch-model-archiver and uploaded to S3 bucket, we will now use the SageMaker Python SDK to create a SageMaker model and deploy it to a SageMaker real-time endpoint using the deploy uncompressed model method. Speed is the key benefit to deploying in this manner with SageMaker and you get a fully functional production ready endpoint complete with a secure RESTful endpoint without any effort spent on infrastructure. There are 3 steps to deploying the model and running inference on SageMaker. The notebook example can be found here.

        + +
          +
        1. Create a SageMaker model
        2. +
        + +
        from datetime import datetime
        +
        +instance_type = "ml.inf2.24xlarge"
        +endpoint_name = sagemaker.utils.name_from_base("ts-inf2-llama2-13b-b1")
        +
        +model = Model(
        +    name="torchserve-inf2-llama2-13b" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
        +    # Enable SageMaker uncompressed model artifacts
        +    model_data={
        +        "S3DataSource": {
        +                "S3Uri": s3_uri,
        +                "S3DataType": "S3Prefix",
        +                "CompressionType": "None",
        +        }
        +    },
        +    image_uri=container,
        +    role=role,
        +    sagemaker_session=sess,
        +    env={"TS_INSTALL_PY_DEP_PER_MODEL": "true"},
        +)
        +
        + +
          +
        1. Deploy a SageMaker model
        2. +
        + +
        model.deploy(
        +    initial_instance_count=1,
        +    instance_type=instance_type,
        +    endpoint_name=endpoint_name,
        +    volume_size=512, # increase the size to store large model
        +    model_data_download_timeout=3600, # increase the timeout to download large model
        +    container_startup_health_check_timeout=600, # increase the timeout to load large model
        +)
        +
        + +
          +
        1. Run streaming response inference on SageMaker +When the endpoint is in service, you can use the invoke_endpoint_with_response_stream API call to invoke the model. This feature enables the return of each generated token to the user, enhancing the user experience. It’s especially beneficial when generating an entire sequence is time-consuming.
        2. +
        + +
        import json
        +
        +body = "Today the weather is really nice and I am planning on".encode('utf-8')
        +resp = smr.invoke_endpoint_with_response_stream(EndpointName=endpoint_name, Body=body, ContentType="application/json")
        +event_stream = resp['Body']
        +parser = Parser()
        +for event in event_stream:
        +    parser.write(event['PayloadPart']['Bytes'])
        +    for line in parser.scan_lines():
        +        print(line.decode("utf-8"), end=' ')
        +
        + +

        Sample inference:

        + +

        Input

        + +

        “Today the weather is really nice and I am planning on”

        + +

        Output

        + +

        “Today the weather is really nice and I am planning on going to the beach. I am going to take my camera and take some pictures of the beach. I am going to take pictures of the sand, the water, and the people. I am also going to take pictures of the sunset. I am really excited to go to the beach and take pictures.

        + +

        The beach is a great place to take pictures. The sand, the water, and the people are all great subjects for pictures. The sunset is also a great subject for pictures.”

        + +

        Conclusion

        + +

        In this post, we showcased how to run Llama 2 model inference using Transformers Neuron and deploy Llama 2 model serving using TorchServe through Amazon SageMaker on an EC2 Inf2 instance. We demonstrated the benefits of using Inferentia2—low latency and low cost—enabled by optimizations in AWS Neuron SDK including tensor parallelism, parallel context encoding and KV caching, particularly for LLM inference. To stay up to date, please follow AWS Neuron’s latest release for new features.

        + +

        Get started today with Llama 2 examples on EC2 and through SageMaker and stay tuned for how to optimize Llama 70B on Inf2!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hitchhikers-guide-speculative-decoding/index.html b/blog/hitchhikers-guide-speculative-decoding/index.html new file mode 100644 index 000000000000..ed8000b8145a --- /dev/null +++ b/blog/hitchhikers-guide-speculative-decoding/index.html @@ -0,0 +1,732 @@ + + + + + + + + + + + + + A Hitchhiker’s Guide to Speculative Decoding | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch at IBM + +

        +

        Speculative decoding is an optimization technique for inference that makes educated guesses about future tokens while generating the current token, all within a single forward pass. It incorporates a verification mechanism to ensure the correctness of these speculated tokens, thereby guaranteeing that the overall output of speculative decoding is identical to that of vanilla decoding. Optimizing the cost of inference of large language models (LLMs) is arguably one of the most critical factors in reducing the cost of generative AI and increasing its adoption. Towards this goal, various inference optimization techniques are available, including custom kernels, dynamic batching of input requests, and quantization of large models.

        + +

        In this blog post, we provide a guide to speculative decoding and demonstrate how it can coexist with other optimizations. We are proud to open source the following, which includes the first speculator for Llama3 models:

        + +
          +
        1. Speculator models for Meta Llama3 8B, IBM Granite 7B lab, Meta Llama2 13B, and Meta Code Llama2 13B.
        2. +
        3. The code for inference via IBM’s fork of HF TGI.
        4. +
        5. The code for training your own speculators and corresponding recipes.
        6. +
        + +

        We have deployed these speculators in an internal production-grade environment with thousands of daily users and observed 2x speedup on language models - Llama3 8B, Llama2 13B, and IBM Granite 7B and 3x speedup on IBM’s Granite 20B code models. We provide a detailed explanation of our approach in this technical report and are planning in-depth analysis in an upcoming ArXiv paper.

        + +

        Speculative decoding: Inference

        + +

        We run IBM TGIS in our internal production environment that has optimizations such as continuous batching, fused kernels, and quantization kernels. To enable speculative decoding in TGIS, we modified the paged attention kernel from vLLM. In what follows, we will describe the key changes to the inference engine to enable speculative decoding.

        + +

        Speculative decoding is based on the premise that the model is powerful enough to predict multiple tokens in a single forward pass. However, the current inference servers are optimized to predict only a single token at a time. In our approach, we attach multiple speculative heads (in addition to the usual one) to the LLM to predict N+1-, N+2-, N+3-th … token. For example, 3 heads will predict 3 additional tokens. Details of the speculator architecture are explained in a later part of this blog. There are two challenges to achieve efficiency and correctness during inference - one is to predict without replicating KV-cache and the other is to verify that the predictions match the original model’s outcomes.

        + +

        In a typical generation loop, after the prompt is processed in a single forward step, a sequence length of 1 (next token predicted) is fed into the forward pass of the model along with the kv-cache. In a naive speculative decoding implementation, each speculative head would have its own kv-cache, but instead we modify the paged attention kernel developed in the vLLM project to enable efficient kv-cache maintenance. This ensures that throughput does not reduce at larger batch sizes. Further, we modify the attention masks to enable verification of the N+1’th token and thus enable speculative decoding without deviating from the original model’s output. The details of this implementation are captured here.

        + +

        Results

        + +

        We illustrate the speedup obtained with the Meta’s chat versions of Llama2 13B using a simple prompt.

        + +

        Visual illustration of the non-speculative generation (left) compared to speculative generation (right)

        + +

        Figure 2: Visual illustration of the non-speculative generation (left) compared to speculative generation (right)

        + +

        We deployed the above solution in an internal production environment. The figure below reports two metrics – time to first token (TTFT) and inter-token latency (ITL) with different numbers of concurrent users (which is captured in the numbers on the graph lines). We observe that the speculative decoding version is nearly twice as fast for the Llama2 13B chat model and nearly thrice as fast for the Granite 20B code model compared to the non-speculative version for all batch sizes. We observe similar behavior for the smaller models - IBM’s Granite 7B and Meta Llama3 8B models.

        + +

        Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Llama 13B with number of concurrent users indicated on the graph

        + +

        Figure 3: Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Llama 13B with number of concurrent users indicated on the graph

        + +

        Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Granite 20B Code with number of concurrent users indicated on the graph

        + +

        Figure 4: Time to first token (TTFT - left) and Inter-token latency (ITL - right) for Granite 20B Code with number of concurrent users indicated on the graph

        + +

        Note on efficiency

        + +

        We performed numerous experiments to determine the right configuration for speculator training. These are:

        + +
          +
        1. Speculator architecture: The current approach allows for the number of heads to be modified, which maps to the number of tokens that we can look ahead. Increasing the number of heads also increases the amount of extra compute needed and complexity of training. In practice, for language models, we find 3-4 heads works well in practice, whereas we found that code models can reap benefits from 6-8 heads.
        2. +
        3. Compute: Increasing the number of heads results in increased compute in two dimensions, one is that of increased latency for a single forward pass as well as the compute needed for multiple tokens. If the speculator is not accurate with more heads, it will result in wasted compute increasing the latency and reducing the throughput.
        4. +
        5. Memory: The increased compute is offset by the roundtrips to HBM that need to be done for each forward pass. Note that if we get 3 tokens lookahead correct, we have saved three round trip times on HBM.
        6. +
        + +

        We settled on 3-4 heads for the language models and 6-8 heads for the code models and across different model sizes ranging from 7B to 20B, we observed significant latency improvements without throughput loss compared to non-speculative decoding. We begin to observe throughput reduction beyond a batch size of 64, which happens rarely in practice.

        + +

        Speculative decoding: Training

        + +

        There are two broad approaches for speculative decoding, one is to leverage a smaller model (e.g., Llama 7B as a speculator for Llama 70B) and the other is to attach speculator heads (and train them). In our experiments, we find the approach of attaching speculator heads to be more effective both in model quality and latency gains.

        + +

        Speculator architecture

        + +

        Medusa made speculative decoding popular; their approach is to add a head to the existing model which is then trained to do speculation. We modify the Medusa architecture by making the “heads” hierarchical, where each head stage predicts a single token and then feeds it to the next head stage. These multi-stage heads are depicted in the below figure. We are exploring ways of minimizing the embeddings table by sharing these across the multiple stages and base model.

        + +

        A simple architecture diagram for a 3-headed multi-stage  speculator. Z is the state from the base model.

        + +

        Figure 4: A simple architecture diagram for a 3-headed multi-stage speculator. Z is the state from the base model.

        + +

        Speculator training

        + +

        We have a two-phase approach to training a speculator for efficiency reasons. In the first phase, we train on small batches with long sequence lengths (4k tokens) and use the standard causal LM approach for training. In phase 2, we use large batches with short sequence lengths (256 tokens) generated from the base model. In this training phase, we tune the heads to match the output of the base model. Through numerous experiments, we find that a 5:2 ratio of steps for phase 1 vs phase 2 works well. We depict the progress of these phases in the below figure. We use PyTorch FSDP and IBM FMS for the training of speculators.

        + +

        Per-head training loss curves for Llama2-13B speculator training, phase 1 and 2

        + +

        Figure 5: Per-head training loss curves for Llama2-13B speculator training, phase 1 and 2

        + +

        Conclusion and Future Work

        + +

        Through this blog, we are releasing a new approach for speculative decoding and the following assets:

        + +
          +
        1. Models for improving the inter-token latencies for a range of models - Llama3 8B, Llama2 13B, Granite 7B, and CodeLlama 13B
        2. +
        3. Production quality code for inference
        4. +
        5. Recipes for training speculators
        6. +
        + +

        We are working on training speculators for Llama3 70B and Mistral models and invite the community to contribute as well as help improve on our framework. We would also love to work with major open source serving frameworks such as vLLM and TGI to contribute back our speculative decoding approach to benefit the community.

        + +

        Acknowledgements

        + +

        There are several teams that helped us get to these latency improvements for inference. We would like to thank the vLLM team for creating the paged attention kernel in a clean and reusable manner. We extend our gratitude to the Team PyTorch at Meta that helped provide feedback on this blog as well as continued efforts on optimal usage of PyTorch. Special thanks to our internal production teams at IBM Research who took this prototype to production and hardened it. A shout out to Stas Bekman for providing insightful comments on the blog resulting in an improved explanation of the tradeoffs between compute, memory, and speculator effectiveness.

        + +

        The paged attention kernel was integrated into IBM FMS by Josh Rosenkranz and Antoni Viros i Martin. The speculator architecture and training was done by Davis Wertheimer, Pavithra Ranganathan, and Sahil Suneja. The integration of the modeling code with the inference server was done by Thomas Parnell, Nick Hill, and Prashant Gupta.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hopper-tma-unit/index.html b/blog/hopper-tma-unit/index.html new file mode 100644 index 000000000000..5194099231c8 --- /dev/null +++ b/blog/hopper-tma-unit/index.html @@ -0,0 +1,1049 @@ + + + + + + + + + + + + + Deep Dive on the Hopper TMA Unit for FP8 GEMMs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Adnan Hoque, Less Wright, Chih-Chieh Yang + +

        +

        Abstract

        + +

        The Hopper (H100) GPU architecture, billed as the “first truly asynchronous GPU”, includes a new, fully asynchronous hardware copy engine for bulk data movement between global and shared memory called Tensor Memory Accelerator (TMA). While CUTLASS has built-in support for TMA via its asynchronous pipeline paradigm, Triton exposes TMA support via an experimental API.

        + +

        In this post, we provide a deeper dive into the details of how TMA works, for developers to understand the new async copy engine. We also show the importance of leveraging TMA for H100 kernels by building a TMA enabled FP8 GEMM kernel in Triton, which delivers from 1.4-2.2x performance gains over cuBLAS FP16 for small-to-medium problem sizes. Finally, we showcase key implementation differences between Triton and CUTLASS that may account for reports of performance regressions with TMA in Triton. We open source our implementation for reproducibility and review at https://github.com/pytorch-labs/applied-ai/tree/main/kernels

        + +

        The throughput in TFLOPs of various Triton and cuBLAS FP8 and FP16 kernels, for M=M, N=4096, K=4096. The red line is the Triton TMA, which showcases the advantages of leveraging TMA.

        + +

        Figure 1. The throughput in TFLOPs of various Triton and cuBLAS FP8 and FP16 kernels, for M=M, N=4096, K=4096. The red line is the Triton TMA, which showcases the advantages of leveraging TMA.

        + +

        TMA Background

        + +

        TMA is an H100 hardware addition that allows applications to asynchronously and bi-directionally transfer 1D-5D tensors between GPU global and shared memory. In addition, TMA can also transfer the same data to not just the calling SM’s shared memory, but to other SM’s shared memory if they are part of the same Thread Block Cluster. This is termed ‘multicast’.

        + +

        TMA is very lightweight as only a single thread is needed to kick off a TMA transfer. By moving data directly from GMEM (global) to SMEM (shared), this avoids earlier GPU requirements of using registers for moving data between different memory spaces.

        + +

        A100-style data movement vs H100 with TMA.  TMA hardware eliminates the need for a large amount of threads and registers participating in bulk data transfers.

        + +

        Figure 2. A100-style data movement vs H100 with TMA. TMA hardware eliminates the need for a large amount of threads and registers participating in bulk data transfers. (Image credit Nvidia)

        + +

        A single thread can issue large data movement instructions, allowing the majority of a given thread block to continue working on other instructions while data is in-flight. Combined with asynchronous pipelining, this allows memory transfers to be easily hidden and ensure the majority of any given thread block cluster can focus on computational task.

        + +

        This lightweight invocation for data movement enables the creation of warp-group specialized kernels, where warp-groups take on different roles, namely producers and consumers. Producers elect a leader thread that fires off TMA requests, which are then asynchronously coordinated with the consumer (MMA) warp-groups via an arrival barrier. Consumers then process the data using warp-group MMA, and signal back to the producers when they have finished reading from the SMEM buffer and the cycle repeats.

        + +

        Further, within threadblock clusters, producers can lower their max register requirements since they are only issuing TMA calls, and effectively transfer additional registers to MMA consumers, which helps to alleviate register pressure for consumers.

        + +

        In addition, TMA handles the address computation for the shared memory destination where the data requested should be placed. This is why calling threads (producers) can be so lightweight.

        + +

        To ensure maximum read access speed, TMA can lay out the arriving data based on swizzling instructions, to ensure the arriving data can be read as fast as possible by consumers, as the swizzling pattern helps avoid shared memory bank conflicts.

        + +

        Finally for TMA instructions that are outgoing, or moving data from SMEM to GMEM, TMA can also include reduction operations (add/min/max) and bitwise (and/or) operations.

        + +

        TMA usage in Triton

        + +

        Pre-Hopper Load:

        + +
        offs_m = pid_m*block_m + tl.arange(0, block_m)
        +offs_n = pid_n*block_n + tl.arange(0, block_n)
        +offs_k = tl.arange(0, block_k)
        +
        +a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k[None, :]*stride_ak)
        +b_ptrs = b_ptr + (offs_k[:, None]*stride_bk + offs_bn[None, :]*stride_bn)
        +
        +a = tl.load(a_ptrs)
        +b = tl.load(b_ptrs)
        +
        + +

        Figure 3. Traditional style bulk load from global to shared memory in Triton

        + +

        In the above Triton example showing a pre-Hopper load, we see how the data for tensors a and b are loaded by each thread block computing global offsets (a_ptrs, b_ptrs) from their relevant program_id (pid_m, pid_n, k) and then making a request to move blocks of memory into shared memory for a and b.

        + +

        Now let’s examine how to perform a load using TMA in Triton.

        + +

        The TMA instruction requires a special data structure called a tensor map, in contrast to the above where we directly pass pointers to global memory. To build the tensor map, we first create a TMA descriptor on the CPU. The descriptor handles the creation of the tensor map by using the cuTensorMapEncode API. The tensor map holds metadata such as the global and shared memory layout of the tensor and serves as a compressed representation of the structure of the multi-dimensional tensor stored in global memory.

        + +

        TMA address generation via a copy descriptor

        + +

        Figure 4. TMA address generation via a copy descriptor (Image credit: Nvidia)

        + +

        The TMA descriptor holds the tensor’s key properties:

        + +
          +
        1. Base Pointer
        2. +
        3. Shape and Block Size
        4. +
        5. Datatype
        6. +
        + +

        The TMA descriptor is created on the host before the kernel, and then moved to device by passing the descriptor to a torch tensor. Thus, in Triton, the GEMM kernel receives a global pointer to the tensor map.

        + +

        Triton Host Code

        + +
           desc_a = np.empty(TMA_SIZE, dtype=np.int8)
        +   desc_b = np.empty(TMA_SIZE, dtype=np.int8)
        +   desc_c = np.empty(TMA_SIZE, dtype=np.int8)
        +
        +   triton.runtime.driver.active.utils.fill_2d_tma_descriptor(a.data_ptr(), m, k, block_m, block_k, a.element_size(), desc_a)
        +
        +   triton.runtime.driver.active.utils.fill_2d_tma_descriptor(b.data_ptr(), n, k, block_n, block_k, b.element_size(), desc_b)
        +
        +   triton.runtime.driver.active.utils.fill_2d_tma_descriptor(c.data_ptr(), m, n, block_m, block_n, c.element_size(), desc_c)
        +  
        +   desc_a = torch.tensor(desc_a, device='cuda')
        +   desc_b = torch.tensor(desc_b, device='cuda')
        +   desc_c = torch.tensor(desc_c, device='cuda')
        +
        + +

        This is the code that is used to set up the descriptors in the kernel invoke function.

        + +

        Triton Device Code

        + +

        Offsets/Pointer Arithmetic:

        + +
           offs_am = pid_m * block_m
        +   offs_bn = pid_n * block_n
        +   offs_k = 0
        +
        + +

        Load:

        + +
          a = tl._experimental_descriptor_load(a_desc_ptr, [offs_am, offs_k], [block_m, block_k], tl.float8e4nv)
        +  b = tl._experimental_descriptor_load(b_desc_ptr, [offs_bn, offs_k], [block_n, block_k], tl.float8e4nv)
        +
        + +

        Store:

        + +
         tl._experimental_descriptor_store(c_desc_ptr, accumulator, [offs_am, offs_bn])
        +
        + +

        We no longer need to calculate a pointer array for both load and store functions in the kernel. Instead, we pass a single descriptor pointer, the offsets, block size and the input datatype. This simplifies address calculation and reduces register pressure, as we no longer have to do complex pointer arithmetic in software and dedicate CUDA cores for address computation.

        + +

        TMA Performance Analysis

        + +

        Below, we discuss the PTX instructions for different load mechanisms on Hopper.

        + +

        PTX for Loading Tile (cp.async) - H100 no TMA

        + +
        add.s32 	%r27, %r100, %r8;
        +add.s32 	%r29, %r100, %r9;
        +selp.b32 	%r30, %r102, 0, %p18;
        +
        +
        +@%p1 cp.async.cg.shared.global [ %r27 + 0 ], [ %rd20 + 0 ], 0x10, %r30;
        +@%p1 cp.async.cg.shared.global [ %r29 + 0 ], [ %rd21 + 0 ], 0x10, %r30;
        +
        +
        +cp.async.commit_group ;
        +
        + +

        Here, we observe the older cp.async instruction responsible for global memory copies. From the traces below we can see that both loads bypass the L1 cache. A major difference in the newer TMA load is that before tiles from A and B were ready to be consumed by the Tensor Core we would need to execute an ldmatrix instruction that operated on data contained in register files. On Hopper, the data can now be directly reused from shared memory.

        + +

        H100 Memory Chart showing GMEM Throughput = 910.22 GB/s

        + +

        Figure 5. H100 Memory Chart showing GMEM Throughput = 910.22 GB/s (Triton GEMM without TMA) for M=128, N=4096, K=4096

        + +

        By leveraging TMA through the Triton API changes we mentioned above, we can investigate the PTX that Triton generates for a single 2D tile load with TMA.

        + +

        PTX for Loading Tile (cp.async.bulk.tensor) - H100 using TMA

        + +
        bar.sync 	0;
        +shr.u32 	%r5, %r4, 5;
        +shfl.sync.idx.b32	%r66, %r5, 0, 31, -1;
        +
        +elect.sync _|%p7, 0xffffffff;
        +
        +
        +add.s32 	%r24, %r65, %r67;
        +shl.b32 	%r25, %r66, 7;
        +
        +@%p8
        +cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [%r24], [%rd26, {%r25,%r152}], [%r19];
        +
        + +

        The cp.async.bulk.tensor.2d.shared TMA instruction is passed the destination address in shared memory, a pointer to the tensor map, the tensor map coordinates and a pointer to the mbarrier object, respectively.

        + +

        H100 Memory Chart GMEM Throughput =1.45 TB/s

        + +

        Figure 6. H100 Memory Chart GMEM Throughput =1.45 TB/s (Triton GEMM with TMA) for M=128, N=4096, K=4096

        + +

        For optimal performance we tuned the TMA GEMM kernel extensively. Amongst other parameters such as tile sizes, number of warps and number of pipeline stages, the biggest increase in memory throughput was observed when we increased the TMA_SIZE (descriptor size) from 128 to 512. From the above NCU profiles, we can see that the final tuned kernel has increased global memory transfer throughput from 910 GB/s to 1.45 TB/s, a 59% increase in GMEM throughput, over the non-TMA Triton GEMM kernel.

        + +

        Comparison of CUTLASS and Triton FP8 GEMM and TMA Implementation - Kernel Architecture

        + +

        Triton vs CUTLASS Ping-Pong FP8 GEMM TFLOPs, M=M, N=4096, K=4096

        + +

        Figure 7. Triton vs CUTLASS Ping-Pong FP8 GEMM TFLOPs, M=M, N=4096, K=4096

        + +

        The above chart shows the performance of a CUTLASS Ping-Pong GEMM kernel against Triton. The Ping-Pong kernel leverages TMA differently than Triton. It makes use of all of its HW and SW software capabilities, while Triton currently does not. Specifically, CUTLASS supports the below TMA features that help explain the performance gaps in pure GEMM performance:.

        + +
          +
        1. +

          TMA Multicast

          + +
            +
          • Enables copy of data from GMEM to multiple SMs
          • +
          +
        2. +
        3. +

          Warp Specialization

          + +
            +
          • Enables warp groups within a threadblock to take on different roles
          • +
          +
        4. +
        5. +

          Tensor Map (TMA Descriptor) Prefetch

          + +
            +
          • Enables prefetching the Tensor Map object from GMEM, which allows pipelining of TMA loads
          • +
          +
        6. +
        + +

        To put the performance numbers in perspective, below we show a ‘speed-up’ chart highlighting the latency differences on a percentage basis:

        + +

        % Speedup of CUTLASS Ping-Pong vs Triton FP8 with TMA.

        + +

        Figure 8: % Speedup of CUTLASS Ping-Pong vs Triton FP8 with TMA.

        + +

        This speedup is purely kernel throughput, not including E2E launch overhead which we will discuss below.

        + +

        TMA Descriptor movement - a key difference between Triton and CUTLASS with E2E performance implications

        + +

        As noted previously, creation of a 2D+ dimensional TMA descriptor takes place on the host and is then transferred to the device. However, this transfer process takes place very differently depending on the implementation.

        + +

        Here we showcase the differences between how Triton transfers TMA descriptors compared with CUTLASS.

        + +

        Recall, TMA transfers require a special data structure, a tensor map to be created on CPU through the cuTensorMap API, which for an FP8 GEMM Kernel means creating three descriptors, one for each A, B and C. We see below that for both the Triton and CUTLASS Kernels the same CPU procedures are invoked.

        + +

        Calls to cuTensorMapEncodeTiled (Both Triton and CUTLASS use this path)

        + +

        Figure 7. Calls to cuTensorMapEncodeTiled (Both Triton and CUTLASS use this path)

        + +

        However, for Triton, each descriptor is transferred in its own distinct copy kernel, which adds a significant amount of overhead and serves as a barrier to use this kernel in an end-to-end use inference scenario.

        + +

        Three H2D Copy Kernels are launched before the kernel execution, for A, B and C

        + +

        Figure 8. Three H2D Copy Kernels are launched before the kernel execution, for A, B and C

        + +

        These copies are not observed in the CUTLASS implementation, due to the way that TMA descriptors are passed to the kernel. We can see from the PTX below that with Cutlass, tensor maps are passed-by-value to the kernel.

        + +
        .entry _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_6half_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEENS7_ILi128EEES9_EEENS6_IJNS7_ILi2EEENS7_ILi1EEESC_EEENS_4gemm32KernelTmaWarpSpecializedPingpongENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE(
        +
        +.param .align 64 .b8 _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_6half_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEENS7_ILi128EEES9_EEENS6_IJNS7_ILi2EEENS7_ILi1EEESC_EEENS_4gemm32KernelTmaWarpSpecializedPingpongENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE_param_0[1024]
        +
        +
        +mov.b64 	%rd110, _ZN7cutlass13device_kernelIN49_GLOBAL__N__8bf0e19b_16_scaled_mm_c3x_cu_2bec3df915cutlass_3x_gemmIaNS_10bfloat16_tENS1_14ScaledEpilogueEN4cute5tupleIJNS5_1CILi64EEES8_NS7_ILi256EEEEEENS6_IJNS7_ILi1EEESB_SB_EEENS_4gemm24KernelTmaWarpSpecializedENS_8epilogue18TmaWarpSpecializedEE10GemmKernelEEEvNT_6ParamsE_param_0;
        +
        +add.s64 	%rd70, %rd110, 704;
        +cvta.param.u64 	%rd69, %rd70;
        +
        +cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%rd69, {%r284, %r283}], [%r1880];
        +
        + +

        Figure 9. CUTLASS kernel PTX showing pass-by-value

        + +

        By directly passing the TMA Descriptor as opposed to passing a global memory pointer, the CUTLASS kernel avoids the three extra H2D copy kernels and instead these copies are included in the single device kernel launch for the GEMM.

        + +

        Because of the difference in how descriptors are moved to the device, the kernel latencies including the time to prepare the tensors to be consumed by the TMA is drastically different. For M=1-128, N=4096, K=4096 the CUTLASS pingpong kernel has an average latency of 10us Triton TMA kernels complete in an average of 4ms. This is a factor of ~3330x slower and appears to be directly linked to the 3 independent kernel launches for TMA descriptor transfer by Triton.

        + +

        Cuda graphs may be one way to reduce this, but given the overhead created by the H2D copies the current Triton implementation when measured end to end is not competitive. A rework of how the Triton compiler manages TMA descriptors would likely resolve this gap. We thus focused on comparing the actual compute kernel throughput and not E2E in our data above.

        + +

        Results Summary

        + +

        Triton FP8 TMA GEMM TFLOPs Comparison

        + +

        Figure 10. Triton FP8 TMA GEMM TFLOPs Comparison

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        M + Triton TMA + Triton Tutorial + Triton SplitK + cuBLAS FP8 + cuBLAS FP16 + CUTLASS Ping-Pong FP8 +
        1 + 2.5 + 1 + 2.4 + 1.5 + 1.8 + 3.57 +
        2 + 5.1 + 2.5 + 4.8 + 3.1 + 3.6 + 5.9 +
        4 + 10.3 + 7.21 + 9.6 + 6.1 + 7.2 + 14.3 +
        8 + 21.0 + 16.5 + 19.2 + 12.3 + 14.4 + 28.6 +
        16 + 44.5 + 41.0 + 37.2 + 24.5 + 27.7 + 55.1 +
        32 + 89.7 + 81.2 + 72.2 + 71.6 + 56.8 + 114.4 +
        64 + 178.5 + 163.7 + 130.8 + 144.6 + 105.3 + 228.7 +
        128 + 359.7 + 225.9 + 160.1 + 244.0 + 189.2 + 377.7 +
        + +

        Figure 11. Triton FP8 TMA GEMM TFLOPs Comparison Table

        + +

        The above chart and table summarize the gain we’ve been able to achieve on a single NVIDIA H100 for FP8 GEMM, by leveraging the TMA Hardware Unit, over non-TMA Triton kernels and high performance CUDA (cuBLAS) kernels. The key point to note is this kernel’s superior scaling (with the batch size) properties over the competition. The problem sizes we benchmarked on are representative of the matrix shapes found in small-to-medium batch size LLM inference. Thus, TMA GEMM kernel performance in the mid-M regime (M=32 to M=128) will be critical for those interested in leveraging this kernel for FP8 LLM deployment use cases, as the FP8 compressed data type can allow larger matrices to fit in GPUs memory.

        + +

        To summarize our analysis, the TMA implementation in Triton and CUTLASS differ in terms of full featureset support (multicast, prefetch etc.) and how the TMA Descriptor is passed to the GPU kernel. If this descriptor is passed in a manner that more closely matches the CUTLASS kernel (pass-by-value), the extraneous H2D copies could be avoided and thus the E2E performance would be greatly improved.

        + +

        Future Work

        + +

        For future research, we plan to improve upon these results, by working with the community to incorporate the CUTLASS architecture of TMA loads into Triton as well as investigating the Cooperative Kernel for FP8 GEMM, a modified strategy to the Ping-Pong Kernel.

        + +

        In addition, once features like thread block clusters and TMA atomic operations are enabled in Triton, we may be able to get further speedups by leveraging the SplitK strategy in the TMA GEMM Kernel, as atomic operations on Hopper can be performed in Distributed Shared Memory (DSMEM) as opposed to L2 Cache. We also note the similarities of NVIDIA Hopper GPUs with other AI hardware accelerators like Google’s TPU and IBM’s AIU which are dataflow architectures. On Hopper, data can now “flow” from GMEM to a network of connected SMs due to the additions of TMA, which we discussed extensively in this blog, and DSMEM, which we plan to cover in a future post.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-computational-graphs-are-executed-in-pytorch/index.html b/blog/how-computational-graphs-are-executed-in-pytorch/index.html new file mode 100644 index 000000000000..121a1d69a03b --- /dev/null +++ b/blog/how-computational-graphs-are-executed-in-pytorch/index.html @@ -0,0 +1,1689 @@ + + + + + + + + + + + + + How Computational Graphs are Executed in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Preferred Networks + +

        +

        Welcome to the last entry into understanding the autograd engine of PyTorch series! +If you haven’t read parts 1 & 2 check them now to understand how PyTorch creates the computational graph for the backward pass!

        + +

        This post is based on PyTorch v1.11, so some highlighted parts may differ across versions.

        + +

        PyTorch autograd graph execution

        + +

        The last post showed how PyTorch constructs the graph to calculate the outputs’ derivatives w.r.t. the inputs when executing the forward pass. Now we will see how the execution of the backward pass is coordinated and done by looking at the whole process, starting from Python down to the lower C++ level internals.

        + +

        What Happens when Calling backward()/grad() from Python

        +

        Using variable.backward()

        + +

        After doing all our calculations with an input set to require the gradient, we call .backward() on the result to initiate the backward pass execution.

        + +
        >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
        +>>> y = torch.exp(x).sum()
        +>>> y.backward()
        +
        + +

        Calling .backward() on a tensor results in a call to torch.autograd.backward().

        +
        # torch/_tensor.py
        +
        +def backward(self, gradient=None, retain_graph=None, create_graph=False, inputs=None):
        +    
        +    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
        +
        +
        +

        torch.autograd.backward() checks the arguments and calls the autograd engine in the C++ layer.

        + +
        def backward(
        +    tensors: _TensorOrTensors,
        +    grad_tensors: Optional[_TensorOrTensors] = None,
        +    retain_graph: Optional[bool] = None,
        +    create_graph: bool = False,
        +    grad_variables: Optional[_TensorOrTensors] = None,
        +    inputs: Optional[_TensorOrTensors] = None,
        +) -> None:
        +    
        +
        +    if inputs is not None and len(inputs) == 0:
        +        raise RuntimeError("'inputs' argument to backward() cannot be empty.")
        +
        +    tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
        +    inputs = (inputs,) if isinstance(inputs, torch.Tensor) else \
        +        tuple(inputs) if inputs is not None else tuple()
        +
        +    grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
        +    grad_tensors_ = _make_grads(tensors, grad_tensors_)
        +    if retain_graph is None:
        +        retain_graph = create_graph
        +
        +    Variable._execution_engine.run_backward(
        +        tensors, grad_tensors_, retain_graph, create_graph, inputs,
        +        allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
        +
        +
        +

        First, whether the grad_tensors argument was specified or not, there is a call to the _make_grads function. This is used to check the provided grad_tensors or to specify the default value for them by looking at the tensors argument values’ shapes. Check the first blog post for details on the default value for the grad_tensors of the backward pass. This function just provides the vector of the vector jacobian product if it was not initially specified.

        + +

        In the above code, Variable has an _execution_engine attribute that is defined in torch.autograd.variable to be of type ImperativeEngine; the C++ engine exported to python and declared in torch/csrc/autograd/python_engine.cpp. In the following sections, we explain in detail how this object executes the backward pass.

        + +

        Note that the torch.autograd.backward function has an inputs optional argument. This argument is used when we want to calculate the .grad field of only a subset of input tensors in the forward pass.

        + +
        >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
        +>>> y = torch.tensor([0.1, 0.90], requires_grad=True)
        +>>> z = torch.exp(x * y).sum()
        +>>> torch.autograd.backward([z], inputs=[x])
        +>>> x.grad
        +tensor([0.1051, 1.7676])
        +>>> y.grad  # None
        +>>>
        +
        +
        +

        Using torch.autograd.grad

        + +

        An alternative to backward() is to use torch.autograd.grad(). The main difference to backward() is that grad() returns a tuple of tensors with the gradients of the outputs w.r.t. the inputs kwargs instead of storing them in the .grad field of the tensors. As you can see, the grad() code shown below is very similar to backward.

        + +
        def grad(
        +    outputs: _TensorOrTensors,
        +    inputs: _TensorOrTensors,
        +    grad_outputs: Optional[_TensorOrTensors] = None,
        +    retain_graph: Optional[bool] = None,
        +    create_graph: bool = False,
        +    only_inputs: bool = True,
        +    allow_unused: bool = False,
        +   is_grads_batched: bool = False
        +) -> Tuple[torch.Tensor, ...]:
        +   
        +    outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs)
        +    inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs)
        +    overridable_args = outputs + inputs
        +    if has_torch_function(overridable_args):
        +        return handle_torch_function(
        +            grad,
        +            overridable_args,
        +            outputs,
        +            inputs,
        +            grad_outputs=grad_outputs,
        +            retain_graph=retain_graph,
        +            create_graph=create_graph,
        +            only_inputs=only_inputs,
        +            allow_unused=allow_unused,
        +        )
        +
        +    grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(outputs))
        +    grad_outputs_ = _make_grads(outputs, grad_outputs_)
        +
        +    if retain_graph is None:
        +        retain_graph = create_graph
        +
        +    if is_grads_batched:
        +        # …. It will not be covered here
        +    else:
        +        return Variable._execution_engine.run_backward(
        +            outputs, grad_outputs_, retain_graph, create_graph, inputs,
        +            allow_unused, accumulate_grad=False)  # Calls into the C++ engine to run the backward pass
        +
        +
        + +

        Figure 1 shows the computational graph with the backward() and grad() arguments highlighted in red and blue, respectively:

        + +

        + +

        + +

        +Fgiure 1: Correspondence of `backward`/`grad` arguments in the graphs. +

        + +

        Going Inside the Autograd Engine

        + +

        Refreshing Concepts: Nodes and Edges

        + +

        As we saw in 2 +The computational graph comprises Node and Edge objects. Please read that post if you haven’t done it yet.

        + +

        Nodes

        + +

        Node objects are defined in torch/csrc/autograd/function.h, and they provide an overload of operator() for the associated function and a list of edges to do the graph traversal. Note that Node is a base class that autograd functions inherit from and override the apply method to execute the backward function.

        +
        struct TORCH_API Node : std::enable_shared_from_this<Node> {
        + ...
        + /// Evaluates the function on the given inputs and returns the result of the
        +  /// function call.
        +  variable_list operator()(variable_list&& inputs) {
        +  ...
        +  }
        +
        +protected:
        +  /// Performs the `Node`'s actual operation.
        +  virtual variable_list apply(variable_list&& inputs) = 0;
        +  
        +  edge_list next_edges_;
        +  uint64_t topological_nr_ = 0;
        +  
        +
        +
        + +

        There is an attribute called topological_nr_ in every node object. This number is used to optimize the graph execution as it allows to discard of graph branches under certain conditions. The topological number is the longest distance between this node and any leaf node and it is shown in Figure 2. Its main property is that for any pair of nodes x, y in a directed graph topo_nr(x) < topo_nr(y) means that there is no path from x to y. So this allows for reducing the number of paths in the graph in need of traversal. Check the topological_nr +) method comment for further details.

        + +

        + +

        + +

        +Figure 2: Example of the Topological Number calculation +

        + +

        Edges

        + +

        The Edge object links Nodes together, and its implementation is straightforward.

        + +
        struct Edge {
        +  ...
        +  /// The function this `Edge` points to.
        +  std::shared_ptr<Node> function;
        +  /// The identifier of a particular input to the function.
        +  uint32_t input_nr;
        +};
        +
        +
        + +

        It only requires a function pointer to the Node and an input number that is the index of the output from the forward function this edge points to. When preparing the set of gradients before calling “function”, we know that what is flowing from this edge should be accumulated in the “input_nr”th argument. Note that the input/output name is flipped here and this is the input to the backward function. + Edge objects are constructed using the gradient_edge function method.

        + +
         Edge gradient_edge(const Variable& self) {
        +    if (const auto& gradient = self.grad_fn()) {
        +      return Edge(gradient, self.output_nr());
        +    } else {
        +      return Edge(grad_accumulator(self), 0);
        +    }
        +  }
        +
        +
        +

        Entering the C++ Realm

        + +

        Once that torch.autograd.backward() has been invoked, the +THPEngine_run_backward routine starts the graph traversal. Following is a schema of the function body:

        +
        PyObject *THPEngine_run_backward(PyObject *self, PyObject *args, PyObject *kwargs)
        +{
        +  HANDLE_TH_ERRORS
        +  PyObject *tensors = nullptr;
        +  PyObject *grad_tensors = nullptr;
        +  unsigned char keep_graph = 0;
        +  unsigned char create_graph = 0;
        +  PyObject *inputs = nullptr;
        +  
        +  // Convert the python arguments to C++ objects
        +  const char *accepted_kwargs[] = { // NOLINT
        +      "tensors", "grad_tensors", "keep_graph", "create_graph", "inputs",
        +      "allow_unreachable", "accumulate_grad", nullptr
        +  };
        +  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OObb|Obb", (char**)accepted_kwargs,
        +        &tensors, &grad_tensors, &keep_graph, &create_graph, &inputs, &allow_unreachable, &accumulate_grad))
        +
        + // Prepare arguments
        + for(const auto i : c10::irange(num_tensors)) {
        +   // Check that the tensors require gradients
        +  }
        +
        +  std::vector<Edge> output_edges;
        +  if (inputs != nullptr) {
        +     // Prepare outputs
        +  }
        +
        +  {
        +      // Calls the actual autograd engine
        +    pybind11::gil_scoped_release no_gil;
        +    outputs = engine.execute(roots, grads, keep_graph, create_graph, accumulate_grad, output_edges);
        +  }
        +    // Clean up and finish
        +}
        +
        +
        + +

        First, we prepare the input arguments after converting the PyObject arguments to actual C++ objects. The tensors list contains the tensors from which we start the backward pass. These tensors are converted to edges using torch::autograd::impl::gradient_edge and added to a list called roots where the graph traversal starts.

        + +
         edge_list roots;
        +  roots.reserve(num_tensors);
        +  variable_list grads;
        +  grads.reserve(num_tensors);
        +  for(const auto i : c10::irange(num_tensors)) {
        +    PyObject *_tensor = PyTuple_GET_ITEM(tensors, i);
        +       const auto& variable = THPVariable_Unpack(_tensor);
        +       auto gradient_edge = torch::autograd::impl::gradient_edge(variable);
        +     roots.push_back(std::move(gradient_edge));
        +
        +    PyObject *grad = PyTuple_GET_ITEM(grad_tensors, i);
        +    if (THPVariable_Check(grad)) {
        +      const Variable& grad_var = THPVariable_Unpack(grad);
        +      grads.push_back(grad_var);
        +    } 
        +  }
        +
        +
        + +

        Now, if the inputs argument was specified in backward or we used the torch.autograd.grad api, the following code creates a list of edges to accumulate the gradients in the specified tensors at the end of the computation. The engine uses this later to optimize the execution as it doesn’t add the gradients in all the leaf nodes, just the specified ones.

        + +
          std::vector<Edge> output_edges;
        +  if (inputs != nullptr) {
        +    int num_inputs = PyTuple_GET_SIZE(inputs);
        +    output_edges.reserve(num_inputs);
        +    for (const auto i : c10::irange(num_inputs)) {
        +      PyObject *input = PyTuple_GET_ITEM(inputs, i);
        +      const auto& tensor = THPVariable_Unpack(input);
        +      const auto output_nr = tensor.output_nr();
        +      auto grad_fn = tensor.grad_fn();
        +      if (!grad_fn) {
        +        grad_fn = torch::autograd::impl::try_get_grad_accumulator(tensor);
        +      }
        +      if (accumulate_grad) {
        +        tensor.retain_grad();
        +      }
        +      if (!grad_fn) {
        +        output_edges.emplace_back(std::make_shared<Identity>(), 0);
        +      } else {
        +        output_edges.emplace_back(grad_fn, output_nr);
        +      }
        +    }
        +  }
        +
        +
        + +

        The next step is the actual graph traversal and node function execution, and finally, the cleanup and return.

        + +
          {
        +    // Calls the actual autograd engine
        +    pybind11::gil_scoped_release no_gil;
        +    auto& engine = python::PythonEngine::get_python_engine();
        +    outputs = engine.execute(roots, grads, keep_graph, create_graph, accumulate_grad, output_edges);
        +  }
        +  // Clean up and finish
        +}
        +
        +
        + +

        Starting the Real Execution

        + +

        engine.executeis present in torch/csrc/autograd/engine.cpp

        + +

        There are two differentiated steps here:

        + +

        Analyze the graph to find dependencies between functions +Create worker threads that traverse the graph

        + +

        Data Structures Used for the Execution

        + +

        GraphTask

        + +

        All the execution metadata is managed by the GraphTask class in torch/csrc/autograd/engine.h

        + +
        struct GraphTask: std::enable_shared_from_this<GraphTask> {
        +  std::atomic<uint64_t> outstanding_tasks_{0};
        +  //  … 
        +  std::unordered_map<Node*, InputBuffer> not_ready_;
        +  std::unordered_map<Node*, int> dependencies_;
        +
        +  struct ExecInfo {
        +     // …
        +  };
        +  std::unordered_map<Node*, ExecInfo> exec_info_;
        +  std::vector<Variable> captured_vars_;
        +  // …
        +  std::shared_ptr<ReadyQueue> cpu_ready_queue_;
        +};
        +
        +
        + +

        Here we see a series of variables dedicated to maintaining the execution state. +outstanding_tasks_ tracks the number of tasks left to be executed for the backward pass to complete. not_ready_ holds the input arguments for the Nodes that are not ready to be executed. dependencies_ track the number of predecessors that a Node has. As the count reaches 0, the Node is ready for execution; it is placed in a ready queue to be retrieved and executed later.

        + +

        exec_info_ and the associated ExecInfo struct are used only when the inputs argument is specified or it is a call to autograd.grad(). They allow filter paths on the graph that are not needeed since only the gradients are calculated only for the variables in the inputs list.

        + +

        captured_vars_ is where the results of the graph execution are temporarily stored if we used the torch.autograd.grad() api instead of torch.autograd.backward() since grad() returns the gradients as tensors instead of just filling the .grad field of the inputs.

        + +

        NodeTask

        + +

        The NodeTask struct is a basic class that holds an fn_ pointer to the node to execute, and an inputs_ buffer to store the input arguments to this function. Note that the functions executed by the backward pass are the derivatives specified in the derivatives.yaml file. or the user provided backward function when using custom functions as described in the second blog post.

        + +

        The inputs_ buffer is also where the output gradients of the previously executed functions are aggregated, and it is defined as a std::vector<Variable> container with facilities to accumulate values at a given position.

        + +
        struct NodeTask {
        +  std::weak_ptr<GraphTask> base_;
        +  std::shared_ptr<Node> fn_;
        +  // This buffer serves as an implicit "addition" node for all of the
        +  // gradients flowing here.  Once all the dependencies are finished, we
        +  // use the contents of this buffer to run the function.
        +  InputBuffer inputs_;
        +};
        +
        +
        +

        GraphRoot

        + +

        The GraphRoot is a special function used to hold multiple input variables in a single place. The code is pretty simple as it only acts as a container of variables.

        + +
        struct TORCH_API GraphRoot : public Node {
        +  GraphRoot(edge_list functions, variable_list inputs)
        +      : Node(std::move(functions)),
        +      outputs(std::move(inputs)) {
        +    for (const auto& t : outputs) {
        +      add_input_metadata(t);
        +    }
        +  }
        +
        +  variable_list apply(variable_list&& inputs) override {
        +    return outputs;
        +  }
        +
        +
        + +

        AccumulateGrad

        + +

        This function is set during the graph creation in gradient_edge when the Variable object doesn’t have a grad_fn. This is, it is a leaf node.

        + +
            if (const auto& gradient = self.grad_fn()) {
        +      // …
        +    } else {
        +      return Edge(grad_accumulator(self), 0);
        +    }
        +
        +
        + +

        The function body is defined in torch/csrc/autograd/functions/accumulate_grad.cpp and it essentially accumulates the input grads in the object’s .grad attribute.

        + +
        auto AccumulateGrad::apply(variable_list&& grads) -> variable_list {
        +  check_input_variables("AccumulateGrad", grads, 1, 0);
        +  
        +
        +  at::Tensor new_grad = callHooks(variable, std::move(grads[0]));
        +  std::lock_guard<std::mutex> lock(mutex_);
        +
        +  at::Tensor& grad = variable.mutable_grad();
        +  accumulateGrad(
        +      variable,
        +      grad,
        +      new_grad,
        +      1 + !post_hooks().empty() /* num_expected_refs */,
        +      [&grad](at::Tensor&& grad_update) { grad = std::move(grad_update); });
        +  return variable_list();
        +}
        +}} // namespace torch::autograd
        +
        +
        +
        +
        + +

        accumulateGrad +does several checks on the tensors format and eventually performs the variable_grad += new_grad; accumulation.

        + +

        Preparing the graph for execution

        + +

        Now, let’s walk through Engine::execute. The first thing to do besides arguments consistency checks is to create the actual GraphTask object we described above. This object keeps all the metadata of the graph execution.

        + +
        auto Engine::execute(const edge_list& roots,
        +                     const variable_list& inputs,
        +                     bool keep_graph,
        +                     bool create_graph,
        +                     bool accumulate_grad,
        +                     const edge_list& outputs) -> variable_list {
        +
        +  validate_outputs(roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
        +    return msg;
        +  });
        +
        +  // Checks
        +
        +  auto graph_task = std::make_shared<GraphTask>(
        +      /* keep_graph */ keep_graph,
        +      /* create_graph */ create_graph,
        +      /* depth */ not_reentrant_backward_call ? 0 : total_depth + 1,
        +      /* cpu_ready_queue */ local_ready_queue);
        +
        +  // If we receive a single root, skip creating extra root node
        +  // …
        +  // Prepare graph by computing dependencies
        +  // …
        +  // Queue the root 
        +  // …
        +  // launch execution
        +  // …
        +}
        +
        +
        + +

        After creating the GraphTask, we use its associated function if we only have one root node. If we have multiple root nodes, we create a special GraphRoot object as described before.

        + +
          bool skip_dummy_node = roots.size() == 1;
        +  auto graph_root = skip_dummy_node ?
        +    roots.at(0).function :
        +    std::make_shared<GraphRoot>(roots, inputs);
        +
        +
        + +

        The next step is to fill the dependencies_ map in the GraphTask object since the engine must know when it can execute a task. The outputs here is the inputs argument passed to the torch.autograd.backward() call in Python. But here, we have reversed the names since the gradients w.r.t. the inputs of the forward pass are now the outputs of the backward pass. And from now on, there is no concept of forward/backward, but only graph traversal and execution.

        + +
          auto min_topo_nr = compute_min_topological_nr(outputs);
        +  // Now compute the dependencies for all executable functions
        +  compute_dependencies(graph_root.get(), *graph_task, min_topo_nr);
        +
        +  if (!outputs.empty()) {
        +    graph_task->init_to_execute(*graph_root, outputs, accumulate_grad, min_topo_nr);
        +  }
        +
        +
        + +

        Here we preprocess the graph for the execution of the nodes. First, compute_min_topological_nr is called to to obtain the minimum topological number of the tensors specified in outputs (0 if no inputs kwarg was supplied to .backward or input for .grad). This computation prunes paths in the graph that lead to input variables of which we don’t want/need to calculate the grads.

        + +

        Second, is the compute_dependencies call. This function is a very simple graph traversal that starts with the root Node, and for each of the edges in node.next_edges() it increments the counter in dependencies_. Figure 3 shows the result of the dependencies calculation for the example graph. Note that the number of dependencies of any node is just the number of edges arriving at it.

        + +

        + +

        + +

        +Figure 3: Number of dependencies for each node +

        + +

        Finally, the init_to_execute call, this is the one that populates the GraphTask::exec_info_ map in case that inputs were specified in the python backward call. It iterates the graph again, starting from the root, and records in the exec_info_ map the intermediate nodes needed to calculate only the given inputs gradients.

        + +
          // Queue the root
        +  if (skip_dummy_node) {
        +    InputBuffer input_buffer(roots.at(0).function->num_inputs());
        +    auto input = inputs.at(0);
        +
        +
        +    input_buffer.add(roots.at(0).input_nr,
        +                      std::move(input),
        +                      input_stream,
        +                      opt_next_stream);
        +
        +    execute_with_graph_task(graph_task, graph_root, std::move(input_buffer));
        +  } else {
        +    execute_with_graph_task(graph_task, graph_root, InputBuffer(variable_list()));
        +  }
        +  // Avoid a refcount bump for the Future, since we check for refcount in
        +  // DistEngine (see TORCH_INTERNAL_ASSERT(futureGrads.use_count() == 1)
        +  // in dist_engine.cpp).
        +  auto& fut = graph_task->future_result_;
        +  fut->wait();
        +  return fut->value().toTensorVector();
        +}
        +
        +
        + +

        And now, we are ready to start the actual execution by creating the InputBuffer. In case we only have one root variable, we begin by copying the value of the inputs tensor (this is the gradients passed to python backward) in position 0 of the input_buffer. This is a small optimization that avoids running the RootNode for no reason. Also, if the rest of the graph is not on the cpu, we directly start on that worker while the RootNode is always placed on the cpu ready queue. Details of the workers and ready queues are explained in the section below.

        + +

        On the other hand, if we have multiple roots, the GraphRoot object also holds the inputs, so it is enough to pass it an empty InputBuffer.

        + +

        Graph Traversal and Node Execution

        +

        Devices, Threads and Queues

        + +

        Before diving into the actual execution, we need to see how the engine is structured.

        + +

        First of all, the engine is multithreaded with one thread per device. For example, the caller thread is associated with the CPU while additional threads are created and associated with each GPU or other devices available in the system. Each thread tracks its device using thread-local storage in the worker_device variable. In addition, the threads have a queue of tasks to be executed also located in thread-local storage, the local_ready_queue. This is where work is queued for this thread to execute in the thread_main function that is explained later. +You will wonder how the device where a task should be executed is decided. The InputBuffer class has a device() function that returns the first non-cpu device of all its tensors. +This function is used together with Engine::ready_queue to select the queue to queue a task.

        + +
        auto Engine::ready_queue(std::shared_ptr<ReadyQueue> cpu_ready_queue, at::Device device) -> std::shared_ptr<ReadyQueue>{
        +  if (device.type() == at::kCPU || device.type() == at::DeviceType::Meta) {
        +    return cpu_ready_queue;
        +  } else {
        +    // See Note [Allocating GPUs to autograd threads]
        +    return device_ready_queues_.at(device.index());
        +  }
        +}
        +
        +
        + +

        The ReadyQueue object is defined in torch/csrc/autograd/engine.h and it is a simple wrapper over std::priority_queue that allows a thread to wait for a task if it’s empty. One interesting property of the ReadyQueue is that it increases the GraphTask::outstanding_tasks_ value used to determine if the execution has completed or not.

        + +
        auto ReadyQueue::push(NodeTask item, bool incrementOutstandingTasks) -> void {
        +  {
        +    std::lock_guard<std::mutex> lock(mutex_);
        +    if (incrementOutstandingTasks) {
        +      std::shared_ptr<GraphTask> graph_task = item.base_.lock();
        +      ++graph_task->outstanding_tasks_;
        +    }
        +    heap_.push(std::move(item));
        +  }
        +  not_empty_.notify_one();
        +}
        +
        +auto ReadyQueue::pop() -> NodeTask {
        +  std::unique_lock<std::mutex> lock(mutex_);
        +  not_empty_.wait(lock, [this]{ return !heap_.empty(); });
        +  auto task = std::move(const_cast<NodeTask&>(heap_.top())); heap_.pop();
        +  return task;
        +}
        +
        +
        + +

        Reentrant Backward

        + +

        A reentrant backward happens when one of the tasks in a backward pass calls again backward. It is not a very common case, but it can be used to reduce memory utilization as it could potentially avoid saving intermediate results. For more information, check this PyTorch forum post.

        + +
        class ReentrantBackward(torch.autograd.Function):
        +    @staticmethod
        +    def forward(ctx, input):
        +        return input.sum()
        +
        +    @staticmethod
        +    def backward(ctx, input):
        +        # Let's compute the backward by using autograd
        +        input = input.detach().requires_grad_()
        +        with torch.enable_grad():
        +            out = input.sum()
        +        out.backward()  # REENTRANT CALL!!
        +        return out.detach()
        +
        +
        + +

        Here, we call backward() inside backward() for a user custom-defined autograd function. +This situation can lead to deadlocks because the first backward needs to wait for the second one to complete. But some internal implementation details can prevent the second backward from completing as it is explained in the dedicated subsection.

        +

        Thread Initialization

        + +

        execute_with_graph_task is in charge of initializing the threads taking care of the computation and placing the root node in the queue of the device that produced it.

        + +
        c10::intrusive_ptr<at::ivalue::Future> Engine::execute_with_graph_task(
        +    const std::shared_ptr<GraphTask>& graph_task,
        +    std::shared_ptr<Node> graph_root,
        +    InputBuffer&& input_buffer) {
        +
        +  initialize_device_threads_pool();
        +  // Lock mutex for GraphTask.
        +  std::unique_lock<std::mutex> lock(graph_task->mutex_);
        +
        +  auto queue = ready_queue(graph_task->cpu_ready_queue_, input_buffer.device());
        +
        +  if (worker_device == NO_DEVICE) {
        +    set_device(CPU_DEVICE);
        +    graph_task->owner_ = worker_device;
        +    queue->push(NodeTask(graph_task, std::move(graph_root), std::move(input_buffer)));
        +    lock.unlock();
        +    thread_main(graph_task);
        +    worker_device = NO_DEVICE;
        +  } else {
        +     // This deals with reentrant backwards, we will see it later.
        +  }
        +  return graph_task->future_result_;
        +}
        +
        +
        + +

        First, this function initializes several threads (one per device) calling initialize_device_threads_pool() where several things happen: +One ReadyQueue per device is created. +One thread per non-cpu device is created. +A thread local worker_device variable is set to track the current device associated with the thread. +thread_main function is called, and threads wait for tasks to be put in their queues.

        + +

        Then it retrieves the queue to place the root node based on the device that holds the tensors present in the input_buffer using the ready_queue function. Now, the main thread (the one also executing the Python interpreter) has its worker_device set to NO_DEVICE, and it is in charge of executing functions with all its tensors living in the cpu. If worker_device is set to any other value, the graph execution is already started, and .backward() was called inside a running Node, creating a reentrant backward call. This is explained later. For now, +the main thread places the task in the queue and call thread_main.

        +

        Where the Magic Happens

        + +

        It’s been a long way, but finally, we are ready to traverse the graph and execute the nodes. Each of the spawned threads, and the main thread call thread_main.

        + +
        auto Engine::thread_main(const std::shared_ptr<GraphTask>& graph_task) -> void {
        +
        +  while (graph_task == nullptr || !graph_task->future_result_->completed()) {
        +    std::shared_ptr<GraphTask> local_graph_task;
        +    {
        +      NodeTask task = local_ready_queue->pop();
        +
        +      if (task.isShutdownTask_) {
        +        break;
        +      }
        +
        +      if (!(local_graph_task = task.base_.lock())) {
        +        // GraphTask for function is no longer valid, skipping further
        +        // execution.
        +        continue;
        +      }
        +
        +      if (task.fn_ && !local_graph_task->has_error_.load()) {
        +        at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_);
        +
        +        try {
        +          GraphTaskGuard guard(local_graph_task);
        +          NodeGuard ndguard(task.fn_);
        +          {
        +            evaluate_function(
        +                local_graph_task,
        +                task.fn_.get(),
        +                task.inputs_,
        +                local_graph_task->cpu_ready_queue_);
        +          }
        +        } catch (std::exception& e) {
        +          thread_on_exception(local_graph_task, task.fn_, e);
        +        }
        +      }
        +    }
        +
        +    // Decrement the outstanding tasks.
        +    --local_graph_task->outstanding_tasks_;
        +
        +    // Check if we've completed execution.
        +    if (local_graph_task->completed()) {
        +      local_graph_task->mark_as_completed_and_run_post_processing();
        +      auto base_owner = local_graph_task->owner_;
        +      if (worker_device != base_owner) {
        +        std::atomic_thread_fence(std::memory_order_release);
        +        ready_queue_by_index(local_graph_task->cpu_ready_queue_, base_owner)
        +            ->push(NodeTask(local_graph_task, nullptr, InputBuffer(0)));
        +      }
        +    }
        +  }
        +}
        +
        +
        + +

        The code here is simple, given the local_ready_queue assigned to each thread in thread-local storage. The threads loop until there are no tasks left to execute in the graph. Note that for device-associated threads, the passed graph_task argument is nullptr, and they block in local_ready_queue->pop() until a task is pushed in their queue. After some consistency checks (the task type is shutdown, or the graph is still valid). We get to the actual function invocation in evaluate_function.

        + +
                try {
        +          GraphTaskGuard guard(local_graph_task);
        +          NodeGuard ndguard(task.fn_);
        +          {
        +            evaluate_function(
        +                local_graph_task,
        +                task.fn_.get(),
        +                task.inputs_,
        +                local_graph_task->cpu_ready_queue_);
        +          }
        +        } catch (std::exception& e) {
        +          thread_on_exception(local_graph_task, task.fn_, e);
        +        }
        +      }
        +
        +
        + +

        After calling evaluate_function, we check if the graph_task execution is complete by looking the outstanding_tasks_ number. This number increases when a task is pushed to a queue and is decreased in local_graph_task->completed() when a task is executed. When the execution is done, we return the results that are be in the captured_vars_ in case we called torch.autograd.grad() instead of torch.autograd.backward() as this function returns tensors instead of storing them in the .grad attribute of the inputs. Finally we wake up the main thread if it’s waiting by sending a dummy task.

        + +
           // Decrement the outstanding tasks.
        +    --local_graph_task->outstanding_tasks_;
        +
        +    // Check if we've completed execution.
        +    if (local_graph_task->completed()) {
        +      local_graph_task->mark_as_completed_and_run_post_processing();
        +      auto base_owner = local_graph_task->owner_;
        +      if (worker_device != base_owner) {
        +        std::atomic_thread_fence(std::memory_order_release);
        +        ready_queue_by_index(local_graph_task->cpu_ready_queue_, base_owner)
        +            ->push(NodeTask(local_graph_task, nullptr, InputBuffer(0)));
        +      }
        +    }
        +
        +
        + +

        Calling the Function and Unlocking New Tasks

        + +

        evaluate_function serves three purposes:

        + +

        Run the function. +Accumulate its results in the next node InputBuffers. +Decrease the dependencies counter of the next nodes and enqueues the tasks reaching 0 to be executed.

        + +
        void Engine::evaluate_function(
        +    std::shared_ptr<GraphTask>& graph_task,
        +    Node* func,
        +    InputBuffer& inputs,
        +    const std::shared_ptr<ReadyQueue>& cpu_ready_queue) {
        +
        +  // If exec_info_ is not empty, we have to instrument the execution
        +  auto& exec_info_ = graph_task->exec_info_;
        +  if (!exec_info_.empty()) {
        +    // Checks if the function needs to be executed 
        +    if (!fn_info.needed_) {
        +      // Skip execution if we don't need to execute the function.
        +      return;
        +    }
        +  }
        +
        +  auto outputs = call_function(graph_task, func, inputs);
        +
        +  auto& fn = *func;
        +  if (!graph_task->keep_graph_) {
        +    fn.release_variables();
        +  }
        +
        +
        + +

        Initially, we check the exec_info_ map of the GraphTask structure to determine if the current node needs to be executed. Remember that if this map is empty, all the nodes are executed because we are calculating the grads for all the inputs of the forward pass.

        + +

        After this check, the function is executed by running call_function. Its implementation is very straightforward and calls the actual derivative function and registered hooks if any.

        + +
          int num_outputs = outputs.size();
        +  if (num_outputs == 0) {
        +    // Records leaf stream (if applicable)
        +    return;
        +  }
        +
        +  if (AnomalyMode::is_enabled()) {
        +    // check for nan values in result
        +  }
        +
        +
        + +

        Next, we check the outputs of the function after call_function is done. If the number of outputs is 0, there are no following nodes to be executed so we can safely return. This is the case of the AccumulateGrad node associated with the leaf nodes.

        + +

        Also, the check for NaN values in the gradients is done here if requested.

        +
        
        +  std::lock_guard<std::mutex> lock(graph_task->mutex_);
        +  for (const auto i : c10::irange(num_outputs)) {
        +    auto& output = outputs[i];
        +    const auto& next = fn.next_edge(i);
        +
        +    if (!next.is_valid()) continue;
        +
        +   
        +
        +
        + +

        We have now executed a grad_fn that has returned one gradient per each of the associated forward pass function inputs. As we saw in the previous blog post, we have an Edge object per each of these input tensors, and the grad_fn of the function producing them in the forward pass. Essentially, Output[0] of the node in the backward pass, corresponds to the first argument of the forward pass associated function. Figure 4 shows how the outputs of a backward function are related to the inputs of the forward function. See that the outputs of grad_fn C are the gradients of z w.r.t. the inputs of Function C

        + +

        + +

        + +

        +Figure 4: Correspondence between forward and backward functions inputs and outputs +

        + +

        We now iterate through these edges and check if the associated functions are ready to be executed.

        + +
         // Check if the next function is ready to be computed
        +    bool is_ready = false;
        +    auto& dependencies = graph_task->dependencies_;
        +    auto it = dependencies.find(next.function.get());
        +
        +    if (it == dependencies.end()) {
        +      auto name = next.function->name();
        +      throw std::runtime_error(std::string("dependency not found for ") + name);
        +    } else if (--it->second == 0) {
        +      dependencies.erase(it);
        +      is_ready = true;
        +    }
        +
        +    auto& not_ready = graph_task->not_ready_;
        +    auto not_ready_it = not_ready.find(next.function.get());
        +
        +
        + +

        For this, we check the graph_task->dependencies_ map. We decrement the counter, and if it reaches 0, we mark the function pointed by the edge ready to be executed. Following, we prepare the input buffers of the tasks indicated by the next edges.

        + +
            if (not_ready_it == not_ready.end()) {
        +      if (!exec_info_.empty()) {
        +        // Skip functions that aren't supposed to be executed
        +      }
        +
        +      // Creates an InputBuffer and moves the output to the corresponding input position
        +      InputBuffer input_buffer(next.function->num_inputs());
        +      input_buffer.add(next.input_nr,
        +                       std::move(output),
        +                       opt_parent_stream,
        +                       opt_next_stream);
        +
        +      if (is_ready) {
        +        auto queue = ready_queue(cpu_ready_queue, input_buffer.device());
        +        queue->push(
        +            NodeTask(graph_task, next.function, std::move(input_buffer)));
        +      } else {
        +        not_ready.emplace(next.function.get(), std::move(input_buffer));
        +      }
        +
        +
        + +

        Here, we look for the task in the graph_task->not_ready_ map. If it is not present, we create a new InputBuffer object and set the current output in the input_nr position of the buffer associated with the edge. If the task is ready to be executed, we enqueue it in the appropriate device ready_queue and complete the execution. However, if the task is not ready and we have seen it before, it is present in the not_ready_map_.

        + +
            } else {
        +      // The function already has a buffer
        +      auto &input_buffer = not_ready_it->second;
        +      // Accumulates into buffer
        +      input_buffer.add(next.input_nr,
        +                       std::move(output),
        +                       opt_parent_stream,
        +                       opt_next_stream);
        +      if (is_ready) {
        +        auto queue = ready_queue(cpu_ready_queue, input_buffer.device());
        +        queue->push(NodeTask(graph_task, next.function, std::move(input_buffer)));
        +        not_ready.erase(not_ready_it);
        +      }
        +    }
        +  }
        +}
        +
        +
        + +

        In this case, we accumulate the output in the existing input_buffer instead of creating a new one. Once all the tasks are processed, the worker thread exits the loop and complete. +All this process is summarized in the animation in Figure 5. We see how a thread peeks at the tasks in the ready queue and decrements the next nodes’ dependencies, unlocking them for execution.

        + +

        + +

        + +

        +Figure 5: Animation of the execution of the computational graph +

        + +

        Flow with Reentrant Backward

        + +

        As we saw above, the reentrant backward problem is when the currently executed function does a nested call to backward. When this happens, the thread running this function goes all the way down to execute_with_graph_task as in the non-reentrant case, but here is when things are different.

        + +
        c10::intrusive_ptr<at::ivalue::Future> Engine::execute_with_graph_task(
        +    const std::shared_ptr<GraphTask>& graph_task,
        +    std::shared_ptr<Node> graph_root,
        +    InputBuffer&& input_buffer) {
        +
        +  initialize_device_threads_pool();
        +  // Lock mutex for GraphTask.
        +  std::unique_lock<std::mutex> lock(graph_task->mutex_);
        +
        +  auto queue = ready_queue(graph_task->cpu_ready_queue_, input_buffer.device());
        +
        +  if (worker_device == NO_DEVICE) {
        +    //Regular case
        +  } else {
        +    // If worker_device is any devices (i.e. CPU, CUDA): this is a re-entrant
        +    //    backward call from that device.
        +    graph_task->owner_ = worker_device;
        +
        +    // Now that all the non-thread safe fields of the graph_task have been populated,
        +    // we can enqueue it.
        +    queue->push(NodeTask(graph_task, std::move(graph_root), std::move(input_buffer)));
        +
        +    if (current_depth >= max_recursion_depth_) {
        +      // If reached the max depth, switch to a different thread
        +      add_thread_pool_task(graph_task);
        +    } else {
        +      ++total_depth;
        +      ++current_depth;
        +      lock.unlock();
        +      thread_main(graph_task);
        +      --current_depth;
        +      --total_depth;
        +    }
        +  }
        +  return graph_task->future_result_;
        +}
        +
        +
        + +

        Here, execute_with_graph_task detects this as a reentrant call and then looks for the current number of nested calls. If it exceeds the limit, we create a new thread to take care of the execution of this graph, and if not, we execute this reentrant call regularly. +The limit of nested calls was originally set to avoid stack overflow due to reentrant calls creating very large call stacks. However, the number was further reduced when sanitizer tests were added because of the maximum amount of locks a thread can hold at a given moment. This can be seen in torch/csrc/autograd/engine.h.

        + +

        When this maximum depth is exceeded, a new thread is created with the add_thread_pool_task function.

        + +
        void Engine::add_thread_pool_task(const std::weak_ptr<GraphTask>& graph_task) {
        +  std::unique_lock<std::mutex> lck(thread_pool_shared_->mutex_);
        +  // if we have pending graph_task objects to be processed, create a worker.
        +   bool create_thread = (thread_pool_shared_->num_workers_ <= thread_pool_shared_->graphtasks_queue_.size());
        +  thread_pool_shared_->graphtasks_queue_.push(graph_task);
        +
        +
        +  lck.unlock();
        +  if (create_thread) {
        +    std::thread t(&Engine::reentrant_thread_init, this);
        +    t.detach();
        +  }
        +
        +  thread_pool_shared_->work_.notify_one();
        +}
        +
        +
        +
        +
        + +

        Before going in-depth, let’s look at the thread_pool_shared_ object in the Engine which manages all the information related to the threads associated to the reentrant backward calls.

        + +
          struct ThreadPoolShared {
        +    unsigned int num_workers_;
        +    std::condition_variable work_;
        +    std::mutex mutex_;
        +    std::queue<std::weak_ptr<GraphTask>> graphtasks_queue_;
        +
        +    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
        +    ThreadPoolShared() : num_workers_(0) {}
        + };
        +
        +
        +
        +
        + +

        ThreadPoolShared is a simple container holding a queue of GraphTask objects with synchronization mechanisms and the number of current workers.

        + +

        Now it is easy to understand how add_thread_pool_task creates a thread when there are graph_task objects enqueued and insufficient workers to process them.

        + +

        add_thread_pool_task initializes a thread by executing reentrant_thread_init

        + +
        void Engine::reentrant_thread_init() {
        +  at::init_num_threads();
        +  auto tp_shared = thread_pool_shared_;
        +  while(true) {
        +    std::unique_lock<std::mutex> lk(tp_shared->mutex_);
        +    ++thread_pool_shared_->num_workers_;
        +    tp_shared->work_.wait(lk, [&tp_shared]{ return !tp_shared->graphtasks_queue_.empty();});
        +    --thread_pool_shared_->num_workers_;
        +    auto task = tp_shared->graphtasks_queue_.front();
        +    tp_shared->graphtasks_queue_.pop();
        +    lk.unlock();
        +    std::shared_ptr<GraphTask> graph_task;
        +    if (!(graph_task = task.lock())) {
        +      continue;
        +    }
        +    set_device(graph_task->owner_);
        +    // set the local_ready_queue to the ready queue on the graph_task->owner_ device
        +    local_ready_queue = ready_queue_by_index(graph_task->cpu_ready_queue_, graph_task->owner_);
        +    total_depth = graph_task->reentrant_depth_;
        +    thread_main(graph_task);
        +  }
        +}
        +
        +
        +
        +
        + +

        The code is straightforward. The newly created thread waits on the thread_pool_shared->graphtasks_queue_ for reentrant backward graphs to be available and executes them. Notice that this thread uses the task-ready queue associated with the device of the thread that started this call by accessing the graph_task->owner_ field set in the execute_with_graph_task function.

        + +

        Error Handling

        + +

        Whenever an error happens in one of the worker threads. It will be propagated to the backward calling thread.

        + +

        To achieve this, there is a try/catch block in the thread_main that catches any exception in the Node function call and sets it to the associated GraphTask object.

        + +
               try {
        +          
        +          GraphTaskGuard guard(local_graph_task);
        +          NodeGuard ndguard(task.fn_);
        +          {
        +            evaluate_function(
        +               
        +          }
        +        } catch (std::exception& e) {
        +          thread_on_exception(local_graph_task, task.fn_, e);
        +        }
        +      }
        +    }
        +
        +
        + +

        thread_on_exception and the functions it calls end up setting the exception in the local_graph_task object.

        + +
        void Engine::thread_on_exception(
        +    std::shared_ptr<GraphTask> graph_task,
        +    const std::shared_ptr<Node>& fn,
        +    std::exception& e) {
        +  graph_task->set_exception(std::current_exception(), fn);
        +}
        +
        +void GraphTask::set_exception_without_signal(const std::shared_ptr<Node>& fn) {
        +  if (!has_error_.exchange(true)) {
        +    if (AnomalyMode::is_enabled() && fn) {
        +      fn->metadata()->print_stack(fn->name());
        +    }
        +  }
        +}
        +
        +void GraphTask::set_exception(
        +    std::exception_ptr eptr,
        +    const std::shared_ptr<Node>& fn) {
        +  set_exception_without_signal(fn);
        +  if (!future_completed_.exchange(true)) {
        +    // NOLINTNEXTLINE(performance-move-const-arg)
        +    future_result_->setError(std::move(eptr));
        +  }
        +}
        +
        +
        + +

        In set_exception it sets the has_error_ flag to true and it calls the setError +function of the future_result_ object. This will make the error to be re-thrown at the caller thread when future_result_->value() is accessed.

        + +
         IValue value() {
        +    std::unique_lock<std::mutex> lock(mutex_);
        +    AT_ASSERT(completed());
        +    if (eptr_) {
        +      std::rethrow_exception(eptr_);
        +    }
        +    return value_;
        +  }
        +
        +
        + +

        Closing Remarks

        + +

        This has been the last post of this series covering how PyTorch does the auto differentiation. We hope you enjoyed reading it and that now you are familiar enough with PyTorch internals to start contributing in PyTorch development!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch/index.html b/blog/how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch/index.html new file mode 100644 index 000000000000..da79b6e740f8 --- /dev/null +++ b/blog/how-disney-improved-activity-recognition-with-multimodal-approaches-with-pytorch/index.html @@ -0,0 +1,775 @@ + + + + + + + + + + + + + How Disney Improved Activity Recognition Through Multimodal Approaches with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Monica Alfaro, Albert Aparicio, Francesc Guitart, Marc Junyent, Pablo Pernias, Marcel Porta, and Miquel Àngel Farré (former Senior Technology Manager) + +

        +

        Introduction

        + +

        Among the many things Disney Media & Entertainment Distribution (DMED) is responsible for, is the management and distribution of a huge array of media assets including news, sports, entertainment and features, episodic programs, marketing and advertising and more.

        + +

        + +

        + +

        Our team focuses on media annotation as part of DMED Technology’s content platforms group. In our day-to-day work, we automatically analyze a variety of content that constantly challenges the efficiency of our machine learning workflow and the accuracy of our models.

        + +

        Several of our colleagues recently discussed the workflow efficiencies that we achieved by switching to an end-to-end video analysis pipeline using PyTorch, as well as how we approach animated character recognition. We invite you to read more about both in this previous post.

        + +

        While the conversion to an end-to-end PyTorch pipeline is a solution that any company might benefit from, animated character recognition was a uniquely-Disney concept and solution.

        + +

        In this article we will focus on activity recognition, which is a general challenge across industries — but with some specific opportunities when leveraged in the media production field, because we can combine audio, video, and subtitles to provide a solution.

        + +

        Experimenting with Multimodality

        + +

        Working on a multimodal problem adds more complexity to the usual training pipelines. Having multiple information modes for each example means that the multimodal pipeline has to have specific implementations to process each mode in the dataset. Usually after this processing step, the pipeline has to merge or fuse the outputs.

        + +

        Our initial experiments in multimodality were completed using the MMF framework. MMF is a modular framework for vision and language multimodal research. MMF contains reference implementations of state-of-the-art vision and language models and has also powered multiple research projects at Meta AI Research (as seen in this poster presented in PyTorch Ecosystem Day 2020). Along with the recent release of TorchMultimodal, a PyTorch library for training state-of-the-art multimodal models at scale, MMF highlights the growing interest in Multimodal understanding.

        + +

        MMF tackles this complexity with modular management of all the elements of the pipeline through a wide set of different implementations for specific modules, ranging from the processing of the modalities to the fusion of the processed information.

        + +

        In our scenario, MMF was a great entry point to experiment with multimodality. It allowed us to iterate quickly by combining audio, video and closed captioning and experiment at different levels of scale with certain multimodal models, shifting from a single GPU to TPU Pods.

        + +

        Multimodal Transformers

        + +

        With a workbench based on MMF, our initial model was based on a concatenation of features from each modality evolving to a pipeline that included a Transformer-based fusion module to combine the different input modes.

        + +

        Specifically, we made use of the fusion module called MMFTransformer, developed in collaboration with the Meta AI Research team. This is an implementation based on VisualBERT for which the necessary modifications were added to be able to work with text, audio and video.

        + +

        Despite having decent results with the out-of-box implementation MMFTransformer, we were still far from our goal, and the Transformers-based models required more data than we had available.

        + +

        Searching for less data-hungry solutions

        + +

        Searching for less data-hungry solutions, our team started studying MLP-Mixer. This new architecture has been proposed by the Google Brain team and it provides an alternative to well established de facto architectures like convolutions or self-attention for computer vision tasks.

        + +

        MLP-Mixer

        + +

        The core idea behind mixed variations consists of replacing the convolutions or self-attention mechanisms used in transformers with Multilayer Perceptrons. This change in architecture favors the performance of the model in high data regimes (especially with respect to the Transformers), while also opening some questions regarding the inductive biases hidden in the convolutions and the self-attention layers.

        + +

        Those proposals perform great in solving image classification tasks by splitting the image in chunks, flattening those chunks into 1D vectors and passing them through a sequence of Mixer Layers.

        + +

        + +

        + +

        Inspired by the advantages of Mixer based architectures, our team searched for parallelisms with the type of problems we try to solve in video classification: specifically, instead of a single image, we have a set of frames that need to be classified, along with audio and closed captioning in the shape of new modalities.

        + +

        Activity Recognition reinterpreting the MLP-Mixer

        + +

        Our proposal takes the core idea of the MLP-Mixer — using multiple multi-layer perceptrons on a sequence and transposed sequence and extends it into a Multi Modal framework that allows us to process video, audio & text with the same architecture.

        + +

        For each of the modalities, we use different extractors that will provide embeddings describing the content. Given the embeddings of each modality, the MLP-Mixer architecture solves the problem of deciding which of the modalities might be the most important, while also weighing how much each modality contributes to the final labeling.

        + +

        For example, when it comes to detecting laughs, sometimes the key information is in audio or in the frames, and in some of the cases we have a strong signal in the closed caption.

        + +

        We tried processing each frame separately with a ResNet34 and getting a sequence of embeddings and by using a video-specific model called R3D, both pre-trained on ImageNet and Kinetics400 respectively.

        + +

        + +

        + +

        To process the audio, we use the pretrained ResNet34, and we remove the final layers to be able to extract 2D embeddings from the audio spectrograms (for 224x224 images we end up with 7x7 embeddings).

        + +

        + +

        + +

        For closed captioning, we are using a pre-trained BERT-large, with all layers frozen, except for the Embeddings & LayerNorms.

        + +

        + +

        + +

        Once we have extracted the embedding from each modality, we concatenate them into a single sequence and pass it through a set of MLP-Mixer blocks; next we use average pooling & a classification head to get predictions.

        + +

        + +

        + +

        Our experiments have been performed on a custom, manually labeled dataset for activity recognition with 15 classes, which we know from experiments are hard and cannot all be predicted accurately using a single modality.

        + +

        These experiments have shown a significant increase in performance using our approach, especially in a low/mid-data regime (75K training samples).

        + +

        When it comes to using only Text and Audio, our experiments showed a 15 percent improvement in accuracy over using a classifier on top of the features extracted by state-of-the-art backbones.

        + +

        Using Text, Audio and Video we have seen a 17 percent improvement in accuracy over using Meta AIFacebook’s MMF Framework, which uses a VisualBERT-like model to combine modalities using more powerful state of the art backbones.

        + +

        Currently, we extended the initial model to cover up to 55 activity classes and 45 event classes. One of the challenges we expect to improve upon in the future is to include all activities and events, even those that are less frequent.

        + +

        Interpreting the MLP-Mixer mode combinations

        + +

        An MLP-Mixer is a concatenation of MultiLayer Perceptrons. This can be, very roughly, approximated to a linear operation, in the sense that, once trained, the weights are fixed and the input will directly affect the output.

        + +

        Once we assume that approximation, we also assume that for an input consisting of NxM numbers, we could find a NxM matrix that (when multiplied elementwise) could approximate the predictions of the MLP-Mixer for a class.

        + +

        + +

        + +

        We will call this matrix a stencil, and if we have access to it, we can find what parts of the input embeddings are responsible for a specific prediction.

        + +

        You can think of it as a punch card with holes in specific positions. Only information in those positions will pass and contribute to a specific prediction. So we can measure the intensity of the input at those positions.

        + +

        + +

        + +

        Of course, this is an oversimplification, and there won’t exist a unique stencil that perfectly represents all of the contributions of the input to a class (otherwise that would mean that the problem could be solved linearly). So this should be used for visualization purposes only, not as an accurate predictor.

        + +

        Once we have a set of stencils for each class, we can effortlessly measure input contribution without relying on any external visualization techniques.

        + +

        To find a stencil, we can start from a “random noise” stencil and optimize it to maximize the activations for a specific class by just back-propagating through the MLP-Mixer.

        + +

        + +

        + +

        By doing this we can end up with many valid stencils, and we can reduce them to a few by using K-means to cluster them into similar stencils and averaging each cluster.

        + +

        Using the Mixer to get the best of each world

        + +

        MLP-Mixer, used as an image classification model without convolutional layers, requires a lot of data, since the lack of inductive bias – one of the model’s good points overall – is a weakness when it comes to working in low data domains.

        + +

        When used as a way to combine information previously extracted by large pretrained backbones (as opposed to being used as a full end-to-end solution), they shine. The Mixer’s strength lies in finding temporal or structural coherence between different inputs. For example, in video-related tasks we could extract embeddings from the frames using a powerful, pretrained model that understands what is going on at frame level and use the mixer to make sense of it in a sequential manner.

        + +

        This way of using the Mixer allows us to work with limited amounts of data and still get better results than what was achieved with Transformers. This is because Mixers seem to be more stable during training and seem to pay attention to all the inputs, while Transformers tend to collapse and pay attention only to some modalities/parts of the sequence.

        + +

        Acknowledgements: We would like to thank the Meta AI Research and Partner Engineering teams for this collaboration.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-ibm-uses-pt-terratorch/index.html b/blog/how-ibm-uses-pt-terratorch/index.html new file mode 100644 index 000000000000..96f7914ea443 --- /dev/null +++ b/blog/how-ibm-uses-pt-terratorch/index.html @@ -0,0 +1,723 @@ + + + + + + + + + + + + + How IBM Research Uses PyTorch and TerraTorch to Make Geospatial Computer Vision Accessible for Everyone | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Earth Observation-based analytics are becoming essential for understanding our planet — from monitoring deforestation to tracking urban development and analyzing the impacts of climate change. However, the coding and deep learning skills for applying AI models to satellite imagery and earth observation data has traditionally been a major barrier for many practitioners.

        + +

        By IBM Research’s launch of TerraTorch 1.0, a PyTorch domain library for fine-tuning of Geospatial Computer Vision Foundation Models, we make geospatial AI not only more accessible but also more practical for the wider PyTorch community. Our goal: simplify the process so that any data scientist, researcher, or enthusiast can build powerful geospatial models with ease and low GPU and data processing requirements.

        + +

        globes

        + +

        The power of foundation models, even with 75-95% of the input data removed, the models do a fantastic job in reconstruction of the input data - therefore learning the underlying physics of our planet in a deep, latent space

        + +

        The Business Challenge

        + +

        Our goal was to remove the technical barriers that prevent people from working with satellite imagery, weather and climate data at scale. Together with NASA, we’ve developed the Prithvi family of foundation models. Integrating the latest innovations of AI research using the clean API PyTorch provides has facilitated the job.

        + +

        We wanted to create a framework that anyone can use to go from raw data to inference ready models in just a few steps.

        + +

        globes

        + +

        How a weather and climate foundation model created and fine-tuned on PyTorch is used for weather forecasts

        + +

        How IBM Research Used PyTorch

        + +

        We’ve built TerraTorch on top of PyTorch, leveraging its dynamic ecosystem to integrate:

        + +
          +
        • PyTorch Lightning for clean, scalable training loops
        • +
        • TorchGeo for geospatial data handling and transformations (PyTorch transforms)
        • +
        • For foundation models like the leading generative multimodal foundation model ‘Terramind’, co-developed by IBM and ESA, and the ‘Prithvi’ family, co-developed by IBM and NASA, TerraTorch has been used to fine-tune all of the downstream geospatial models for satellite imagery, weather and climate data. It includes the family of fine-tuned models that IBM has released as part of Granite. In addition, other interesting foundation models and ecosystem components like Clay, SatMAE, Satlas, DeCur and DOFA are included in TerraTorch.
        • +
        • Powerful and state-of-the-art vision transformers to experiment with modern neural network architectures
        • +
        • TerraTorch-Iterate build on top of PyTorch, Optuna, MLFlow and Ray Tune for Hyperparameter Optimization (HPO), Neural Architecture Search (NAS) and Foundation Model Benchmarking (GeoBench), where TerraTorch became the reference implementation
        • +
        + +

        flow diagram

        + +

        The fine-tuning and inference process is completely described in a single YAML config file. There, the architectural building blocks of the model (backbone, neck, decoder, head) are defined. The Model Factory assembles the model using the build-in and custom registries. In addition, the Optimizer and Data Modules are created as defined in the config. Finally, everything is passed to the Lightning Trainer, who executes the task.

        + +

        With PyTorch’s flexibility, we were able to prototype quickly, iterate on model architectures, and deploy pipelines for a range of geospatial applications — from flood and biomass detection to increasing resolution of climate data, where some of our our work became part of the IBM Granite Geospatial Model Family.

        + +

        flow diagram

        + +

        Architecture of the Prithvi-EO-2.0-600M foundation model which IBM Research developed together with NASA

        + +

        Solving AI Challenges with PyTorch

        + +

        PyTorch helped us to tackle three major challenges:

        + +
          +
        • Ease of experimentation: Dynamic computation graphs, automatic differentiation, full abstraction of CUDA and rich visualization tools made it simple to test different models and training strategies.
        • +
        • Scalability: With DDP, FSDP, PyTorch Lightning and TorchGeo, we could train models on large-scale datasets without worrying about infrastructure.
        • +
        • Community support: PyTorch - the de-facto standard in AI research - with its active community and excellent documentation made it easy to overcome hurdles and stay up to date with the latest advancements in AI research.
        • +
        + +

        A Word from IBM Research

        + +

        “PyTorch gave me the power to turn complex linear algebra and optimization problems into accessible, shareable solutions for the community. It feels empowering that we’re building and fine-tuning models for anyone curious about understanding our planet through AI.”

        + +

        — Romeo Kienzler, AI Research Engineer at IBM Research Zurich, Rueschlikon

        + +

        quote

        + +

        The Benefits of Using PyTorch

        + +

        Using PyTorch allowed us to:

        + +
          +
        • Build a reproducible, open-source framework for fine-tuning geospatial foundation models
        • +
        • Share our work with the community through easy-to-follow notebooks, TerraTorch configuration files, tutorials and model checkpoints on HuggingFace
        • +
        • Rapidly iterate over foundation model architectures and deploy fine-tuned models for inference, from research to real-world client products
        • +
        + +

        Learn More

        + +

        For more information about this project and to explore the code, visit:

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus/index.html b/blog/how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus/index.html new file mode 100644 index 000000000000..ff510bab940b --- /dev/null +++ b/blog/how-intel-uses-pytorch-to-empower-generative-ai-through-intel-arc-gpus/index.html @@ -0,0 +1,683 @@ + + + + + + + + + + + + + How Intel Uses PyTorch to Empower Generative AI through Intel Arc GPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads.

        + +

        The Business Challenge

        + +

        Our goal was to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel. We recognized the need to showcase the capabilities of the latest GenAI workloads on our newest line of client GPUs. To address this, we developed a starter application, AI Playground, which is open source and includes a comprehensive developer reference sample available on GitHub using PyTorch. This application seamlessly integrates image generation, image enhancement, and chatbot functionalities, using retrieval-augmented generation (RAG) features, all within a single, user-friendly installation package. This initiative not only demonstrates the functionality of these AI workloads but also serves as an educational resource for the ecosystem, guiding developers on effectively leveraging the Intel® Arc™ GPU product line for advanced AI applications. This solution leverages Intel® Arc™ Xe Cores and Xe Matrix Extensions (XMX) for accelerating inferencing.

        + +

        AI Playground

        + +

        How Intel Used PyTorch

        + +

        PyTorch is the core AI framework for AI Playground. We extensively leverage PyTorch’s eager mode, which aligns perfectly with the dynamic and iterative nature of our generative models. This approach not only enhances our development workflow but also enables us to rapidly prototype and iterate on advanced AI features. By harnessing PyTorch’s powerful capabilities, we have created a robust reference sample that showcases the potential of GenAI on Intel GPUs in one cohesive application.

        + +

        Solving AI Challenges with PyTorch

        + +

        PyTorch has been instrumental in addressing our AI challenges by providing a robust training and inference framework optimized for discrete and integrated Intel Arc GPU product lines. Choosing PyTorch over alternative frameworks or APIs was crucial. Other options would have necessitated additional custom development or one-off solutions, which could have significantly slowed our time to market and limited our feature set. With PyTorch, we leveraged its flexibility and ease of use, allowing our team to focus on innovation through experimentation, rather than infrastructure. The integration of Intel® Extension for PyTorch further enhanced performance by optimizing computational efficiency and enabling seamless scaling on Intel hardware, ensuring that our application ran faster and more efficiently.

        + +

        A Word from Intel

        + +

        With PyTorch as the backbone of our AI Playground project, we achieved rapid development cycles that significantly accelerated our time to market. This flexibility enabled us to iteratively enhance features and effectively align with the commitments of our hardware launches in 2024.

        + +

        -Bob Duffy, AI Playground Product Manager

        + +

        PyTorch Case Stidu

        + +

        The Benefits of Using PyTorch

        + +

        The biggest benefit of using PyTorch for us is the large PyTorch ecosystem, which connects us with an active and cooperative community of developers. This collaboration has facilitated the seamless deployment of key features from existing open source projects, allowing us to integrate the latest GenAI capabilities into AI Playground. Remarkably, we accomplished this with minimal re-coding, ensuring that these advanced features are readily accessible on Intel Arc GPUs.

        + +

        Learn More

        + +

        For more information about Intel’s AI Playground and collaboration with PyTorch, visit the following links:

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-to-accelerate/index.html b/blog/how-to-accelerate/index.html new file mode 100644 index 000000000000..5bc89461a5a8 --- /dev/null +++ b/blog/how-to-accelerate/index.html @@ -0,0 +1,842 @@ + + + + + + + + + + + + + How to Accelerate PyTorch Geometric on Intel® CPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        Overview

        + +

        The Intel PyTorch team has been collaborating with the PyTorch Geometric (PyG) community to provide CPU performance optimizations for Graph Neural Network (GNN) and PyG workloads. In the PyTorch 2.0 release, several critical optimizations were introduced to improve GNN training and inference performance on CPU. Developers and researchers can now take advantage of Intel’s AI/ML Framework optimizations for significantly faster model training and inference, which unlocks the ability for GNN workflows directly using PyG.

        + +

        In this blog, we will perform a deep dive on how to optimize PyG performance for both training and inference while using the PyTorch 2.0 flagship torch.compile feature to speed up PyG models.

        + +

        Message Passing Paradigm

        + +

        Message passing refers to the process of nodes exchanging information with their respective neighbors by sending messages to one another. In PyG, the process of message passing can be generalized into three steps:

        + +
          +
        1. Gather: Collect edge-level information of adjacent nodes and edges.
        2. +
        3. Apply: Update the collected information with user-defined functions (UDFs).
        4. +
        5. Scatter: Aggregate to node-level information, e.g., via a particular reduce function such as sum, mean, or max.
        6. +
        + +

        Figure 1: The message passing paradigm

        + +

        Figure 1: The message passing paradigm (Source: Matthias Fey)

        + +

        Message passing performance is highly related to the storage format of the adjacency matrix of the graph, which records how pairs of nodes are connected. Two methods for the storage format are:

        + +
          +
        • Adjacency matrix in COO (Coordinate Format): The graph data is physically stored in a two-dimensional tensor shape of [2, num_edges], which maps each connection of source and destination nodes. The performance hotspot is scatter-reduce.
        • +
        • Adjacency matrix in CSR (Compressed Sparse Row): Similar format to COO, but compressed on the row indices. This format allows for more efficient row access and faster sparse matrix-matrix multiplication (SpMM). The performance hotspot is sparse matrix related reduction ops.
        • +
        + +

        Scatter-Reduce

        + +

        The pattern of scatter-reduce is parallel in nature, which updates values of a self tensor using values from a src tensor at the entries specified by index. Ideally, parallelizing on the outer dimension would be most performant. However, direct parallelization leads to write conflicts, as different threads might try to update the same entry simultaneously.

        + +

        Figure 2: Scatter-reduce and its optimization scheme

        + +

        Figure 2: Scatter-reduce and its optimization scheme (Source: Mingfei Ma)

        + +

        To optimize this kernel, we use sorting followed by a reduction:

        + +
          +
        • Sorting: Sort the index tensor in ascending order with parallel radix sort, such that indices pointing to the same entry in the self tensor are managed in the same thread.
        • +
        • Reduction: Paralleled on the outer dimension of self, and do vectorized reduction for each indexed src entry.
        • +
        + +

        For its backward path during the training process (i.e., gather), sorting is not needed because its memory access pattern will not lead to any write conflicts.

        + +

        SpMM-Reduce

        + +

        Sparse matrix-matrix reduction is a fundamental operator in GNNs, where A is sparse adjacency matrix in CSR format and B is a dense feature matrix where the reduction type could be sum, mean or max.

        + +

        Figure 3: SpMM optimization scheme

        + +

        Figure 3: SpMM optimization scheme (Source: Mingfei Ma)

        + +

        The biggest challenge when optimizing this kernel is how to balance thread payload when parallelizing along rows of the sparse matrix A. Each row in A corresponds to a node, and its number of connections may vary vastly from one to another; this results in thread payload imbalance. One technique to address such issues is to do payload scanning before thread partition. Aside from that, other techniques are also introduced to further exploit CPU performance such as vectorization and unrolling and blocking.

        + +

        These optimizations are done via torch.sparse.mm using the reduce flags of amax, amin, mean, sum.

        + +

        Performance Gains: Up to 4.1x Speedup

        + +

        We collected benchmark performance for both inference and training in pytorch_geometric/benchmark and in the Open Graph Benchmark (OGB) to demonstrate the performance improvement from the above-mentioned methods on Intel® Xeon® Platinum 8380 Processor.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model – Dataset + Option + Speedup ratio +
        + GCN-Reddit (inference) + 512-2-64-dense + 1.22x +
        1024-3-128-dense + 1.25x +
        512-2-64-sparse + 1.31x +
        1024-3-128-sparse + 1.68x +
        + GraphSage-ogbn-products (inference) + 1024-3-128-dense + 1.15x +
        512-2-64-sparse + 1.20x +
        1024-3-128-sparse + 1.33x +
        full-batch-sparse + 4.07x +
        GCN-PROTEINS (training) + 3-32 + 1.67x +
        GCN-REDDIT-BINARY (training) + 3-32 + 1.67x +
        GCN-Reddit (training) + 512-2-64-dense + 1.20x +
        1024-3-128-dense + 1.12x +
        + +

        Table 1: Performance Speedup on PyG Benchmark1

        + +

        From the benchmark results, we can see that our optimizations in PyTorch and PyG achieved 1.1x-4.1x speed-up for inference and training.

        + +

        torch.compile for PyG

        + +

        The PyTorch2.0 flagship feature torch.compile is fully compatible with PyG 2.3 release, bringing additional speed-up in PyG model inference/training over imperative mode, thanks to TorchInductor C++/OpenMP backend for CPUs. In particular, a 3.0x – 5.4x performance speed-up is measured on basic GNN models with Intel Xeon Platinum 8380 Processor on model training2.

        + +

        Figure 4: Performance Speedup with Torch Compile

        + +

        Figure 4: Performance Speedup with Torch Compile

        + +

        Torch.compile can fuse the multiple stages of message passing into a single kernel, which provides significant speedup due to the saved memory bandwidth. Refer to this pytorch geometric tutorial for additional support.

        + +

        Please note that torch.compile within PyG is in beta mode and under active development. Currently, some features do not yet work together seamlessly such as torch.compile(model, dynamic=True), but fixes are on the way from Intel.

        + +

        Conclusion & Future Work

        + +

        In this blog, we introduced the GNN performance optimizations included in PyTorch 2.0 on CPU. We are closely collaborating with the PyG community for future optimization work, which will focus on in-depth optimizations from torch.compile, sparse optimization, and distributed training.

        + +

        Acknowledgement

        + +

        The results presented in this blog is a joint effort of Intel PyTorch team and Kumo. Special thanks to Matthias Fey (Kumo), Pearu Peterson (Quansight) and Christian Puhrsch (Meta) who spent precious time and gave substantial assistance! Together, we made one more step forward on the path of improving the PyTorch CPU ecosystem.

        + +

        References

        + + + +

        Footnotes

        + +

        Product and Performance Information

        + +

        1Platinum 8380: 1-node, 2x Intel Xeon Platinum 8380 processor with 256GB (16 slots/ 16GB/3200) total DDR4 memory, uCode 0xd000389, HT on, Turbo on, Ubuntu 20.04.5 LTS, 5.4.0-146-generic, INTEL SSDPE2KE016T8 1.5T; GCN + Reddit FP32 inference, GCN+Reddit FP32 training, GraphSAGE + ogbn-products FP32 inference, GCN-PROTAIN, GCN-REDDIT-BINARY FP32 training; Software: PyTorch 2.1.0.dev20230302+cpu, pytorch_geometric 2.3.0, torch-scatter 2.1.0, torch-sparse 0.6.16, test by Intel on 3/02/2023.

        + +

        2Platinum 8380: 1-node, 2x Intel Xeon Platinum 8380 processor with 256GB (16 slots/ 16GB/3200) total DDR4 memory, uCode 0xd000389, HT on, Turbo on, Ubuntu 20.04.5 LTS, 5.4.0-146-generic, INTEL SSDPE2KE016T8 1.5T; GCN, GraphSAGE, GIN and EdgeCNN, FP32; Software: PyTorch 2.1.0.dev20230411+cpu, pytorch_geometric 2.4.0, torch-scatter 2.1.1+pt20cpu, torch-sparse 0.6.17+pt20cpu, test by Intel on 4/11/2023.

        + +

        3Performance varies by use, configuration and other factors. Learn more at www.Intel.com/PerformanceIndex.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/index.html b/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/index.html new file mode 100644 index 000000000000..bb31f3074ee8 --- /dev/null +++ b/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/index.html @@ -0,0 +1,1169 @@ + + + + + + + + + + + + + How to Train State-Of-The-Art Models Using TorchVision’s Latest Primitives | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Vasilis Vryniotis + +

        + + +

        A few weeks ago, TorchVision v0.11 was released packed with numerous new primitives, models and training recipe improvements which allowed achieving state-of-the-art (SOTA) results. The project was dubbed “TorchVision with Batteries Included” and aimed to modernize our library. We wanted to enable researchers to reproduce papers and conduct research more easily by using common building blocks. Moreover, we aspired to provide the necessary tools to Applied ML practitioners to train their models on their own data using the same SOTA techniques as in research. Finally, we wanted to refresh our pre-trained weights and offer better off-the-shelf models to our users, hoping that they would build better applications.

        + +

        Though there is still much work to be done, we wanted to share with you some exciting results from the above work. We will showcase how one can use the new tools included in TorchVision to achieve state-of-the-art results on a highly competitive and well-studied architecture such as ResNet50 [1]. We will share the exact recipe used to improve our baseline by over 4.7 accuracy points to reach a final top-1 accuracy of 80.9% and share the journey for deriving the new training process. Moreover, we will show that this recipe generalizes well to other model variants and families. We hope that the above will influence future research for developing stronger generalizable training methodologies and will inspire the community to adopt and contribute to our efforts.

        + +

        The Results

        + +

        Using our new training recipe found on ResNet50, we’ve refreshed the pre-trained weights of the following models:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelAccuracy@1Accuracy@5
        ResNet5080.85895.434
        ResNet10181.88695.780
        ResNet15282.28496.002
        ResNeXt50-32x4d81.19895.340
        + +

        Note that the accuracy of all models except RetNet50 can be further improved by adjusting their training parameters slightly, but our focus was to have a single robust recipe which performs well for all.

        + +

        UPDATE: We have refreshed the majority of popular classification models of TorchVision, you can find the details on this blog post.

        + +

        There are currently two ways to use the latest weights of the model.

        + +

        Using the Multi-pretrained weight API

        + +

        We are currently working on a new prototype mechanism which will extend the model builder methods of TorchVision to support multiple weights. Along with the weights, we store useful meta-data (such as the labels, the accuracy, links to recipe etc) and the preprocessing transforms necessary for using the models. Example:

        + +
          from PIL import Image
        +  from torchvision import prototype as P
        +  img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
        +   
        +  # Initialize model
        +  weights = P.models.ResNet50_Weights.IMAGENET1K_V2
        +  model = P.models.resnet50(weights=weights)
        +  model.eval()
        +
        +  # Initialize inference transforms
        +  preprocess = weights.transforms()
        +   
        +  # Apply inference preprocessing transforms
        +  batch = preprocess(img).unsqueeze(0)
        +  prediction = model(batch).squeeze(0).softmax(0)
        +   
        +  # Make predictions
        +  label = prediction.argmax().item()
        +  score = prediction[label].item()
        +   
        +  # Use meta to get the labels
        +  category_name = weights.meta['categories'][label]
        +  print(f"{category_name}: {100 * score}%")
        +
        + +

        Using the legacy API

        + +

        Those who don’t want to use a prototype API have the option of accessing the new weights via the legacy API using the following approach:

        + +
          from torchvision.models import resnet
        +   
        +  # Overwrite the URL of the previous weights
        +  resnet.model_urls["resnet50"] = "https://download.pytorch.org/models/resnet50-11ad3fa6.pth"
        +   
        +  # Initialize the model using the legacy API
        +  model = resnet.resnet50(pretrained=True)
        +   
        +  # TODO: Apply preprocessing + call the model
        +  # ...
        +
        + +

        The Training Recipe

        + +

        Our goal was to use the newly introduced primitives of TorchVision to derive a new strong training recipe which achieves state-of-the-art results for the vanilla ResNet50 architecture when trained from scratch on ImageNet with no additional external data. Though by using architecture specific tricks [2] one could further improve the accuracy, we’ve decided not to include them so that the recipe can be used in other architectures. Our recipe heavily focuses on simplicity and builds upon work by FAIR [3], [4], [5], [6], [7]. Our findings align with the parallel study of Wightman et al. [7], who also report major accuracy improvements by focusing on the training recipes.

        + +

        Without further ado, here are the main parameters of our recipe:

        + +
          # Optimizer & LR scheme
        +  ngpus=8,
        +  batch_size=128,  # per GPU
        +
        +  epochs=600, 
        +  opt='sgd',  
        +  momentum=0.9,
        +
        +  lr=0.5, 
        +  lr_scheduler='cosineannealinglr', 
        +  lr_warmup_epochs=5, 
        +  lr_warmup_method='linear', 
        +  lr_warmup_decay=0.01, 
        +
        +
        +  # Regularization and Augmentation
        +  weight_decay=2e-05, 
        +  norm_weight_decay=0.0,
        +
        +  label_smoothing=0.1, 
        +  mixup_alpha=0.2, 
        +  cutmix_alpha=1.0, 
        +  auto_augment='ta_wide', 
        +  random_erase=0.1, 
        +  
        +  ra_sampler=True,
        +  ra_reps=4,
        +
        +
        +  # EMA configuration
        +  model_ema=True, 
        +  model_ema_steps=32, 
        +  model_ema_decay=0.99998, 
        +
        +
        +  # Resizing
        +  interpolation='bilinear', 
        +  val_resize_size=232, 
        +  val_crop_size=224, 
        +  train_crop_size=176,
        +
        + +

        Using our standard training reference script, we can train a ResNet50 using the following command:

        + +
        torchrun --nproc_per_node=8 train.py --model resnet50 --batch-size 128 --lr 0.5 \
        +--lr-scheduler cosineannealinglr --lr-warmup-epochs 5 --lr-warmup-method linear \
        +--auto-augment ta_wide --epochs 600 --random-erase 0.1 --weight-decay 0.00002 \
        +--norm-weight-decay 0.0 --label-smoothing 0.1 --mixup-alpha 0.2 --cutmix-alpha 1.0 \
        +--train-crop-size 176 --model-ema --val-resize-size 232 --ra-sampler --ra-reps 4
        +
        + +

        Methodology

        + +

        There are a few principles we kept in mind during our explorations:

        + +
          +
        1. Training is a stochastic process and the validation metric we try to optimize is a random variable. This is due to the random weight initialization scheme employed and the existence of random effects during the training process. This means that we can’t do a single run to assess the effect of a recipe change. The standard practice is doing multiple runs (usually 3 to 5) and studying the summarization stats (such as mean, std, median, max, etc).
        2. +
        3. There is usually a significant interaction between different parameters, especially for techniques that focus on Regularization and reducing overfitting. Thus changing the value of one can have effects on the optimal configurations of others. To account for that one can either adopt a greedy search approach (which often leads to suboptimal results but tractable experiments) or apply grid search (which leads to better results but is computationally expensive). In this work, we used a mixture of both.
        4. +
        5. Techniques that are non-deterministic or introduce noise usually require longer training cycles to improve model performance. To keep things tractable, we initially used short training cycles (small number of epochs) to decide which paths can be eliminated early and which should be explored using longer training.
        6. +
        7. There is a risk of overfitting the validation dataset [8] because of the repeated experiments. To mitigate some of the risk, we apply only training optimizations that provide a significant accuracy improvements and use K-fold cross validation to verify optimizations done on the validation set. Moreover we confirm that our recipe ingredients generalize well on other models for which we didn’t optimize the hyper-parameters.
        8. +
        + +

        Break down of key accuracy improvements

        + +

        As discussed in earlier blogposts, training models is not a journey of monotonically increasing accuracies and the process involves a lot of backtracking. To quantify the effect of each optimization, below we attempt to show-case an idealized linear journey of deriving the final recipe starting from the original recipe of TorchVision. We would like to clarify that this is an oversimplification of the actual path we followed and thus it should be taken with a grain of salt. 

        + +

        +Cumulative Accuracy Improvements for ResNet50 +

        + +

        In the table below, we provide a summary of the performance of stacked incremental improvements on top of Baseline. Unless denoted otherwise, we report the model with best Acc@1 out of 3 runs:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
         Accuracy@1Accuracy@5Incremental DiffAbsolute Diff
        ResNet50 Baseline76.13092.8620.0000.000
        + LR optimizations76.49493.1980.3640.364
        + TrivialAugment76.80693.2720.3120.676
        + Long Training78.60694.0521.8002.476
        + Random Erasing78.79694.0940.1902.666
        + Label Smoothing79.11494.3740.3182.984
        + Mixup79.23294.5360.1183.102
        + Cutmix79.51094.6420.2783.380
        + Weight Decay tuning80.03694.7460.5263.906
        + FixRes mitigations80.19694.6720.1604.066
        + EMA80.45094.9080.2544.320
        + Inference Resize tuning *80.67495.1660.2244.544
        + Repeated Augmentation **80.85895.4340.1844.728
        + +

        *The tuning of the inference size was done on top of the last model. See below for details.

        + +

        ** Community contribution done after the release of the article. See below for details.

        + +

        Baseline

        + +

        Our baseline is the previously released ResNet50 model of TorchVision. It was trained with the following recipe:

        + +
          # Optimizer & LR scheme
        +  ngpus=8,
        +  batch_size=32,  # per GPU
        +
        +  epochs=90, 
        +  opt='sgd',  
        +  momentum=0.9,
        +
        +  lr=0.1, 
        +  lr_scheduler='steplr', 
        +  lr_step_size=30, 
        +  lr_gamma=0.1, 
        +
        +
        +  # Regularization
        +  weight_decay=1e-4,
        +
        +
        +  # Resizing
        +  interpolation='bilinear', 
        +  val_resize_size=256, 
        +  val_crop_size=224, 
        +  train_crop_size=224,
        +
        + +

        Most of the above parameters are the defaults on our training scripts. We will start building on top of this baseline by introducing optimizations until we gradually arrive at the final recipe.

        + +

        LR optimizations

        + +

        There are a few parameter updates we can apply to improve both the accuracy and the speed of our training. This can be achieved by increasing the batch size and tuning the LR. Another common method is to apply warmup and gradually increase our learning rate. This is beneficial especially when we use very high learning rates and helps with the stability of the training in the early epochs. Finally, another optimization is to apply Cosine Schedule to adjust our LR during the epochs. A big advantage of cosine is that there are no hyper-parameters to optimize, which cuts down our search space.

        + +

        Here are the additional optimizations applied on top of the baseline recipe. Note that we’ve run multiple experiments to determine the optimal configuration of the parameters:

        + +
          batch_size=128,  # per GPU
        +
        +  lr=0.5, 
        +  lr_scheduler='cosineannealinglr', 
        +  lr_warmup_epochs=5, 
        +  lr_warmup_method='linear', 
        +  lr_warmup_decay=0.01,
        +
        + +

        The above optimizations increase our top-1 Accuracy by 0.364 points comparing to the baseline. Note that in order to combine the different LR strategies we use the newly introduced SequentialLR scheduler.

        + +

        TrivialAugment

        + +

        The original model was trained using basic augmentation transforms such as Random resized crops and horizontal flips. An easy way to improve our accuracy is to apply more complex “Automatic-Augmentation” techniques. The one that performed best for us is TrivialAugment [9], which is extremely simple and can be considered “parameter free”, which means it can help us cut down our search space further.

        + +

        Here is the update applied on top of the previous step:

        + +
        auto_augment='ta_wide',
        +
        + +

        The use of TrivialAugment increased our top-1 Accuracy by 0.312 points compared to the previous step.

        + +

        Long Training

        + +

        Longer training cycles are beneficial when our recipe contains ingredients that behave randomly. More specifically as we start adding more and more techniques that introduce noise, increasing the number of epochs becomes crucial. Note that at early stages of our exploration, we used relatively short cycles of roughly 200 epochs which was later increased to 400 as we started narrowing down most of the parameters and finally increased to 600 epochs at the final versions of the recipe.

        + +

        Below we see the update applied on top of the earlier steps:

        + +
        epochs=600,
        +
        + +

        This further increases our top-1 Accuracy by 1.8 points on top of the previous step. This is the biggest increase we will observe in this iterative process. It’s worth noting that the effect of this single optimization is overstated and somehow misleading. Just increasing the number of epochs on top of the old baseline won’t yield such significant improvements. Nevertheless the combination of the LR optimizations with strong Augmentation strategies helps the model benefit from longer cycles. It’s also worth mentioning that the reason we introduce the lengthy training cycles so early in the process is because in the next steps we will introduce techniques that require significantly more epochs to provide good results.

        + +

        Random Erasing

        + +

        Another data augmentation technique known to help the classification accuracy is Random Erasing [10], [11]. Often paired with Automatic Augmentation methods, it usually yields additional improvements in accuracy due to its regularization effect. In our experiments we tuned only the probability of applying the method via a grid search and found that it’s beneficial to keep its probability at low levels, typically around 10%. 

        + +

        Here is the extra parameter introduced on top of the previous:

        + +
        random_erase=0.1,
        +
        + +

        Applying Random Erasing increases our Acc@1 by further 0.190 points.

        + +

        Label Smoothing

        + +

        A good technique to reduce overfitting is to stop the model from becoming overconfident. This can be achieved by softening the ground truth using Label Smoothing [12]. There is a single parameter which controls the degree of smoothing (the higher the stronger) that we need to specify. Though optimizing it via grid search is possible, we found that values around 0.05-0.15 yield similar results, so to avoid overfitting it we used the same value as on the paper that introduced it.

        + +

        Below we can find the extra config added on this step:

        + +
        label_smoothing=0.1,
        +
        + +

        We use PyTorch’s newly introduced CrossEntropyLoss label_smoothing parameter and that increases our accuracy by an additional 0.318 points.

        + +

        Mixup and Cutmix

        + +

        Two data augmentation techniques often used to produce SOTA results are Mixup and Cutmix [13], [14]. They both provide strong regularization effects by softening not only the labels but also the images. In our setup we found it beneficial to apply one of them randomly with equal probability. Each is parameterized with a hyperparameter alpha, which controls the shape of the Beta distribution from which the smoothing probability is sampled. We did a very limited grid search, focusing primarily on common values proposed on the papers. 

        + +

        Below you will find the optimal values for the alpha parameters of the two techniques:

        + +
        mixup_alpha=0.2, 
        +cutmix_alpha=1.0,
        +
        + +

        Applying mixup increases our accuracy by 0.118 points and combining it with cutmix improves it by additional 0.278 points.

        + +

        Weight Decay tuning

        + +

        Our standard recipe uses L2 regularization to reduce overfitting. The Weight Decay parameter controls the degree of the regularization (the larger the stronger) and is applied universally to all learned parameters of the model by default. In this recipe, we apply two optimizations to the standard approach. First we perform grid search to tune the parameter of weight decay and second we disable weight decay for the parameters of the normalization layers. 

        + +

        Below you can find the optimal configuration of weight decay for our recipe:

        + +
        weight_decay=2e-05, 
        +norm_weight_decay=0.0,
        +
        + +

        The above update improves our accuracy by a further 0.526 points, providing additional experimental evidence for a known fact that tuning weight decay has significant effects on the performance of the model. Our approach for separating the Normalization parameters from the rest was inspired by ClassyVision’s approach.

        + +

        FixRes mitigations

        + +

        An important property identified early in our experiments is the fact that the models performed significantly better if the resolution used during validation was increased from the 224x224 of training. This effect is studied in detail on the FixRes paper [5] and two mitigations are proposed: a) one could try to reduce the training resolution so that the accuracy on the validation resolution is maximized or b) one could fine-tune the model on a two-phase training so that it adjusts on the target resolution. Since we didn’t want to introduce a 2-phase training, we went for option a). This means that we reduced the train crop size from 224 and used grid search to find the one that maximizes the validation on resolution of 224x224.

        + +

        Below you can see the optimal value used on our recipe:

        + +
        val_crop_size=224, 
        +train_crop_size=176,
        +
        + +

        The above optimization improved our accuracy by an additional 0.160 points and sped up our training by 10%. 

        + +

        It’s worth noting that the FixRes effect still persists, meaning that the model continues to perform better on validation when we increase the resolution. Moreover, further reducing the training crop-size actually hurts the accuracy. This intuitively makes sense because one can only reduce the resolution so much before critical details start disappearing from the picture. Finally, we should note that the above FixRes mitigation seems to benefit models with similar depth to ResNet50. Deeper variants with larger receptive fields seem to be slightly negatively affected (typically by 0.1-0.2 points). Hence we consider this part of the recipe optional. Below we visualize the performance of the best available checkpoints (with the full recipe) for models trained with 176 and 224 resolution:

        + +
        +Best ResNet50 trained with 176 Resolution +Best ResNet50 trained with 224 Resolution +
        + +

        Exponential Moving Average (EMA)

        + +

        EMA is a technique that allows one to push the accuracy of a model without increasing its complexity or inference time. It performs an exponential moving average on the model weights and this leads to increased accuracy and more stable models. The averaging happens every few iterations and its decay parameter was tuned via grid search. 

        + +

        Below you can see the optimal values for our recipe:

        + +
        model_ema=True, 
        +model_ema_steps=32, 
        +model_ema_decay=0.99998,
        +
        + +

        The use of EMA increases our accuracy by 0.254 points comparing to the previous step. Note that TorchVision’s EMA implementation is build on top of PyTorch’s AveragedModel class with the key difference being that it averages not only the model parameters but also its buffers. Moreover, we have adopted tricks from Pycls which allow us to parameterize the decay in a way that doesn’t depend on the number of epochs.

        + +

        Inference Resize tuning

        + +

        Unlike all other steps of the process which involved training models with different parameters, this optimization was done on top of the final model. During inference, the image is resized to a specific resolution and then a central 224x224 crop is taken from it. The original recipe used a resize size of 256, which caused a similar discrepancy as the one described on the FixRes paper [5]. By bringing this resize value closer to the target inference resolution, one can improve the accuracy. To select the value we run a short grid search between interval [224, 256] with step of 8. To avoid overfitting, the value was selected using half of the validation set and confirmed using the other half.

        + +

        Below you can see the optimal value used on our recipe:

        + +
        val_resize_size=232,
        +
        + +

        The above is an optimization which improved our accuracy by 0.224 points. It’s worth noting that the optimal value for ResNet50 works also best for ResNet101, ResNet152 and ResNeXt50, which hints that it generalizes across models:

        + +
        +ResNet50 Inference Resize +ResNet101 Inference Resize +Best ResNet50 trained with 224 Resolution +
        + +

        [UPDATE] Repeated Augmentation

        + +

        Repeated Augmentation [15], [16] is another technique which can improve the overall accuracy and has been used by other strong recipes such as those at [6], [7]. Tal Ben-Nun, a community contributor, has further improved upon our original recipe by proposing training the model with 4 repetitions. His contribution came after the release of this article.

        + +

        Below you can see the optimal value used on our recipe:

        + +
        ra_sampler=True,
        +ra_reps=4,
        +
        + +

        The above is the final optimization which improved our accuracy by 0.184 points. 

        + +

        Optimizations that were tested but not adopted

        + +

        During the early stages of our research, we experimented with additional techniques, configurations and optimizations. Since our target was to keep our recipe as simple as possible, we decided not to include anything that didn’t provide a significant improvement. Here are a few approaches that we took but didn’t make it to our final recipe:

        + +
          +
        • Optimizers: Using more complex optimizers such as Adam, RMSProp or SGD with Nesterov momentum didn’t provide significantly better results than vanilla SGD with momentum.
        • +
        • LR Schedulers: We tried different LR Scheduler schemes such as StepLR and Exponential. Though the latter tends to work better with EMA, it often requires additional hyper-parameters such as defining the minimum LR to work well. Instead, we just use cosine annealing decaying the LR up to zero and choose the checkpoint with the highest accuracy.
        • +
        • Automatic Augmentations: We’ve tried different augmentation strategies such as AutoAugment and RandAugment. None of these outperformed the simpler parameter-free TrivialAugment.
        • +
        • Interpolation: Using bicubic or nearest interpolation didn’t provide significantly better results than bilinear.
        • +
        • Normalization layers: Using Sync Batch Norm didn’t yield significantly better results than using the regular Batch Norm.
        • +
        + +

        Acknowledgements

        + +

        We would like to thank Piotr Dollar, Mannat Singh and Hugo Touvron for providing their insights and feedback during the development of the recipe and for their previous research work on which our recipe is based on. Their support was invaluable for achieving the above result. Moreover, we would like to thank Prabhat Roy, Kai Zhang, Yiwen Song, Joel Schlosser, Ilqar Ramazanli, Francisco Massa, Mannat Singh, Xiaoliang Dai, Samuel Gabriel, Allen Goodman and Tal Ben-Nun for their contributions to the Batteries Included project.

        + +

        References

        + +
          +
        1. Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. “Deep Residual Learning for Image Recognition”.
        2. +
        3. Tong He, Zhi Zhang, Hang Zhang, Zhongyue Zhang, Junyuan Xie, Mu Li. “Bag of Tricks for Image Classification with Convolutional Neural Networks”
        4. +
        5. Piotr Dollár, Mannat Singh, Ross Girshick. “Fast and Accurate Model Scaling”
        6. +
        7. Tete Xiao, Mannat Singh, Eric Mintun, Trevor Darrell, Piotr Dollár, Ross Girshick. “Early Convolutions Help Transformers See Better”
        8. +
        9. Hugo Touvron, Andrea Vedaldi, Matthijs Douze, Hervé Jégou. “Fixing the train-test resolution discrepancy
        10. +
        11. Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. “Training data-efficient image transformers & distillation through attention”
        12. +
        13. Ross Wightman, Hugo Touvron, Hervé Jégou. “ResNet strikes back: An improved training procedure in timm”
        14. +
        15. Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, Vaishaal Shankar. “Do ImageNet Classifiers Generalize to ImageNet?”
        16. +
        17. Samuel G. Müller, Frank Hutter. “TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation”
        18. +
        19. Zhun Zhong, Liang Zheng, Guoliang Kang, Shaozi Li, Yi Yang. “Random Erasing Data Augmentation”
        20. +
        21. Terrance DeVries, Graham W. Taylor. “Improved Regularization of Convolutional Neural Networks with Cutout”
        22. +
        23. Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jon Shlens, Zbigniew Wojna. “Rethinking the Inception Architecture for Computer Vision”
        24. +
        25. Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, David Lopez-Paz. “mixup: Beyond Empirical Risk Minimization”
        26. +
        27. Sangdoo Yun, Dongyoon Han, Seong Joon Oh, Sanghyuk Chun, Junsuk Choe, Youngjoon Yoo. “CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features”
        28. +
        29. Elad Hoffer, Tal Ben-Nun, Itay Hubara, Niv Giladi, Torsten Hoefler, Daniel Soudry. “Augment your batch: better training with larger batches”
        30. +
        31. Maxim Berman, Hervé Jégou, Andrea Vedaldi, Iasonas Kokkinos, Matthijs Douze. “Multigrain: a unified image embedding for classes and instances”
        32. +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/huawei-joins-pytorch/index.html b/blog/huawei-joins-pytorch/index.html new file mode 100644 index 000000000000..ef670f2ca36a --- /dev/null +++ b/blog/huawei-joins-pytorch/index.html @@ -0,0 +1,703 @@ + + + + + + + + + + + + + Huawei Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, the PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, announced that Huawei has joined as a premier member.

        + +

        Huawei has been a long-standing supporter and contributor to the PyTorch Ecosystem, and, through the release of progressive diverse computing, provides easier access to the PyTorch ecosystem for more hardware vendors. By joining as a premier member, Huawei will continue to optimize PyTorch to fully unleash Ascend computing capabilities.

        + +

        “We are delighted to join the PyTorch Foundation, and hope to further collaborate with other member companies and expand the community to a wider audience,” said by Zhang Dixuan, President of Huawei Ascend Computing Business, “This move benefits both Huawei, PyTorch, and the wider AI ecosystem. It also aligns with our long-held beliefs in openness, innovation, collaboration, and shared success, and we are confident that it will spur new innovations in the global AI community.”

        + +

        Huawei unveiled the All Intelligence strategy to accelerate intelligence across all industries. To cater the demand for AI computing needs, Huawei invests in the system-level technologies, and that belief is centered on open hardware and software that enables partners and fosters talent. This strategy aligns with the PyTorch Foundation’s mission to develop AI as part of a sustainable open source ecosystem and produce inclusive technological feats.

        + +

        PyTorch Foundation Executive Director Ibrahim Haddad said, “We are delighted to welcome Huawei to the PyTorch Foundation. Huawei is a leading body in researching computer vision, natural language processing, speech recognition, and other emerging areas, and has proven experience in the field of foundation models. We have no doubt that we will benefit from their support and guidance.”

        + +

        As a premier member, Huawei is granted one seat to the PyTorch Foundation Governing Board, and will help set policies, bylaws, and mission and vision statements that define the overarching scope of the PyTorch Foundation’s initiatives, technical vision, and direction.

        + +

        The Board welcomes Huawei representative Fred Li, Head of Computing Open Source Development Team at Huawei. Fred leads an active and creative team in R&D and operations projects under the principle of “upstream first”, which aims to make diverse computing power ubiquitous.

        + +

        To learn more about how you can be a part of the PyTorch Foundation, visit our website.

        + +

        About Huawei

        + +

        Founded in 1987, Huawei is a leading global provider of information and communications technology (ICT) infrastructure and smart devices. We have 207,000 employees and operate in over 170 countries and regions, serving more than three billion people around the world. We are committed to bringing digital to every person, home and organization for a fully connected, intelligent world.

        + +

        About PyTorch Foundation

        + +

        The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

        + +

        About The Linux Foundation

        + +

        The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

        + +
        + +

        华为成为PyTorch基金会Premier会员

        + +

        PyTorch 基金会是深度学习社区在开源 PyTorch 框架和生态系统上进行协作的中立家园,今天宣布华为已作为Premier会员加入。

        + +

        华为长期以来一直是PyTorch生态系统的支持者和贡献者,通过推进多样性算力支持与改进,帮助更多厂商后端能够更加轻松地接入PyTorch生态,并积极致力于PyTorch优化,从而充分释放昇腾的算力。

        + +

        “通过加入PyTorch基金会,我们可以进一步与其他成员公司共同协作,加速PyTorch社区的发展。”华为昇腾计算业务总裁张迪煊表示,“我们相信这对华为和 PyTorch 生态系统是互惠互利的,也符合我们长期以来开放创新,协作共赢的开源理念,为全球人工智能社区带来更多的兴奋和创新。”

        + +

        华为发布全面智能化战略,加速千行万业智能化的转型,持续通过系统级持续创新,坚持硬件开放、软件开源、使能伙伴、发展人才,以满足各行各业多样性的AI算力需求。这与 PyTorch 基金会的使命完美契合且相互补充,即通过培育和维持开源生态系统来推动人工智能的发展,并使每个人都能使用这些技术创新。

        + +

        “华为在计算机视觉、自然语言处理、语音识别等领域进行了广泛的研究,并且在大模型领域也积累了成熟的研究经验。我们相信 PyTorch 基金会将从他们对我们的成员和生态系统的支持中受益匪浅。”PyTorch 基金会执行董事 Ibrahim Haddad 说道。

        + +

        作为 Premier 会员,华为获得了 PyTorch 基金会董事会的一个席位。董事会通过我们的章程、使命和愿景声明制定政策,描述基金会计划、技术愿景和方向的总体范围。

        + +

        我们很高兴欢迎华为计算开源业务总经理李永乐加入我们的董事会。李永乐目前负责华为计算产品线开源业务,他领导着一支极具创新又充满活力的技术和运营团队,他们秉持着“Upstream first”的原则,让多样性算力无处不在。

        + +

        要了解有关如何成为 PyTorch 基金会一部分的更多信息,请访问我们的网站

        + +

        关于华为

        + +

        华为创立于1987年,是全球领先的ICT(信息与通信)基础设施和智能终端提供商。我们的20.7万员工遍及170多个国家和地区,为全球30多亿人口提供服务。我们致力于把数字世界带入每个人、每个家庭、每个组织,构建万物互联的智能世界。

        + +

        关于PyTorch基金会

        + +

        PyTorch 基金会是深度学习社区在开源 PyTorch 框架和生态系统上进行协作的中立家园。 PyTorch 基金会得到其成员和 PyTorch 开源项目主要贡献者的支持。基金会利用成员和贡献者提供的资源来促进社区讨论和协作。

        + +

        关于Linux基金会

        + +

        Linux 基金会是世界领先的开源软件、硬件、标准和数据协作中心。 Linux 基金会项目对世界基础设施至关重要,包括 Linux、Kubernetes、Node.js、ONAP、PyTorch、RISC-V、SPDX、OpenChain 等。 Linux 基金会专注于利用最佳实践并满足贡献者、用户和解决方案提供商的需求,以创建可持续的开放协作模型。欲了解更多信息,请访问我们的 linuxfoundation.org。 Linux 基金会已注册商标并使用商标。有关 Linux 基金会的商标列表,请参阅其商标使用页面:www.linuxfoundation.org/trademark-usage。 Linux 是 Linus Torvalds 的注册商标。

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/hugging-face-joins/index.html b/blog/hugging-face-joins/index.html new file mode 100644 index 000000000000..5a3a676ac66d --- /dev/null +++ b/blog/hugging-face-joins/index.html @@ -0,0 +1,675 @@ + + + + + + + + + + + + + Hugging Face Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Smiling hugging face

        + +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Hugging Face has joined as a premier member.

        + +

        Hugging Face has been a long time supporter and contributor to the PyTorch Ecosystem by providing powerful models and resources that accelerate research, development, and adoption of AI technologies, particularly in the field of natural language processing.

        + +

        “Our mission has always been to democratize AI and make it accessible to everyone. We’re truly aligned with PyTorch’s objective of reducing the barrier of entry to practitioners. By joining the PyTorch Foundation, we can further amplify that impact and support this very important framework of the ecosystem that is PyTorch,” said Lysandre Debut, Head of Open Source at Hugging Face. “We believe the two ecosystems have significant overlap, and collaborating with the foundation will allow us to bridge the gap to provide the best software, the best tools to the machine learning community at large.”

        + +

        Hugging Face’s Model Hub and open source libraries promote collaboration and knowledge sharing within the AI open source community, making Hugging Face a great match to the growing PyTorch Foundation. They continue to drive industry adoption and collaboration by creating user-friendly tools and resources and providing accessible and well-documented libraries.

        + +

        “Hugging Face’s commitment to open source development and their exceptional contributions to the PyTorch ecosystem have truly impressed us. With their help, we will drive innovation, foster collaboration, and empower the global AI community to create transformative solutions for the AI community,” said PyTorch Foundation Executive Director Ibrahim Haddad. “We welcome Hugging Face to the PyTorch Foundation and look forward to the achievements that lie ahead.”

        + +

        As a premier member, Hugging Face is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

        + +

        Lysandre Debut

        + +

        We’re happy to welcome Lysandre Debut, Head of Open Source at Hugging Face to our board. Lysandre has been at Hugging Face since the company’s pivot to open-source, and was the first engineer to focus entirely on the open-source mission. Now leading the open-source part of the organization, Lysandre remains technically involved by being a core maintainer of the Transformers library.

        + +

        To learn more about how you can be a part of the PyTorch Foundation, visit our website.

        + +

        About Hugging Face

        + +

        Hugging Face is a community and company dedicated to lowering the barrier of entry to Machine Learning and Deep Learning. Strong advocates for open-source and open-science, their model Hub hosts more than 250,000 public models and 50,000 public datasets that are very simple to use. Transformers, Diffusers, PEFT, Accelerate, and Datasets are some of the open-source tools made available by Hugging Face.

        + +

        About PyTorch Foundation

        + +

        The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

        + +

        About The Linux Foundation

        + +

        The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page: www.linuxfoundation.org/trademark-usage. Linux is a registered trademark of Linus Torvalds.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ibm-joins-pytorch/index.html b/blog/ibm-joins-pytorch/index.html new file mode 100644 index 000000000000..0f1163aa8c0c --- /dev/null +++ b/blog/ibm-joins-pytorch/index.html @@ -0,0 +1,665 @@ + + + + + + + + + + + + + IBM Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        The PyTorch Foundation, part of The Linux Foundation, is pleased to announce that IBM has joined as a premier member.

        + +

        IBM Logo

        + +

        The foundation serves as a neutral space for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. With its extensive industry expertise and leadership in open source and AI, IBM is committed to actively contributing to the PyTorch community.

        + +

        IBM offers a comprehensive portfolio of enterprise AI solutions and recently released watsonx, its next-generation data and AI platform. IBM’s watsonx platform leverages PyTorch to offer an enterprise-grade software stack for end-to-end training and fine-tuning of AI foundation models.

        + +

        “By joining the PyTorch Foundation, we aim to contribute our expertise and resources to further advance PyTorch’s capabilities and make AI more accessible in hybrid cloud environments with flexible hardware options,” said Priya Nagpurkar, Vice President, Hybrid Cloud Platform and Developer Productivity, IBM Research. “We intend for our collaboration with PyTorch to bring the power of foundation models and generative AI to enterprises using the watsonx platform to drive business transformation.”

        + +

        IBM and PyTorch have already collaborated on two projects. The first enables foundation models with billions of parameters to train efficiently on standard cloud networking infrastructure, such as Ethernet networking. Together, IBM and PyTorch have also worked on ways to make checkpointing for AI training considerably more cost-effective, by fixing the distributed checkpointing within PyTorch to support certain types of object storage.

        + +

        “We’re happy to welcome IBM as a premier member. IBM’s expertise and dedication to advancing the field of artificial intelligence align perfectly with the mission of the PyTorch community,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Their commitment to open collaboration and innovation will strengthen our collective efforts to empower developers and researchers worldwide.”

        + +

        As a premier member, IBM is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

        + +

        Raghu Ganti Headshot

        + +

        We’re happy to welcome Raghu Ganti, Principal Research Scientist at IBM Research, to our board. Raghu co-leads IBM Research’s foundation model training and validation platform, built on Red Hat OpenShift. His team primarily contributes to the PyTorch training components, with the mission of democratizing training and validation of foundation models.

        + +

        To learn more about how you can be a part of the PyTorch Foundation, visit our website.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/improve-rag-performance/index.html b/blog/improve-rag-performance/index.html new file mode 100644 index 000000000000..062d6fe9197f --- /dev/null +++ b/blog/improve-rag-performance/index.html @@ -0,0 +1,1035 @@ + + + + + + + + + + + + + Improve RAG performance with torch.compile on AWS Graviton Processors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Sunita Nadampalli(AWS), Ankith Gunapal(Meta), Hamid Shojanazeri(Meta) + +

        +

        Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to support tasks like answering questions, translating languages, and completing sentences. There are a few challenges when working with LLMs such as domain knowledge gaps, factuality issues, and hallucination, which affect their reliability especially for the fields that require high levels of accuracy, such as healthcare, law, or engineering. Retrieval Augmented Generation (RAG) provides a solution to mitigate some of these issues by augmenting LLMs with a specific domain or an organization’s internal knowledge base, without the need to retrain the model.

        + +

        The RAG knowledge source is generally business specific databases which are typically deployed on general-purpose CPU infrastructure. So, deploying RAG on general-purpose CPU infrastructure alongside related business services is both efficient and cost-effective. With this motivation, we evaluated RAG deployment on AWS Graviton based Amazon EC2 instances which have been delivering up to 40% price-performance advantage compared to comparable instances for the majority of the workloads including databases, in-memory caches, big data analytics, media codecs, gaming servers, and machine learning inference.

        + +

        In the past we published a few blog posts on how PyTorch was optimized for AWS Graviton processors to accelerate ML Inference performance for both eager mode (blog) and torch.compile mode (blog). In this blog we cover how to deploy a typical RAG workload using PyTorch and torch.compile, how we improved its performance up to 1.7x for embedding model and 1.3x for RAG query on AWS Graviton3-based m7g.xlarge instance compared to the default PyTorch “eager mode”, and finally a few recommendations that you can apply for your RAG use cases.

        + +

        How to Optimize RAG?

        + +

        Without RAG, the LLM takes the user input and creates a response based on information it was trained on (what it already knows). With RAG, an information retrieval component is introduced that utilizes the user input to first pull information from a new data source. The user query and the relevant information are both given to the LLM. The LLM uses the new knowledge and its training data to create better responses. The following diagram shows the conceptual flow of using RAG with LLMs.

        + +

        Image 1: Conceptual flow of using RAG with LLMs

        + +

        Image 1: Conceptual flow of using RAG with LLMs

        + +

        Source: https://aws.amazon.com/what-is/retrieval-augmented-generation/

        + +

        Embedding model

        + +

        At the core of RAG is an embedding model that takes the text data and converts into a vector representation. These vectors are then stored in a vector db. When a user makes a query, the query is first converted to a vector and the RAG does a similarity search on the vector db. Hence, the first step in optimizing RAG performance is optimizing an embedding model’s inference performance. We used the AWS Graviton3-based m7g.xlarge instance and the HuggingFace sentence-transformer embedding model for the optimization work. Here is a sample script for profiling the HuggingFace sentence-transformer embedding model inference with PyTorch Eager mode.

        + +
        import torch
        +from torch.profiler import profile, ProfilerActivity, record_function
        +from transformers import AutoModel, AutoTokenizer
        +
        +model_name = "sentence-transformers/all-mpnet-base-v2"
        +input_text = ["This is an example sentence", "Each sentence is converted"]
        +
        +model = AutoModel.from_pretrained(model_name)
        +tokenizer = AutoTokenizer.from_pretrained(model_name)
        +
        +encoded_input = tokenizer(
        +    input_text, padding=True, truncation=True, return_tensors="pt"
        +)
        +
        +warmup, actual = 100, 100
        +model.eval()
        +
        +with torch.no_grad():
        +    # warmup
        +    for i in range(warmup):
        +        embeddings = model(**encoded_input)
        +
        +    with profile(activities=[ProfilerActivity.CPU]) as prof:
        +        with record_function("model_inference"):
        +            for i in range(actual):
        +                embeddings = model(**encoded_input)
        +        print(prof.key_averages().table(sort_by="self_cpu_time_total"))
        +
        + +

        Eager mode

        + +

        Since PyTorch eager mode was already optimized on AWS Graviton processors with the following runtime environment settings, we included them in the baseline and measured the following performance. Please refer to Optimized PyTorch 2.0 Inference with AWS Graviton processors for more details on how we optimized the PyTorch eager mode on AWS Graviton processors.

        + +
        # Enable the fast math GEMM kernels, to accelerate fp32 inference with bfloat16 gemm
        +export DNNL_DEFAULT_FPMATH_MODE=BF16
        +
        +# Enable Linux Transparent Huge Page (THP) allocations,
        +# to reduce the tensor memory allocation latency
        +export THP_MEM_ALLOC_ENABLE=1
        +
        +# Set LRU Cache capacity to cache the primitives and avoid redundant
        +# memory allocations
        +export LRU_CACHE_CAPACITY=1024
        +
        + +
        ---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
        +---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                aten::addmm        61.01%        2.638s        62.49%        2.702s     370.197us          7300  
        +            model_inference        12.01%     519.161ms       100.00%        4.324s        4.324s             1  
        +                  aten::bmm         6.25%     270.084ms        11.96%     517.089ms     215.454us          2400  
        +               aten::select         3.98%     172.165ms         5.34%     230.863ms       1.331us        173500  
        +                aten::copy_         2.11%      91.133ms         2.11%      91.133ms       6.200us         14700   
        +---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +Self CPU time total: 4.324s
        +
        + +

        Table 1: Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with PyTorch Eager mode

        + +

        Next, we added torch.compile, weights pre-packing, and torch.inference_mode and observed around 1.7x performance improvement. The following section talks about each of these optimizations and the resulting speedup.

        + +

        torch.compile

        + +

        In contrast to eager mode, the torch.compile pre-compiles the entire model into a single graph in a manner that’s optimized for running on given hardware. Please refer to Accelerated PyTorch Inference with torch.compile on AWS Graviton processors for more details on torch.compile features and how we optimized them on AWS Graviton processors. Invoke torch.compile as shown in the following snippet to trigger PyTorch dynamo compilation for the model. This resulted in around 1.04x performance improvement from the baseline.

        + +
        model = torch.compile(model)
        +
        +----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
        +----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                 aten::addmm        64.46%        2.675s        66.66%        2.766s     378.905us          7300  
        +       Torch-Compiled Region        19.76%     820.085ms        99.04%        4.109s      41.094ms           100  
        +                   aten::bmm         6.66%     276.216ms        12.52%     519.527ms     216.470us          2400  
        +                aten::select         3.98%     164.991ms         5.41%     224.488ms       1.299us        172800  
        +            aten::as_strided         1.66%      69.039ms         1.66%      69.039ms       0.383us        180100  
        +----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +Self CPU time total: 4.149s
        +
        + +

        Table 2: Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile mode

        + +

        Weights pre-packing

        + +

        torch.compile opens up opportunities like pre-packing the model weights into a format that is more suitable for the given hardware during the model compilation, thus improving the performance. Set the following config to trigger weights pre-packing. This resulted in around 1.69x improvement from the baseline.

        + +
        import torch._inductor.config as config
        +config.cpp.weight_prepack=True
        +config.freezing=True
        +
        + +
        -----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
        +-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +    mkldnn::_linear_pointwise        39.10%     994.821ms        41.50%        1.056s     144.628us          7300  
        +        Torch-Compiled Region        35.12%     893.675ms        98.42%        2.504s      25.043ms           100  
        +                    aten::bmm        10.96%     278.859ms        21.66%     551.073ms     229.614us          2400  
        +                 aten::select         7.34%     186.838ms         9.98%     253.840ms       1.469us        172800  
        +             aten::as_strided         2.63%      67.002ms         2.63%      67.002ms       0.388us        172800   
        +-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +Self CPU time total: 2.544s
        +
        + +

        Table 3: Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile and weights pre-packing

        + +

        torch.inference_mode

        + +

        Additionally, use torch.inference_mode() to get savings from turning off version control for tensors and view tracking of tensors. Please refer to the PyTorch documentation for more details.

        + +
        with torch.inference_mode():
        +# instead of
        +with torch.no_grad():
        +
        + +
        -----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
        +-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +    mkldnn::_linear_pointwise        38.92%     987.276ms        41.17%        1.044s     143.056us          7300  
        +        Torch-Compiled Region        34.92%     885.895ms        98.45%        2.498s      24.975ms           100  
        +                    aten::bmm        11.25%     285.292ms        22.22%     563.594ms     234.831us          2400  
        +                 aten::select         7.74%     196.223ms        10.22%     259.251ms       1.500us        172800  
        +             aten::as_strided         2.48%      63.027ms         2.48%      63.027ms       0.365us        172800  
        +-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
        +Self CPU time total: 2.537s
        +
        + +

        Table 4: Profiler output for HuggingFace sentence-transformer embedding model inference on AWS Graviton3-based m7g.xlarge instance with torch.compile, weights pre-packing, and inference_mode

        + +

        The following table shows the incremental performance improvements achieved for the standalone embedding model inference.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Optimization level + Latency measured (in sec) + Improvement over the baseline +
        PyTorch eager mode (Baseline) + 0.04324 + NA +
        torch.compile + 0.04149 + 1.04x +
        weights pre-packing + 0.02544 + 1.69x +
        torch.inference_mode + 0.02537 + 1.70x +
        + +

        The following script is an updated example for the embedding model inference with the previously discussed optimizations included. The optimizations are highlighted in GREEN.

        + +
        +
        +import torch
        +from torch.profiler import profile, record_function, ProfilerActivity
        +from transformers import AutoTokenizer, AutoModel
        +import torch._inductor.config as config
        +config.cpp.weight_prepack=True
        +config.freezing=True
        +
        +model_name = "sentence-transformers/all-mpnet-base-v2"
        +input_text = ['This is an example sentence', 'Each sentence is converted']
        +
        +model = AutoModel.from_pretrained(model_name)
        +tokenizer = AutoTokenizer.from_pretrained(model_name)
        +
        +encoded_input = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')
        +
        +warmup , actual = 100, 100
        +model.eval()
        +model = torch.compile(model)
        +
        +with torch.inference_mode():
        +#instead of with torch.no_grad()
        +# warmup
        +  for i in range(warmup):
        +  	embeddings = model(**encoded_input)
        +
        +  with profile(activities=[ProfilerActivity.CPU]) as prof:
        +	with record_function("model_inference"):
        +  	for i in range(actual):
        +     	embeddings = model(**encoded_input)
        +  print(prof.key_averages().table(sort_by="self_cpu_time_total"))
        +
        +
        + +

        End-to-End RAG scenario on CPU

        + +

        After optimizing the embedding model inference, we started with a PyTorch eager mode based RAG setup, mainly to validate the functionality on the CPU backend. We built the RAG solution with HuggingFaceEmbeddings from langchain_community.embeddings, as shown in the following code snippet.

        + +
        from langchain_community.embeddings import HuggingFaceEmbeddings
        +from langchain_community.vectorstores import FAISS
        +from langchain.text_splitter import RecursiveCharacterTextSplitter
        +from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
        +from langchain.prompts import PromptTemplate
        +from langchain_core.prompts import format_document
        +from bs4 import BeautifulSoup as Soup
        +import torch
        +
        +url =  "https://pytorch.org/blog/pytorch2-5/"
        +chunk_size = 1000
        +chunk_overlap = 0
        +embedding_model = "sentence-transformers/all-mpnet-base-v2"
        +N = 5
        +
        +question = "What's new in PyTorch 2.5?"
        +
        +from transformers import AutoTokenizer, AutoModel
        +from typing import Any, List
        +
        +loader = RecursiveUrlLoader(
        +            url=url, max_depth=3, extractor=lambda x: Soup(x, "html.parser").text
        +        )       
        +docs = loader.load()
        +
        +# Split the document into chunks with a specified chunk size
        +text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        +all_splits = text_splitter.split_documents(docs)
        +
        +# Store the document into a vector store with a specific embedding model
        +model = HuggingFaceEmbeddings(model_name=embedding_model)
        +
        +warmup , actual = 100, 100
        +
        +with torch.inference_mode():
        +    vectorstore = FAISS.from_documents(all_splits, model)
        +
        +    for i in range(warmup):
        +        searchDocs = vectorstore.similarity_search(question, k=N)
        +
        +    import time
        +
        +    start = time.time()
        +    for i in range(actual):
        +        searchDocs = vectorstore.similarity_search(question, k=N)
        +    end = time.time()
        +    print(f"Time for 1 inference is {(end-start)/actual} seconds")
        +
        +    doc_prompt = PromptTemplate.from_template("{page_content}")
        +    context = ""
        +    for i, doc in enumerate(searchDocs):
        +        context += f"\n{format_document(doc, doc_prompt)}\n"
        +
        + +

        Next, our goal was to optimize the end-to-end RAG use case with torch.compile and weights pre-packing that gave 1.7x improvement for the standalone embedding model inference. However, the optimizations didn’t work out of the box for the RAG scenario.

        + +

        What are the challenges and solutions to achieve similar gains in an end-to-end RAG scenario?

        + +

        Challenge 1: model handle

        + +

        There was no way to get the model handle that was instantiated with HuggingFaceEmbeddings, and the wrapper class doesn’t provide compile APIs. So, there was no way for our application to invoke torch.compile to trigger the PyTorch dynamo compilation process.

        + +

        Solution

        + +

        We implemented our custom embedding class so that we can get a handle for the model. This instantiated the embedding model from sentence-transformers , and maintained the handle for immediate compilation or compilation at a later stage. With this, we were able to trigger torch.compile and hence the dynamo compilation.

        + +
        class CustomEmbedding(HuggingFaceEmbeddings):
        +    
        +    def __init__(self, **kwargs: Any):
        +        """Initialize the sentence_transformer."""
        +        super().__init__(**kwargs)
        +
        +        # Load model from HuggingFace Hub
        +        self.client = AutoModel.from_pretrained(self.model_name)
        +    class Config:
        +        arbitrary_types_allowed = True
        +
        +
        +    
        +    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        +        """Compute doc embeddings using a HuggingFace transformer model.
        +        Args:
        +            texts: The list of texts to embed.
        +        Returns:
        +            List of embeddings, one for each text.
        +        """
        +
        +        texts = list(map(lambda x: x.replace("\n", " "), texts))
        +
        +        # Tokenize sentences
        +        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        +        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        +        
        +        embeddings = self.client(
        +           **encoded_input, output_hidden_states=True
        +        )
        +        embeddings = embeddings.pooler_output.detach().numpy()
        +
        +        return embeddings.tolist()
        +
        +# instead of model = HuggingFaceEmbeddings(model_name=embedding_model)
        +model = CustomEmbedding(model_name=embedding_model)
        +
        +# torch.compile the model
        +model.client = torch.compile(model.client)
        +
        + +

        Challenge 2: triggering the optimization

        + +

        For a typical inference scenario where the graph is frozen and gradient calculations are disabled, Torch inductor (the compiler backend we used for CPUs) invokes hardware specific optimizations like graph rewrite into more performant operators, operator fusion, and weights pre-packing. Though Torch dynamo was able to see the model and trigger generic compilation, it failed to trigger these additional Fx passes in the Torch inductor.

        + +

        There were two main reasons for Torch inductor not triggering the optimization passes: (1) The application didn’t set no_grad() or inference_mode() for torch inductor to detect that the graph was frozen; and (2) We hit a limitation with the torch.compile framework, where, if the no_grad is set just at the beginning of the compiled region, torch.compile wouldn’t be able to detect it while invoking the inductor Fx passes because it would not have hit the no_grad region by then. Please refer to this GitHub issue for more details.

        + +

        Solution

        + +

        We work around this limitation by moving the no_grad() context into the application code from within the model class. With this, the model compilation happened as expected and gave around 1.3x performance improvement when we profiled the stable inference pass for eager and compiled versions.

        + +

        Challenge 3: extra compilation

        + +

        With the previous fixes, the query lookup inference performance was improved, but not the total execution time of the benchmarking script. We root-caused it to redundant compilation for the model during the RAG inference. Further deep diving revealed that it was because of the batch size mismatch between the word embedding and the RAG query stages. For example, in our benchmarking script, when the database was vectorized and stored in vector db, we used the batch size of 16, hence the model was compiled with shapes of 16xNxK. Whereas, the RAG query lookup is usually a single request of shape 1xNxK. So, there was a batch size mismatch (dimension “0” of these tensors) that triggered the recompilation for the query lookup stage. We confirmed it with the following Torch logging: TORCH_LOGS="recompiles"

        + +
        TORCH_LOGS="recompiles" python rag_compile.py 
        +V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles] Recompiling function forward in site-packages/transformers/models/mpnet/modeling_mpnet.py:502
        +V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles]     triggered by the following guard failure(s):
        +V1103 02:48:08.805986 34281 site-packages/torch/_dynamo/guards.py:2813] [0/1] [__recompiles]     - 0/0: tensor 'L['input_ids']' size mismatch at index 0. expected 16, actual 1
        +
        + +

        Solution

        + +

        Torch dynamo provides a decorator to mark the dimension of a given tensor as dynamic and specify an expected value for the same, so that re-compilation is not triggered. For example, specifying dimension “0” of input_ids and attention_mask as dynamic, and specifying that value of “1” is allowed in that dimension (as shown in the following code snippet), should have avoided the redundant compilations.

        + +
        torch._dynamo.decorators.mark_unbacked(encoded_input['input_ids'], 0)
        +torch._dynamo.mark_dynamic(encoded_input['input_ids'], 1)
        +        torch._dynamo.decorators.mark_unbacked(encoded_input['attention_mask'], 0)
        +torch._dynamo.mark_dynamic(encoded_input['attention_mask'], 1)
        +
        + +

        However, the Torch dynamo decorator and marking didn’t work in this particular case. Moreover, using the decorator created graph breaks. So, we added some warmup iterations to hide the compilation latency, and profiled the query lookup performance in the steady state. However, the good news is that, in practice, this re-compilation is triggered only for the first query, so it might not affect the production scenario if the database size is fixed. Moreover, PyTorch AOT Inductor (a new feature in PyTorch) addresses re-compilation and warm up challenges with torch.compile. In a follow-up blog we will address how in a production environment we can use AOT Inductor to address these challenges.

        + +

        With these solutions we were able to apply torch.compile, weights pre-packing and the AWS Graviton specific optimizations for an end-end RAG scenario and improve the performance by 1.3x from the baseline eager mode.

        + +

        Deployment

        + +

        A detailed guide on how to deploy torch compiled RAG on AWS Graviton-based Amazon EC2 instances and how to deploy it in conjunction with Llama using TorchServe can be found on the PyTorch website.

        + +

        Conclusion

        + +

        In this blog, we covered how we optimized embedding model inference performance on AWS Graviton3-based EC2 instances. We also shared the challenges faced, the solutions we implemented to bring those optimizations for a RAG use case, and the resulting speedups. We hope that you will give it a try! If you need any support with ML software on Graviton, please open an issue on the AWS Graviton Technical Guide GitHub.

        + +

        We would like to express our gratitude to Eli Uriegas for the support in making this blog post happen.

        + +

        Authors

        + +

        Sunita Nadampalli is a Principal Engineer and AI/ML expert at AWS. She leads AWS Graviton software performance optimizations for AI/ML and HPC workloads. She is passionate about open source software development and delivering high-performance and sustainable software solutions for SoCs based on the Arm ISA.

        + +

        Ankith Gunapal is an AI Partner Engineer at Meta (PyTorch). He leads customer support, evangelizing & release engineering of TorchServe. He is passionate about solving production problems in model inference and model serving. He also enjoys distilling technically complex material in a user friendly format.

        + +

        Hamid Shojanazeri leads the AI Frameworks Partner Engineering team at Meta. He is passionate about building scalable AI solutions and specializes in working with PyTorch to tackle the challenges of large-scale distributed training, inference, model serving, and optimization.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/index.html b/blog/index.html new file mode 100644 index 000000000000..f448c42d70c5 --- /dev/null +++ b/blog/index.html @@ -0,0 +1,995 @@ + + + + + + + + + + + + + Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + + + + +
        +
        + +

        Featured Post

        +

        + Recap of the PyTorch Korea User Group Meetup: A Technical Conference with a PyTorch Core Maintainer +

        + + + + Read More + + + +
        +
        + +
        +
        +
        +
        + + + + + + + + +
        +
        +

        May 02, 2025

        +

        + PyTorch Day France Featured Sessions: A Defining Moment for Open Source AI +

        +

        PyTorch Day France offers a front-row seat to the future of open source AI. Taking place 7 May at Station F in Paris and co-located with GOSIM AI Paris, this one-day event will bring together developers, researchers, and industry leaders for a day of technical sessions, real-world insights, and community exchange. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        May 01, 2025

        +

        + Announcing the PyTorch Docathon 2025 +

        +

        + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 30, 2025

        +

        + FlexAttention Part II: FlexAttention for Inference +

        +

        Overview + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 30, 2025

        +

        + 6x faster Async Checkpointing in PyTorch, using Cached Plans, no GIL contention +

        +

        Meta: Less Wright, Meet Vadakkanchery, Saurabh Mishra, Ela Krepska, Hamid Shojanazeri, Pradeep Fernando +Crusoe: Ethan Petersen, Martin Cala, Chip Smith + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 29, 2025

        +

        + PyTorch Foundation Expands to an Umbrella Foundation to Accelerate AI Innovation +

        +

        Today, I am thrilled to announce a significant milestone for the PyTorch Foundation: we are expanding our scope to become an umbrella foundation, allowing us to host additional projects. This expansion positions the PyTorch Foundation to foster a broader ecosystem of high-value, trusted, and innovative AI projects that cater to all stages of the AI lifecycle—from training and inference to industry-specific applications. + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 28, 2025

        +

        + Accelerating Large Scale Training and Convergence with PyTorch Float8 Rowwise on Crusoe 2K H200s +

        +

        Meta: Less Wright, Hamid Shojanazeri, Vasiliy Kuznetsov, Daniel Vega-Myhre, Gokul Nadathur, Will Constable, Tianyu Liu, Tristan Rice, Driss Guessous, Josh Fromm, Luca Wehrstedt, Jiecao Yu +Crusoe: Ethan Petersen, Martin Cala, Chip Smith + +

        + +
        + + Read More + +
        + + + + +
        +
        +

        April 25, 2025

        +

        + Accelerate PyTorch 2.7 on Intel® GPUs +

        +

        PyTorch 2.7 continues to deliver significant functionality and performance enhancements on Intel® GPU architectures to streamline AI workflows. Application developers and researchers seeking to fine-tune, inference and develop PyTorch models on Intel GPUs will now have a consistent user experience across various operating systems, including Windows, Linux and Windows Subsystem for Linux (WSL2). This is made possible through improved installation, eager mode script debugging, a performance pro...

        + +
        + + Read More + +
        + +
        + +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        + +

        Install PyTorch

        + +

        Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. This should + be suitable for many users. Preview is available if you want the latest, not fully tested and supported, builds that are generated nightly. + Please ensure that you have met the prerequisites below (e.g., numpy), depending on your package manager. You can also + install previous versions of PyTorch. Note that LibTorch is only available for C++. +

        + +

        NOTE: Latest PyTorch requires Python 3.9 or later.

        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Your OS
        +
        +
        +
        Package
        +
        +
        +
        Language
        +
        +
        +
        Compute Platform
        +
        +
        +
        Run this Command:
        +
        +
        + +
        +
        +
        +
        PyTorch Build
        +
        +
        +
        Stable (1.13.0)
        +
        +
        +
        Preview (Nightly)
        +
        +
        +
        +
        +
        Your OS
        +
        +
        +
        Linux
        +
        +
        +
        Mac
        +
        +
        +
        Windows
        +
        +
        +
        +
        +
        Package
        +
        +
        +
        Pip
        +
        +
        +
        LibTorch
        +
        +
        +
        Source
        +
        +
        +
        +
        +
        Language
        +
        +
        +
        Python
        +
        +
        +
        C++ / Java
        +
        +
        +
        +
        +
        Compute Platform
        +
        +
        +
        CUDA 11.8
        +
        +
        +
        CUDA 12.1
        +
        +
        +
        CUDA 12.4
        +
        +
        +
        ROCm 5.2
        +
        +
        +
        CPU
        +
        +
        +
        +
        +
        Run this Command:
        +
        +
        +
        pip install torch torchvision
        +
        +
        +
        +
        +
        + + + + Previous versions of PyTorch + +
        + +
        +

        Quick Start With
        Cloud Partners

        + +

        Get up and running with PyTorch quickly through popular cloud platforms and machine learning services.

        + +
        + + +
        +
        +
        + Google Cloud Platform +
        + + + + + +
        +
        + +
        +
        +
        +

        Microsoft Azure

        +
        + + +
        +
        + +
        +
        +
        + Lightning Studios +
        + +
        +
        +
        + +
        +
        +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/inside-the-matrix/index.html b/blog/inside-the-matrix/index.html new file mode 100644 index 000000000000..87ecf6947944 --- /dev/null +++ b/blog/inside-the-matrix/index.html @@ -0,0 +1,1075 @@ + + + + + + + + + + + + + Inside the Matrix: Visualizing Matrix Multiplication, Attention and Beyond | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Basil Hosmer + +

        +

        Use 3D to visualize matrix multiplication expressions, attention heads with real weights, and more.

        + +

        Matrix multiplications (matmuls) are the building blocks of today’s ML models. This note presents mm, a visualization tool for matmuls and compositions of matmuls.

        + +

        Matrix multiplication is inherently a three-dimensional operation. Because mm uses all three spatial dimensions, it can convey meaning more clearly and intuitively than the usual squares-on-paper idioms, especially (though not only) for visual/spatial thinkers.

        + +

        We also have room to compose matmuls in geometrically consistent ways - so we can visualize big, compound structures like attention heads and MLP layers using the same rules as simple expressions. And more advanced features, like animating different matmul algorithms, partitioning for parallelism, and loading external data to explore the behavior of actual models, all build naturally on this foundation.

        + +

        mm is fully interactive, runs in the browser and keeps its complete state in the URL, so links are shareable sessions (the screenshots and videos in this note all have links that open the corresponding visualization in the tool). This reference guide describes all of the available functionality.

        + +

        We’ll first introduce the visualization approach, build intuition by visualizing some simple matmuls and expressions, then dive into some more extended examples:

        + +
          +
        1. Pitch - why is this way of visualizing better?
        2. +
        3. Warmup - animations - watching the canonical matmul decompositions in action
        4. +
        5. Warmup - expressions - a quick tour of some fundamental expression building blocks
        6. +
        7. Inside an attention head - an in-depth look at the structure, values and computation behavior of a couple of attention heads from GPT2 via NanoGPT
        8. +
        9. Parallelizing attention - visualizing attention head parallelization with examples from the recent Blockwise Parallel Transformer paper
        10. +
        11. Sizes in an attention layer - what do the MHA and FFA halves of an attention layer look like together, when we visualize a whole layer as a single structure? How does the picture change during autoregressive decoding?
        12. +
        13. LoRA - a visual explanation of this elaboration of the attention head architecture
        14. +
        15. Wrapup - next steps and call for feedback
        16. +
        + +

        1 Pitch

        + +

        mm’s visualization approach is based on the premise that matrix multiplication is fundamentally a three-dimensional operation.

        + +

        In other words this:

        + +

        matrix multiplication is fundamentally a three-dimensional operation

        + +

        is a sheet of paper trying to be this (open in mm):

        + +

        wrap the matmul around a cube

        + +

        When we wrap the matmul around a cube this way, the correct relationships between argument shapes, result shape and shared dimensions all fall into place.

        + +

        Now the computation makes geometric sense: each location i, j in the result matrix anchors a vector running along the depth dimension k in the cube’s interior, where the horizontal plane extending from row i in L and a vertical plane extending from column j in R intersect. Along this vector, pairs of (i, k) (k, j) elements from the left and right arguments meet and are multiplied, and the resulting products are summed along k and the result is deposited in location i, j of the result.

        + +

        (Jumping ahead momentarily, here’s an animation.)

        + +

        This is the intuitive meaning of matrix multiplication:

        + +
          +
        1. project two orthogonal matrices into the interior of a cube
        2. +
        3. multiply the pair of values at each intersection, forming a grid of products
        4. +
        5. sum along the third orthogonal dimension to produce a result matrix.
        6. +
        + +

        For orientation, the tool displays an arrow in the cube’s interior that points towards the result matrix, with a blue vane coming from the left argument and a red vane coming from the right argument. The tool also displays white guidelines to indicate the row axis of each matrix, though they’re faint in this screenshot.

        + +

        The layout constraints are straightforward:

        + +
          +
        • left argument and result must be adjoined along their shared height (i) dimension
        • +
        • right argument and result must be adjoined along their shared width (j) dimension
        • +
        • left and right arguments must be adjoined along their shared (left width/right height) dimension, which becomes the matmul’s depth (k) dimension
        • +
        + +

        This geometry gives us a solid foundation for visualizing all the standard matmul decompositions, and an intuitive basis for exploring nontrivially complex compositions of matmuls, as we’ll see below.

        + +

        2 Warmup - animations

        + +

        Before diving into some more complex examples, we’ll run through a few intuition builders to get a feel for how things look and feel in this style of visualization.

        + +

        2a Dot product

        + +

        First, the canonical algorithm - computing each result element by taking the dot product of the corresponding left row and right column. What we see in the animation is the sweep of multiplied value vectors through the cube’s interior, each delivering a summed result at the corresponding position.

        + +

        Here, L has blocks of rows filled with 1 (blue) or -1 (red); R has column blocks filled similarly. k is 24 here, so the result matrix (L @ R) has blue values of 24 and red values of -24 (open in mm - long click or control-click to inspect values):

        + +

        + +

        + +

        2b Matrix-vector products

        + +

        A matmul decomposed into matrix-vector products looks like a vertical plane (a product of the left argument with each column of the right argument) painting columns onto the result as it sweeps horizontally through the cube’s interior (open in mm):

        + +

        + +

        + +

        Observing the intermediate values of a decomposition can be quite interesting, even in simple examples.

        + +

        For instance, note the prominent vertical patterns in the intermediate matrix-vector products when we use randomly initialized arguments- reflecting the fact that each intermediate is a column-scaled replica of the left argument (open in mm):

        + +

        + +

        + +

        2c Vector-matrix products

        + +

        A matmul decomposed into vector-matrix products looks like a horizontal plane painting rows onto the result as it descends through the cube’s interior (open in mm):

        + +

        + +

        + +

        Switching to randomly initialized arguments, we see patterns analogous to those we saw with matrix-vector products - only this time the patterns are horizontal, corresponding to the fact that each intermediate vector-matrix product is a row-scaled replica of the right argument.

        + +

        When thinking about how matmuls express the rank and structure of their arguments, it’s useful to envision both of these patterns happening simultaneously in the computation (open in mm):

        + +

        + +

        + +

        Here’s one more intuition builder using vector-matrix products, showing how the identity matrix functions exactly like a mirror set at a 45deg angle to both its counterargument and the result (open in mm):

        + +

        + +

        + +

        2d Summed outer products

        + +

        The third planar decomposition is along the k axis, computing the matmul result by a pointwise summation of vector outer products. Here we see the plane of outer products sweeping the cube “from back to front”, accumulating into the result (open in mm):

        + +

        + +

        + +

        Using randomly initialized matrices with this decomposition, we can see not just values but rank accumulate in the result, as each rank-1 outer product is added to it.

        + +

        Among other things this builds intuition for why “low-rank factorization” - i.e. approximating a matrix by constructing a matmul whose arguments are small in the depth dimension - works best when the matrix being approximated is low rank. LoRA in a later section (open in mm):

        + +

        + +

        + +

        3 Warmup - expressions

        + +

        How can we extend this visualization approach to compositions of matmuls? Our examples so far have all visualized a single matmul L @ R of some matrices L and R - what about when L and/or R are themselves matmuls, and so on transitively?

        + +

        It turns out we can extend the approach nicely to compound expressions. The key rules are simple: the subexpression (child) matmul is another cube, subject to the same layout constraints as the parent, and the result face of the child is simultaneously the corresponding argument face of the parent, like a covalently shared electron.

        + +

        Within these constraints, we’re free to arrange the faces of a child matmul however we like. Here we use the tool’s default scheme, which generates alternating convex and concave cubes - this layout works well in practice to maximize use of space and minimize occlusion. (Layouts are completely customizable, however - see the reference for details.)

        + +

        In this section we’ll visualize some of the key building blocks we find in ML models, to gain fluency in the visual idiom and to see what intuitions even simple examples can give us.

        + +

        3a Left-associative expressions

        + +

        We’ll look at two expressions of the form (A @ B) @ C, each with its own distinctive shape and character. (Note: mm adheres to the convention that matrix multiplication is left-associative and writes this simply as A @ B @ C.)

        + +

        First we’ll give A @ B @ C the characteristic FFN shape, in which the “hidden dimension” is wider than the “input” or “output” dimensions. (Concretely in the context of this example, this means that the width of B is greater than the widths of A or C.)

        + +

        As in the single matmul examples, the floating arrows point towards the result matrix, blue vane coming from the left argument and red vane from right argument (open in mm):

        + +

        As in the single matmul examples, the floating arrows point towards the result matrix, blue vane coming from the left argument and red vane from right argument

        + +

        Next we’ll visualize A @ B @ C with the width of B narrower than that of A or C, giving it a bottleneck or “autoencoder” shape (open in mm):

        + +

        visualize A @ B @ C with the width of B narrower than that of A or C

        + +

        This pattern of alternating convex and concave blocks extends to chains of arbitrary length: for example this multilayer bottleneck (open in mm):

        + +

        pattern of alternating convex and concave blocks extends to chains of arbitrary length

        + +

        3b Right associative expressions

        + +

        Next we’ll visualize a right-associative expression A @ (B @ C).

        + +

        In the same way left-associative expressions extend horizontally - sprouting from the left argument of the root expression, so to speak - right-associative chains extend vertically, sprouting from the root’s right argument.

        + +

        One sometimes sees an MLP formulated right-associatively, i.e. with columnar input on the right and weight layers running right to left. Using the matrices from the 2-layer FFN example pictured above - suitably transposed - here’s what that looks like, with C now playing the role of the input, B the first layer and A the second layer (open in mm):

        + +

        an MLP formulated right-associatively

        + +

        Aside: in addition to the color of the arrow vanes (blue for left, red for right), a second visual cue for distinguishing left and right arguments is their orientation: the rows of the left argument are coplanar with those of the result - they stack along the same axis (i). Both cues tell us for example that B is the left argument to (B @ C) above.

        + +

        3c Binary expressions

        + +

        For a visualization tool to be useful beyond simple didactic examples, visualizations need to remain legible as expressions get more complicated. A key structural component in real-world use cases is binary expressions - matmuls with subexpressions on both the left and right.

        + +

        Here we’ll visualize the simplest such expression shape, (A @ B) @ (C @ D) (open in mm):

        + +

        binary expressions - matmuls with subexpressions on both the left and right

        + +

        3d Quick aside: partitioning and parallelism

        + +

        A full presentation of this topic is out of scope for this note, though we’ll see it in action later in the context of attention heads. But as a warmup, two quick examples should give a sense of how this style of visualization makes reasoning about parallelizing compound expressions very intuitive, via the simple geometry of partitioning.

        + +

        In the first example we’ll apply the canonical “data parallel” partitioning to the left-associative multilayer bottleneck example above. We partition along i, segmenting the initial left argument (“batch”) and all intermediate results (“activations”), but none of the subsequent arguments (“weights”) - the geometry making it obvious which participants in the expression are segmented and which remain whole (open in mm):

        + +

        the canonical "data parallel" partitioning to the left-associative multilayer bottleneck example

        + +

        The second example would (for me, anyway) be much harder to build intuition about without clear geometry to support it: it shows how a binary expression can be parallelized by partitioning the left subexpression along its j axis, the right subexpression along its i axis, and the parent expression along its k axis (open in mm):

        + +

        a binary expression can be parallelized by partitioning the left subexpression along its j axis, the right subexpression along its i axis, and the parent expression along its k axis

        + +

        4 Inside an Attention Head

        + +

        Let’s look at a GPT2 attention head - specifically layer 5, head 4 of the “gpt2” (small) configuration (layers=12, heads=12, embed=768) from NanoGPT, using OpenAI weights via HuggingFace. Input activations are taken from a forward pass on an OpenWebText training sample of 256 tokens.

        + +

        There’s nothing particularly unusual about this particular head; I chose it mainly because it computes a fairly common attention pattern and lives in the middle of the model, where activations have become structured and show some interesting texture. (Aside: in a subsequent note I’ll present an attention head explorer that lets you visualize all layers and heads of this model, along with some travel notes.)

        + +

        Open in mm (may take a few seconds to fetch model weights)

        + +

        There's nothing particularly unusual about this particular head

        + +

        4a Structure

        + +

        The entire attention head is visualized as a single compound expression, starting with input and ending with projected output. (Note: to keep things self-contained we do per-head output projection as described in Megatron-LM.)

        + +

        The computation contains six matmuls:

        + +
        Q = input @ wQ        // 1
        +K_t = wK_t @ input_t  // 2
        +V = input @ wV        // 3
        +attn = sdpa(Q @ K_t)  // 4
        +head_out = attn @ V   // 5
        +out = head_out @ wO   // 6
        +
        + +

        A thumbnail description of what we’re looking at:

        + +
          +
        • the blades of the windmill are matmuls 1, 2, 3 and 6: the former group are the in-projections from input to Q, K and V; the latter is the out-projection from attn @ V back to the embedding dimension.
        • +
        • at the hub is the double matmul that first calculates attention scores (convex cube in back), then uses them to produce output tokens from the values vector (concave cube in front). Causality means that the attention scores form a lower triangle.
        • +
        + +

        But I’d encourage exploring this example in the tool itself, rather than relying on the screenshot or the video below to convey just how much signal can be absorbed from it - both about its structure and the actual values flowing through the computation.

        + +

        4b Computation and Values

        + +

        Here’s an animation of the attention head computation. Specifically, we’re watching

        + +
        sdpa(input @ wQ @ K_t) @ V @ wO
        +
        + +

        (i.e., matmuls 1, 4 , 5 and 6 above, with K_t and V precomputed) being computed as a fused chain of vector-matrix products: each item in the sequence goes all the way from input through attention to output in one step. More on this animation choice in the later section on parallelization, but first let’s look at what the values being computed tell us.

        + +

        Open in mm

        + +

        + +

        + +

        There’s a lot of interesting stuff going on here.

        + +
          +
        • Before we even get to the attention calculation, it’s quite striking how low-rank Q and K_t are. Zooming in on the Q @ K_t vector-matrix product animation, the situation is even more vivid: a significant number of channels (embedding positions) in both Q and K look more or less constant across the sequence, implying that the useful attention signal is potentially driven by a only smallish subset of the embedding. Understanding and exploiting this phenomenon is one of the threads we’re pulling on as part of the SysML ATOM transformer efficiency project.
        • +
        • Perhaps most familiar is the strong-but-not-perfect diagonal that emerges in the attention matrix. This is a common pattern, showing up in many of the attention heads of this model (and those of many transformers). It produces localized attention: the value tokens in the small neighborhood immediately preceding an output token’s position largely determine that output token’s content pattern.
        • +
        • However, the size of this neighborhood and the influence of individual tokens within it vary nontrivially - this can be seen both in the off-diagonal frost in the attention grid, and in the fluctuating patterns of the attn[i] @ V vector-matrix product plane as it descends the attention matrix on its way through the sequence.
        • +
        • But note that the local neighborhood isn’t the only thing that’s attracting attention: the leftmost column of the attention grid, corresponding to the first token of the sequence, is entirely filled with nonzero (but fluctuating) values, meaning every output token will be influenced to some degree by the first value token.
        • +
        • Moreover there’s an inexact but discernible oscillation in attention score dominance between the current token neighborhood and the initial token. The period of the oscillation varies, but broadly speaking starts short and then lengthens as one travels down the sequence (evocatively correlated with the quantity of candidate attention tokens for each row, given causality).
        • +
        • To get a feel for how (attn @ V) is formed, it’s important not to focus on attention in isolation - V is an equal player. Each output item is a weighted average of the entire V vector: at the limit when attention is a perfect diagonal, attn @ V is simply an exact copy of V. Here we see something more textured: visible banding where particular tokens have scored high over a contiguous subsequence of attention rows, superimposed on a matrix visibly similar to to V but with some vertical smearing due to the fat diagonal. (Aside: per the mm reference guide, long-clicking or control-clicking will reveal the actual numeric values of visualized elements.)
        • +
        • Bear in mind that since we’re in a middle layer (5), the input to this attention head is an intermediate representation, not the original tokenized text. So the patterns seen in the input are themselves thought-provoking - in particular, the strong vertical threads are particular embedding positions whose values are uniformly high magnitude across long stretches of the sequence - sometimes almost the entire thing.
        • +
        • Interestingly, though, the first vector in the input sequence is distinctive, not only breaking the pattern of these high-magnitude columns but carrying atypical values at almost every position (aside: not visualized here, but this pattern is repeated over multiple sample inputs).
        • +
        + +

        Note: apropos of the last two bullet points, it’s worth reiterating that we’re visualizing computation over a single sample input. In practice I’ve found that each head has a characteristic pattern it will express consistently (though not identically) over a decent collection of samples (and the upcoming attention head browser will provide a collection of samples to play with), but when looking at any visualization that includes activations, it’s important to bear in mind that a full distribution of inputs may influence the ideas and intuitions it provokes it in subtle ways.

        + +

        Finally, one more pitch to explore the animation directly!

        + +

        4c Heads are different in interesting ways

        + +

        Before we move on, here’s one more demonstration of the usefulness of simply poking around a model to see how it works in detail.

        + +

        This is another attention head from GPT2. It behaves quite differently from layer 5, head 4 above - as one might expect, given that it’s in a very different part of the model. This head is in the very first layer: layer 0, head 2 (open in mm, may take a few seconds to load model weights):

        + +

        This is another attention head from GPT2

        + +

        Things to note:

        + +
          +
        • This head spreads attention very evenly. This has the effect of delivering a relatively unweighted average of V (or rather, the appropriate causal prefix of V) to each row in attn @ V, as can be seen in this animation: as we move down the attention score triangle, the attn[i] @ V vector-matrix product is small fluctuations away from being simply a downscaled, progressively revealed copy of V.
        • +
        • attn @ V has striking vertical uniformity - in large columnar regions of the embedding, the same value patterns persist over the entire sequence. One can think of these as properties shared by every token.
        • +
        • Aside: on the one hand one might expect some uniformity in attn @ V given the effect of very evenly spread attention. But each row has been constructed from only a causal subsequence of V rather than the whole thing - why is that not causing more variation, like a progressive morphing as one moves down the sequence? By visual inspection V isn’t uniform along its length, so the answer must lie in some more subtle property of its distribution of values.
        • +
        • Finally, this head’s output is even more vertically uniform after out-projection
        • +
        • the strong impression being that the bulk of the information being delivered by this attention head consists of properties which are shared by every token in the sequence. The composition of its output projection weights reinforces this intuition.
        • +
        + +

        Overall, it’s hard to resist the idea that the extremely regular, highly structured information this attention head produces might be obtained by computational means that are a bit… less lavish. Of course this isn’t an unexplored area, but the specificity and richness of signal of the visualized computation has been useful in generating new ideas, and reasoning about existing ones.

        + +

        4d Revisiting the pitch: invariants for free

        + +

        Stepping back, it’s worth reiterating that the reason we can visualize nontrivially compound operations like attention heads and have them remain intuitive is that important algebraic properties - like how argument shapes are constrained, or which parallelization axes intersect which operations - don’t require additional thinking: they arise directly from the geometry of the visualized object, rather than being additional rules to keep in mind.

        + +

        For example, in these attention head visualizations it’s immediately obvious that

        + +
          +
        • Q and attn @ V are the same length, K and V are the same length, and the lengths of these pairs are independent of each other
        • +
        • Q and K are the same width, V and attn @ V are the same width, and the widths of these pairs are independent of each other.
        • +
        + +

        These properties are true by construction, as a simple consequence of which parts of the compound structure the constituents inhabit and how they are oriented.

        + +

        This “properties for free” benefit can be especially useful when exploring variations on a canonical structure - an obvious example being the one-row-high attention matrix in autoregressive token-at-a-time decoding (open in mm):

        + +

        the one-row-high attention matrix in autoregressive token-at-a-time decoding

        + +

        5 Parallelizing attention

        + +

        In the animation of head 5, layer 4 above, we visualize 4 of the 6 matmuls in the attention head

        + +

        as a fused chain of vector-matrix products, confirming the geometric intuition that the entire left-associative chain from input to output is laminar along the shared i axis, and can be parallelized.

        + +

        5a Example: partitioning along i

        + +

        To parallelize the computation in practice, we would partition the input into blocks along the i axis. We can visualize this partition in the tool, by specifying that a given axis be partitioned into a particular number of blocks - in these examples we’ll use 8, but there’s nothing special about that number.

        + +

        Among other things, this visualization makes clear that wQ (for in-projection), K_t and V (for attention) and wO (for out-projection) are needed in their entirety by each parallel computation, since they’re adjacent to the partitioned matrices along those matrices’ unpartitioned dimensions (open in mm):

        + +

        wQ (for in-projection), K_t and V (for attention) and wO (for out-projection) are needed in their entirety by each parallel computation

        + +

        5b Example: double partitioning

        + +

        As an example of partitioning along multiple axes, we can visualize some recent work which innovates in this space (Block Parallel Transformer, building on work done in e.g. Flash Attention and its antecedents).

        + +

        First, BPT partitions along i as described above - and actually extends this horizontal partitioning of the sequence into chunks all the way through the second (FFN) half of the attention layer as well. (We’ll visualize this in a later section.)

        + +

        To fully attack the context length problem, a second partitioning is then added to MHA - that of the attention calculation itself (i.e., a partition along the j axis of Q @ K_t). The two partitions together divide attention into a grid of blocks (open in mm):

        + +

        The two partitions together divide attention into a grid of blocks

        + +

        This visualization makes clear

        + +
          +
        • the effectiveness of this double partitioning as an attack on the context length problem, since we’ve now visibly partitioned every occurrence of sequence length in the attention calculation
        • +
        • the “reach” of this second partitioning: it’s clear from the geometry that the in-projection computations of K and V can be partitioned along with the core double matmul
        • +
        + +

        Note one subtlety: the visual implication here is that we can also parallelize the subsequent matmul attn @ V along k and sum the partial results split-k style, thus parallelizing the entire double matmul. But the row-wise softmax in sdpa() adds the requirement that each row have all its segments normalized before the corresponding row of attn @ V can be computed, adding an extra row-wise step between the attention calculation and the final matmul.

        + +

        6 Sizes in an Attention Layer

        + +

        The first (MHA) half of an attention layer is famously computationally demanding because of its quadratic complexity, but the second (FFN) half is demanding in its own right due to the width of its hidden dimension, typically 4 times that of the model’s embedding dimension. Visualizing the biomass of a full attention layer can be useful in building intuition about how the two halves of the layer compare to each other.

        + +

        6a Visualizing the full layer

        + +

        Below is a full attention layer with the first half (MHA) in the background and the second (FFN) in the foreground. As usual, arrows point in the direction of computation.

        + +

        Notes:

        + +
          +
        • This visualization doesn’t depict individual attention heads, but instead shows the unsliced Q/K/V weights and projections surrounding a central double matmul. Of course this isn’t a faithful visualization of the full MHA operation - but the goal here is to give a clearer sense of the relative matrix sizes in the two halves of the layer, rather than the relative amounts of computation each half performs. (Also, randomized values are used rather than real weights.)
        • +
        • The dimensions used here are downsized to keep the browser (relatively) happy, but the proportions are preserved (from NanoGPT’s small config): model embedding dimension = 192 (from 768), FFN embedding dimension = 768 (from 3072), sequence length = 256 (from 1024), although sequence length is not fundamental to the model. (Visually, changes in sequence length would appear as changes in the width of the input blades, and consequently in the size of the attention hub and the height of the downstream vertical planes.)
        • +
        + +

        Open in mm:

        + +

        a full attention layer with the first half (MHA) in the background and the second (FFN) in the foreground

        + +

        6b Visualizing the BPT partitioned layer

        + +

        Revisiting Blockwise Parallel Transformer briefly, here we visualize BPT’s parallelization scheme in the context of an entire attention layer (with individual heads elided per above). In particular, note how the partitioning along i (of sequence blocks) extends through both MHA and FFN halves (open in mm):

        + +

        visualize BPT's parallelization scheme in the context of an entire attention layer

        + +

        6c Partitioning the FFN

        + +

        The visualization suggests an additional partitioning, orthogonal to the ones described above - in the FFN half of the attention layer, splitting the double matmul (attn_out @ FFN_1) @ FFN_2, first along j for attn_out @ FFN_1, then along k in the subsequent matmul with FFN_2. This partition slices both layers of FFN weights, reducing the capacity requirements of each participant in the computation at the cost of a final summation of the partial results.

        + +

        Here’s what this partition looks like applied to an otherwise unpartitioned attention layer (open in mm):

        + +

        what this partition looks like applied to an otherwise unpartitioned attention layer

        + +

        And here it is applied to a layer partitioned a la BPT (open in mm):

        + +

        applied to a layer partitioned a la BPT

        + +

        6d Visualizing token-at-a-time decoding

        + +

        During autoregressive token-at-a-time decoding, the query vector consists of a single token. It’s instructive to have a mental picture of what an attention layer looks like in that situation - a single embedding row working its way through an enormous tiled plane of weights.

        + +

        Aside from the emphasizing the sheer immensity of weights compared to activations, this view is also evocative of the notion that K_t and V function like dynamically generated layers in a 6-layer MLP, although the mux/demux computations of MHA itself (papered over here, per above) make the correspondence inexact (open in mm):

        + +

        the mux/demux computations of MHA itself

        + +

        7 LoRA

        + +

        The recent LoRA paper (LoRA: Low-Rank Adaptation of Large Language Models) describes an efficient finetuning technique based on the idea that weight deltas introduced during finetuning are low-rank. Per the paper, this “allows us to train some dense layers in a neural network indirectly by optimizing rank decomposition matrices of the dense layers’ change during adaptation […], while keeping the pre-trained weights frozen.”

        + +

        7a The basic idea

        + +

        In a nutshell, the key move is to train the factors of a weight matrix rather than the matrix itself: replace an I x J weights tensor with a matmul of an I x K tensor and a K x J tensor, holding K to some small number.

        + +

        If K is small enough the size win can be huge, but the tradeoff is that lowering it lowers the rank of what the product can express. As a quick illustration of both the size savings and the structuring effect on the result, here’s a matmul of random 128 x 4 left and 4 x 128 right arguments - a.k.a. a rank-4 factorization of a 128 x 128 matrix. Notice the vertical and horizontal patterning in L @ R (open in mm):

        + +

        a matmul of random 128 x 4 left and 4 x 128 right arguments

        + +

        7b Applying LoRA to an attention head

        + +

        The way LoRA applies this factoring move to the fine tuning process is to

        + +
          +
        • create a low-rank factorization for each weight tensor to be fine-tuned and train the factors, keeping the original weights frozen
        • +
        • after fine tuning, multiply each pair of low-rank factors to get a matrix in the shape of the original weights tensor, and add it to the original pretrained weights tensor
        • +
        + +

        The following visualization shows an attention head with the weight tensors wQ, wK_t, wV, wO replaced by low rank factorizations wQ_A @ wQ_B, etc. Visually, the factor matrices show up as low fences along the edges of the windmill blades (open in mm - spacebar stops the spin):

        + +

        + +

        + +

        8 Wrapup

        + +

        8a Call for feedback

        + +

        I’ve found this way of visualizing matmul expressions extremely helpful for building intuition and reasoning about not just matrix multiplication itself, but also many aspects of ML models and their computation, from efficiency to interpretability.

        + +

        if you try it out and have suggestions or comments, I definitely want to hear, either in the comments here or in the repo.

        + +

        8b Next steps

        + +
          +
        • There’s a GPT2 attention head explorer built on top of the tool which I’m currently using to inventory and classify the attention head traits found in that model. (This was the tool I used to find and explore the attention heads in this note.) Once complete I plan to post a note with the inventory.
        • +
        • As mentioned up top, embedding these visualizations in Python notebooks is dead simple. But session URLs can get… unwieldy, so it will be useful to have Python-side utilities for constructing them from configuration objects, similar to the simple JavaScript helpers used in the reference guide.
        • +
        • If you’ve got a use case you think might benefit from visualizations like this but it’s not obvious how to use the tool to do it, get in touch! I’m not necessarily looking to expand its core visualization capabilities that much further (right tool for the job, etc.), but e.g. the API for driving it programmatically is pretty basic, there’s plenty that can be done there.
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/int4-decoding/index.html b/blog/int4-decoding/index.html new file mode 100644 index 000000000000..8423a67792ad --- /dev/null +++ b/blog/int4-decoding/index.html @@ -0,0 +1,4028 @@ + + + + + + + + + + + + + INT4 Decoding GQA CUDA Optimizations for LLM Inference | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Sarunya Pumma, Jongsoo Park, Jianyu Huang, Amy Yang, Jaewon Lee, Daniel Haziza, Grigory Sizov, Jeremy Reizenstein, Jeff Johnson, Ying Zhang + +

        +

        An efficient decoding Grouped-Query Attention with low-precision KV cache

        + +

        Introduction

        + +

        Generative AI has taken the world by storm with its ability to generate content like humans. Many of these generative AI tools are powered by large language models (LLMs), like Meta Llama models and OpenAI’s ChatGPT. One of the main challenges of LLMs is supporting large “context lengths” (also known as “sequence lengths”). The context length refers to the number of tokens that the model uses to understand the input context and generate responses. Longer context lengths generally translate into higher precision and quality in the responses. However, long context lengths are compute and memory intensive. This is mainly due to the following reasons:

        + +
          +
        • The computational complexity of attention layers increases proportionally with the context length (the growth rate depends on the attention algorithm). As a result, when using long context lengths, the attention layers can become a bottleneck, particularly during the prefill phase where attentions are compute bound.
        • +
        • The KV cache size grows linearly with the context length, thus, putting higher pressure on the memory requirement and consequently slowing down the already memory-bound attention decoding. Moreover, since the memory capacity is limited, the batch size reduces when the KV cache gets bigger, which generally results in a drop in throughput.
        • +
        + +

        The computational complexity growth is difficult to solve compared to the other problem mentioned above. One way to address the KV cache size growth problem is to use low precision KV cache. From our experiments, group-wise INT4 quantization provides comparable results in terms of accuracy compared to BF16 KV cache during the decode phase in Meta Llama 2 inference. However, we did not observe any latency improvement, despite reading 4x lesser data in attention decoding layers. This means that the INT4 attention is 4x less efficient at utilizing precious HBM bandwidth than BF16 attention.

        + +

        In this note, we discuss the CUDA optimizations that we applied to INT4 GQA (grouped-query attention – the attention layer that we use in the LLM inference phase) to improve its performance by up to 1.8x on the NVIDIA A100 GPU and 1.9x on the NVIDIA H100 GPU.

        + +
          +
        • The optimized CUDA INT4 GQA outperformed INT4 Flash-Decoding GQA (the best performing INT4 GQA that we used in the experiment mentioned above) by 1.4x-1.7x on A100 and 1.09x-1.3x on H100.
        • +
        • The optimized CUDA INT4 GQA performs better than BF16 Flash-Decoding GQA by 1.5x-1.7x on A100 and 1.4x-1.7x on H100.
        • +
        + +

        Background

        + +

        GQA for LLM Inference

        + +

        Grouped-Query Attention (GQA) is a variant of multi-head attention (MHA) where each KV cache head is shared across a group of query heads. Our LLM inference adopts GQA as an attention layer in both the prefill and decode phases in order to reduce the capacity requirement for the KV cache. We use multiple GPUs in inference where the KV cache and query heads are distributed across GPUs. Each GPU runs an attention layer with a single KV head and a group of Q heads. Therefore, when viewed from a single GPU perspective, the GQA component can also be described as MQA (Multi-Query Attention).

        + +

        The simplified workflow of decoding GQA is illustrated in Figure 1. GQA takes three main inputs: input query (denoted Q), K cache (denoted K), and V cache (denoted V). Our current GQA inference uses BF16 for Q, K, and V.

        + +
          +
        • Q is a 4D BF16 tensor of shape (B, 1, HQ, D)
        • +
        • K is a 4D BF16 tensor of shape (B, Tmax, HKV, D)
        • +
        • V is a 4D BF16 tensor of shape (B, Tmax, HKV, D)
        • +
        + +

        where

        + +
          +
        • B is the batch size (the number of input prompts)
        • +
        • HQ is the number of query heads
        • +
        • HKV is the number of KV heads (HQ must be divisible by HKV)
        • +
        • Tmax is the maximum context length
        • +
        • D is the head dimension (fixed to 128)
        • +
        + +

        GQA is simply bmm(softmax(bmm(Q, KT) / sqrt(D)), V). This yields a single output tensor (denoted as O) which is a 4D BF16 tensor that has the same shape as Q. Note that matrix multiplications are performed using BF16, however, accumulation and softmax are carried out in FP32. We call this “BF16 GQA” as the KV cache is BF16.

        + +

        Figure 1: The simplified workflow of BF16 GQA for LLM inference

        + +

        Figure 1 The simplified workflow of BF16 GQA for LLM inference

        + +

        INT4 GQA

        + +

        To further reduce the size of the KV cache, we explore the possibility of using INT4 for KV cache instead of BF16. We estimate the potential performance improvement by calculating the computational intensity (CI) of INT4 GQA and comparing it to that of BF16 GQA, as CI represents FLOPS per byte. We compute the CI for QKT and PV (as shown in Equation 1) as they take KV cache as an operand. Note that we disregard the Q load as it is negligible compared to the KV cache. We also ignore any intermediate data loads/stores that are not on global memory. Thus, the CI only takes into account the computation FLOPS and KV cache loads.

        + +

        Equation 1

        + +

        Equation (1)

        + +

        Assuming that HQ = 8 and HKV = 1, CI for BF16 KV cache is 8 while CI for INT4 KV cache is 32. The CIs indicate that both BF16 and INT4 GQAs are memory bound (the peak CIs for BF16 tensor cores for A100 and H100 are 312 TF / 2 TB/s = 141 and 990 TF / 3.35 TB/s = 269; note that these TF numbers are without sparsity). Moreover, with INT4 KV cache, we should expect up to 4x performance improvement compared to BF16 GQA.

        + +

        To enable INT4 KV cache support in GQA, we can dequantize the KV cache from INT4 to BF16 before passing it to the BF16 GQA operator. However, since KV cache is typically large, copying it from/to global memory can be costly. Moreover, decoding GQA is a memory bound operation (the memory unit is utilized much more heavily than the compute unit). Figure 2 shows the NCU profile of the FMHA CUTLASS BF16 GQA kernel in xFormers, which is one of the state of the art implementations of GQA. From the figure, it is obvious that memory is a bottleneck.

        + +

        Figure 2: The NCU profile of the FMHA CUTLASS BF16 kernel in xFormers

        + +

        Figure 2 The NCU profile of the FMHA CUTLASS BF16 kernel in xFormers

        + +

        A more efficient alternative is to fuse INT4 dequantization with the GQA operation (shown in Figure 3). In other words, having GQA read INT4 KV cache directly and perform the INT4 to BF16 conversion within the kernel. This change can potentially reduce the amount of global memory reads required for the KV cache, which could lead to a decrease in latency. We call this “INT4 GQA.”

        + +

        Figure 3: The workflow of fused INT4 GQA

        + +

        Figure 3 The workflow of fused INT4 GQA

        + +

        We list the state of the art implementations of GQA in the table below along with their features in Table 1.

        + +

        Table 1 State of the art GQA implementations

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Implementation + Denote + BF16 GQA + Fused INT4 GQA +
        Flash-Decoding (Triton implementation) + FD + Yes + Yes +
        Flash Attention (v2.3.3) + FA + Yes + No +
        CUDA baseline + CU + Yes + Yes +
        + +

        All implementations, except for CU, support both split-K and non split-K. CU only has the split-K implementation. Only FA has a heuristic in the backend to determine whether to run the split-K or non split-K kernel. For other implementations, users must explicitly choose which version to run. In this note, we focus on long context lengths (in our experiments, we use a context length of 8192) and therefore opt for the split-K version wherever possible.

        + +

        As the baseline, we measured the performance of the state of the art GQA implementations on NVIDIA A100 and H100 GPUs. The latency (time in microseconds) and achieved bandwidth (GB/s) are reported in Table 2. Note that we ran a range of split-Ks (from 2 to 128 splits) and reported the best performance for each implementation. For all experiments, we use a context length of 8192. For INT4 GQA, we used row-wise quantization (i.e., num quantized groups = 1).

        + +

        Table 2 Baseline GQA performance

        + +

        On A100

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Time (us) + BF16 GQA + INT4 GQA +
        Batch size + FD + FA + CU + FD + FA + CU +
        32 + 139 + 133 + 183 + 137 + - + 143 +
        64 + 245 + 229 + 335 + 234 + - + 257 +
        128 + 433 + 555 + 596 + 432 + - + 455 +
        256 + 826 + 977 + 1127 + 815 + - + 866 +
        512 + 1607 + 1670 + 2194 + 1581 + - + 1659 +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Effective Bandwidth (GB/s) + BF16 GQA + INT4 GQA +
        Batch size + FD + FA + CU + FD + FA + CU +
        32 + 965 + 1012 + 736 + 262 + - + 250 +
        64 + 1097 + 1175 + 802 + 305 + - + 278 +
        128 + 1240 + 968 + 901 + 331 + - + 314 +
        256 + 1301 + 1100 + 954 + 351 + - + 331 +
        512 + 1338 + 1287 + 980 + 362 + - + 345 +
        + +

        On H100

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Time (us) + BF16 GQA + INT4 GQA +
        Batch size + FD + FA + CU + FD + FA + CU +
        32 + 91 + 90 + 114 + 70 + - + 96 +
        64 + 148 + 146 + 200 + 113 + - + 162 +
        128 + 271 + 298 + 361 + 205 + - + 294 +
        256 + 515 + 499 + 658 + 389 + - + 558 +
        512 + 1000 + 1011 + 1260 + 756 + - + 1066 +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Effective Bandwidth (GB/s) + BF16 GQA + INT4 GQA +
        Batch size + FD + FA + CU + FD + FA + CU +
        32 + 1481 + 1496 + 1178 + 511 + - + 371 +
        64 + 1815 + 1840 + 1345 + 631 + - + 443 +
        128 + 1982 + 1802 + 1487 + 699 + - + 487 +
        256 + 2087 + 2156 + 1634 + 736 + - + 513 +
        512 + 2150 + 2127 + 1706 + 757 + - + 537 +
        + +

        First, let’s discuss the BF16 GQA performance: CU ranks last in terms of performance among all implementations. FD and FA have comparable performance. When the batch size is less than or equal to 64, FA utilizes the split-K kernel and performs slightly better than FD. However, when the batch size is greater than 64, FD performs better.

        + +

        The same trend holds true for INT4 GQAs. However, we did not measure the performance of FA as it does not support INT4 KV cache. FD outperforms CU for all cases.

        + +

        When comparing the latencies of FD between BF16 and INT4 GQAs, we find that they are almost identical. This suggests that INT4 GQA is highly inefficient, which can be further confirmed by the significantly lower achievable bandwidth for INT4 GQA compared to BF16 GQA. The same trend is also true when looking at the performance of CU.

        + +

        CUDA with Tensor Cores INT4 GQA Implementation

        + +

        In this section, we briefly describe our baseline implementation which is CUDA with tensor cores INT4 GQA (CU). Each thread block processes only one KV head and a group of query heads from one input prompt. Therefore, each thread block performs mm(softmax(mm(Q, KT) / sqrt(D)), V); notice that mm is being performed not bmm. Moreover, since this is a split-K implementation, tokens in the KV cache are split among different thread blocks. Note that each thread block contains 4 warps (each warp contains 32 threads for NVIDIA A100 and H100 GPUs). Work in each thread block is split among warps. Within each warp, we use the WMMA API to compute matrix multiplication on tensor cores. Figure 4 demonstrates the work partitioning in CU.

        + +

        Figure 4: CU work partitioning

        + +

        Figure 4 CU work partitioning

        + +

        Optimizing CUDA with Tensor Cores Kernel of INT4 GQA

        + +

        In this note, we discuss the optimizations that we have applied to the CUDA with tensor cores implementation of INT4 GQA (CU). The ideal goal is to improve the INT4 GQA performance by 4 times based on the CI analysis in the previous section. Note that the query size is negligible compared to the KV cache size when the context length is long.

        + +

        In our analysis, we used the NVIDIA Nsight Compute (NCU) as the main profiler. Our general bottleneck elimination approach is to minimize the stall cycles. We applied 10 optimizations to INT4 GQA, three of which are specific for NVIDIA A100/H100 GPUs. These optimizations are well known CUDA optimization techniques which can be generalized to many applications.

        + +

        It is worth noting that the reason that we choose to optimize the CUDA implementation rather than the Flash-Decoding implementation (FD) (which is Triton based) is because with CUDA, we have a better control of how the low-level instructions are being generated. Many optimization techniques that we apply such as, operating on tensor core fragments directly (Optimizations 7-9), cannot be done through Triton since it does not expose low-level details to developers. However, these optimizations can be integrated into the compiler-based solution to make the optimizations available to broader operators, which is indeed a part of our future plan.

        + +

        Optimization 1: Unroll K Loads

        + +

        Problem Analysis:

        + +

        The NCU profile shows that during K loading, there are only 2 global loads followed by memory stalls at dequantize_permuted_int4. The memory stalls are the long scoreboard stalls which indicates the waits for global memory access. This suggests that the kernel does not issue sufficient memory loads

        + +

        to hide the global load latency. The kernel issues data loading, and then waits to consume the data immediately causing the global load latency to be exposed. The stalls are shown in Figure 5.

        + +

        Figure 5: K loading before unrolling

        + +

        Figure 5 K loading before unrolling (the numbers that the arrows point to are stall cycles caused by global memory wait)

        + +

        Solution:

        + +

        In the baseline implementation, we use uint32_t to load 8 INT4 K values in a single load and we perform 2 uint32_t loads in each iteration, which is 16 INT4 K values. To allow for a better global load latency hiding, we issue 8 uint32_t loads instead of two before consuming the K values in dequantize_permuted_int4. This allows the compiler to unroll the loads as well as reorder the instructions to hide the global load latency better. Figure 6 shows the NCU profile of K loading after unrolling. Comparing Figure 5 and Figure 6, we effectively reduce the stall cycles by unrolling the K loads.

        + +

        Figure 6: K loading after unrolling

        + +

        Figure 6 K loading after unrolling (the numbers that the arrows point to are stall cycles caused by global memory wait)

        + +

        Results:

        + +

        Table 3 Performance of Optimization 1 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 1 + Baseline + Opt 1 +
        32 + 137 + 143 + 134 + 262 + 250 + 267 + 1.02 + 1.07 +
        64 + 234 + 257 + 237 + 305 + 278 + 302 + 0.99 + 1.09 +
        128 + 432 + 455 + 422 + 331 + 314 + 339 + 1.02 + 1.08 +
        256 + 815 + 866 + 806 + 351 + 331 + 355 + 1.01 + 1.07 +
        512 + 1581 + 1659 + 1550 + 362 + 345 + 369 + 1.02 + 1.07 +
        + +

        Optimization 2: Improve P Type Casting (FP32->BF16)

        + +

        Problem Analysis:

        + +

        Since the product of softmax(bmm(Q, KT) / sqrt(D)) is FP32 (denoted as P in Figure 3), the kernel has to convert P from FP32 to BF16 before feeding it to the next bmm computation. The kernel performs the FP32 to BF16 conversion of P by copying the FP32 data from one location in shared memory to another location in shared memory. This causes stalls during the shared memory access (shown in Figure 7) which might be caused by (1) the shared memory indirection; and (2) the shared memory bank conflict since each thread accesses an 16-bit element (because of this, two threads can access the same memory bank simultaneously).

        + +

        Figure 7: P type casting before Optimization 2

        + +

        Figure 7 P type casting before Optimization 2 (the number that the arrow points to is stall cycles caused by shared memory wait)

        + +

        Solution:

        + +

        We use all threads in the thread block to do in-place type conversion. Each thread operates on two consecutive elements in order to avoid the shared memory bank conflict when storing BF16. All threads work on the same head (h) at the same time to guarantee correctness of the conversion. The in-place conversion steps are as follows:

        + +
          +
        1. Each thread loads 2 FP32 token elements from the same head from the shared memory into registers
        2. +
        3. Call __syncthreads() to make sure that every thread finishes reading the data
        4. +
        5. Each thread converts its data to 2 BF16 token elements and then stores the results to the same shared memory
        6. +
        + +

        Some optimizations that we apply to the implementation:

        + +
          +
        • Use vector types (especially nv_bfloat2)
        • +
        • Unroll data loading/storing, i.e., performing multiple loads before calling __syncthreads() and performing multiple stores after __syncthreads()
        • +
        + +

        After this optimization, long stalls are not observed during P type casting as shown in Figure 8.

        + +

        Figure 8: P type casting after Optimization 2

        + +

        Figure 8 P type casting after Optimization 2 (the numbers that the arrow points to are stall cycles caused by shared memory wait)

        + +

        Culprits:

        + +

        Since we unroll data loading/storing by using registers as an intermediate storage, the number of registers per thread increases resulting in reduced occupancy.

        + +

        Results:

        + +

        Table 4 Performance of Optimization 2 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 2 + Baseline + Opt 2 +
        32 + 137 + 143 + 126 + 262 + 250 + 285 + 1.09 + 1.14 +
        64 + 234 + 257 + 221 + 305 + 278 + 324 + 1.06 + 1.16 +
        128 + 432 + 455 + 395 + 331 + 314 + 362 + 1.09 + 1.15 +
        256 + 815 + 866 + 749 + 351 + 331 + 382 + 1.09 + 1.16 +
        512 + 1581 + 1659 + 1435 + 362 + 345 + 399 + 1.10 + 1.16 +
        + +

        Optimization 3: Remove Local Memory Usage for max QKT computation

        + +

        Problem Analysis:

        + +

        During the softmax computation, the kernel has to compute max QKT for each head. It uses a temporary “thread-local” storage for storing per-thread max QKT results (one float value for each head). Depending on the compiler, the thread-local storage can be allocated on registers (on chip) or the local memory (off chip == global memory). Unfortunately, in the baseline, the thread-local storage resides in the local memory which is much slower than the registers (shown in Figure 9). We suspect that this is because the compiler cannot determine the indices of thread-local storage at compile time (since the number of heads (H) in the kernel is a runtime variable). Accessing local memory as if accessing registers can hurt the performance of the kernel.

        + +

        Figure 9: Local memory access during max QKT computation

        + +

        Figure 9 Local memory access during max QKT computation

        + +

        Solution:

        + +

        We realize that we do not need H (number of heads) floats as temporary storage per thread since each thread can compute max QKT for only one head instead of all the heads. Thus, we only need one float per thread, which can be easily stored in a register. To accumulate the max results among warps, we use shared memory. This optimization eliminates the local memory usage during max QKT computation.

        + +

        Results:

        + +

        Table 5 Performance of Optimization 3 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 3 + Baseline + Opt 3 +
        32 + 137 + 143 + 119 + 262 + 250 + 300 + 1.14 + 1.20 +
        64 + 234 + 257 + 206 + 305 + 278 + 348 + 1.14 + 1.25 +
        128 + 432 + 455 + 368 + 331 + 314 + 389 + 1.17 + 1.24 +
        256 + 815 + 866 + 696 + 351 + 331 + 411 + 1.17 + 1.24 +
        512 + 1581 + 1659 + 1338 + 362 + 345 + 428 + 1.18 + 1.24 +
        + +

        Optimization 4: Remove local memory usage for row sum

        + +

        Problem Analysis:

        + +

        Similar to Optimization 3, the local memory usage problem is also observed during the row sum computation in the softmax computation. Since local memory is off chip, accessing it as if accessing registers can hurt the performance of the kernel.

        + +

        Solution:

        + +

        We apply the same solution as the max QKT computation for the row sum computation. That is to have each thread compute a row sum of only one head, which requires only one float per thread. This eliminates the need for local memory.

        + +

        Results:

        + +

        Table 6 Performance of Optimization 4 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 4 + Baseline + Opt 4 +
        32 + 137 + 143 + 118 + 262 + 250 + 302 + 1.15 + 1.21 +
        64 + 234 + 257 + 204 + 305 + 278 + 351 + 1.15 + 1.26 +
        128 + 432 + 455 + 364 + 331 + 314 + 393 + 1.19 + 1.25 +
        256 + 815 + 866 + 688 + 351 + 331 + 416 + 1.18 + 1.26 +
        512 + 1581 + 1659 + 1328 + 362 + 345 + 431 + 1.19 + 1.25 +
        + +

        Optimization 5: Add prefetch for V load

        + +

        Problem Analysis:

        + +

        The same issue as K loading is observed when loading V. That is, the kernel issues data loading, and then waits to consume the data immediately causing the global load latency to be exposed. However, when using the unrolling technique mentioned above, the compiler allocates the temporary buffer on local memory instead of registers causing a large slow down.

        + +

        Solution:

        + +

        We adopt the data prefetching technique for V loading. We load the next iteration V values immediately after the current iteration values are consumed. This allows the data loading to be overlapped with the PK computation resulting in better kernel performance.

        + +

        Results:

        + +

        Table 7 Performance of Optimization 5 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 5 + Baseline + Opt 5 +
        32 + 137 + 143 + 109 + 262 + 250 + 327 + 1.25 + 1.31 +
        64 + 234 + 257 + 194 + 305 + 278 + 370 + 1.21 + 1.33 +
        128 + 432 + 455 + 345 + 331 + 314 + 414 + 1.25 + 1.32 +
        256 + 815 + 866 + 649 + 351 + 331 + 441 + 1.26 + 1.33 +
        512 + 1581 + 1659 + 1244 + 362 + 345 + 460 + 1.27 + 1.33 +
        + +

        Optimization 6: Add Group-Wise INT4 (Groups = 4) with Vector Load

        + +

        Problem Analysis:

        + +

        Prior to this optimization, CU only supported row-wise INT4 quantization. That is, every column in each row shares the same scales. The scales of each row are stored in the first 4 bytes of each row as shown in Figure 10. In the kernel, each thread loads only one row at a time. Since each row contains 68 bytes (4 bytes for scales and 64 bytes for data), it cannot guarantee that every row aligns with a size of any vector type. Thus, vector loads cannot be used for loading the KV cache.

        + +

        Figure 10: The layout of each row of INT4 KV cache with row-wise quantization

        + +

        Figure 10 The layout of each row of INT4 KV cache with row-wise quantization

        + +

        Solution:

        + +

        We have implemented support for group-wise INT4 quantization with num groups = 4. In this case, columns in each row in the KV cache tensor are divided into 4 equal groups. Columns within the same group share the same scales for quantization/dequantization. The data layout for INT4 KV cache is shown in Figure 11. The scales for all groups are serialized and stored at the beginning of each row. The INT4 data is also serialized and laid out next to the scales.

        + +

        Because the number of bytes in each row now becomes 80 bytes, we can use a vector type, i.e., uint2 in our case, to load data. (We do not use uint4 since each thread loads only 16 INT4s at a time due to the tensor core fragment size.) Vector load is generally better than scalar load since it does not cause extra byte loads.

        + +

        Figure 11: The layout of each row of INT4 KV cache with row-wise quantization

        + +

        Figure 11 The layout of each row of INT4 KV cache with row-wise quantization

        + +

        Results:

        + +

        Table 8 Performance of Optimization 6 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 6 + Baseline + Opt 6 +
        32 + 137 + 143 + 111 + 262 + 250 + 322 + 1.23 + 1.29 +
        64 + 234 + 257 + 192 + 305 + 278 + 372 + 1.22 + 1.34 +
        128 + 432 + 455 + 346 + 331 + 314 + 414 + 1.25 + 1.32 +
        256 + 815 + 866 + 642 + 351 + 331 + 446 + 1.27 + 1.35 +
        512 + 1581 + 1659 + 1244 + 362 + 345 + 460 + 1.27 + 1.33 +
        + +

        Table 9 Performance of Optimization 6 for INT4 GQA (group-wise quantization with num groups = 4)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD +
        Opt 6 + Opt 6 +
        32 + 129 + 116 + 325 + 364 + 1.31 +
        64 + 219 + 195 + 385 + 431 + 1.36 +
        128 + 392 + 347 + 429 + 484 + 1.39 +
        256 + 719 + 638 + 468 + 527 + 1.41 +
        512 + 1375 + 1225 + 489 + 550 + 1.43 +
        + +

        Optimization 7: Compute max QKT From WMMA Fragment Directly (A100/H100 specific)

        + +

        Problem Analysis:

        + +

        We observe large stalls due to shared memory accessing during the max QKT computation (showing as large short scoreboard stalls) as shown in Figure 12.

        + +

        Figure 12: Stalls due to shared memory access during max QKT computation

        + +

        Figure 12 Stalls due to shared memory access during max QKT computation (the number that the arrow points to is stall cycles caused by shared memory wait)

        + +

        Solution:

        + +

        We bypass shared memory when computing max QKT by computing it from the WMMA fragment (i.e., the tensor core fragment) directly. The layout of the WMMA fragment is specific to the GPU architecture. In this optimization, we only enabled this optimization for the NVIDIA A100/H100 GPUs. Other GPUs will still use shared memory for the max QKT computation. By bypassing shared memory, we effectively eliminate the stalls caused by shared memory access. The tensor core layout of the C fragment which is used for storing the QKT results is shown in Figure 13.

        + +

        Figure 13: C fragment (QKT storage) tensor core layout on A100/H100

        + +

        Figure 13 C fragment (QKT storage) tensor core layout on A100/H100

        + +

        Table 10 Performance of Optimization 7 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 7 + Baseline + Opt 7 +
        32 + 137 + 143 + 107 + 262 + 250 + 333 + 1.27 + 1.33 +
        64 + 234 + 257 + 183 + 305 + 278 + 391 + 1.28 + 1.40 +
        128 + 432 + 455 + 333 + 331 + 314 + 430 + 1.30 + 1.37 +
        256 + 815 + 866 + 620 + 351 + 331 + 461 + 1.31 + 1.40 +
        512 + 1581 + 1659 + 1206 + 362 + 345 + 475 + 1.31 + 1.38 +
        + +

        Table 11 Performance of Optimization 7 for INT4 GQA (group-wise quantization with num groups = 4)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD + vs CUDA_WMMA Opt 6 +
        Opt 6 + Opt 7 + Opt 6 + Opt 7 +
        32 + 129 + 116 + 111 + 325 + 364 + 380 + 1.17 + 1.04 +
        64 + 219 + 195 + 187 + 385 + 431 + 449 + 1.17 + 1.04 +
        128 + 392 + 347 + 333 + 429 + 484 + 506 + 1.18 + 1.04 +
        256 + 719 + 638 + 615 + 468 + 527 + 547 + 1.17 + 1.04 +
        512 + 1375 + 1225 + 1184 + 489 + 550 + 569 + 1.16 + 1.03 +
        + +

        Optimization 8: Write FP32->BF16 Results to P Fragment Directly (A100/H100 specific)

        + +

        Problem Analysis:

        + +

        During the FP32-BF16 conversion for the P fragment, the kernel loads the FP32 data from shared memory, does the conversion and then stores the BF16 data back to shared memory. Moreover, the conversion requires many thread block synchronizations (__syncthreads()).

        + +

        Solution:

        + +

        Due to the data partitioning design of the kernel, each warp performs only one pass through the P fragment. Thus, we do not have to write the conversion results back to the shared memory for future usage. To avoid writing the BF16 data to the shared memory and thread block synchronizations, we have each warp load the FP32 data of the P WMMA fragment from the shared memory, do the conversion and then write the BF16 data directly to the P fragment.

        + +

        Note that this optimization is applied to only the NVIDIA A100 and H100 GPUs because the WMMA fragment layout is architecture dependent. For non-A100/H100 GPUs, the kernel will fallback to the original path.

        + +

        The P fragment tensor core layout is shown in Figure 14. Note that this layout is specific to the NVIDIA A100/H100 GPU.

        + +

        Figure 14: P fragment tensor core layout on A100/H100

        + +

        Figure 14 P fragment tensor core layout on A100/H100

        + +

        Table 12 Performance of Optimization 8 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 8 + Baseline + Opt 8 +
        32 + 137 + 143 + 101 + 262 + 250 + 353 + 1.35 + 1.41 +
        64 + 234 + 257 + 174 + 305 + 278 + 410 + 1.34 + 1.47 +
        128 + 432 + 455 + 317 + 331 + 314 + 451 + 1.36 + 1.43 +
        256 + 815 + 866 + 590 + 351 + 331 + 485 + 1.38 + 1.47 +
        512 + 1581 + 1659 + 1143 + 362 + 345 + 501 + 1.38 + 1.45 +
        + +

        Table 13 Performance of Optimization 8 for INT4 GQA (group-wise quantization with num groups = 4)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD + vs CUDA_WMMA Opt 6 +
        Opt 6 + Opt 8 + Opt 6 + Opt 8 +
        32 + 129 + 116 + 106 + 325 + 364 + 396 + 1.22 + 1.09 +
        64 + 219 + 195 + 180 + 385 + 431 + 467 + 1.21 + 1.08 +
        128 + 392 + 347 + 319 + 429 + 484 + 528 + 1.23 + 1.09 +
        256 + 719 + 638 + 596 + 468 + 527 + 565 + 1.21 + 1.07 +
        512 + 1375 + 1225 + 1138 + 489 + 550 + 591 + 1.21 + 1.08 +
        + +

        Optimization 9: Swizzle P Shared Memory Layouts (A100/H100 specific)

        + +

        Problem Analysis:

        + +

        We observe large shared memory bank conflicts during P loading. The amount of bank conflict depends on the memory access stride. For instance, for split-Ks = 32 and max seq length = 8192, we observed that only 4 out of 32 banks are being accessed in parallel (memory access stride = 256). From Figure 14, when all threads access element 0, threads that have the same threadIdx.x % 4 access the same bank.

        + +

        Figure 15: P fragment in shared memory before swizzling

        + +

        Figure 15 P fragment in shared memory before swizzling

        + +

        Solution:

        + +

        We shuffle the layout of P load/store in the shared memory in such a way that avoids bank conflicts. In other words, we store the QKT results (C fragment) and load them (P fragment) using the swizzled layout. Moreover, instead of using the original memory access stride which is dependent on the number of tokens per thread block, we use the fragment’s column size as the stride which is constant. Thus, the load and store of the P fragment is always contiguous.

        + +

        The new layouts for the C and P fragments are shown in Figure 16. With the new layout, it is guaranteed that 16 banks are being accessed in parallel as shown in Figure 17.

        + +

        Figure 16: The swizzled layouts of C and P fragments

        + +

        Figure 16 The swizzled layouts of C and P fragments

        + +

        Figure 17: P fragment in shared memory after swizzling

        + +

        Figure 17 P fragment in shared memory after swizzling

        + +

        Table 14 Performance of Optimization 9 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 9 + Baseline + Opt 9 +
        32 + 137 + 143 + 98 + 262 + 250 + 365 + 1.39 + 1.46 +
        64 + 234 + 257 + 167 + 305 + 278 + 429 + 1.41 + 1.54 +
        128 + 432 + 455 + 299 + 331 + 314 + 479 + 1.45 + 1.52 +
        256 + 815 + 866 + 549 + 351 + 331 + 521 + 1.48 + 1.58 +
        512 + 1581 + 1659 + 1060 + 362 + 345 + 540 + 1.49 + 1.56 +
        + +

        Table 15 Performance of Optimization 9 for INT4 GQA (group-wise quantization with num groups = 4)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD + vs CUDA_WMMA Opt 6 +
        Opt 6 + Opt 9 + Opt 6 + Opt 9 +
        32 + 129 + 116 + 105 + 325 + 364 + 400 + 1.23 + 1.10 +
        64 + 219 + 195 + 174 + 385 + 431 + 484 + 1.26 + 1.12 +
        128 + 392 + 347 + 302 + 429 + 484 + 558 + 1.30 + 1.15 +
        256 + 719 + 638 + 560 + 468 + 527 + 601 + 1.28 + 1.14 +
        512 + 1375 + 1225 + 1065 + 489 + 550 + 632 + 1.29 + 1.15 +
        + +

        Optimization 10: Pad Shared Memory for INT4 Dequantization

        + +

        Problem Analysis:

        + +

        Once the kernel reads the INT4 K or V cache from global memory, it performs dequantization and stores the results (BF16) in the shared memory. Then, the BF16 data is loaded to the WMMA fragment from shared memory (via the WMMA interface). We observed a large number of bank conflicts for both K and V accesses. For instance, for K stores, only 4 out of 32 banks are being accessed in parallel. For K loads, 16 banks are being accessed in parallel. The same also occurs for V stores and loads. See the figures in the solution section.

        + +

        Solution:

        + +

        We pad the shared memory to reduce the bank conflict. Specifically, we pad each row by 2. That is, the row stride of K becomes F_K + 2 and the row stride of V becomes F_N + 2 (F_K and F_N are the fixed widths of the K and V WMMA fragments, respectively). With this optimization, we are able to reduce the bank conflict by 1.8x as shown in Figure 18.

        + +

        Figure 18: Bank conflicts before and after Optimization 10

        + +

        Figure 18 Bank conflicts before and after Optimization 10

        + +

        After Optimization 10, for K stores, 32 banks are being accessed in parallel (shown in Figure 19), while for K loads, 29 banks are accessed in parallel (shown in Figure 20).

        + +

        Figure 19: K fragment store shared memory layout without and with padding

        + +

        Figure 19 K fragment store shared memory layout without and with padding

        + +

        Figure 20: K fragment load shared memory layout without and with padding

        + +

        Figure 20 K fragment load shared memory layout without and with padding

        + +

        Table 16 Performance of Optimization 10 for INT4 GQA (row-wise quantization)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CU + FD + CU + vs FD + vs CU baseline +
        Baseline + Opt 10 + Baseline + Opt 10 +
        32 + 137 + 143 + 94 + 262 + 250 + 380 + 1.45 + 1.52 +
        64 + 234 + 257 + 151 + 305 + 278 + 475 + 1.55 + 1.71 +
        128 + 432 + 455 + 266 + 331 + 314 + 538 + 1.63 + 1.71 +
        256 + 815 + 866 + 489 + 351 + 331 + 586 + 1.67 + 1.77 +
        512 + 1581 + 1659 + 930 + 362 + 345 + 616 + 1.70 + 1.79 +
        + +

        Table 17 Performance of Optimization 10 for INT4 GQA (group-wise quantization with num groups = 4)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Batch size + Time (us) + Bandwidth (GB/s) + Speed up +
        FD + CUDA_WMMA + FD + CUDA_WMMA + vs FD + vs CUDA_WMMA Opt 6 +
        Opt 6 + Opt 10 + Opt 6 + Opt 10 +
        32 + 129 + 116 + 99 + 325 + 364 + 425 + 1.31 + 1.17 +
        64 + 219 + 195 + 161 + 385 + 431 + 523 + 1.36 + 1.21 +
        128 + 392 + 347 + 282 + 429 + 484 + 598 + 1.39 + 1.23 +
        256 + 719 + 638 + 509 + 468 + 527 + 662 + 1.41 + 1.25 +
        512 + 1375 + 1225 + 965 + 489 + 550 + 698 + 1.43 + 1.27 +
        + +

        Performance Evaluation

        + +

        Microbenchmark results

        + +

        We also evaluated BF16 GQA performance using our optimized kernel (as shown in Table 19). CU still performs generally worse than FD and FA for BF16. This is expected since our optimizations are INT4 focused.

        + +

        While INT4 GQA is still not as efficient as BF16 GQA (see the achieved bandwidths), it is important to note that when comparing FD BF16 GQA performance against CU INT4 GQA performance, we can see that the latency of INT4 is smaller than that of BF16.

        + +

        Table 19 Performance of BF16 GQA and INT GQA after CU optimizations

        + +

        On A100

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Time (us) + BF16 GQA + INT4 GQA +
        Batch size + FD + FA + CU before + CU after + FD + FA + CU before + CU after +
        32 + 139 + 133 + 183 + 163 + 137 + - + 143 + 94 +
        64 + 245 + 229 + 335 + 276 + 234 + - + 257 + 151 +
        128 + 433 + 555 + 596 + 517 + 432 + - + 455 + 266 +
        256 + 826 + 977 + 1127 + 999 + 815 + - + 866 + 489 +
        512 + 1607 + 1670 + 2194 + 1879 + 1581 + - + 1659 + 930 +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Effective Bandwidth (GB/s) + BF16 GQA + INT4 GQA +
        Batch size + FD + FA + CU before + CU after + FD + FA + CU before + CU after +
        32 + 965 + 1012 + 736 + 824 + 262 + - + 250 + 380 +
        64 + 1097 + 1175 + 802 + 972 + 305 + - + 278 + 475 +
        128 + 1240 + 968 + 901 + 1039 + 331 + - + 314 + 538 +
        256 + 1301 + 1100 + 954 + 1075 + 351 + - + 331 + 586 +
        512 + 1338 + 1287 + 980 + 1144 + 362 + - + 345 + 616 +
        + +

        On H100

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Time (us) + BF16 GQA + INT4 GQA +
        Batch size + FD + FA + CU before + CU after + FD + FA + CU before + CU after +
        32 + 91 + 90 + 114 + 100 + 70 + - + 96 + 64 +
        64 + 148 + 146 + 200 + 183 + 113 + - + 162 + 101 +
        128 + 271 + 298 + 361 + 308 + 205 + - + 294 + 170 +
        256 + 515 + 499 + 658 + 556 + 389 + - + 558 + 306 +
        512 + 1000 + 1011 + 1260 + 1066 + 756 + - + 1066 + 575 +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Effective Bandwidth (GB/s) + BF16 GQA + INT4 GQA +
        Batch size + FD + FA + CU before + CU after + FD + FA + CU before + CU after +
        32 + 1481 + 1496 + 1178 + 1341 + 511 + - + 371 + 560 +
        64 + 1815 + 1840 + 1345 + 1470 + 631 + - + 443 + 710 +
        128 + 1982 + 1802 + 1487 + 1743 + 699 + - + 487 + 844 +
        256 + 2087 + 2156 + 1634 + 1934 + 736 + - + 513 + 935 +
        512 + 2150 + 2127 + 1706 + 2015 + 757 + - + 537 + 996 +
        + +

        E2E results

        + +

        We evaluated our optimized INT4 GQA kernel in Llama 2 70B on 8 H100 GPUs. We ran the model end-to-end, but only reported the decode latency. We use FP8 FFN (feed forward network) to emphasize the attention performance in the decoding phase. We vary the batch size from 1 to 256 and the context length from 2,048 (2K) to 16,384 (16K). The E2E performance results are shown in the figure below.

        + +

        Figure 21: Meta Llama 2 decode latency (ms) comparison

        + +

        Figure 21 Meta Llama 2 decode latency (ms) comparison (BF16 GQA runs out of memory in large batch size configurations)

        + +

        Code

        + +

        If you are interested, please checkout our code here. If you have any questions, please feel free to open an issue on GitHub, and we will be happy to help. Your contributions are welcome!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/int8-quantization/index.html b/blog/int8-quantization/index.html new file mode 100644 index 000000000000..2d9b8af94348 --- /dev/null +++ b/blog/int8-quantization/index.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + INT8 Quantization for x86 CPU in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        August 07, 2023

        +

        + INT8 Quantization for x86 CPU in PyTorch +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        Overview

        + +

        INT8 quantization is a powerful technique for speeding up deep learning inference on x86 CPU platforms. By reducing the precision of the model’s weights and activations from 32-bit floating-point (FP32) to 8-bit integer (INT8), INT8 quantization can significantly improve the inference speed and reduce memory requirements without sacrificing accuracy.

        + +

        In this blog, we will discuss the recent progress on INT8 quantization for x86 CPU in PyTorch, focusing on the new x86 quantization backend. We will also briefly look at the new quantization path with PyTorch 2.0 Export (PT2E) and TorchInductor.

        + +

        X86 Quantization Backend

        + +

        The current recommended way of quantization in PyTorch is FX. Before PyTorch 2.0, the default quantization backend (a.k.a. QEngine) on x86 CPUs was FBGEMM, which leveraged the FBGEMM performance library to achieve the performance speedup. In the PyTorch 2.0 release, a new quantization backend called X86 was introduced to replace FBGEMM. The x86 quantization backend offers improved INT8 inference performance when compared to the original FBGEMM backend by leveraging the strengths of both FBGEMM and the Intel® oneAPI Deep Neural Network Library (oneDNN) kernel libraries.

        + +

        Performance Benefit from X86 Backend

        + +

        To measure the performance benefits of the new X86 backend, we ran INT8 inference on 69 popular deep learning models (shown in Figures 1-3 below) using 4th Gen Intel® Xeon® Scalable processors. The results showed a 2.97X geomean performance speedup compared to FP32 inference performance, while the speedup was 1.43X with the FBGEMM backend. The charts below show the per-model performance speedup comparing the x86 backend and the FBGEMM backend.

        + +

        Figure 1: Models with less than 2x performance boost with x86 backend1

        + +

        Figure 1: Models with less than 2x performance boost with x86 backend1

        + +

        Figure 2: Models with 2x-4x performance boost with x86 backend1

        + +

        Figure 2: Models with 2x-4x performance boost with x86 backend1

        + +

        Figure 3: Models with larger than 4x performance boost with x86 backend1

        + +

        Figure 3: Models with larger than 4x performance boost with x86 backend1

        + +

        Usage of x86 Backend

        + +

        By default in 2.0, users on x86 platforms will use the x86 quantization backend and their PyTorch programs will remain unchanged when using the default backend. Alternatively, users can specify x86 as the quantization backend explicitly.
        +Below is an example code snippet of PyTorch static post-training quantization with x86 quantization backend.

        + +
        import torch
        +from torch.ao.quantization import get_default_qconfig_mapping
        +from torch.quantization.quantize_fx import prepare_fx, convert_fx
        +
        +qconfig_mapping = get_default_qconfig_mapping()
        +# Or explicity specify the qengine
        +# qengine = 'x86'
        +# torch.backends.quantized.engine = qengine
        +# qconfig_mapping = get_default_qconfig_mapping(qengine)
        +
        +model_fp32 = MyModel().eval()
        +x = torch.randn((1, 3, 224, 224), dtype=torch.float)
        +x = x.to(memory_format=torch.channels_last)
        +
        +# Insert observers according to qconfig and backend config
        +prepared_model = prepare_fx(model_fp32, qconfig_mapping, example_inputs=x)
        +
        +# Calibration code not shown
        +
        +# Convert to quantized model
        +quantized_model = convert_fx(prepared_model)
        +
        + +

        Technical Details of x86 Backend

        + +

        We devised heuristic dispatching rules according to the performance numbers from the models we benchmarked to decide whether to invoke oneDNN or FBGEMM performance library to execute the convolution or matrix multiplication operations. The rules are a combination of operation kinds, shapes, CPU architecture information, etc. Detailed logic is available here. For more design and technical discussion, please refer to the Request for Comments.

        + +

        Next Steps With a New Quantization Path PyTorch 2.0 Export

        + +

        Although still far from finalized, a new quantization path, PyTorch 2.0 Export (PT2E), is in early design and PoC stage. The new approach is slated to replace the FX quantization path in the future. It is built upon the capabilities of TorchDynamo Export, a feature introduced in the PyTorch 2.0 release for FX graph capturing. This graph is then quantized and lowered to different backends. TorchInductor, the new DL compiler of PyTorch, has shown promising results in terms of FP32 inference speedup on x86 CPU. We are working actively to enable it as one of the quantization backends of PT2E. We believe the new path will lead to further improvements in INT8 inference performance due to more flexibility of fusion at different levels.

        + +

        Conclusion

        + +

        The x86 backend introduced in PyTorch 2.0 release has demonstrated a remarkable improvement in INT8 inference speed on x86 CPU platforms. It offers a 1.43X speedup compared to the original FBGEMM backend while maintaining backward compatibility. This enhancement can benefit end users with minimal or no modifications to their programs. Furthermore, a new quantization path, PT2E, is currently in development and is expected to provide even more possibilities in the future.

        + +

        Acknowledgement

        + +

        Special thanks to Nikita Shulga, Vasiliy Kuznetsov, Supriya Rao, and Jongsoo Park. Together, we made one more step forward on the path of improving the PyTorch CPU ecosystem.

        + +

        Configuration

        + +

        1 AWS EC2 r7iz.metal-16xl instance (Intel(R) Xeon(R) Gold 6455B, 32-core/64-thread, Turbo Boost On, Hyper-Threading On, Memory: 8x64GB, Storage: 192GB); OS: Ubuntu 22.04.1 LTS; Kernel: 5.15.0-1028-aws; Batch Size: 1; Core per Instance: 4; PyTorch 2.0 RC3; TorchVision 0.15.0+cpu, test by Intel on 3/77/2023. May not reflect all publicly available security updates.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/intel-gpu-support-pytorch-2-5/index.html b/blog/intel-gpu-support-pytorch-2-5/index.html new file mode 100644 index 000000000000..2688626109bb --- /dev/null +++ b/blog/intel-gpu-support-pytorch-2-5/index.html @@ -0,0 +1,938 @@ + + + + + + + + + + + + + Intel GPU Support Now Available in PyTorch 2.5 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + PyTorch Team at Intel + +

        +

        Support for Intel GPUs is now available in PyTorch® 2.5, providing improved functionality and performance for Intel GPUs which including Intel® Arc™ discrete graphics, Intel® Core™ Ultra processors with built-in Intel® Arc™ graphics and Intel® Data Center GPU Max Series. This integration brings Intel GPUs and the SYCL* software stack into the official PyTorch stack, ensuring a consistent user experience and enabling more extensive AI application scenarios, particularly in the AI PC domain.

        + +

        Developers and customers building for and using Intel GPUs will have a better user experience by directly obtaining continuous software support from native PyTorch, unified software distribution, and consistent product release time.

        + +

        Furthermore, Intel GPU support provides more choices to users. Now PyTorch provides a consistent GPU programming paradigm on both front ends and back ends. Developers can now run and deploy workloads on Intel GPUs with minimal coding efforts.

        + +

        Overview of Intel GPU support

        + +

        Intel GPU support in PyTorch provides eager mode and graph mode support in the PyTorch built-in front end. Eager mode now has an implementation of commonly used Aten operators with the SYCL programming language. Graph mode (torch.compile) now has an enabled Intel GPU back end to implement the optimization for Intel GPUs and to integrate Triton. 

        + +

        Essential components of Intel GPU support were added to PyTorch, including runtime, Aten operators, oneDNN, TorchInductor, Triton and Intel GPU tool chains integration. Meanwhile, quantization and distributed are being actively developed in preparation for the PyTorch 2.6 release.

        + +

        Features

        + +

        In addition to providing key features for Intel® Client GPUs and Intel® Data Center GPU Max Series for inference and training, PyTorch keeps the same user experience as other hardware the PyTorch supports. If you migrate code from CUDA*, you can run the existing application code on an Intel GPU with minimal code changes for the device name (from cuda to xpu). For example:

        + +

        # CUDA Code
        +tensor = torch.tensor([1.0, 2.0]).to(“cuda”)

        + +

        # Code for Intel GPU
        +tensor = torch.tensor([1.0, 2.0]).to(“xpu”)

        + +

        PyTorch 2.5 features with an Intel GPU include: 

        + +
          +
        • Inference and training workflows.
        • +
        • Enhance both torch.compile and eager mode functionalities (more Ops), together with performance improvement, and fully run three Dynamo Hugging Face*, TIMM* and TorchBench* benchmarks for eager and compile modes. 
        • +
        • Data types such as FP32, BF16, FP16, and automatic mixed precision (AMP).
        • +
        • Runs on Intel® Client GPUs and Intel® Data Center GPU Max Series.
        • +
        • Supports Linux (Ubuntu, SUSE Linux and Red Hat Linux) and Windows 10/11.
        • +
        + +

        Get Started

        + +

        Get a tour of the environment setup, PIP wheels installation, and examples on Intel® Client GPUs and Intel® Data Center GPU Max Series from Getting Started Guide. Support for Intel GPUs can be experienced through PyTorch PIP wheels installation by nightly and preview binary releases.

        + +
          +
        • +

          Try Intel® Client GPUs through Intel® Arc™ Graphics family (Codename DG2), Intel® Core™ Ultra processor family with Intel® Graphics (Codename Meteor Lake), and Intel® Core™ Ultra mobile processor family with Intel® Graphics (Codename Lunar Lake).

          +
        • +
        • +

          Try Intel Data Center GPU Max Series through Intel® Tiber™ AI Cloud.

          + +
            +
          1. +

            To learn how to create a free Standard account, see Get Started. Then do the following:

            + +
              +
            • +

              Sign in to the cloud console.

              +
            • +
            • +

              From the Training section, open the  PyTorch on Intel® GPUs  notebook and click “Launch Jupyter Notebook.”

              +
            • +
            • +

              Ensure that the PyTorch 2.5 kernel is selected for the notebook.

              +
            • +
            +
          2. +
          +
        • +
        + +

        Performance

        + +

        The performance of Intel GPU on PyTorch was continuously optimized to achieve decent result on three Dynamo Hugging Face, TIMM and TorchBench benchmarks for eager and compile modes.

        + +

        The latest performance data measured on top of PyTorch Dynamo Benchmarking Suite using Intel® Data Center GPU Max Series 1100 single card showcase the FP16/BF16 significant speedup ratio over FP32 on eager mode in Figure 1, and Torch.compile mode speedup ratio over eager mode in Figure 2. Both inference and training reached the similar significant improvements.

        + +

        Figure 2: FP16/BF16 Performance Gains Over FP32 Eager

        + +

        Figure 2: FP16/BF16 Performance Gains Over FP32 Eager

        + +

        Figure 3: Torch.compile Performance Gains Over Eager Mode

        + +

        Figure 3: Torch.compile Performance Gains Over Eager Mode

        + +

        Summary

        + +

        Intel GPU on PyTorch 2.5 brings Intel® Client GPUs (Intel® Core™ Ultra processors with built-in Intel® Arc™ graphics and Intel® Arc™ Graphics for dGPU parts) and Intel® Data Center GPU Max Series into the PyTorch ecosystem for AI workload acceleration. Especially, Client GPUs is added to the GPU-supported list for AI PC use scenarios on Windows and Linux environment.

        + +

        We warmly welcome the community to evaluate and provide feedback on these enhancements to  Intel GPU support on PyTorch. 

        + +

        Resources

        + + + +

        Acknowledgments

        + +

        We want thank PyTorch open source community for their technical discussions and insights: Andrey TalmanAlban Desmaison, Nikita ShulgaEli Uriegas, Jason Ansel, and Bin Bao.

        + +

        We also thank collaborators from PyTorch for their professional support and guidance.

        + +

        Performance Configuration

        + +

        The configurations in the table are collected with svr-info. Test by Intel on September 12, 2024.

        + +

        Table 1

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ComponentDetails
        NameIntel® Max Series GPU 1100 in Intel® Tiber™ Developer Cloud
        TimeThu Sep 12 08:21:27 UTC 2024
        SystemSupermicro SYS-521GE-TNRT
        BaseboardSupermicro X13DEG-OA
        ChassisSupermicro Other
        CPU ModelIntel(R) Xeon(R) Platinum 8468V
        MicroarchitectureSPR_XCC
        Sockets2
        Cores per Socket48
        HyperthreadingEnabled
        CPUs192
        Intel Turbo BoostEnabled
        Base Frequency2.4GHz
        All-core Maximum Frequency2.4GHz
        Maximum Frequency2.9GHz
        NUMA Nodes2
        PrefetchersL2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled, AMP: Disabled, Homeless: Disabled, LLC: Disabled
        PPINs5e3f862ef7ba9d50, 6c85812edfcc84b1
        AcceleratorsDLB 2, DSA 2, IAA 2, QAT (on CPU) 2, QAT (on chipset) 0
        Installed Memory1024GB (16x64GB DDR5 4800 MT/s [4800 MT/s])
        Hugepagesize2048 kB
        Transparent Huge Pagesmadvise
        Automatic NUMA BalancingEnabled
        NIC2 x Ethernet Controller X710 for 10GBASE-T, 4 x MT2892 Family [ConnectX-6 Dx]
        Disk1 x 894.3G Micron_7450_MTFDKBG960TFR
        BIOS1.4a
        Microcode0x2b0004b1
        OSUbuntu 22.04.2 LTS
        Kernel5.15.0-73-generic
        TDP330W
        Power & Perf PolicyNormal (6)
        Frequency Governorperformance
        Frequency Driveracpi-cpufreq
        Max C-State9
        + +

        Table 2

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ComponentDetails
        Single CardIntel® Max Series GPU 1100 series on 4th Gen Intel® Xeon® processors of Intel Tiber Developer Cloud
        Workload & versionTimm ac34701, TorchBench 03cde49, Torchvision d23a6e1, Torchaudio b3f6f51, Transformers 243e186
        Software Stackintel-for-pytorch-gpu-dev 0.5.3, intel-pti-dev 0.9.0, Intel xpu backend for Triton cc981fe
        FrameworkPytorch 4a3dabd67f8ce63f2fc45f278421cca3cc532cfe
        GPU driveragama-ci-devel-803.61
        GFX FW VersionPVC2_1.23374
        + +

        Notices & Disclaimers

        + +

        Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

        + +

        Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

        + +

        AI disclaimer:
        +AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at  www.intel.com/AIPC. Results may vary.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/intel-gpus-pytorch-2-4/index.html b/blog/intel-gpus-pytorch-2-4/index.html new file mode 100644 index 000000000000..5aa8181979d7 --- /dev/null +++ b/blog/intel-gpus-pytorch-2-4/index.html @@ -0,0 +1,719 @@ + + + + + + + + + + + + + Accelerate Your AI: PyTorch 2.4 Now Supports Intel GPUs for Faster Workloads | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + the PyTorch Team at Intel + +

        +

        We have exciting news! PyTorch 2.4 now supports Intel® Data Center GPU Max Series and the SYCL software stack, making it easier to speed up your AI workflows for both training and inference. This update allows for you to have a consistent programming experience with minimal coding effort and extends PyTorch’s device and runtime capabilities, including device, stream, event, generator, allocator, and guard, to seamlessly support streaming devices. This enhancement simplifies deploying PyTorch on ubiquitous hardware, making it easier for you to integrate different hardware back ends.

        + +

        Intel GPU support upstreamed into PyTorch provides support for both eager and graph modes, fully running Dynamo Hugging Face benchmarks. Eager mode now includes common Aten operators implemented with SYCL. The most performance-critical graphs and operators are highly optimized by using oneAPI Deep Neural Network Library (oneDNN) and oneAPI Math Kernel Library (oneMKL). Graph mode (torch.compile) now has an enabled Intel GPU back end to implement the optimization for Intel GPUs and to integrate Triton. Furthermore, data types such as FP32, BF16, FP16, and automatic mixed precision (AMP) are supported. The PyTorch Profiler, based on Kineto and oneMKL, is being developed for the upcoming PyTorch 2.5 release.

        + +

        Take a look at the current and planned front-end and back-end improvements for Intel GPU upstreamed into PyTorch.

        + +

        the current and planned front-end and back-end improvements for Intel GPU upstreamed into PyTorch

        + +

        PyTorch 2.4 on Linux supports Intel Data Center GPU Max Series for training and inference while maintaining the same user experience as other hardware. If you’re migrating code from CUDA, you can run your existing application on an Intel GPU with minimal changes—just update the device name from cuda to xpu. For example:

        + +
        # CUDA Code 
        +tensor = torch.tensor([1.0, 2.0]).to("cuda") 
        + 
        +# Code for Intel GPU 
        +tensor = torch.tensor([1.0, 2.0]).to("xpu")
        +
        + +

        Get Started

        + +

        Try PyTorch 2.4 on the Intel Data Center GPU Max Series through the Intel® Tiber™ Developer Cloud. Get a tour of the environment setup, source build, and examples. To learn how to create a free Standard account, see Get Started, then do the following:

        + +
          +
        1. +

          Sign in to the cloud console.

          +
        2. +
        3. +

          From the Training section, open the PyTorch 2.4 on Intel GPUs notebook.

          +
        4. +
        5. +

          Ensure that the PyTorch 2.4 kernel is selected for the notebook.

          +
        6. +
        + +

        Summary

        + +

        PyTorch 2.4 introduces initial support for Intel Data Center GPU Max Series to accelerate your AI workloads. With Intel GPU, you’ll get continuous software support, unified distribution, and synchronized release schedules for a smoother development experience. We’re enhancing this functionality to reach Beta quality in PyTorch 2.5. Planned features in 2.5 include:

        + +
          +
        • +

          More Aten operators and full Dynamo Torchbench and TIMM support in Eager Mode.

          +
        • +
        • +

          Full Dynamo Torchbench and TIMM benchmark support in torch.compile.

          +
        • +
        • +

          Intel GPU support in torch.profile.

          +
        • +
        • +

          PyPI wheels distribution.

          +
        • +
        • +

          Windows and Intel Client GPU Series support.

          +
        • +
        + +

        We welcome the community to evaluate these new contributions to Intel GPU support on PyTorch. 

        + +

        Resources

        + + + +

        Acknowledgments

        + +

        We want thank PyTorch open source community for their technical discussions and insights: Nikita Shulga, Jason Ansel, Andrey Talman, Alban Desmaison, and Bin Bao.

        + +

        We also thank collaborators from PyTorch for their professional support and guidance.

        + +

        1 To enable GPU support and improve performance, we suggest installing the Intel® Extension for PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/intel-joins-pytorch/index.html b/blog/intel-joins-pytorch/index.html new file mode 100644 index 000000000000..ea973120d9e7 --- /dev/null +++ b/blog/intel-joins-pytorch/index.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + + Intel Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Intel logo

        + +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Intel has joined as a premier member.

        + +

        “The PyTorch Foundation is thrilled to welcome Intel as a premier member, marking a significant milestone in our mission to empower the global AI community. Intel’s extensive expertise and commitment to advancing cutting-edge technologies align perfectly with our vision of fostering open-source innovation,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Together, we will accelerate the development and democratization of PyTorch, and use the collaboration to shape a vibrant future of AI for all.”

        + +

        Intel has developed and released several PyTorch-based tools and libraries to enable developers to accelerate their AI workflows, and is actively working on optimizing PyTorch to leverage Intel hardware capabilities.

        + +

        “At Intel, we believe in the power of collaboration and open-source innovation to propel the ecosystem towards an AI Everywhere future. Joining the Governing Board of the PyTorch Foundation is a testament to Intel’s commitment to advancing and democratizing AI,” said Wei Li, Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel. “By harnessing the collective expertise and resources within the deep learning community, we aim to accelerate the development of PyTorch and continue to drive breakthroughs in AI research and applications.”

        + +

        Intel fosters industry collaboration, co-engineering, and open source contributions to accelerate software innovation and develop new technologies that bring benefits to the open source community. By working together with other member companies and under the guidance of the PyTorch Foundation, Intel remains committed to actively contributing to and advocating for the community.

        + +

        As a premier member, Intel is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

        + +

        Wei Li

        + +

        We’re happy to welcome Wei Li, Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel, to our board. Dr. Wei Li is Vice President and General Manager of Artificial Intelligence and Analytics (AIA) at Intel, where he leads a world-wide team of engineering “magicians” who make AI Everywhere a reality by supercharging machine performance and developer productivity. Wei and his team have been instrumental in Intel’s recent multi-billion-dollar AI revenue growth by delivering 10-100X software acceleration, across deep learning, statistical machine learning and big data analytics, to complement Intel’s AI-optimized hardware portfolio.

        + +

        To learn more about how you can be a part of the PyTorch Foundation, visit our website.

        + +

        Read more about Intel’s commitment to the PyTorch Community here.

        + +

        About Intel

        + +

        Intel (Nasdaq: INTC) is an industry leader, creating world-changing technology that enables global progress and enriches lives. Inspired by Moore’s Law, we continuously work to advance the design and manufacturing of semiconductors to help address our customers’ greatest challenges. By embedding intelligence in the cloud, network, edge and every kind of computing device, we unleash the potential of data to transform business and society for the better. To learn more about Intel’s innovations, go to newsroom.intel.com and intel.com.

        + +

        © Intel Corporation. Intel, the Intel logo and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

        + +

        About PyTorch Foundation

        + +

        The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

        + +

        About The Linux Foundation

        + +

        The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/interactive-chat-gen-model/index.html b/blog/interactive-chat-gen-model/index.html new file mode 100644 index 000000000000..f8ca15bd3d31 --- /dev/null +++ b/blog/interactive-chat-gen-model/index.html @@ -0,0 +1,805 @@ + + + + + + + + + + + + + How to Build an Interactive Chat-Generation Model using DialoGPT and PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        The focus on interactive chat-generation (or conversational response-generation) models has greatly increased in the past several months. Conversational response-generation models such as ChatGPT and Google Bard have taken the AI world by storm. The purpose of interactive chat generation is to answer various questions posed by humans, and these AI based models use natural language processing (NLP) to generate conversations almost indistinguishable from those generated by humans.

        + +

        This article showcases a code sample on how to create interactive chats based on a pre-trained DialoGPT model from Hugging Face with the addition of the Intel® Extension for PyTorch to perform dynamic quantization on the model.

        + +

        Get Started

        + +

        Why DialoGPT?

        + +

        DialoGPT (Dialogue Generative Pre-trained Transformer) is a large-scale, pre-trained dialogue-response-generation model trained on 147M conversation-like exchanges pulled out from Reddit comment chains and discussion threads. DialoGPT was proposed by Microsoft in 2019. The main goal was to create open-domain chatbots capable of producing natural responses to a variety of conversational topics. The conversational response-generation systems that leverage DialoGPT generate more applicable, resourceful, diverse, and context-specific replies.

        + +

        DialoGPT Architecture

        + +

        DialoGPT architecture is based on the GPT-2 model. It is formulated as an autoregressive language model and uses a multi-layer transformer as the model architecture. GPT-2 was proposed by OpenAI. GPT-2 models are trained on general text data whereas DialoGPT is trained on Reddit discussion threads.

        + +

        Let’s look at the GPT-2 architecture. There are two types of blocks in general transformer architecture:

        + +
          +
        • Encoder - contains self-attention layer and feed-forward neural network
        • +
        • Decoder - similar to encoder, but the self-attention layer is masked
        • +
        + +

        The self-attention layer allows a position to peak at tokens to the right of the current word (the successive words in text), whereas masked self-attention layer prevents that from happening.

        + +

        self-attention layer vs masked self-attention layer

        + +

        GPT-2 is built using transformer decoder blocks. This means that the following layers are used in the architecture:

        + +
          +
        1. Embedding Layer – responsible for converting input text into embeddings (each word is converted to a fixed-length vector representation)
        2. +
        3. Transformer Decoder – includes multiple decoder blocks with masked self-attention and feed forward neural network layers
        4. +
        5. Output Layer – responsible for converting embeddings obtained from the decoder into words
        6. +
        + +

        GPT-2 architecture (and DialoGPT architecture) is shown below.

        + +

        GPT-2 architecture

        + +

        As the model is based on transformers architecture, it has the issue of repetition and copying the inputs. To avoid repetition, we can use Top-K sampling and Top-p sampling.

        + +
          +
        • Top-K sampling - filters the K most likely next words and redistributes the probability mass among only those K next words.
        • +
        • Top-p sampling - rather than selecting only the most likely K words, selects the smallest possible set of words whose cumulative probability exceeds the probability p.
        • +
        + +

        The probability mass is then redistributed among the words in the set. As a result, the size of the set of words can be dynamically increased and decreased based on the probability distribution of the next word.

        + +

        Quantization using Intel® Extension for PyTorch

        + +

        What is Quantization?

        + +

        Quantization is a systematic reduction of the precision of all or several layers within the model. This means a higher-precision type, such as the single-precision floating-point (FP32) mostly used in deep learning, is converted into a lower-precision type such as FP16 (16 bits) or INT8 (8 bits).

        + +

        This helps in achieving,

        + +
          +
        • lower memory bandwidth
        • +
        • lower storage
        • +
        • higher performance with minimum-to-zero accuracy loss
        • +
        + +

        Quantization is especially important with large models such as those based on the Transformer architecture like BERT or GPT.

        + +

        There are two types of quantization:

        + +
          +
        • Static – Static quantization quantizes the weights and activations of the model. This quantization is used when both memory bandwidth and compute savings are important.
        • +
        • Dynamic – In dynamic quantization, the weights are quantized ahead of time, but the activations are dynamically quantized during inference.
        • +
        + +

        Intel Extension for PyTorch: The Intel Extension extends PyTorch with up-to-date features and optimizations for an extra performance boost on Intel® hardware. Learn how to install it standalone or get it a part of the Intel® AI Analytics Toolkit.

        + +

        The extension can be loaded as a Python* module or linked as a C++ library. Python users can enable it dynamically by importing intel_extension_for_pytorch.

        + +
          +
        • This CPU tutorial gives detailed information about Intel Extension for PyTorch for Intel CPUs. Source code is available at the master branch.
        • +
        • This GPU tutorial gives detailed information about Intel Extension for PyTorch for Intel GPUs. Source code is available at the xpu-master branch.
        • +
        + +

        How to perform dynamic quantization using Intel Extension for PyTorch?

        + +

        Here are the steps to quantize the existing FP32 model to INT8 model using dynamic quantization:

        + +
          +
        1. Prepare quantization configuration - We can use default dynamic quantization configuration with ipex.quantization.default_dynamic_qconfig.
        2. +
        3. Prepare the FP32 model by using the** ipex.quantization.prepare **method (provide the input parameters such as FP32 model to quantize, the prepared configuration, example inputs and information if the quantization should be in place).
        4. +
        5. Convert the model from FP32 to INT8 - Use ipex.quantization.convert method for conversion. The input model will be the model prepared in step 2.
        6. +
        + +

        We also encourage you to check out the Intel® Neural Compressor tool that automates popular model-compression technologies such as quantization, pruning, and knowledge distillation across multiple deep learning frameworks.

        + +

        Code Sample

        + +

        The following steps are implemented in the code sample:

        + +
          +
        1. Load model and tokenizer: Transformers library (check out Intel® Extension for Transformers) and Auto Classes available in the Hugging Face Main Classes are used in this step. These allow us to automatically find the relevant model by the given name. It also allows to easily change the model without major changes in the code on the developer’s side as shown below: +
          tokenizer = AutoTokenizer.from_pretrained(model)
          +model = AutoModelForCausalLM.from_pretrained(model)
          +
          +

          The model parameter is specified as an input for the tokenizer, and model initialization is just the path to the pre-trained DialoGPT model. In this sample, we are using ‘microsoft/DialoGPT-large.’ If you have limited resources, you can use ‘microsoft/DialoGPT-medium’ or ‘microsoft/DialoGPT-small’ models and receive comparable results.

          +
        2. +
        3. Perform dynamic quantization of the model: +
            +
          1. Create the configuration using the default dynamic quantization configuration from Intel Extension for PyTorch library.
          2. +
          3. Prepare the model.
          4. +
          5. Convert the model from FP32 to INT8.
            +The steps are explained in detail in the above section.
          6. +
          +
        4. +
        5. Response generation: The first step in response generation is to encode the input sentence as shown in the code below: +
          new_input_ids = tokenizer.encode(input(">> You:") + tokenizer.eos_token, return_tensors='pt')
          +
          +

          In this sample, we want our model to save history, so we are adding input sentences in the form of tokens to the chat history:

          +
          bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_round > 0 else new_input_ids
          +
          +

          The text generation can be done by the model.generate function, where we can specify all important parameters like saved chat history, length of the response in tokens, and usage of both Top-K and Top-p sampling.

          +
          chat_history_ids = model.generate(bot_input_ids, do_sample=True, max_length=2000, top_k=50, top_p=0.95, pad_token_id=tokenizer.eos_token_id) 
          +
          +

          The last step is to decode and print the response:

          +
        6. +
        7. Preparation for interactive conversation: After response generation, the last step is to add interaction. This can be done by using a simple for loop. Based on the initialized tokenizer, model, and empty chat history, responses are generated for a number of rounds: +
          for chat_round in range(n):
          +chat_history_ids = generate_response(
          +tokenizer,
          +model,
          +chat_round,
          +chat_history_ids
          +)
          +
          +

          An example of interactive chat generation will look like the one shown in the picture below.

          +
        8. +
        + +

        An example of interactive chat generation

        + +

        What’s Next?

        + +

        Get started with interactive chat-generation models using Intel Extension for PyTorch and DialoGPT. Download and try the Intel AI Analytics Toolkit and Intel Extension for PyTorch for yourself to build various end-to-end AI applications.

        + +

        We encourage you to also check out and incorporate Intel’s other AI/ML Framework optimizations and end-to-end portfolio of tools into your AI workflow and learn about the unified, open, standards-based oneAPI programming model that forms the foundation of Intel’s AI Software Portfolio to help you prepare, build, deploy, and scale your AI solutions.

        + +

        For more details about the new 4th Gen Intel® Xeon® Scalable processors, visit Intel’s AI Solution Platform portal where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs.

        + +

        Useful resources

        + + + +

        Explore more AI code samples

        + + + +

        See all code samples

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-accelerated-pytorch-training-on-mac/index.html b/blog/introducing-accelerated-pytorch-training-on-mac/index.html new file mode 100644 index 000000000000..50b2957e03f6 --- /dev/null +++ b/blog/introducing-accelerated-pytorch-training-on-mac/index.html @@ -0,0 +1,675 @@ + + + + + + + + + + + + + Introducing Accelerated PyTorch Training on Mac | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + PyTorch + +

        +

        In collaboration with the Metal engineering team at Apple, we are excited to announce support for GPU-accelerated PyTorch training on Mac. Until now, PyTorch training on Mac only leveraged the CPU, but with the upcoming PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.

        + +

        + +

        + +

        Metal Acceleration

        + +

        Accelerated GPU training is enabled using Apple’s Metal Performance Shaders (MPS) as a backend for PyTorch. The MPS backend extends the PyTorch framework, providing scripts and capabilities to set up and run operations on Mac. MPS optimizes compute performance with kernels that are fine-tuned for the unique characteristics of each Metal GPU family. The new device maps machine learning computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.

        + +

        Training Benefits on Apple Silicon

        + +

        Every Apple silicon Mac has a unified memory architecture, providing the GPU with direct access to the full memory store. This makes Mac a great platform for machine learning, enabling users to train larger networks or batch sizes locally. This reduces costs associated with cloud-based development or the need for additional local GPUs. The Unified Memory architecture also reduces data retrieval latency, improving end-to-end performance.

        + +

        In the graphs below, you can see the performance speedup from accelerated GPU training and evaluation compared to the CPU baseline:

        + +

        + +

        + +

        +Accelerated GPU training and evaluation speedups over CPU-only (times faster) +

        + +

        Getting Started

        + +

        To get started, just install the latest Preview (Nightly) build on your Apple silicon Mac running macOS 12.3 or later with a native version (arm64) of Python.

        + +

        You can also learn more about Metal and MPS on Apple’s Metal page.

        + +

        * Testing conducted by Apple in April 2022 using production Mac Studio systems with Apple M1 Ultra, 20-core CPU, 64-core GPU 128GB of RAM, and 2TB SSD. Tested with macOS Monterey 12.3, prerelease PyTorch 1.12, ResNet50 (batch size=128), HuggingFace BERT (batch size=64), and VGG16 (batch size=64). Performance tests are conducted using specific computer systems and reflect the approximate performance of Mac Studio.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-depyf/index.html b/blog/introducing-depyf/index.html new file mode 100644 index 000000000000..3b8370882fb7 --- /dev/null +++ b/blog/introducing-depyf/index.html @@ -0,0 +1,829 @@ + + + + + + + + + + + + + Introducing depyf: mastering torch.compile with ease | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Kaichao You + +

        +

        depyf logo

        + +

        We are thrilled to introduce depyf, a new project to the PyTorch ecosystem designed to help users understand, learn, and adapt to torch.compile!

        + +

        Motivation

        + +

        torch.compile is a cornerstone of PyTorch 2.x, offering a straightforward path to accelerate machine learning workflows with just a single line of code for both training and inference. The mere inclusion of @torch.compile can dramatically enhance the performance of your code. However, identifying the optimal insertion point for torch.compile is not easy, not to mention the complexity of adjusting various knobs for maximum efficiency.

        + +

        The intricacies of the torch.compile stack, encompassing Dynamo, AOTAutograd, Inductor, and more, present a steep learning curve. These components, essential for deep learning performance optimization, can be daunting without a solid foundation in the subject.

        + +

        Note: For an introductory example of how torch.compile works, please refer to this walk-through explanation.

        + +

        A common tool: TORCH_COMPILE_DEBUG

        + +

        To demystify torch.compile, the common approach involves leveraging the TORCH_COMPILE_DEBUG environment variable. While it provides more information, deciphering the output remains a formidable task.

        + +

        For example, when we have the following code:

        + +
        # test.py
        +import torch
        +from torch import _dynamo as torchdynamo
        +from typing import List
        +
        +@torch.compile
        +def toy_example(a, b):
        +   x = a / (torch.abs(a) + 1)
        +   if b.sum() < 0:
        +       b = b * -1
        +   return x * b
        +
        +def main():
        +   for _ in range(100):
        +       toy_example(torch.randn(10), torch.randn(10))
        +
        +if __name__ == "__main__":
        +   main()
        +
        + +

        And run it with TORCH_COMPILE_DEBUG=1 python test.py , we will get a directory named torch_compile_debug/run_2024_02_05_23_02_45_552124-pid_9520 , under which there are these files:

        + +
        .
        +├── torchdynamo
        +│   └── debug.log
        +└── torchinductor
        +   ├── aot_model___0_debug.log
        +   ├── aot_model___10_debug.log
        +   ├── aot_model___11_debug.log
        +   ├── model__4_inference_10.1
        +   │   ├── fx_graph_readable.py
        +   │   ├── fx_graph_runnable.py
        +   │   ├── fx_graph_transformed.py
        +   │   ├── ir_post_fusion.txt
        +   │   ├── ir_pre_fusion.txt
        +   │   └── output_code.py
        +   ├── model__5_inference_11.2
        +   │   ├── fx_graph_readable.py
        +   │   ├── fx_graph_runnable.py
        +   │   ├── fx_graph_transformed.py
        +   │   ├── ir_post_fusion.txt
        +   │   ├── ir_pre_fusion.txt
        +   │   └── output_code.py
        +   └── model___9.0
        +       ├── fx_graph_readable.py
        +       ├── fx_graph_runnable.py
        +       ├── fx_graph_transformed.py
        +       ├── ir_post_fusion.txt
        +       ├── ir_pre_fusion.txt
        +       └── output_code.py
        +
        + +

        The generated files and logs often raise more questions than they answer, leaving developers puzzled over the meaning and relationships within the data. Common puzzles for TORCH_COMPILE_DEBUG include:

        + +
          +
        • What does model__4_inference_10.1 mean?
        • +
        • I have one function but three model__xxx.py in the directory, what is their correspondence?
        • +
        • What are those LOAD_GLOBAL stuff in debug.log ?
        • +
        + +

        A better tool: depyf comes to rescue

        + +

        Let’s see how depyf can help developers to resolve the above challenges. To use depyf , simply execute pip install depyf or follow the project page https://github.com/thuml/depyf to install the latest version, and then surround the main code within with depyf.prepare_debug .

        + +
        # test.py
        +import torch
        +from torch import _dynamo as torchdynamo
        +from typing import List
        +
        +@torch.compile
        +def toy_example(a, b):
        +   x = a / (torch.abs(a) + 1)
        +   if b.sum() < 0:
        +       b = b * -1
        +   return x * b
        +
        +def main():
        +   for _ in range(100):
        +       toy_example(torch.randn(10), torch.randn(10))
        +
        +if __name__ == "__main__":
        +   import depyf
        +   with depyf.prepare_debug("depyf_debug_dir"):
        +       main()
        +
        + +

        After executing python test.py , depyf will produce a directory named depyf_debug_dir (the argument of the prepare_debug function). Under the directory, there would be these files:

        + +
        .
        +├── __compiled_fn_0 AFTER POST GRAD 0.py
        +├── __compiled_fn_0 Captured Graph 0.py
        +├── __compiled_fn_0 Forward graph 0.py
        +├── __compiled_fn_0 kernel 0.py
        +├── __compiled_fn_3 AFTER POST GRAD 0.py
        +├── __compiled_fn_3 Captured Graph 0.py
        +├── __compiled_fn_3 Forward graph 0.py
        +├── __compiled_fn_3 kernel 0.py
        +├── __compiled_fn_4 AFTER POST GRAD 0.py
        +├── __compiled_fn_4 Captured Graph 0.py
        +├── __compiled_fn_4 Forward graph 0.py
        +├── __compiled_fn_4 kernel 0.py
        +├── __transformed_code_0_for_torch_dynamo_resume_in_toy_example_at_8.py
        +├── __transformed_code_0_for_toy_example.py
        +├── __transformed_code_1_for_torch_dynamo_resume_in_toy_example_at_8.py
        +└── full_code_for_toy_example_0.py
        +
        + +

        And there are two obvious benefits:

        + +
          +
        1. The long and difficult-to-understand torchdynamo/debug.log is gone. Its content is cleaned up and shown as human-readable source code, in full_code_for_xxx.py and __transformed_code_{n}_for_xxx.py . It is worth to note, that the most tedious and difficult job of depyf is to decompile the bytecode inside torchdynamo/debug.log into Python source code, freeing developers from intimidating internals of Python.
        2. +
        3. The correspondence between function names and computation graphs are respected. For example, in __transformed_code_0_for_toy_example.py , we can see a function named __compiled_fn_0 , and we will immediately know its corresponding computation graphs are in __compiled_fn_0_xxx.py , because they share the same __compiled_fn_0 prefix name.
        4. +
        + +

        Starting with full_code_for_xxx.py , and following the functions involved, users will have a clear view of what torch.compile does to their code.

        + +

        One more thing: step-through debuggability

        + +

        Stepping through code line by line using debuggers is a great way to understand how code works. However, under TORCH_COMPILE_DEBUG , those files are only for users’ information, and cannot be executed with the data users concern.

        + +

        Note: By “debug”, we mean the process of inspecting and improving a program, rather than correcting buggy code.

        + +

        A standout feature of depyf is its capability to facilitate step-through debugging for torch.compile: all of the files it generates are linked with runtime code objects inside Python interpreter, and we can set breakpoints in these files. The usage is simple, just add one context manager with depyf.debug() , and it should do the trick:

        + +
        # test.py
        +import torch
        +from torch import _dynamo as torchdynamo
        +from typing import List
        +
        +@torch.compile
        +def toy_example(a, b):
        +   x = a / (torch.abs(a) + 1)
        +   if b.sum() < 0:
        +       b = b * -1
        +   return x * b
        +
        +def main():
        +   for _ in range(100):
        +       toy_example(torch.randn(10), torch.randn(10))
        +
        +if __name__ == "__main__":
        +   import depyf
        +   with depyf.prepare_debug("depyf_debug_dir"):
        +       main()
        +   with depyf.debug():
        +       main()
        +
        + +

        Just one caveat: the workflow of debugging torch.compile deviates from standard debugging workflow. With torch.compile, many codes are dynamically generated. Therefore, we need to:

        + +
          +
        1. launch the program
        2. +
        3. when the program exits with depyf.prepare_debug("depyf_debug_dir") , code will be available in depyf_debug_dir.
        4. +
        5. when the program enters with depyf.debug() , it will automatically set a breakpoint internally, so that the program is paused.
        6. +
        7. navigate to depyf_debug_dir to set breakpoints.
        8. +
        9. continue to run the code, and debuggers will hit these breakpoints!
        10. +
        + +

        depyf screenshot

        + +

        Here is a screenshot of what it looks like. All code and tensor variables are live, and we can inspect any variable, and step through the code, as in our daily debugging workflow now! The only difference is that we are debugging torch.compile generated code rather than human-written code.

        + +

        Conclusion

        + +

        torch.compile serves as an invaluable tool for accelerating PyTorch code effortlessly. For those looking to delve deeper into torch.compile, whether to leverage its full potential or to integrate custom operations, the learning curve can be very steep though. depyf is designed to lower this barrier, offering a user-friendly experience to understand, learn, and adapt to torch.compile.

        + +

        Do explore depyf and experience its benefits firsthand! The project is open-source and readily available at https://github.com/thuml/depyf. Installation is straightforward via pip install depyf. We hope depyf can enhance everyone’s development workflow with torch.compile.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-hidet/index.html b/blog/introducing-hidet/index.html new file mode 100644 index 000000000000..2810acbe54cb --- /dev/null +++ b/blog/introducing-hidet/index.html @@ -0,0 +1,762 @@ + + + + + + + + + + + + + Introducing Hidet: A Deep Learning Compiler for Efficient Model Serving | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team Hidet + +

        +

        Hidet is a powerful deep learning compiler that simplifies the process of implementing high-performing deep learning operators on modern accelerators (e.g., NVIDIA GPUs). With the new feature of torch.compile(...) in PyTorch 2.0, integrating a novel compiler into PyTorch is easier than ever - Hidet now can be used as a torch.compile(...) backend to accelerate PyTorch models, making it an attractive option for PyTorch users who want to improve the inference performance of their models, especially for those who also need to implement extremely optimized custom operators.

        + +

        Using Hidet to Compile A PyTorch Model

        + +

        To use Hidet in PyTorch, you need to first install the hidet package via pip:

        + +
        pip install hidet
        +
        + +

        Hidet is integrated with PyTorch as a torch.compile(...) backend following the Custom Backends tutorial. You can specify hidet as the backend when you compile a model. (Note: requires PyTorch version 2.0+):

        + +
        torch.compile(..., backend='hidet')
        +
        + +

        Hidet converts the given PyTorch model in the torch.fx.Graph format into its internal graph representation, and conducts a series of optimizations. Hidet provides a few options to configure the optimizations. For example, we can use hidet.torch.dynamo_config.use_tensor_core(True) to allow Hidet to generate CUDA kernels that leverage the Tensor Cores on NVIDIA GPUs, and use hidet.torch.dynamo_config.search_space(2) to allow Hidet to search for the best operator schedule specific for your hardware and input sizes. More configurations can be found in Hidet’s documentation.

        + +

        Here’s a complete example of how to use Hidet to compile and optimize a pre-trained ResNet50 model from torchvision:

        + +
        import hidet
        +import torch
        +
        +# Load a pre-trained ResNet50 model
        +x = torch.randn(1, 3, 224, 224, device='cuda').half()
        +model = torch.hub.load(
        +    'pytorch/vision:v0.6.0', 'resnet50', pretrained=True
        +).cuda().half().eval()
        +
        +# Configure hidet to use tensor core and enable tuning
        +hidet.torch.dynamo_config.use_tensor_core(True)
        +hidet.torch.dynamo_config.search_space(2) 
        +
        +# Compile the model using Hidet
        +model_opt = torch.compile(model, backend='hidet')
        +
        +# Check correctness
        +torch.testing.assert_close(actual=model_opt(x), expected=model(x), rtol=1e-2, atol=1e-2)
        +
        +# Benchmark
        +from hidet.utils import benchmark_func
        +print('eager: {:2f}'.format(benchmark_func(lambda: model(x))))
        +print('hidet: {:2f}'.format(benchmark_func(lambda: model_opt(x))))
        +
        + +

        We encourage you to try out the above script on your own NVIDIA GPU(s)! If you run this script on an aws.g5.2xlarge instance, you would get the result shown in the following figure. Hidet achieves the speedup because it could automatically fuse multiple operators, tune operator schedules, and use CUDA Graph to reduce framework-level overhead. More results can be found in the ASPLOS’23 publication of Hidet and our performance tracking

        + +

        Eager vs Hidet latency

        + +

        Using Hidet Script to Write Custom Operators

        + +

        Hidet Script is one approach to implement tensor operators in Python. The following example shows how to implement a naive matrix multiplication using Hidet Script and integrate it as a PyTorch operator.

        + +
        import torch
        +import hidet
        +
        +
        +def matmul(m_size, n_size, k_size):
        +    from hidet.lang import f32, attr
        +    from hidet.lang.cuda import threadIdx, blockIdx, blockDim
        +
        +    with hidet.script_module() as script_module:
        +        @hidet.script
        +        def matmul(
        +            a: f32[m_size, k_size],
        +            b: f32[k_size, n_size],
        +            c: f32[m_size, n_size]
        +        ):
        +            attr.cuda_grid_dim = ((m_size + 31) // 32, (n_size + 31) // 32)
        +            attr.cuda_block_dim = (32, 32)
        +            i = threadIdx.x + blockIdx.x * blockDim.x
        +            j = threadIdx.y + blockIdx.y * blockDim.y
        +            if i < m_size and j < n_size:
        +                c[i, j] = 0.0
        +                for k in range(k_size):
        +                    c[i, j] += a[i, k] * b[k, j]
        +
        +    ir_module = script_module.ir_module()
        +    func = hidet.driver.build_ir_module(ir_module)
        +    return func
        +
        +
        +class NaiveMatmul(torch.autograd.Function):
        +    @staticmethod
        +    def forward(ctx, a, b):
        +        m, k = a.shape
        +        k, n = b.shape
        +        c = torch.empty([m, n], dtype=a.dtype, device=a.device)
        +        func = matmul(m, n, k)
        +        func(a, b, c)
        +        return c
        +
        +
        +a = torch.randn([3, 4], device='cuda')
        +b = torch.randn([4, 5], device='cuda')
        +c = NaiveMatmul.apply(a, b)
        +cc = torch.matmul(a, b)
        +torch.testing.assert_close(c, cc)
        +
        + +

        More optimizations can be applied, see the example in our documentation to learn more.

        + +

        Hidet Script vs. Triton: Triton greatly simplifies the CUDA programming by introducing the tile-based programming model where the parallel execution unit is thread blocks instead of threads. However, this simplification also prevents the tensor program developers from manipulating the fine-grained computation and memory resources (e.g., warps, shared memory) in their preferred ways. It would be challenging to implement an optimization that requires fine-grained control of these resources using Triton if it has not been implemented by the Triton compiler itself. Hidet Script, on the other hand, simplifies tensor programming while still enabling users to implement their own optimizations with extensive flexibility. It’s worth noting that the more granular control of Hidet Script also brings added complexity compared to Triton.

        + +

        More about Hidet

        + +

        Hidet originates from a research project led by the EcoSystem lab at the University of Toronto (UofT) and AWS. The authors propose a new way, named the task-mapping programming paradigm, to construct tensor programs. It aims to simplify the tensor programming without sacrificing any optimization opportunity. Now, Hidet is an open-source project, jointly supported by CentML and the EcoSystem lab, that aims to provide an efficient solution to end-to-end inference on modern accelerators (e.g., NVIDIA GPUs).

        + +

        Additional Resources

        + + + +

        Acknowledgement

        + +

        We would like to thank Jerry Park, Mark Saroufim, Jason Liang and Helen Suk for their valuable help on preparing the blog post and feedback on the text. We also would like to thank Nikita Shulga, Jason Ansel, and Dmytro Dzhulgakov for reviewing and improving our PR https://github.com/pytorch/pytorch/pull/93873 on the 3rd-party dynamo backend registration.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-nvfuser-a-deep-learning-compiler-for-pytorch/index.html b/blog/introducing-nvfuser-a-deep-learning-compiler-for-pytorch/index.html new file mode 100644 index 000000000000..a8c0cdeb8a1d --- /dev/null +++ b/blog/introducing-nvfuser-a-deep-learning-compiler-for-pytorch/index.html @@ -0,0 +1,747 @@ + + + + + + + + + + + + + Introducing nvFuser, a deep learning compiler for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Christian Sarofeen, Piotr Bialecki, Jie Jiang, Kevin Stephano, Masaki Kozuki, Neal Vaidya, Stas Bekman + +

        +

        nvFuser is a Deep Learning Compiler for NVIDIA GPUs that automatically just-in-time compiles fast and flexible kernels to reliably accelerate users’ networks. It provides significant speedups for deep learning networks running on Volta and later CUDA accelerators by generating fast custom “fusion” kernels at runtime. nvFuser is specifically designed to meet the unique requirements of the PyTorch community, and it supports diverse network architectures and programs with dynamic inputs of varying shapes and strides. +In this blog post we’ll describe nvFuser and how it’s used today, show the significant performance improvements it can obtain on models from HuggingFace and TIMM, and look ahead to nvFuser in PyTorch 1.13 and beyond. If you would like to know more about how and why fusion improves the speed of training for Deep Learning networks, please see our previous talks on nvFuser from GTC 2022 and GTC 2021. +nvFuser relies on a graph representation of PyTorch operations to optimize and accelerate. Since PyTorch has an eager execution model, the PyTorch operations users are running are not directly accessible as a whole program that can be optimized by a system like nvFuser. Therefore users must utilize systems built on top of nvFuser which are capable of capturing users programs and translating them into a form that is optimizable by nvFuser. These higher level systems then pass these captured operations to nvFuser, so that nvFuser can optimize the execution of the user’s script for NVIDIA GPUs. There are three systems that capture, translate, and pass user programs to nvFuser for optimization:

        + +
          +
        • TorchScript jit.script +
            +
          • This system directly parses sections of an annotated python script to translate into its own representation what the user is doing. This system then applies its own version of auto differentiation to the graph, and passes sections of the subsequent forward and backwards graphs to nvFuser for optimization.
          • +
          +
        • +
        • FuncTorch +
            +
          • This system doesn’t directly look at the user python script, instead inserting a mechanism that captures PyTorch operations as they’re being run. We refer to this type of capture system as “trace program acquisition”, since we’re tracing what has been performed. FuncTorch doesn’t perform its own auto differentiation – it simply traces PyTorch’s autograd directly to get backward graphs.
          • +
          +
        • +
        • TorchDynamo +
            +
          • TorchDynamo is another program acquisition mechanism built on top of FuncTorch. TorchDynamo parses the Python bytecode produced from the user script in order to select portions to trace with FuncTorch. The benefit of TorchDynamo is that it’s able to apply decorators to a user’s script, effectively isolating what should be sent to FuncTorch, making it easier for FuncTorch to successfully trace complex Python scripts.
          • +
          +
        • +
        + +

        These systems are available for users to interact with directly while nvFuser automatically and seamlessly optimizes performance critical regions of the user’s code. These systems automatically send parsed user programs to nvFuser so nvFuser can:

        + +
          +
        1. Analyze the operations being run on GPUs
        2. +
        3. Plan parallelization and optimization strategies for those operations
        4. +
        5. Apply those strategies in generated GPU code
        6. +
        7. Runtime-compile the generated optimized GPU functions
        8. +
        9. Execute those CUDA kernels on subsequent iterations
        10. +
        + +

        It is important to note nvFuser does not yet support all PyTorch operations, and there are still some scenarios that are actively being improved in nvFuser that are discussed herein. However, nvFuser does support many DL performance critical operations today, and the number of supported operations will grow in subsequent PyTorch releases. nvFuser is capable of generating highly specialized and optimized GPU functions for the operations it does have support for. This means nvFuser is able to power new PyTorch systems like TorchDynamo and FuncTorch to combine the flexibility PyTorch is known for with unbeatable performance.

        + +

        nvFuser Performance

        + +

        Before getting into how to use nvFuser, in this section we’ll show the improvements in training speed nvFuser provides for a variety of models from the HuggingFace Transformers and PyTorch Image Models (TIMM) repositories and we will discuss current gaps in nvFuser performance that are under development today. All performance numbers in this section were taken using an NVIDIA A100 40GB GPU, and used either FuncTorch alone or Functorch with TorchDynamo.

        + +

        HuggingFace Transformer Benchmarks

        + +

        nvFuser can dramatically accelerate training of HuggingFace Transformers when combined with another important optimization (more on that in a moment). Performance improvements can be seen in Figure 1 to range between 1.12x and 1.50x across a subset of popular HuggingFace Transformer networks.

        + +

        + +

        + +

        +Figure 1: Performance gains of 8 training scenarios from HuggingFace’s Transformer repository. First performance boost in the dark green is due to replacing the optimizer with an NVIDIA Apex fused AdamW optimizer. The light green is due to adding nvFuser. Models were run with batch size and sequence lengths of [64, 128], [8, 512], [2, 1024], [64, 128], [8, 512], [8, src_seql=512, tgt_seql=128], [8, src_seql=1024, tgt_seql=128], and [8, 512] respectively. All networks were run with Automatic Mixed Precision (AMP) enabled with dtype=float16. +

        + +

        While these speedups are significant, it’s important to understand that nvFuser doesn’t (yet) automate everything about running networks quickly. For HuggingFace Transformers, for example, it was important to use the AdamW fused optimizer from NVIDIA’s Apex repository as the optimizer otherwise consumed a large portion of runtime. Using the fused AdamW optimizer to make the network faster exposes the next major performance bottleneck — memory bound operations. These operations are optimized by nvFuser, providing another large performance boost. With the fused optimizer and nvFuser enabled, the training speed of these networks improved between 1.12x to 1.5x. +HuggingFace Transformer models were run with the torch.amp module. (“amp” stands for Automated Mixed Precision, see the “What Every User Should Know about Mixed Precision in PyTorch” blog post for details.) An option to use nvFuser was added to HuggingFace’sTrainer. If you have TorchDynamo installed you can activate it to enable nvFuser in HuggingFace by passing torchdynamo = ‘nvfuser’ to the Trainer class. +nvFuser has great support for normalization kernels and related fusions frequently found in Natural Language Processing (NLP) models, and it is recommended users try nvFuser in their NLP workloads.

        + +

        PyTorch Image Models (TIMM) Benchmarks

        +

        nvFuser, can also significantly reduce the training time of TIMM networks, up to over 1.3x vs. eager PyTorch, and up to 1.44x vs. eager PyTorch when combined with the torch.amp module. Figure 1 shows nvFuser’s speedup without torch.amp, and when torch.amp is used with the NHWC (“channels last”) and NCHW (“channels first”) formats. nvFuser is integrated in TIMM through FuncTorch tracing directly (without TorchDynamo) and can be used by adding the –aot-autograd command line argument when running the TIMM benchmark or training script.

        + +

        + +

        + +

        +Figure 1: The Y-axis is the performance gain nvFuser provides over not using nvFuser. A value of 1.0 means no change in perf, 2.0 would mean nvFuser is twice as fast, 0.5 would mean nvFuser takes twice the time to run. Square markers are with float16 Automatic Mixed Precision (AMP) and channels first contiguous inputs, circle markers are float32 inputs, and triangles are with float16 AMP and channels last contiguous inputs. Missing data points are due to an error being encountered when tracing. +

        + +

        When running with float32 precision nvFuser provides a 1.12x geometric mean (“geomean”) speedup on TIMM networks, and when running with torch.amp and “channels first” it provides a 1.14x geomean speedup. However, nvFuser currently doesn’t speedup torch.amp and “channels last” training (a .9x geomean regression), so we recommend not using it in those cases. We are actively working on improving “channels last” performance now, and soon we will have two additional optimization strategies (grid persistent optimizations for channels-last normalizations and fast transposes) which we expect will provide speedups comparable to “channels first” in PyTorch version 1.13 and later. Many of nvFuser’s optimizations can also help in inference cases. However, in PyTorch when running inference on small batch sizes, the performance is typically limited by CPU overhead, which nvFuser can’t completely remove or fix. Therefore, typically the most important optimization for inference is to enable CUDA Graphs when possible. Once CUDA Graphs is enabled, then it can also be beneficial to also enable fusion through nvFuser. Performance of inference is shown in Figure 2 and Figure 3. Inference is only run with float16 AMP as it is uncommon to run inference workloads in full float32 precision.

        + +

        + +

        + +

        + +

        + +

        +Figure 2: Performance gains of enabling CUDA Graphs, and CUDA Graphs with nvFuser compared to the performance of native PyTorch without CUDA Graphs and nvFuser across TIMM models with float16 AMP, channels first inputs, and a batch size of 1 and 8 respectively. There is a geomean speedup of 2.74x with CUDA Graphs and 2.71x with CUDA Graphs + nvFuser respectively. nvFuser provides a maximum regression of 0.68x and a maximum performance gain of 2.74x (relative to CUDA Graphs without nvFuser). Performance gain is measured relative to the average time per iteration PyTorch takes without CUDA Graphs and without nvFuser. Models are sorted by how much additional performance nvFuser is providing. +

        + +

        + +

        + +

        + +

        + +

        +Figure 3: Performance gains of enabling CUDA Graphs, and CUDA Graphs with nvFuser compared to the performance of native PyTorch without CUDA Graphs and nvFuser across TIMM models with AMP, channels last inputs, and a batch size of 1 and 8 respectively. There is a geomean speedup of 2.29x with CUDA Graphs and 2.95x with CUDA Graphs + nvFuser respectively. nvFuser provides a maximum regression of 0.86x and a maximum performance gain of 3.82x (relative to CUDA Graphs without nvFuser). Performance gain is measured relative to the average time per iteration PyTorch takes without CUDA Graphs and without nvFuser. Models are sorted by how much additional performance nvFuser is providing. +

        + +

        So far nvFuser performance has not been tuned for inference workloads so its performance benefit is not consistent across all cases. However, there are still many models that benefit significantly from nvFuser during inference and we encourage users to try nvFuser in inference workloads to see if you would benefit today. Performance of nvFuser in inference workloads will improve in the future and if you’re interested in nvFuser in inference workloads please reach out to us on the PyTorch forums.

        + +

        Getting Started - Accelerate Your Scripts with nvFuser

        + +

        We’ve created a tutorial demonstrating how to take advantage of nvFuser to accelerate part of a standard transformer block, and how nvFuser can be used to define fast and novel operations. There are still some rough edges in nvFuser that we’re working hard on improving as we’ve outlined in this blog post. However we’ve also demonstrated some great improvements for training speed on multiple networks in HuggingFace and TIMM and we expect there are opportunities in your networks where nvFuser can help today, and many more opportunities it will help in the future. +If you would like to learn more about nvFuser we recommend watching our presentations from NVIDIA’s GTC conference GTC 2022 and GTC 2021.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-pytorch-fully-sharded-data-parallel-api/index.html b/blog/introducing-pytorch-fully-sharded-data-parallel-api/index.html new file mode 100644 index 000000000000..89a38f358e95 --- /dev/null +++ b/blog/introducing-pytorch-fully-sharded-data-parallel-api/index.html @@ -0,0 +1,809 @@ + + + + + + + + + + + + + Introducing PyTorch Fully Sharded Data Parallel (FSDP) API | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Yanli Zhao, Rohan Varma, Chien-Chin Huang, Shen Li, Min Xu, Alban Desmaison + +

        +

        Recent studies have shown that large model training will be beneficial for improving model quality. During the last 3 years, model size grew 10,000 times from BERT with 110M parameters to Megatron-2 with one trillion. However, training large AI models is not easy—aside from the need for large amounts of computing resources, software engineering complexity is also challenging. PyTorch has been working on building tools and infrastructure to make it easier.

        + +

        PyTorch Distributed data parallelism is a staple of scalable deep learning because of its robustness and simplicity. It however requires the model to fit on one GPU. Recent approaches like DeepSpeed ZeRO and FairScale’s Fully Sharded Data Parallel allow us to break this barrier by sharding a model’s parameters, gradients and optimizer states across data parallel workers while still maintaining the simplicity of data parallelism.

        + +

        With PyTorch 1.11 we’re adding native support for Fully Sharded Data Parallel (FSDP), currently available as a prototype feature. Its implementation heavily borrows from FairScale’s version while bringing more streamlined APIs and additional performance improvements.

        + +

        Scaling tests of PyTorch FSDP on AWS show it can scale up to train dense models with 1T parameters. Realized performance in our experiments reached 84 TFLOPS per A100 GPU for GPT 1T model and 159 TFLOPS per A100 GPU for GPT 175B model on AWS cluster. Native FSDP implementation also dramatically improved model initialization time compared to FairScale’s original when CPU offloading was enabled.

        + +

        In future PyTorch versions, we’re going to enable users to seamlessly switch between DDP, ZeRO-1, ZeRO-2 and FSDP flavors of data parallelism, so that users can train different scales of models with simple configurations in the unified API.

        + +

        How FSDP Works

        + +

        FSDP is a type of data-parallel training, but unlike traditional data-parallel, which maintains a per-GPU copy of a model’s parameters, gradients and optimizer states, it shards all of these states across data-parallel workers and can optionally offload the sharded model parameters to CPUs.

        + +

        The figure below shows how FSDP works for 2 data-parallel processes:

        + +

        + +

        + +

        +Figure 1. FSDP workflow +

        + +

        Usually, model layers are wrapped with FSDP in a nested way, so that only layers in a single FSDP instance need to gather the full parameters to a single device during forward or backward computations. The gathered full parameters will be freed immediately after computation, and the freed memory can be used for the next layer’s computation. In this way, peak GPU memory could be saved and thus training can be scaled to use a larger model size or larger batch size. To further maximize memory efficiency, FSDP can offload the parameters, gradients and optimizer states to CPUs when the instance is not active in the computation.

        + +

        Using FSDP in PyTorch

        + +

        There are two ways to wrap a model with PyTorch FSDP. Auto wrapping is a drop-in replacement for DDP; manual wrapping needs minimal changes of model definition code with the ability to explore complex sharding strategies.

        + +

        Auto Wrapping

        + +

        Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code.

        + +

        fsdp_auto_wrap_policy argument allows specifying a callable function to recursively wrap layers with FSDP. default_auto_wrap_policy function provided by the PyTorch FSDP recursively wraps layers with the number of parameters larger than 100M. You can supply your own wrapping policy as needed. The example of writing a customized wrapping policy is shown in the FSDP API doc.

        + +

        In addition, cpu_offload could be configured optionally to offload wrapped parameters to CPUs when these parameters are not used in computation. This can further improve memory efficiency at the cost of data transfer overhead between host and device.

        + +

        The example below shows how FSDP is wrapped using auto wrapping.

        + +
        from torch.distributed.fsdp import (
        +   FullyShardedDataParallel,
        +   CPUOffload,
        +)
        +from torch.distributed.fsdp.wrap import (
        +   default_auto_wrap_policy,
        +)
        +import torch.nn as nn
        + 
        +class model(nn.Module):
        +   def __init__(self):
        +       super().__init__()
        +       self.layer1 = nn.Linear(8, 4)
        +       self.layer2 = nn.Linear(4, 16)
        +       self.layer3 = nn.Linear(16, 4)
        + 
        +model = DistributedDataParallel(model())
        +fsdp_model = FullyShardedDataParallel(
        +   model(),
        +   fsdp_auto_wrap_policy=default_auto_wrap_policy,
        +   cpu_offload=CPUOffload(offload_params=True),
        +)
        +
        + +

        Manual Wrapping

        + +

        Manual wrapping can be useful to explore complex sharding strategies by applying wrap selectively to some parts of the model. Overall settings can be passed to the enable_wrap() context manager.

        + +
        from torch.distributed.fsdp import (
        +   FullyShardedDataParallel,
        +   CPUOffload,
        +)
        +from torch.distributed.fsdp.wrap import (
        +   enable_wrap,
        +   wrap,
        +)
        +import torch.nn as nn
        +from typing import Dict
        + 
        + 
        +class model(nn.Module):
        +   def __init__(self):
        +       super().__init__()
        +       self.layer1 = wrap(nn.Linear(8, 4))
        +       self.layer2 = nn.Linear(4, 16)
        +       self.layer3 = wrap(nn.Linear(16, 4))
        + 
        +wrapper_kwargs = Dict(cpu_offload=CPUOffload(offload_params=True))
        +with enable_wrap(wrapper_cls=FullyShardedDataParallel, **wrapper_kwargs):
        +   fsdp_model = wrap(model())
        +
        + +

        After wrapping the model with FSDP using one of the two above approaches, the model can be trained in a similar way as local training, like this:

        + +
        optim = torch.optim.Adam(fsdp_model.parameters(), lr=0.0001)
        +for sample, label in next_batch():
        +  out = fsdp_model(input)
        +  loss = criterion(out, label)
        +  loss.backward()
        +  optim.step()
        +
        + +

        Benchmark Results

        + +

        We ran extensive scaling tests for 175B and 1T GPT models on AWS clusters using PyTorch FSDP. Each cluster node is an instance with 8 NVIDIA A100-SXM4-40GB GPUs, and inter-nodes are connected via AWS Elastic Fabric Adapter (EFA) with 400 Gbps network bandwidth.

        + +

        GPT models are implemented using minGPT. A randomly generated input dataset is used for benchmarking purposes. All experiments ran with 50K vocabulary size, fp16 precision and SGD optimizer.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelNumber of layersHidden sizeAttention headsModel size, billions of parameters
        GPT 175B961228896175
        GPT 1T128256001601008
        + +

        In addition to using FSDP with parameters CPU offloading in the experiments, the activation checkpointing feature in PyTorch is also applied in the tests.

        + +

        The maximum per-GPU throughput of 159 teraFLOP/s (51% of NVIDIA A100 peak theoretical performance 312 teraFLOP/s/GPU) is achieved with batch size 20 and sequence length 512 on 128 GPUs for the GPT 175B model; further increase of the number of GPUs leads to per-GPU throughput degradation because of growing communication between the nodes.

        + +

        For the GPT 1T model, the maximum per-GPU throughput of 84 teraFLOP/s (27% of the peak teraFLOP/s) is achieved with batch size 4 and sequence length 2048 on 128 GPUs. However, further increase of the number of GPUs doesn’t affect the per-GPU throughput too much because we observed that the largest bottleneck in the 1T model training is not from communication but from the slow CUDA cache allocator when peak GPU memory is reaching the limit. The use of A100 80G GPUs with larger memory capacity will mostly resolve this issue and also help scale the batch size to achieve much larger throughput.

        + +

        + +

        + +

        + +

        + +

        Future Work

        + +

        In the next beta release, we are planning to add efficient distributed model/states checkpointing APIs, meta device support for large model materialization, and mixed-precision support inside FSDP computation and communication. We’re also going to make it easier to switch between DDP, ZeRO1, ZeRO2 and FSDP flavors of data parallelism in the new API. To further improve FSDP performance, memory fragmentation reduction and communication efficiency improvements are also planned.

        + +

        A Bit of History of 2 Versions of FSDP

        + +

        FairScale FSDP was released in early 2021 as part of the FairScale library. And then we started the effort to upstream FairScale FSDP to PyTorch in PT 1.11, making it production-ready. We have selectively upstreamed and refactored key features from FairScale FSDP, redesigned user interfaces and made performance improvements.

        + +

        In the near future, FairScale FSDP will stay in the FairScale repository for research projects, while generic and widely adopted features will be upstreamed to PyTorch incrementally and hardened accordingly.

        + +

        Meanwhile, PyTorch FSDP will focus more on production readiness and long-term support. This includes better integration with ecosystems and improvements on performance, usability, reliability, debuggability and composability.

        + +

        Acknowledgments

        + +

        We would like to thank the authors of FairScale FSDP: Myle Ott, Sam Shleifer, Min Xu, Priya Goyal, Quentin Duval, Vittorio Caggiano, Tingting Markstrum, Anjali Sridhar. Thanks to the Microsoft DeepSpeed ZeRO team for developing and popularizing sharded data parallel techniques. Thanks to Pavel Belevich, Jessica Choi, Sisil Mehta for running experiments using PyTorch FSDP on different clusters. Thanks to Geeta Chauhan, Mahesh Yadav, Pritam Damania, Dmytro Dzhulgakov for supporting this effort and insightful discussions.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/index.html b/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/index.html new file mode 100644 index 000000000000..6d5eab4c5706 --- /dev/null +++ b/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/index.html @@ -0,0 +1,704 @@ + + + + + + + + + + + + + Introducing PyTorch Profiler - the new and improved performance tool | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Maxim Lukiyanov - Principal PM at Microsoft, Guoliang Hua - Principal Engineering Manager at Microsoft, Geeta Chauhan - Partner Engineering Lead at Facebook, Gisle Dankel - Tech Lead at Facebook + +

        +

        Along with PyTorch 1.8.1 release, we are excited to announce PyTorch Profiler – the new and improved performance debugging profiler for PyTorch. Developed as part of a collaboration between Microsoft and Facebook, the PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models.

        + +

        Analyzing and improving large-scale deep learning model performance is an ongoing challenge that grows in importance as the model sizes increase. For a long time, PyTorch users had a hard time solving this challenge due to the lack of available tools. There were standard performance debugging tools that provide GPU hardware level information but missed PyTorch-specific context of operations. In order to recover missed information, users needed to combine multiple tools together or manually add minimum correlation information to make sense of the data. There was also the autograd profiler (torch.autograd.profiler) which can capture information about PyTorch operations but does not capture detailed GPU hardware-level information and cannot provide support for visualization.

        + +

        The new PyTorch Profiler (torch.profiler) is a tool that brings both types of information together and then builds experience that realizes the full potential of that information. This new profiler collects both GPU hardware and PyTorch related information, correlates them, performs automatic detection of bottlenecks in the model, and generates recommendations on how to resolve these bottlenecks. All of this information from the profiler is visualized for the user in TensorBoard. The new Profiler API is natively supported in PyTorch and delivers the simplest experience available to date where users can profile their models without installing any additional packages and see results immediately in TensorBoard with the new PyTorch Profiler plugin. Below is the screenshot of PyTorch Profiler - automatic bottleneck detection.

        + +
        + +
        + +

        Getting started

        + +

        PyTorch Profiler is the next version of the PyTorch autograd profiler. It has a new module namespace torch.profiler but maintains compatibility with autograd profiler APIs. The Profiler uses a new GPU profiling engine, built using Nvidia CUPTI APIs, and is able to capture GPU kernel events with high fidelity. To profile your model training loop, wrap the code in the profiler context manager as shown below.

        + +
         with torch.profiler.profile(
        +    schedule=torch.profiler.schedule(
        +        wait=2,
        +        warmup=2,
        +        active=6,
        +        repeat=1),
        +    on_trace_ready=tensorboard_trace_handler,
        +    with_stack=True
        +) as profiler:
        +    for step, data in enumerate(trainloader, 0):
        +        print("step:{}".format(step))
        +        inputs, labels = data[0].to(device=device), data[1].to(device=device)
        +
        +        outputs = model(inputs)
        +        loss = criterion(outputs, labels)
        +
        +        optimizer.zero_grad()
        +        loss.backward()
        +        optimizer.step()
        +        profiler.step()
        +
        +

        The schedule parameter allows you to limit the number of training steps included in the profile to reduce the amount of data collected and simplify visual analysis by focusing on what’s important. The tensorboard_trace_handler automatically saves profiling results to disk for analysis in TensorBoard.

        + +

        To view results of the profiling session in TensorBoard, install PyTorch Profiler TensorBoard Plugin package.

        + +
        pip install torch_tb_profiler
        +
        +

        Visual Studio Code Integration

        +

        Microsoft Visual Studio Code is one of the most popular code editors for Python developers and data scientists. The Python extension for VS Code recently added the integration of TensorBoard into the code editor, including support for the PyTorch Profiler. Once you have VS Code and the Python extension installed, you can quickly open the TensorBoard Profiler plugin by launching the Command Palette using the keyboard shortcut CTRL + SHIFT + P (CMD + SHIFT + P on a Mac) and typing the “Launch TensorBoard” command.

        + +
        + +
        + +

        This integration comes with a built-in lifecycle management feature. VS Code will install the TensorBoard package and the PyTorch Profiler plugin package (coming in mid-April) automatically if you don’t have them on your system. VS Code will also launch TensorBoard process for you and automatically look for any TensorBoard log files within your current directory. When you’re done, just close the tab and VS Code will automatically close the process. No more Terminal windows running on your system to provide a backend for the TensorBoard UI! Below is PyTorch Profiler Trace View running in TensorBoard.

        + +
        + +
        + +

        Learn more about TensorBoard support in VS Code in this blog.

        + +

        Feedback

        + +

        Review PyTorch Profiler documentation, give Profiler a try and let us know about your experience. Provide your feedback on PyTorch Discussion Forum or file issues on PyTorch GitHub.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-the-playtorch-app/index.html b/blog/introducing-the-playtorch-app/index.html new file mode 100644 index 000000000000..6caf68b3e8d3 --- /dev/null +++ b/blog/introducing-the-playtorch-app/index.html @@ -0,0 +1,745 @@ + + + + + + + + + + + + + Introducing the PlayTorch app: Rapidly Create Mobile AI Experiences | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + PlayTorch Team + +

        +

        + +

        + +

        In December, we announced PyTorch Live, a toolkit for building AI-powered mobile prototypes in minutes. The initial release included a command-line interface to set up a development environment and an SDK for building AI-powered experiences in React Native. Today, we’re excited to share that PyTorch Live will now be known as PlayTorch. This new release provides an improved and simplified developer experience. PlayTorch development is independent from the PyTorch project and the PlayTorch code repository is moving into the Meta Research GitHub organization.

        + +

        A New Workflow: The PlayTorch App

        + +

        The PlayTorch team is excited to announce that we have partnered with Expo to change the way AI powered mobile experiences are built. Our new release simplifies the process of building mobile AI experiences by eliminating the need for a complicated development environment. You will now be able to build cross platform AI powered prototypes from the very browser you are using to read this blog.

        + +

        In order to make this happen, we are releasing the PlayTorch app which is able to run AI-powered experiences built in the Expo Snack web based code editor.

        + +

        + +

        + +

        The PlayTorch app can be downloaded from the Apple App Store and Google Play Store. With the app installed, you can head over to playtorch.dev/snack and write the code for your AI-powered PlayTorch Snack. When you want to try what you’ve built, you can use the PlayTorch app’s QR code scanner to scan the QR code on the Snack page and load the code to your device.

        + +

        NOTE: PlayTorch Snacks will not work in the Expo Go app.

        + +

        More to Explore in the PlayTorch App

        + +

        AI Demos

        + +

        The PlayTorch app comes with several examples of how you can build AI powered experiences with a variety of different machine learning models from object detection to natural language processing. See what can be built with the PlayTorch SDK and be inspired to make something of your own as you play with the examples.

        + +

        + +

        + +

        Sharing Your Creations

        + +

        Any PlayTorch Snack that you run in the PlayTorch app can be shared with others in an instant. When they open the link on their device, the PlayTorch app will instantly load what you’ve built from the cloud so they can experience it first hand.

        + +

        + +

        + +

        When you have something you want to share, let us know on Discord or Twitter or embed the PlayTorch Snack on your own webpage.

        + +

        SDK Overhaul

        + +

        We learned a lot from the community after our initial launch in December and have been hard at work over the past several months to make the PlayTorch SDK (formerly known as PyTorch Live) simple, performant, and robust. In our initial version, the SDK relied on config files to define how a model ingested and output data.

        + +

        Today, we are happy to announce the next version of our SDK can handle data processing in JavaScript for your prototypes with the new PlayTorch API that leverages the JavaScript Interface (JSI) to directly call C++ code. Not only have we completely redone the way you can interact with models, but we have also greatly expanded the variety of supported model architectures.

        + +

        A New Data Processing API for Prototyping

        + +

        With this JSI API, we now allow users direct access to tensors (data format for machine learning). Instead of only having access to predefined transformations, you can now manipulate tensors however you would like for your prototypes.

        + +

        + +

        + +

        No more switching back and forth between code and config. You will now be able to write everything in JavaScript and have access to all of the type annotations and autocomplete features available to you in those languages.

        + +

        Check out our tutorials to see the new Data Processing API in action, take a deeper dive in the API docs, or inspect the code yourself on GitHub.

        + +

        Expanded Use Cases

        + +

        With the new version of the SDK, we have added support for several cutting edge models.

        + +

        + +

        + +

        Image-to-image transformations are now supported thanks to our robust JSI API, so you can see what your world would look like if it were an anime.

        + +

        + +

        + +

        Translate French to English with an AI powered translator using the Seq2Seq model.

        + +

        + +

        + +

        Use DeepLab V3 to segment images!

        + +

        Start Playing

        + +

        If you want to start creating AI experiences yourself, head over to playtorch.dev and try out our tutorials. Each tutorial will guide you through building a simple AI powered experience that you can instantly run on your phone and share with others.

        + +

        How to Get Support

        + +

        Join us on Discord, collaborate with us on GitHub, or follow us on Twitter. Got questions or feedback? We’d love to hear from you!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-torchmultimodal/index.html b/blog/introducing-torchmultimodal/index.html new file mode 100644 index 000000000000..fa7aed65f1fe --- /dev/null +++ b/blog/introducing-torchmultimodal/index.html @@ -0,0 +1,752 @@ + + + + + + + + + + + + + Introducing TorchMultimodal - a library for accelerating exploration in Multimodal AI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Kartikay Khandelwal, Ankita De + +

        +

        We are announcing TorchMultimodal Beta, a PyTorch domain library for training SoTA multi-task multimodal models at scale. The library provides composable building blocks (modules, transforms, loss functions) to accelerate model development, SoTA model architectures (FLAVA, MDETR, Omnivore) from published research, training and evaluation scripts, as well as notebooks for exploring these models. The library is under active development, and we’d love to hear your feedback! You can find more details on how to get started here.

        + +

        Why TorchMultimodal?

        + +

        Interest is rising around AI models that understand multiple input types (text, images, videos and audio signals), and optionally use this understanding to generate different forms of outputs (sentences, pictures, videos). Recent work from FAIR such as FLAVA, Omnivore and data2vec have shown that multimodal models for understanding are competitive with unimodal counterparts, and in some cases are establishing the new state-of-the art. Generative models such as Make-a-video and Make-a-scene are redefining what modern AI systems can do.

        + +

        As interest in multimodal AI has grown, researchers are looking for tools and libraries to quickly experiment with ideas, and build on top of the latest research in the field. While the PyTorch ecosystem has a rich repository of libraries and frameworks, it’s not always obvious how components from these interoperate with each other, or how they can be stitched together to build SoTA multimodal models.

        + +

        TorchMultimodal solves this problem by providing:

        + +
          +
        • +

          Composable and easy-to-use building blocks which researchers can use to accelerate model development and experimentation in their own workflows. These are designed to be modular, and can be easily extended to handle new modalities.

          +
        • +
        • +

          End-to-end examples for training and evaluating the latest models from research. These should serve as starting points for ongoing/future research, as well as examples for using advanced features such as integrating with FSDP and activation checkpointing for scaling up model and batch sizes.

          +
        • +
        + +

        Introducing TorchMultimodal

        + +

        TorchMultimodal is a PyTorch domain library for training multi-task multimodal models at scale. In the repository, we provide:

        + +
          +
        • +

          Building Blocks. A collection of modular and composable building blocks like models, fusion layers, loss functions, datasets and utilities. Some examples include:

          + +
            +
          • +

            Contrastive Loss with Temperature. Commonly used function for training models like CLIP and FLAVA. We also include variants such as ImageTextContrastiveLoss used in models like ALBEF.

            +
          • +
          • +

            Codebook layers which compresses high dimensional data by nearest neighbor lookup in an embedding space and is a vital component of VQVAEs (provided as a model in the repository).

            +
          • +
          • +

            Shifted-window Attention window based multi-head self attention which is a vital component of encoders like Swin 3D Transformers.

            +
          • +
          • +

            Components for CLIP. A popular model published by OpenAI which has proven to be extremely effective at learning text and image representations.

            +
          • +
          • +

            Multimodal GPT. An abstraction that extends OpenAI’s GPT architecture for multimodal generation when combined with the generation utility.

            +
          • +
          • +

            MultiHeadAttention. A critical component for attention-based models with support for fast auto-regressive decoding.

            +
          • +
          +
        • +
        • +

          Examples. A collection of examples that show how to combine these building blocks with components and common infrastructure (Lightning, TorchMetrics) from across the PyTorch Ecosystem to replicate state-of-the-art models published in literature. We currently provide five examples, which include.

          + +
            +
          • +

            FLAVA [paper]. Official code for the paper accepted at CVPR, including a tutorial on finetuning FLAVA.

            +
          • +
          • +

            MDETR [paper]. Collaboration with authors from NYU to provide an example which alleviates interoperability pain points in the PyTorch ecosystem, including a notebook on using MDETR for phrase grounding and visual question answering.

            +
          • +
          • +

            Omnivore [paper]. First example in TorchMultimodal of a model which deals with Video and 3D data, including a notebook for exploring the model.

            +
          • +
          • +

            MUGEN [paper]. Foundational work for auto-regressive generation and retrieval, including demos for text-video generation and retrieval with a large-scale synthetic dataset enriched from OpenAI coinrun.

            +
          • +
          • +

            ALBEF [paper] Code for the model, including a notebook for using this model for Visual Question Answering.

            +
          • +
          +
        • +
        + +

        The following code snippet showcases an example usage of several TorchMultimodal components related to CLIP:

        + +
        
        +# instantiate clip transform
        +clip_transform = CLIPTransform()
        +
        +# pass the transform to your dataset. Here we use coco captions
        +dataset = CocoCaptions(root= ..., annFile=..., transforms=clip_transform)
        +dataloader = DataLoader(dataset, batch_size=16)
        +
        +# instantiate model. Here we use clip with vit-L as the image encoder
        +model= clip_vit_l14()
        +
        +# define loss and other things needed for training
        +clip_loss = ContrastiveLossWithTemperature()
        +optim = torch.optim.AdamW(model.parameters(), lr = 1e-5)
        +epochs = 1
        +
        +# write your train loop
        +for _ in range(epochs):
        +	for batch_idx, batch in enumerate(dataloader):
        +		image, text = batch
        +		image_embeddings, text_embeddings = model(image, text)
        +		loss = contrastive_loss_with_temperature(image_embeddings, text_embeddings)
        +		loss.backward()
        +		optimizer.step()
        +
        + +

        Apart from the code, we are also releasing a tutorial for fine-tuning multimodal foundation models, and a blog post (with code pointers) on how to scale up such models using techniques from PyTorch Distributed (FSDP and activation checkpointing). We hope such examples and tutorials will serve to demystify a number of advanced features available in the PyTorch ecosystem.

        + +

        What’s Next?

        + +

        While this is an exciting launch, there’s a lot more to come. The library is under development and we are working on adding some of the exciting developments in the space of diffusion models, and examples to showcase common trends from research. As you explore and use the library, we’d love to hear any feedback you might have! You can find more details on how to get started here.

        + +

        Team

        + +

        The primary contributors and developers of TorchMultimodal include Ankita De, Evan Smothers, Kartikay Khandelwal, Lan Gong, Laurence Rouesnel, Nahiyan Malik, Rafi Ayub and Yosua Michael Maranatha.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-torchrec/index.html b/blog/introducing-torchrec/index.html new file mode 100644 index 000000000000..dfe82d13eafb --- /dev/null +++ b/blog/introducing-torchrec/index.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + Introducing TorchRec, a library for modern production recommendation systems | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Meta AI - Donny Greenberg, Colin Taylor, Dmytro Ivchenko, Xing Liu, Anirudh Sudarshan + +

        +

        We are excited to announce TorchRec, a PyTorch domain library for Recommendation Systems. This new library provides common sparsity and parallelism primitives, enabling researchers to build state-of-the-art personalization models and deploy them in production.

        + +

        + +

        + +

        How did we get here?

        +

        Recommendation Systems (RecSys) comprise a large footprint of production-deployed AI today, but you might not know it from looking at Github. Unlike areas like Vision and NLP, much of the ongoing innovation and development in RecSys is behind closed company doors. For academic researchers studying these techniques or companies building personalized user experiences, the field is far from democratized. Further, RecSys as an area is largely defined by learning models over sparse and/or sequential events, which has large overlaps with other areas of AI. Many of the techniques are transferable, particularly for scaling and distributed execution. A large portion of the global investment in AI is in developing these RecSys techniques, so cordoning them off blocks this investment from flowing into the broader AI field.

        + +

        By mid-2020, the PyTorch team received a lot of feedback that there hasn’t been a large-scale production-quality recommender systems package in the open-source PyTorch ecosystem. While we were trying to find a good answer, a group of engineers at Meta wanted to contribute Meta’s production RecSys stack as a PyTorch domain library, with a strong commitment to growing an ecosystem around it. This seemed like a good idea that benefits researchers and companies across the RecSys domain. So, starting from Meta’s stack, we began modularizing and designing a fully-scalable codebase that is adaptable for diverse recommendation use-cases. Our goal was to extract the key building blocks from across Meta’s software stack to simultaneously enable creative exploration and scale. After nearly two years, a battery of benchmarks, migrations, and testing across Meta, we’re excited to finally embark on this journey together with the RecSys community. We want this package to open a dialogue and collaboration across the RecSys industry, starting with Meta as the first sizable contributor.

        + +

        Introducing TorchRec

        +

        TorchRec includes a scalable low-level modeling foundation alongside rich batteries-included modules. We initially target “two-tower” ([1], [2]) architectures that have separate submodules to learn representations of candidate items and the query or context. Input signals can be a mix of floating point “dense” features or high-cardinality categorical “sparse” features that require large embedding tables to be trained. Efficient training of such architectures involves combining data parallelism that replicates the “dense” part of computation and model parallelism that partitions large embedding tables across many nodes.

        + +

        In particular, the library includes:

        +
          +
        • Modeling primitives, such as embedding bags and jagged tensors, that enable easy authoring of large, performant multi-device/multi-node models using hybrid data-parallelism and model-parallelism.
        • +
        • Optimized RecSys kernels powered by FBGEMM , including support for sparse and quantized operations.
        • +
        • A sharder which can partition embedding tables with a variety of different strategies including data-parallel, table-wise, row-wise, table-wise-row-wise, and column-wise sharding.
        • +
        • A planner which can automatically generate optimized sharding plans for models.
        • +
        • Pipelining to overlap dataloading device transfer (copy to GPU), inter-device communications (input_dist), and computation (forward, backward) for increased performance.
        • +
        • GPU inference support.
        • +
        • Common modules for RecSys, such as models and public datasets (Criteo & Movielens).
        • +
        + +

        To showcase the flexibility of this tooling, let’s look at the following code snippet, pulled from our DLRM Event Prediction example:

        +
        # Specify the sparse embedding layers
        +eb_configs = [
        +   EmbeddingBagConfig(
        +       name=f"t_{feature_name}",
        +       embedding_dim=64,
        +       num_embeddings=100_000,
        +       feature_names=[feature_name],
        +   )
        +   for feature_idx, feature_name in enumerate(DEFAULT_CAT_NAMES)
        +]
        +
        +# Import and instantiate the model with the embedding configuration
        +# The "meta" device indicates lazy instantiation, with no memory allocated
        +train_model = DLRM(
        +   embedding_bag_collection=EmbeddingBagCollection(
        +       tables=eb_configs, device=torch.device("meta")
        +   ),
        +   dense_in_features=len(DEFAULT_INT_NAMES),
        +   dense_arch_layer_sizes=[512, 256, 64],
        +   over_arch_layer_sizes=[512, 512, 256, 1],
        +   dense_device=device,
        +)
        +
        +# Distribute the model over many devices, just as one would with DDP.
        +model = DistributedModelParallel(
        +   module=train_model,
        +   device=device,
        +)
        +
        +optimizer = torch.optim.SGD(params, lr=args.learning_rate)
        +# Optimize the model in a standard loop just as you would any other model!
        +# Or, you can use the pipeliner to synchronize communication and compute
        +for epoch in range(epochs):
        +   # Train
        +
        + +

        Scaling Performance

        +

        TorchRec has state-of-the-art infrastructure for scaled Recommendations AI, powering some of the largest models at Meta. It was used to train a 1.25 trillion parameter model, pushed to production in January, and a 3 trillion parameter model which will be in production soon. This should be a good indication that PyTorch is fully capable of the largest scale RecSys problems in industry. We’ve heard from many in the community that sharded embeddings are a pain point. TorchRec cleanly addresses that. Unfortunately it is challenging to provide large-scale benchmarks with public datasets, as most open-source benchmarks are too small to show performance at scale.

        + +

        Looking ahead

        +

        Open-source and open-technology have universal benefits. Meta is seeding the PyTorch community with a state-of-the-art RecSys package, with the hope that many join in on building it forward, enabling new research and helping many companies. The team behind TorchRec plan to continue this program indefinitely, building up TorchRec to meet the needs of the RecSys community, to welcome new contributors, and to continue to power personalization at Meta. We’re excited to begin this journey and look forward to contributions, ideas, and feedback!

        + +

        References

        +

        [1] Sampling-Bias-Corrected Neural Modeling for Large Corpus Item Recommendations

        + +

        [2] DLRM: An advanced, open source deep learning recommendation model

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introducing-torchvision-new-multi-weight-support-api/index.html b/blog/introducing-torchvision-new-multi-weight-support-api/index.html new file mode 100644 index 000000000000..f13e60af061e --- /dev/null +++ b/blog/introducing-torchvision-new-multi-weight-support-api/index.html @@ -0,0 +1,986 @@ + + + + + + + + + + + + + Introducing TorchVision’s New Multi-Weight Support API | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Vasilis Vryniotis + +

        +

        TorchVision has a new backwards compatible API for building models with multi-weight support. The new API allows loading different pre-trained weights on the same model variant, keeps track of vital meta-data such as the classification labels and includes the preprocessing transforms necessary for using the models. In this blog post, we plan to review the prototype API, show-case its features and highlight key differences with the existing one.

        + +
        + +
        + +

        We are hoping to get your thoughts about the API prior finalizing it. To collect your feedback, we have created a Github issue where you can post your thoughts, questions and comments.

        + +

        Limitations of the current API

        + +

        TorchVision currently provides pre-trained models which could be a starting point for transfer learning or used as-is in Computer Vision applications. The typical way to instantiate a pre-trained model and make a prediction is:

        + +
        import torch
        +
        +from PIL import Image
        +from torchvision import models as M
        +from torchvision.transforms import transforms as T
        +
        +
        +img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
        +
        +# Step 1: Initialize model
        +model = M.resnet50(pretrained=True)
        +model.eval()
        +
        +# Step 2: Define and initialize the inference transforms
        +preprocess = T.Compose([
        +    T.Resize([256, ]),
        +    T.CenterCrop(224),
        +    T.PILToTensor(),
        +    T.ConvertImageDtype(torch.float),
        +    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        +])
        +
        +# Step 3: Apply inference preprocessing transforms
        +batch = preprocess(img).unsqueeze(0)
        +prediction = model(batch).squeeze(0).softmax(0)
        +
        +# Step 4: Use the model and print the predicted category
        +class_id = prediction.argmax().item()
        +score = prediction[class_id].item()
        +with open("imagenet_classes.txt", "r") as f:
        +    categories = [s.strip() for s in f.readlines()]
        +    category_name = categories[class_id]
        +print(f"{category_name}: {100 * score}%")
        +
        +
        + +

        There are a few limitations with the above approach:

        + +
          +
        1. Inability to support multiple pre-trained weights: Since the pretrained variable is boolean, we can only offer one set of weights. This poses a severe limitation when we significantly improve the accuracy of existing models and we want to make those improvements available to the community. It also stops us from offering pre-trained weights of the same model variant on different datasets.
        2. +
        3. Missing inference/preprocessing transforms: The user is forced to define the necessary transforms prior using the model. The inference transforms are usually linked to the training process and dataset used to estimate the weights. Any minor discrepancies in these transforms (such as interpolation value, resize/crop sizes etc) can lead to major reductions in accuracy or unusable models.
        4. +
        5. Lack of meta-data: Critical pieces of information in relation to the weights are unavailable to the users. For example, one needs to look into external sources and the documentation to find things like the category labels, the training recipe, the accuracy metrics etc.
        6. +
        + +

        The new API addresses the above limitations and reduces the amount of boilerplate code needed for standard tasks.

        + +

        Overview of the prototype API

        + +

        Let’s see how we can achieve exactly the same results as above using the new API:

        + +
        from PIL import Image
        +from torchvision.prototype import models as PM
        +
        +
        +img = Image.open("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
        +
        +# Step 1: Initialize model
        +weights = PM.ResNet50_Weights.IMAGENET1K_V1
        +model = PM.resnet50(weights=weights)
        +model.eval()
        +
        +# Step 2: Initialize the inference transforms
        +preprocess = weights.transforms()
        +
        +# Step 3: Apply inference preprocessing transforms
        +batch = preprocess(img).unsqueeze(0)
        +prediction = model(batch).squeeze(0).softmax(0)
        +
        +# Step 4: Use the model and print the predicted category
        +class_id = prediction.argmax().item()
        +score = prediction[class_id].item()
        +category_name = weights.meta["categories"][class_id]
        +print(f"{category_name}: {100 * score}*%*")
        +
        + +

        As we can see the new API eliminates the aforementioned limitations. Let’s explore the new features in detail.

        + +

        Multi-weight support

        + +

        At the heart of the new API, we have the ability to define multiple different weights for the same model variant. Each model building method (eg resnet50) has an associated Enum class (eg ResNet50_Weights) which has as many entries as the number of pre-trained weights available. Additionally, each Enum class has a DEFAULT alias which points to the best available weights for the specific model. This allows the users who want to always use the best available weights to do so without modifying their code.

        + +

        Here is an example of initializing models with different weights:

        + +
        from torchvision.prototype.models import resnet50, ResNet50_Weights
        +
        +# Legacy weights with accuracy 76.130%
        +model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        +
        +# New weights with accuracy 80.858%
        +model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        +
        +# Best available weights (currently alias for IMAGENET1K_V2)
        +model = resnet50(weights=ResNet50_Weights.DEFAULT)
        +
        +# No weights - random initialization
        +model = resnet50(weights=None)
        +
        + +

        Associated meta-data & preprocessing transforms

        + +

        The weights of each model are associated with meta-data. The type of information we store depends on the task of the model (Classification, Detection, Segmentation etc). Typical information includes a link to the training recipe, the interpolation mode, information such as the categories and validation metrics. These values are programmatically accessible via the meta attribute:

        + +
        from torchvision.prototype.models import ResNet50_Weights
        +
        +# Accessing a single record
        +size = ResNet50_Weights.IMAGENET1K_V2.meta["size"]
        +
        +# Iterating the items of the meta-data dictionary
        +for k, v in ResNet50_Weights.IMAGENET1K_V2.meta.items():
        +    print(k, v)
        +
        + +

        Additionally, each weights entry is associated with the necessary preprocessing transforms. All current preprocessing transforms are JIT-scriptable and can be accessed via the transforms attribute. Prior using them with the data, the transforms need to be initialized/constructed. This lazy initialization scheme is done to ensure the solution is memory efficient. The input of the transforms can be either a PIL.Image or a Tensor read using torchvision.io.

        + +
        from torchvision.prototype.models import ResNet50_Weights
        +
        +# Initializing preprocessing at standard 224x224 resolution
        +preprocess = ResNet50_Weights.IMAGENET1K_V2.transforms()
        +
        +# Initializing preprocessing at 400x400 resolution
        +preprocess = ResNet50_Weights.IMAGENET1K_V2.transforms(crop_size=400, resize_size=400)
        +
        +# Once initialized the callable can accept the image data:
        +# img_preprocessed = preprocess(img)
        +
        + +

        Associating the weights with their meta-data and preprocessing will boost transparency, improve reproducibility and make it easier to document how a set of weights was produced.

        + +

        Get weights by name

        + +

        The ability to link directly the weights with their properties (meta data, preprocessing callables etc) is the reason why our implementation uses Enums instead of Strings. Nevertheless for cases when only the name of the weights is available, we offer a method capable of linking Weight names to their Enums:

        + +
        from torchvision.prototype.models import get_weight
        +
        +# Weights can be retrieved by name:
        +assert get_weight("ResNet50_Weights.IMAGENET1K_V1") == ResNet50_Weights.IMAGENET1K_V1
        +assert get_weight("ResNet50_Weights.IMAGENET1K_V2") == ResNet50_Weights.IMAGENET1K_V2
        +
        +# Including using the DEFAULT alias:
        +assert get_weight("ResNet50_Weights.DEFAULT") == ResNet50_Weights.IMAGENET1K_V2
        +
        + +

        Deprecations

        + +

        In the new API the boolean pretrained and pretrained_backbone parameters, which were previously used to load weights to the full model or to its backbone, are deprecated. The current implementation is fully backwards compatible as it seamlessly maps the old parameters to the new ones. Using the old parameters to the new builders emits the following deprecation warnings:

        + +
        >>> model = torchvision.prototype.models.resnet50(pretrained=True)
        + UserWarning: The parameter 'pretrained' is deprecated, please use 'weights' instead.
        +UserWarning:
        +Arguments other than a weight enum or `None` for 'weights' are deprecated.
        +The current behavior is equivalent to passing `weights=ResNet50_Weights.IMAGENET1K_V1`.
        +You can also use `weights=ResNet50_Weights.DEFAULT` to get the most up-to-date weights.
        +
        + +

        Additionally the builder methods require using keyword parameters. The use of positional parameter is deprecated and using them emits the following warning:

        + +
        >>> model = torchvision.prototype.models.resnet50(None)
        +UserWarning:
        +Using 'weights' as positional parameter(s) is deprecated.
        +Please use keyword parameter(s) instead.
        +
        + +

        Testing the new API

        + +

        Migrating to the new API is very straightforward. The following method calls between the 2 APIs are all equivalent:

        + +
        # Using pretrained weights:
        +torchvision.prototype.models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        +torchvision.models.resnet50(pretrained=True)
        +torchvision.models.resnet50(True)
        +
        +# Using no weights:
        +torchvision.prototype.models.resnet50(weights=None)
        +torchvision.models.resnet50(pretrained=False)
        +torchvision.models.resnet50(False)
        +
        + +

        Note that the prototype features are available only on the nightly versions of TorchVision, so to use it you need to install it as follows:

        + +
        conda install torchvision -c pytorch-nightly
        +
        + +

        For alternative ways to install the nightly have a look on the PyTorch download page. You can also install TorchVision from source from the latest main; for more information have a look on our repo.

        + +

        Accessing state-of-the-art model weights with the new API

        + +

        If you are still unconvinced about giving a try to the new API, here is one more reason to do so. We’ve recently refreshed our training recipe and achieved SOTA accuracy from many of our models. The improved weights can easily be accessed via the new API. Here is a quick overview of the model improvements:

        + +
        + +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelOld Acc@1New Acc@1
        EfficientNet B178.64279.838
        MobileNetV3 Large74.04275.274
        Quantized ResNet5075.9280.282
        Quantized ResNeXt101 32x8d78.98682.574
        RegNet X 400mf72.83474.864
        RegNet X 800mf75.21277.522
        RegNet X 1 6gf77.0479.668
        RegNet X 3 2gf78.36481.198
        RegNet X 8gf79.34481.682
        RegNet X 16gf80.05882.72
        RegNet X 32gf80.62283.018
        RegNet Y 400mf74.04675.806
        RegNet Y 800mf76.4278.838
        RegNet Y 1 6gf77.9580.882
        RegNet Y 3 2gf78.94881.984
        RegNet Y 8gf80.03282.828
        RegNet Y 16gf80.42482.89
        RegNet Y 32gf80.87883.366
        ResNet5076.1380.858
        ResNet10177.37481.886
        ResNet15278.31282.284
        ResNeXt50 32x4d77.61881.198
        ResNeXt101 32x8d79.31282.834
        Wide ResNet50 278.46881.602
        Wide ResNet101 278.84882.51
        + +

        Please spare a few minutes to provide your feedback on the new API, as this is crucial for graduating it from prototype and including it in the next release. You can do this on the dedicated Github Issue. We are looking forward to reading your comments!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/introduction-to-quantization-on-pytorch/index.html b/blog/introduction-to-quantization-on-pytorch/index.html new file mode 100644 index 000000000000..12728b3c7d82 --- /dev/null +++ b/blog/introduction-to-quantization-on-pytorch/index.html @@ -0,0 +1,955 @@ + + + + + + + + + + + + + Introduction to Quantization on PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        March 26, 2020

        +

        + Introduction to Quantization on PyTorch +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Raghuraman Krishnamoorthi, James Reed, Min Ni, Chris Gottbrath, and Seth Weidman + +

        +

        It’s important to make efficient use of both server-side and on-device compute resources when developing machine learning applications. To support more efficient deployment on servers and edge devices, PyTorch added a support for model quantization using the familiar eager mode Python API.

        + +

        Quantization leverages 8bit integer (int8) instructions to reduce the model size and run the inference faster (reduced latency) and can be the difference between a model achieving quality of service goals or even fitting into the resources available on a mobile device. Even when resources aren’t quite so constrained it may enable you to deploy a larger and more accurate model. Quantization is available in PyTorch starting in version 1.3 and with the release of PyTorch 1.4 we published quantized models for ResNet, ResNext, MobileNetV2, GoogleNet, InceptionV3 and ShuffleNetV2 in the PyTorch torchvision 0.5 library.

        + +

        This blog post provides an overview of the quantization support on PyTorch and its incorporation with the TorchVision domain library.

        + +

        What is Quantization?

        + +

        Quantization refers to techniques for doing both computations and memory accesses with lower precision data, usually int8 compared to floating point implementations. This enables performance gains in several important areas:

        +
          +
        • 4x reduction in model size;
        • +
        • 2-4x reduction in memory bandwidth;
        • +
        • 2-4x faster inference due to savings in memory bandwidth and faster compute with int8 arithmetic (the exact speed up varies depending on the hardware, the runtime, and the model).
        • +
        + +

        Quantization does not however come without additional cost. Fundamentally quantization means introducing approximations and the resulting networks have slightly less accuracy. These techniques attempt to minimize the gap between the full floating point accuracy and the quantized accuracy.

        + +

        We designed quantization to fit into the PyTorch framework. The means that:

        +
          +
        1. PyTorch has data types corresponding to quantized tensors, which share many of the features of tensors.
        2. +
        3. One can write kernels with quantized tensors, much like kernels for floating point tensors to customize their implementation. PyTorch supports quantized modules for common operations as part of the torch.nn.quantized and torch.nn.quantized.dynamic name-space.
        4. +
        5. Quantization is compatible with the rest of PyTorch: quantized models are traceable and scriptable. The quantization method is virtually identical for both server and mobile backends. One can easily mix quantized and floating point operations in a model.
        6. +
        7. Mapping of floating point tensors to quantized tensors is customizable with user defined observer/fake-quantization blocks. PyTorch provides default implementations that should work for most use cases.
        8. +
        + +
        + +
        + +

        We developed three techniques for quantizing neural networks in PyTorch as part of quantization tooling in the torch.quantization name-space.

        + +

        The Three Modes of Quantization Supported in PyTorch starting version 1.3

        + +
          +
        1. +

          Dynamic Quantization

          +

          The easiest method of quantization PyTorch supports is called dynamic quantization. This involves not just converting the weights to int8 - as happens in all quantization variants - but also converting the activations to int8 on the fly, just before doing the computation (hence “dynamic”). The computations will thus be performed using efficient int8 matrix multiplication and convolution implementations, resulting in faster compute. However, the activations are read and written to memory in floating point format.

          +
            +
          • PyTorch API: we have a simple API for dynamic quantization in PyTorch. torch.quantization.quantize_dynamic takes in a model, as well as a couple other arguments, and produces a quantized model! Our end-to-end tutorial illustrates this for a BERT model; while the tutorial is long and contains sections on loading pre-trained models and other concepts unrelated to quantization, the part the quantizes the BERT model is simply:
          • +
          + +
          import torch.quantization
          +quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
          +
          +
            +
          • See the documentation for the function here an end-to-end example in our tutorials here and here.
          • +
          +
        2. +
        3. +

          Post-Training Static Quantization

          + +

          One can further improve the performance (latency) by converting networks to use both integer arithmetic and int8 memory accesses. Static quantization performs the additional step of first feeding batches of data through the network and computing the resulting distributions of the different activations (specifically, this is done by inserting “observer” modules at different points that record these distributions). This information is used to determine how specifically the different activations should be quantized at inference time (a simple technique would be to simply divide the entire range of activations into 256 levels, but we support more sophisticated methods as well). Importantly, this additional step allows us to pass quantized values between operations instead of converting these values to floats - and then back to ints - between every operation, resulting in a significant speed-up.

          + +

          With this release, we’re supporting several features that allow users to optimize their static quantization:

          +
            +
          1. Observers: you can customize observer modules which specify how statistics are collected prior to quantization to try out more advanced methods to quantize your data.
          2. +
          3. Operator fusion: you can fuse multiple operations into a single operation, saving on memory access while also improving the operation’s numerical accuracy.
          4. +
          5. Per-channel quantization: we can independently quantize weights for each output channel in a convolution/linear layer, which can lead to higher accuracy with almost the same speed.
          6. +
          + +
            +
          • +

            PyTorch API:

            +
              +
            • To fuse modules, we have torch.quantization.fuse_modules
            • +
            • Observers are inserted using torch.quantization.prepare
            • +
            • Finally, quantization itself is done using torch.quantization.convert
            • +
            +
          • +
          + +

          We have a tutorial with an end-to-end example of quantization (this same tutorial also covers our third quantization method, quantization-aware training), but because of our simple API, the three lines that perform post-training static quantization on the pre-trained model myModel are:

          +
          # set quantization config for server (x86)
          +deploymentmyModel.qconfig = torch.quantization.get_default_config('fbgemm')
          +
          +# insert observers
          +torch.quantization.prepare(myModel, inplace=True)
          +# Calibrate the model and collect statistics
          +
          +# convert to quantized version
          +torch.quantization.convert(myModel, inplace=True)
          +
          +
        4. +
        5. +

          Quantization Aware Training

          +

          Quantization-aware training(QAT) is the third method, and the one that typically results in highest accuracy of these three. With QAT, all weights and activations are “fake quantized” during both the forward and backward passes of training: that is, float values are rounded to mimic int8 values, but all computations are still done with floating point numbers. Thus, all the weight adjustments during training are made while “aware” of the fact that the model will ultimately be quantized; after quantizing, therefore, this method usually yields higher accuracy than the other two methods.

          +
            +
          • +

            PyTorch API:

            +
              +
            • torch.quantization.prepare_qat inserts fake quantization modules to model quantization.
            • +
            • Mimicking the static quantization API, torch.quantization.convert actually quantizes the model once training is complete.
            • +
            +
          • +
          + +

          For example, in the end-to-end example, we load in a pre-trained model as qat_model, then we simply perform quantization-aware training using:

          + +
          # specify quantization config for QAT
          +qat_model.qconfig=torch.quantization.get_default_qat_qconfig('fbgemm')
          +
          +# prepare QAT
          +torch.quantization.prepare_qat(qat_model, inplace=True)
          +
          +# convert to quantized version, removing dropout, to check for accuracy on each
          +epochquantized_model=torch.quantization.convert(qat_model.eval(), inplace=False)
          +
          +
        6. +
        + +

        Device and Operator Support

        +

        Quantization support is restricted to a subset of available operators, depending on the method being used, for a list of supported operators, please see the documentation at https://pytorch.org/docs/stable/quantization.html.

        + +

        The set of available operators and the quantization numerics also depend on the backend being used to run quantized models. Currently quantized operators are supported only for CPU inference in the following backends: x86 and ARM. Both the quantization configuration (how tensors should be quantized and the quantized kernels (arithmetic with quantized tensors) are backend dependent. One can specify the backend by doing:

        + +
        import torchbackend='fbgemm'
        +# 'fbgemm' for server, 'qnnpack' for mobile
        +my_model.qconfig = torch.quantization.get_default_qconfig(backend)
        +# prepare and convert model
        +# Set the backend on which the quantized kernels need to be run
        +torch.backends.quantized.engine=backend
        +
        + +

        However, quantization aware training occurs in full floating point and can run on either GPU or CPU. Quantization aware training is typically only used in CNN models when post training static or dynamic quantization doesn’t yield sufficient accuracy. This can occur with models that are highly optimized to achieve small size (such as Mobilenet).

        + +

        Integration in torchvision

        +

        We’ve also enabled quantization for some of the most popular models in torchvision: Googlenet, Inception, Resnet, ResNeXt, Mobilenet and Shufflenet. We have upstreamed these changes to torchvision in three forms:

        +
          +
        1. Pre-trained quantized weights so that you can use them right away.
        2. +
        3. Quantization ready model definitions so that you can do post-training quantization or quantization aware training.
        4. +
        5. A script for doing quantization aware training — which is available for any of these model though, as you will learn below, we only found it necessary for achieving accuracy with Mobilenet.
        6. +
        7. We also have a tutorial showing how you can do transfer learning with quantization using one of the torchvision models.
        8. +
        + +

        Choosing an approach

        +

        The choice of which scheme to use depends on multiple factors:

        +
          +
        1. Model/Target requirements: Some models might be sensitive to quantization, requiring quantization aware training.
        2. +
        3. Operator/Backend support: Some backends require fully quantized operators.
        4. +
        + +

        Currently, operator coverage is limited and may restrict the choices listed in the table below: +The table below provides a guideline.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model TypePreferred schemeWhy
        LSTM/RNNDynamic QuantizationThroughput dominated by compute/memory bandwidth for weights
        BERT/TransformerDynamic QuantizationThroughput dominated by compute/memory bandwidth for weights
        CNNStatic QuantizationThroughput limited by memory bandwidth for activations
        CNNQuantization Aware TrainingIn the case where accuracy can't be achieved with static quantization
        + +

        Performance Results

        +

        Quantization provides a 4x reduction in the model size and a speedup of 2x to 3x compared to floating point implementations depending on the hardware platform and the model being benchmarked. Some sample results are:

        + +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelFloat Latency (ms)Quantized Latency (ms)Inference Performance GainDeviceNotes
        BERT5813131.8xXeon-D2191 (1.6GHz)Batch size = 1, Maximum sequence length= 128, Single thread, x86-64, Dynamic quantization
        Resnet-502141032xXeon-D2191 (1.6GHz)Single thread, x86-64, Static quantization
        Mobilenet-v297175.7xSamsung S9Static quantization, Floating point numbers are based on Caffe2 run-time and are not optimized
        +
        + +

        Accuracy results

        +

        We also compared the accuracy of static quantized models with the floating point models on Imagenet. For dynamic quantization, we compared the F1 score of BERT on the GLUE benchmark for MRPC.

        + +

        Computer Vision Model accuracy

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelTop-1 Accuracy (Float)Top-1 Accuracy (Quantized)Quantization scheme
        Googlenet69.869.7Static post training quantization
        Inception-v377.577.1Static post training quantization
        ResNet-1869.869.4Static post training quantization
        Resnet-5076.175.9Static post training quantization
        ResNext-101 32x8d79.379Static post training quantization
        Mobilenet-v271.971.6Quantization Aware Training
        Shufflenet-v269.468.4Static post training quantization
        + +

        Speech and NLP Model accuracy

        + +
        + + + + + + + + + + + + + +
        ModelF1 (GLUEMRPC) FloatF1 (GLUEMRPC) QuantizedQuantization scheme
        BERT0.9020.895Dynamic quantization
        +
        + +

        Conclusion

        +

        To get started on quantizing your models in PyTorch, start with the tutorials on the PyTorch website. If you are working with sequence data start with dynamic quantization for LSTM, or BERT. If you are working with image data then we recommend starting with the transfer learning with quantization tutorial. Then you can explore static post training quantization. If you find that the accuracy drop with post training quantization is too high, then try quantization aware training.

        + +

        If you run into issues you can get community help by posting in at discuss.pytorch.org, use the quantization category for quantization related issues.

        + +

        This post is authored by Raghuraman Krishnamoorthi, James Reed, Min Ni, Chris Gottbrath and Seth Weidman. Special thanks to Jianyu Huang, Lingyi Liu and Haixin Liu for producing quantization metrics included in this post.

        + +

        Further reading:

        +
          +
        1. PyTorch quantization presentation at Neurips: (https://research.fb.com/wp-content/uploads/2019/12/2.-Quantization.pptx)
        2. +
        3. Quantized Tensors (https://github.com/pytorch/pytorch/wiki/ +Introducing-Quantized-Tensor)
        4. +
        5. Quantization RFC on Github (https://github.com/pytorch/pytorch/ +issues/18318)
        6. +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/join-pytorch/index.html b/blog/join-pytorch/index.html new file mode 100644 index 000000000000..38c92bd4eddd --- /dev/null +++ b/blog/join-pytorch/index.html @@ -0,0 +1,715 @@ + + + + + + + + + + + + + Join the PyTorch Foundation: Membership Now Open | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        In September 2022, we welcomed PyTorch to the Linux Foundation from Meta, which formed the PyTorch Foundation with founding members AMD, Amazon Web Services (AWS), Google, Meta, Microsoft, and NVIDIA.

        + +

        Since then, we’ve seen significant growth, including a 39% increase in commits across all repositories, 27% increase of unique contributors, and a 12% increase community contributions – all in the last 90 days! We’re grateful to our founding members for their support to move the foundation forward.

        + +

        Today, we’re announcing that membership is now open to join the PyTorch Foundation.

        + +

        As a member of the PyTorch Foundation, you’ll have access to resources that allow you to be stewards of stable, secure, and long-lasting codebases. You can collaborate on training and certification programs, local and regional events, open source developer tooling, academic research, and guides to help new users and contributors have a productive experience.

        + +

        The PyTorch Foundation’s goal is to help end users navigate the PyTorch ecosystem, recruit talent, and adopt PyTorch and support open source AI technologies successfully.

        + +

        Why join as a member

        + +

        Being a part of the PyTorch Foundation grants opportunities to help build the future of end-to-end machine learning frameworks alongside your industry peers.

        + +

        Membership benefits include:

        + +
          +
        • Gain technical traction and insight for your organization’s products by immersing your teams with other industry leaders.
        • +
        • Influence technical priorities, approaches, and code.
        • +
        • Support the PyTorch project community by helping fund programs and services that the project and its community rely on.
        • +
        • Engage with the PyTorch project ecosystem, network with fellow members, and contribute to building and maintaining an engaging and strong PyTorch ecosystem.
        • +
        • Provide thought leadership and participate in unique, wide-reaching networking and marketing programs expanding industry awareness as PyTorch amplifies member progress.
        • +
        • Retain, attract, and increase engineering skills and employees and build your innovation partner network, supply chain, and customer pipeline.
        • +
        • As an active member of the PyTorch community, you can deepen your engagement and leadership in local and industry developer networks and conferences.
        • +
        + +

        How to join

        + +

        Commercial organizations are invited to apply for General membership, while non-profits and academic institutions are encouraged to apply for Associate membership.

        + +

        Premier Members

        + +

        Organizations are welcome to submit an application to be considered as a Premier member. Premier members are the highest tier. They will appoint one voting representative in any subcommittees or activities of the PTF Governing Board, and receive prominent placement in displays of membership including website, landscape and marketing materials, exclusive live webinars with PyTorch online programs and everything included within a “general” membership. The annual fee is $150,000 + an LF Silver Membership.

        + +

        General Members

        + +

        General members will participate in all marketing, community and thought leadership opportunities, as well as discounts on event sponsorships and training courses. General members also have the opportunity to be considered for a PTF board position. The annual fee is dependent on the size of your organization. More details can be found here.

        + +

        Associate Members

        + +

        Associate members are free to join and will receive support and participation opportunities with the PyTorch Foundation team. More information can be found here.

        + +

        Hear from our founding members

        + +

        AMD

        + +

        “AMD strongly believes in and supports an open software ecosystem. We are very proud to be a founding member of the PyTorch Foundation, helping to develop an open and collaborative community for AI and ML. AI and ML have the opportunity to impact everything we do, and the work done through the PyTorch Foundation is critical in developing an open framework that is vendor neutral and helps democratize AI for all.”

        + +

        AWS

        + +

        “AWS is a firm believer in the PyTorch Foundation mission to develop AI and deep learning tools through open collaboration. Our customers use PyTorch every day to build, train, and deploy machine learning models on AWS. Through our involvement, AWS is supporting innovation and helping to make open source tooling more accessible to our customers and the broader community.”

        + +

        Google

        + +

        “The AI revolution is upon us and it’s being built on PyTorch. With new applications like ChatGPT and Stable Diffusion built on PyTorch, the wave of generative AI continues to be felt across every facet of society. We at Google are excited to be a founding member of the PyTorch Foundation and we’re excited for the opportunity to work closely with other leaders in AI to help grow this amazing and innovative community.”

        + +

        Meta

        + +

        “Meta has a long history of putting open science at the core of our work in AI and PyTorch is no exception. PyTorch was built from the ground up with an open source, community-first philosophy. We transitioned PyTorch to the PyTorch Foundation because we believe this approach enables the fastest progress in building and deploying new systems that will address real-world needs and answer fundamental questions about the nature of intelligence. With the PyTorch Foundation, the entire AI community is positioned to push the field forward in countless exciting new ways.”

        + +

        Microsoft

        + +

        “Microsoft believes strongly in PyTorch and it’s been an honor to be a founding member of the PyTorch Foundation. Internally, we use PyTorch extensively, and an outgrowth of that is the Azure Container for PyTorch, which provides deep optimization for PyTorch development, including ONNX Runtime, DeepSpeed, and Nebula to greatly reduce training cost and accelerate training times on Azure Machine Learning. As part of our ongoing commitment to open source machine learning platforms, we look forward to partnering with industry leaders to continue contributing to the advancement of PyTorch.”

        + +

        NVIDIA

        + +

        “As a leading Python-based AI framework, PyTorch has been fundamental to the development of LLMs and GenAI. NVIDIA’s goal is to deepen our collaboration with the open-source AI community as part of the PyTorch Foundation, and help build the next wave of advanced, energy efficient, and cost-effective applications with accelerated computing.”

        + +

        Join today

        + +

        We are excited to see the PyTorch Foundation continue to grow alongside the community through neutral governance and support. We hope you’ll join us as a member!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/language-identification/index.html b/blog/language-identification/index.html new file mode 100644 index 000000000000..6607d9b79247 --- /dev/null +++ b/blog/language-identification/index.html @@ -0,0 +1,835 @@ + + + + + + + + + + + + + Language Identification: Building an End-to-End AI Solution using PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        Language Identification is the process of identifying the primary language from multiple audio input samples. In natural language processing (NLP), language identification is an important problem and a challenging issue. There are many language-related tasks such as entering text on your phone, finding news articles you enjoy, or discovering answers to questions that you may have. All these tasks are powered by NLP models. To decide which model to invoke at a particular point in time, we must perform language identification.

        + +

        This article presents an in-depth solution and code sample for language identification using Intel® Extension for PyTorch, which is a version of the popular PyTorch AI framework optimized for use on Intel® processors, and Intel® Neural Compressor, which is a tool to accelerate AI inference without sacrificing accuracy.

        + +

        The code sample demonstrates how to train a model to perform language identification using the Hugging Face SpeechBrain* toolkit and optimize it using the Intel® AI Analytics Toolkit (AI Kit). The user can modify the code sample and identify up to 93 languages using the Common Voice dataset.

        + +

        Proposed Methodology for Language Identification

        + +

        In the proposed solution, the user will use an Intel AI Analytics Toolkit container environment to train a model and perform inference leveraging Intel-optimized libraries for PyTorch. There is also an option to quantize the trained model with Intel Neural Compressor to speed up inference.

        + +

        Dataset

        + +

        The Common Voice dataset is used and for this code sample, specifically, Common Voice Corpus 11.0 for Japanese and Swedish. This dataset is used to train an Emphasized Channel Attention, Propagation and Aggregation Time Delay Neural Network (ECAPA-TDNN), which is implemented using the Hugging Face SpeechBrain library. Time Delay Neural Networks (TDNNs), aka one-dimensional Convolutional Neural Networks (1D CNNs), are multilayer artificial neural network architectures to classify patterns with shift-invariance and model context at each layer of the network. ECAPA-TDNN is a new TDNN-based speaker-embedding extractor for speaker verification; it is built upon the original x-vector architecture and puts more emphasis on channel attention, propagation, and aggregation.

        + +

        Implementation

        + +

        After downloading the Common Voice dataset, the data is preprocessed by converting the MP3 files into WAV format to avoid information loss and separated into training, validation, and testing sets.

        + +

        A pretrained VoxLingua107 model is retrained with the Common Voice dataset using the Hugging Face SpeechBrain library to focus on the languages of interest. VoxLingua107 is a speech dataset used for training spoken language recognition models that work well with real-world and varying speech data. This dataset contains data for 107 languages. By default, Japanese and Swedish are used, and more languages can be included. This model is then used for inference on the testing dataset or a user-specified dataset. Also, there is an option to utilize SpeechBrain’s Voice Activity Detection (VAD) where only the speech segments from the audio files are extracted and combined before samples are randomly selected as input into the model. This link provides all the necessary tools to perform VAD. To improve performance, the user may quantize the trained model to integer-8 (INT8) using Intel Neural Compressor to decrease latency.

        + +

        Training

        + +

        The copies of training scripts are added to the current working directory, including create_wds_shards.py - for creating the WebDataset shards, train.py - to perform the actual training procedure, and train_ecapa.yaml - to configure the training options. The script to create WebDataset shards and YAML file are patched to work with the two languages chosen for this code sample.

        + +

        In the data preprocessing phase, prepareAllCommonVoice.py script is executed to randomly select a specified number of samples to convert the input from MP3 to WAV format. Here, 80% of these samples will be used for training, 10% for validation, and 10% for testing. At least 2000 samples are recommended as the number of input samples and is the default value.

        + +

        In the next step, WebDataset shards are created from the training and validation datasets. This stores the audio files as tar files which allows writing purely sequential I/O pipelines for large-scale deep learning in order to achieve high I/O rates from local storage—about 3x-10x faster compared to random access.

        + +

        The YAML file will be modified by the user. This includes setting the value for the largest number for the WebDataset shards, output neurons to the number of languages of interest, number of epochs to train over the entire dataset, and the batch size. The batch size should be decreased if the CPU or GPU runs out of memory while running the training script.

        + +

        In this code sample, the training script will be executed with CPU. While running the script, “cpu” will be passed as an input parameter. The configurations defined in train_ecapa.yaml are also passed as parameters.

        + +

        The command to run the script to train the model is:

        + +
        python train.py train_ecapa.yaml --device "cpu"
        +
        + +

        In the future, the training script train.py will be designed to work for Intel® GPUs such as the Intel® Data Center GPU Flex Series, Intel® Data Center GPU Max Series, and Intel® Arc™ A-Series with updates from Intel Extension for PyTorch.

        + +

        Run the training script to learn how to train the models and execute the training script. The 4th Generation Intel® Xeon® Scalable Processor is recommended for this transfer learning application because of its performance improvements through its Intel® Advanced Matrix Extensions (Intel® AMX) instruction set.

        + +

        After training, checkpoint files are available. These files are used to load the model for inference.

        + +

        Inference

        + +

        Inference Pipeline

        + +

        The crucial step before running inference is to patch the SpeechBrain library’s pretrained interfaces.py file so that PyTorch TorchScript* can be run to improve the runtime. TorchScript requires the output of the model to be only tensors.

        + +

        Users can choose to run inference using the testing set from Common Voice or their own custom data in WAV format. The following are the options the inference scripts (inference_custom.py and inference_commonVoice.py) can be run with:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Input Option + Description +
        -p + Specify the data path. +
        -d + Specify the duration of wave sample. The default value is 3. +
        -s + Specify size of sample waves, default is 100. +
        --vad + (`inference_custom.py` only) Enable VAD model to detect active speech. The VAD option will identify speech segments in the audio file and construct a new .wav file containing only the speech segments. This improves the quality of speech data used as input into the language identification model. +
        --ipex + Run inference with optimizations from Intel Extension for PyTorch. This option will apply optimizations to the pretrained model. Using this option should result in performance improvements related to latency. +
        --ground_truth_compare + (`inference_custom.py` only) Enable comparison of prediction labels to ground truth values. +
        --verbose + Print additional debug information, like latency. +
        + +

        The path to the data must be specified. By default, 100 audio samples of 3-seconds will be randomly selected from the original audio file and used as input to the language identification model.

        + +

        A small Convolutional Recurrent Deep Neural Network (CRDNN) pretrained on the LibriParty dataset is used to process audio samples and output the segments where speech activity is detected. This can be used in inference with the --vad option.

        + +

        From the figure below, the timestamps where speech will be detected is delivered from the CRDNN model, and these are used to construct a new, shorter audio file with only speech. Sampling from this new audio file will give a better prediction of the primary language spoken.

        + +

        Audio wave file visualization

        + +

        Run the inference script yourself. An example command of running inference:

        + +
        python inference_custom.py -p data_custom -d 3 -s 50 --vad
        +
        + +

        This will run inference on data you provide located inside the data_custom folder. This command performs inference on 50 randomly selected 3-second audio samples with voice activity detection.

        + +

        If you want to run the code sample for other languages, download Common Voice Corpus 11.0 datasets for other languages.

        + +

        Optimizations with Intel Extension for PyTorch and Intel Neural Compressor

        + +

        PyTorch

        + +

        The Intel extension expands PyTorch with up-to-date features and optimizations for an extra performance boost on Intel hardware. Check out how to install Intel Extension for PyTorch. The extension can be loaded as a Python module or linked as a C++ library. Python users can enable it dynamically by importing intel_extension_for_pytorch.

        + +
          +
        • The CPU tutorial gives detailed information about Intel Extension for PyTorch for Intel CPUs. Source code is available at the master branch.
        • +
        • The GPU tutorial gives detailed information about Intel Extension for PyTorch for Intel GPUs. Source code is available at the xpu-master branch.
        • +
        + +

        To optimize the model for inference using Intel Extension for PyTorch, the --ipexoption can be passed in. The model is optimized using the plug-in. TorchScript speeds up inference because PyTorch is run in graph mode. The command to run with this optimization is:

        + +
        python inference_custom.py -p data_custom -d 3 -s 50 --vad --ipex --verbose
        +
        + +

        Note: The --verbose option is required to view the latency measurements.

        + +

        Auto-mixed precision such as bfloat16 (BF16) support will be added in a future release of the code sample.

        + +

        Intel Neural Compressor

        + +

        This is an open-source Python library that runs on CPUs or GPUs, which:

        + +
          +
        • Performs model quantization to reduce the model size and increase the speed of deep learning inference for deployment.
        • +
        • Automates popular methods such as quantization, compression, pruning, and knowledge distillation across multiple deep-learning frameworks.
        • +
        • Is part of the AI Kit
        • +
        + +

        The model can be quantized from float32 (FP32) precision to integer-8 (INT8) by running the quantize_model.py script while passing in the path to the model and a validation dataset. The following code can be used to load this INT8 model for inference:

        + +
        from neural_compressor.utils.pytorch import load
        +model_int8 = load("./lang_id_commonvoice_model_INT8", self.language_id)
        +signal = self.language_id.load_audio(data_path)
        +prediction = self.model_int8(signal)
        +
        + +

        Note that the original model is required when loading the quantized model. The command to quantize the trained model from FP32 to INT8 by using quantize_model.py is:

        + +
        python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/commonVoiceData/commonVoice/dev
        +
        + +

        What’s Next?

        + +

        Try out the above code sample by upgrading the hardware to a 4th Generation Intel Xeon Scalable Processor with Intel AMX and identify up to 93 different languages from Common Voice datasets.

        + +

        We encourage you to learn more about and incorporate Intel’s other AI/ML Framework optimizations and end-to-end portfolio of tools into your AI workflow. Also, visit AI & ML page covering Intel’s AI software development resources for preparing, building, deploying, and scaling your AI solutions.

        + +

        For more details about the new 4th Gen Intel Xeon Scalable processors, visit Intel’s AI Solution Platform portal where you can learn how Intel is empowering developers to run end-to-end AI pipelines on these powerful CPUs.

        + +

        Useful resources

        + + + +

        Explore more AI code samples

        + + + +

        See all code samples

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/large-scale-training-hugging-face/index.html b/blog/large-scale-training-hugging-face/index.html new file mode 100644 index 000000000000..889b0c1317fb --- /dev/null +++ b/blog/large-scale-training-hugging-face/index.html @@ -0,0 +1,943 @@ + + + + + + + + + + + + + Large Scale Training of Hugging Face Transformers on TPUs With PyTorch/XLA FSDP | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Alex Wertheim, Milad Mohammadi, Jack Cao, Alex Spiridonov, Joe Spisak, Lysandre Debut, Sylvain Gugger, Sourab Mangrulkar + +

        +

        AI is transforming many industries through advanced capabilities such as understanding and generating language, answering questions, and delivering accurate recommendations. These capabilities are fueled by ever-increasing size and complexity of AI models, which require vast amounts of computing power to train.

        + +

        To meet the growing demands of AI training at scale, last year we introduced Fully Sharded Data Parallel (FSDP) in PyTorch/XLA. FSDP is a model parallelism architecture that unlocks the ability to easily and efficiently scale AI models into hundreds of billions of parameters. With PyTorch/XLA FSDP, during distributed training, each device can store a specific model shard, and all-gather the full model weights when it is time to perform the forward pass. Nested FSDP further optimizes performance by only using a given layer’s full parameters during its forward pass.

        + +

        We are excited to announce that PyTorch/XLA FSDP has landed in Hugging Face Transformers. Now, Hugging Face users can train PyTorch models with up to 20 times more parameters using the same amount of computing power as before.

        + +

        We built PyTorch/XLA FSDP support directly into the Hugging Face Trainer class, so that any model using Trainer can leverage FSDP. And with the addition of automatic wrapping to PyTorch/XLA FSDP, nested FSDP wrapping is both flexible and simple to apply. These new features make it easy to train a wide range of Hugging Face models at large scales. In this guide, we demonstrate training GPT-2 models with up to 128B parameters on Google Cloud TPUs. PyTorch/XLA FSDP training on TPUs is highly efficient, achieving up to 45.1% model FLOPS utilization (MFU) for GPT-2:

        + +

        Figure 1: Model FLOPS utilization for Hugging Face GPT-2 on Google Cloud TPU v4

        + +

        Figure 1: Model FLOPS utilization for Hugging Face GPT-2 on Google Cloud TPU v4

        + +

        Configuring PyTorch/XLA FSDP in the Hugging Face Trainer

        + +

        First, follow your preferred method to create your TPU(s) and install PyTorch and PyTorch/XLA. You need versions >= 2.0 for PyTorch and PyTorch/XLA.

        + +
            pip3 install https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torc h-2.0-cp38-cp38-linux_x86_64.whl --user
        +
        +    pip3 install https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torc h_xla-2.0-cp38-cp38-linux_x86_64.whl
        +
        + +

        Next, clone and install the Hugging Face Transformers repo. Install all necessary dependencies (e.g., datasets, evaluate, scikit-learn, accelerate).

        + +
            cd $HOME
        +    git clone https://github.com/huggingface/transformers.git cd transformers
        +    git checkout v4.31-release
        +    pip3 install -e .
        +    pip3 install datasets evaluate scikit-learn
        +    pip3 install accelerate==0.21.0
        +
        + +

        In $HOME/transformers, create any model-specific configuration files you might need. Here is an example of a configuration file for a GPT-2 model with 2B parameters, which we later refer to as gpt2_config.json:

        + +
        {
        +    "activation_function": "gelu_new", 
        +    "architectures": [
        +        "GPT2LMHeadModel"
        +    ],
        +    "attn_pdrop": 0.1,
        +    "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2",
        +    "n_embd": 3072,
        +    "n_head": 24,
        +    "n_layer": 18,
        +    "n_positions": 1024,
        +    "resid_pdrop": 0.1,
        +    "summary_activation": null,
        +    "summary_first_dropout": 0.1,
        +    "summary_proj_to_labels": true,
        +    "summary_type": "cls_index",
        +    "summary_use_proj": true,
        +    "task_specific_params": {
        +        "text-generation": {
        +            "do_sample": true,
        +            "max_length": 50
        +        }
        +    },
        +    "vocab_size": 50257
        +}
        +
        + +

        With PyTorch/XLA FSDP, it is possible to train model sizes much bigger than this on large accelerator slices. We have trained GPT-2 models as large as 128B parameters with these techniques; for expert tips on how to replicate this scale, see the appendix.

        + +

        In $HOME/transformers, create your FSDP configuration file, a JSON file containing all of the configurable aspects of your XLA FSDP wrapping stored as a dictionary. Following the official Hugging Face Transformers XLA FSDP documentation, the following arguments are available to set:

        +
          +
        • xla (bool, \*optional\*, defaults to False): This is a boolean which determines whether or not you use XLA FSDP. Make sure to set this to true.
        • +
        • xla_fsdp_settings (dict, \*optional\*): This is a dictionary which stores all of the XLA FSDP wrapping parameters you want to set; note that you do not have to specify settings for parameters where you are using the default value. For a complete list of settings, see here.
        • +
        + +

        For compute_dtype and buffer_dtype, enter these as strings which contain the corresponding torch data type, e.g. bfloat16.

        + +
          +
        • fsdp_min_num_params (int, \*optional\*, defaults to 0): An integer which sets the minimum number of parameters for size-based auto wrapping. Every module with at least as many parameters as fsdp_min_num_params will be XLA FSDP wrapped.
        • +
        • fsdp_transformer_layer_cls_to_wrap (List[str], \*optional\*): A list of (case-sensitive) transformer layer class names to wrap. Note that this is mutually exclusive with fsdp_min_num_params. Example: ["GPT2Block", "GPT2MLP"].
        • +
        • xla_fsdp_grad_ckpt (bool, \*optional\*, defaults to False): This is a boolean which determines whether to use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be used when the xla flag is set to true, and an auto wrapping policy is specified through fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
        • +
        + +

        Note: For transformer-based models, use fsdp_transformer_layer_cls_to_wrap instead of fsdp_min_num_params when performing automatic nested FSDP wrapping. Layers which share weights should not belong to separate FSDP wrapped units, and the input and output embedding layers in transformer-based models share weights.

        + +

        For this GPT-2 example, here is what the corresponding fsdp_config.json file looks like:

        + +
            {
        +        "fsdp_transformer_layer_cls_to_wrap": [
        +            "GPT2Block"
        +        ],
        +        "xla": true,
        +        "xla_fsdp_settings": {
        +            "compute_dtype": "bfloat16",
        +            "shard_param_on_dim_0": true,
        +            "pin_layout_in_collective_ops": true
        +        },
        +       "xla_fsdp_grad_ckpt": true
        +    }
        +
        + + + + + + + +
        Now, it’s time to train your model! First, ensure that you have your PyTorch/XLA runtime set up appropriately by setting
        + +
            export PJRT_DEVICE=TPU
        +
        + +

        When running training, the key flags to pass are:

        + +

        a) --fsdp "full_shard" +b) --fsdp_config fsdp_config.json

        + +

        where you should replace fsdp_config.json with whatever you named your FSDP configuration file. Here is a sample command to train our example 2B GPT-2 model, where training is started by xla_spawn.py, a launcher script for distributed TPU training.

        + +
            python3 -u examples/pytorch/xla_spawn.py --num_cores 4 examples/pytorch/language-modeling/run_clm.py \
        +    --num_train_epochs 1 \
        +    --dataset_name wikitext \
        +    --dataset_config_name wikitext-2-raw-v1 \ --per_device_train_batch_size 32 \ --per_device_eval_batch_size 32 \
        +    --do_train \
        +    --do_eval \
        +    --output_dir /tmp/test-clm \
        +    --overwrite_output_dir \
        +    --config_name gpt2_config.json \
        +    --cache_dir /tmp \
        +    --tokenizer_name gpt2 \
        +    --block_size 1024 \
        +    --optim adafactor \
        +    --adafactor true \
        +    --save_strategy no \
        +    --logging_strategy no \
        +    --fsdp "full_shard" \
        +    --fsdp_config fsdp_config.json
        +
        + +

        Measuring Model FLOPS Utilization (MFU) for GPT-2

        + +

        Model FLOPS are the floating point operations required to perform a single forward and backward pass. Model FLOPS are hardware- and implementation- independent, and only depend on the underlying model. In each step, the number of FLOPS is computed via the following formulas:

        + +
        tokens_per_batch = global_batch_size \* seq_len
        +
        +FLOPS_per_step = 6 \* tokens_per_batch \* num_params
        +
        + +

        where seq_len is the sequence length and num_params is the number of parameters in the model. We note that this estimation assumes that the input dimensionality is much larger than the input sequence length (d_model >> seq_len). If this assumption is violated the self-attention FLOPs start to be significant enough and this expression will underestimate the true MFU.

        + +

        Based on the step time and the hardware details (numbers of chips and the peak FLOPS per chip), we can compute Model FLOPS Utilization (MFU), which measures how effectively our implementation is using the underlying hardware. Achieving 100% MFU means that the hardware is being used perfectly by that model. We calculate MFU using the following formula:

        + +
        model_FLOPS_utilization = FLOPS_per_step / step_time(s) / chip_count / FLOPS_per_chip
        +
        + +

        When training a GPT-2 model with 2B parameters with the XLA FSDP configuration file above on a Cloud TPU v4-8, we measure a step time of 4.191s. Using the above formula, we calculate 35.7% MFU on a v4-8. For further details on calculating MFU, refer to the PaLM paper.

        + +

        The table below presents MFU for GPT-2 models with sizes between 2B and 128B, with a sequence length of 1024.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        TPU NumCoresv4-8v4-64v4-128v4-128v4-256v4-512
        # of Tokens / Batch131,072524,288524,288524,2881,048,5761,048,576
        # of Parameters2B16B20B32B64B128B
        Step Time (ms)4,19114,5927,82412,97025,65330,460
        PFLOPS / Step1.655062101404809
        MFU35.7%38.8%45.1%44.4%44.7%37.7%
        + +

        Table 1: GPT-2 model FLOPS utilization calculation details

        + +

        Among these configurations, MFU peaks at 45.1% for the 20B parameter model on v4-128. This result compares favorably to, for example, 41.5% MFU for a 22B Megatron-like model.

        + +

        There are two actionable insights from these experiments:

        + +

        First, simply increasing the number of chips without increasing the batch size generally means lower FLOPS utilization, because more time is spent on sharing the model shards. FSDP uses all-reduce communication collectives which are not asynchronous, which means that chip-to-chip communication cannot be overlapped with computation. As the number of chips increases, the number of model shards that must be communicated increases, and so we should expect the portion of the step time spent on communication to increase with the number of chips.

        + +

        Second, increasing the batch size generally means better FLOPS utilization. As the number of chips increases, the memory footprint of the model decreases, which often frees up high bandwidth memory (HBM) to scale up the global batch size. With a larger global batch size, the number of tokens processed in each step increases, and thus, so does the FLOPS per step. As long as the step time does not increase proportionally, we expect a larger global batch size to improve MFU.

        + +

        Therefore, to maximize the MFU, we recommend training with the largest global batch size possible that can fit in the HBM of the TPU slice, using FSDP to reduce memory required for the model parameters.

        + +

        Training Very Large Models (tested to 128B parameters)

        + +

        When using PyTorch/XLA, tensors must be initialized on the CPU before being moved to the XLA device. This means one may encounter host-side out-of-memory errors if the model is sufficiently large, even though the model can fit in the device HBM after sharding. To avoid this, we must defer each submodule’s initialization until it is FSDP wrapped, which ensures that submodules are sharded as soon as their values are populated, avoiding host-side limitations.

        + +

        Below, we explain how to modify a local copy of the Hugging Face transformers repository to train a GPT-2 model with up to 128B parameters using this technique.

        + +

        First, using the commands below, install torchdistX, which is a library containing experimental PyTorch Distributed features. This is the engine behind deferred initialization, and allows you to create tensors that don’t require immediate storage and can be materialized later. You also need to install a specific PyTorch/XLA 2.0 version that takes advantage of this package; note that you must uninstall PyTorch and PyTorch/XLA first, if you installed them earlier.

        + +
        pip3 install torch==2.0 --index-url [https://download.pytorch.org/whl/test/cpu](https://download.pytorch.org/whl/test/cpu) --user
        +pip3 install torch_xla[torchdistx] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/experimen tal/torch_xla-2.0-cp38-cp38-linux_x86_64.whl
        +
        + +

        Next, apply the following changes to your local copy of Hugging Face Transformers:

        + +

        In src/transformers/trainer.py, add the following function in _wrap_model on the line immediately prior to PyTorch/XLA FSDP wrapping:

        + +
        from torchdistx import deferred_init
        +
        +def _init_with_torchdistX(module):
        +    def check_fn(k):
        +        return not isinstance(k, FSDP)
        +    deferred_init.materialize_module(module, check_fn=check_fn)
        +
        + +

        The function materialize_module will initialize the model tensors if check_fn returns True. In this case, check_fn checks whether the module has been FSDP wrapped.

        + +

        Within _wrap_model, modify your FSDP wrapping to accept the additional argument param_init_fn=_init_with_torchdistX:

        + +
        self.model = model = FSDP(
        +        model,
        +        auto_wrap_policy=auto_wrap_policy,
        +        auto_wrapper_callable=auto_wrapper_callable,
        +        param_init_fn=_init_with_torchdistX,
        +        \*\*fsdp_kwargs,
        +    )
        +
        + +

        In examples/pytorch/language-modeling/run_clm.py, add the following import statement at the beginning of the file:

        + +
        from torchdistx import deferred_init
        +
        + +

        Edit the model initialization so that the model is wrapped with deferred_init.deferred_init by replacing the line

        + +
        model = AutoModelForCausalLM.from_config(config)
        +
        + +

        with

        + +
        model = deferred_init.deferred_init(AutoModelForCausalLM.from_config, config)
        +
        + +

        Note that this assumes you are supplying your own model configuration file. Otherwise, you should modify your model initialization statement accordingly.

        + +

        You should also comment out these two lines which immediately follow the line above:

        + +
        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) logger.info(f"Training new model from scratch - Total size={n_params/2\*\*20:.2f}M params")
        +
        + +

        They will cause an error if left unmodified, since the model tensors do not actually have storage when these lines are executed.

        + +

        With these changes, you can now run GPT-2 models with as many as 128B parameters, provided the accelerator size is suitably large.

        + +

        Next Steps & Acknowledgements

        + +

        To learn more, the docs can be found here. We’d love to hear from you if you run into any issues with FSDP in PyTorch/XLA, or just want to tell us about how you are using it.

        + +

        We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to GitHub so that we can openly collaborate.

        + +

        We’d like to thank Ronghang Hu and Ross Girshick at Meta AI and Lysandre Debut, Sourab Mangrulkar, Sylvain Gugger and Arthur Zucker for all the support and collaboration. We’d also like to thank Jiewen Tan, Liyang Lu, Will Cromar, Vaibhav Singh, and Chandra Devarakonda for their assistance in preparing this post.

        + +

        Cheers!

        + +

        The PyTorch/XLA Team at Google

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/lightning-ai-joins-pytorch/index.html b/blog/lightning-ai-joins-pytorch/index.html new file mode 100644 index 000000000000..e7848dac6981 --- /dev/null +++ b/blog/lightning-ai-joins-pytorch/index.html @@ -0,0 +1,671 @@ + + + + + + + + + + + + + Lightning AI Joins the PyTorch Foundation as a Premier Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Lightning AI has joined as a premier member.

        + +

        Lightning AI is the company behind PyTorch Lightning, the platform and open-source framework for companies to build and deploy AI products leveraging the latest generative AI models.

        + +

        “This is a very important milestone for Lightning AI and the PyTorch Lightning community,” remarks Luca Antiga, Chief Technology Officer of Lightning AI. “By joining the PyTorch Foundation, we are strengthening our commitment to boost the adoption of PyTorch across industries. We look forward to partnering with the Foundation to push the vision of PyTorch forward.”

        + +

        PyTorch Lightning is one of the leading projects in the PyTorch ecosystem, allowing developers to build, train, fine-tune and deploy AI models at scale. PyTorch Lightning is helping drive the rapid adoption of PyTorch by both the research community and the enterprise.

        + +

        “Lightning AI has been a great steward of the AI community, and notably a key contributor to PyTorch over the years,” said PyTorch Foundation Executive Director Ibrahim Haddad. “Their goal of making AI research scalable directly aligns with our mission at the foundation.”

        + +

        As a premier member, Lightning AI is granted one seat to the PyTorch Foundation Governing Board. The Board sets policy through our bylaws, mission and vision statements, describing the overarching scope of foundation initiatives, technical vision, and direction.

        + +

        We’re happy to welcome Luca Antiga, Chief Technology Officer at Lightning AI, to our board. Luca joined the Lightning AI team in April 2021 when the Tensorwerk team joined Grid AI. Prior to joining Lightning AI, Luca co-founded Orobix, an applied AI company, and Tensorwerk. He was an early core contributor to PyTorch and co-authored Deep Learning with PyTorch (Manning).

        + +

        To learn more about how you can be a part of the PyTorch Foundation, visit our website.

        + +

        About Lightning AI

        + +

        Lightning AI is the creator of PyTorch Lightning, the deep learning platform and open-source framework of choice for developers and companies seeking to build and deploy AI products.

        + +

        About PyTorch Foundation

        + +

        The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

        + +

        About The Linux Foundation

        + +

        The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/llama-into-torchtune/index.html b/blog/llama-into-torchtune/index.html new file mode 100644 index 000000000000..bc239575e481 --- /dev/null +++ b/blog/llama-into-torchtune/index.html @@ -0,0 +1,1418 @@ + + + + + + + + + + + + + Distilling Llama3.1 8B into 1B in torchtune | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        November 18, 2024

        +

        + Distilling Llama3.1 8B into 1B in torchtune +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Linda Wang, Evan Smothers, Kartikay Khandelwal + +

        +

        In this blog, we present a case study on distilling a Llama 3.1 8B model into Llama 3.2 1B using torchtune’s knowledge distillation recipe. We demonstrate how knowledge distillation (KD) can be used in post-training to improve instruction-following task performance and showcase how users can leverage the recipe.

        + +

        What is Knowledge Distillation?

        + +

        Knowledge Distillation is a widely used compression technique that transfers knowledge from a larger (teacher) model to a smaller (student) model. Larger models have more parameters and capacity for knowledge, however, this larger capacity is also more computationally expensive to deploy. Knowledge distillation can be used to compress the knowledge of a larger model into a smaller model. The idea is that performance of smaller models can be improved by learning from larger model’s outputs.

        + +

        How does Knowledge Distillation work?

        + +

        Knowledge is transferred from the teacher to student model by training on a transfer set where the student is trained to imitate the token-level probability distributions of the teacher. The assumption is that the teacher model distribution is similar to the transfer dataset. The diagram below is a simplified representation of how KD works.

        + +

        Figure 1: Simplified representation of knowledge transfer from teacher to student model

        + +

        Figure 1: Simplified representation of knowledge transfer from teacher to student model

        + +

        As knowledge distillation for LLMs is an active area of research, there are papers, such as MiniLLM, DistiLLM, AKL, and Generalized KD, investigating different loss approaches. In this case study, we focus on the standard cross-entropy (CE) loss with the forward Kullback-Leibler (KL) divergence loss as the baseline. Forward KL divergence aims to minimize the difference by forcing the student’s distribution to align with all of the teacher’s distributions.

        + +

        Why is Knowledge Distillation useful?

        + +

        The idea of knowledge distillation is that a smaller model can achieve better performance using a teacher model’s outputs as an additional signal than it could training from scratch or with supervised fine-tuning. For instance, Llama 3.2 lightweight 1B and 3B text models incorporated logits from Llama 3.1 8B and 70B to recover performance after pruning. In addition, for fine-tuning on instruction-following tasks, research in LLM distillation demonstrates that knowledge distillation methods can outperform supervised fine-tuning (SFT) alone.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model + Method + DollyEval + Self-Inst + S-NI +
        GPT-4 Eval + GPT-4 Eval + Rouge-L +
        Llama 7B + SFT + 73.0 + 69.2 + 32.4 +
        KD + 73.7 + 70.5 + 33.7 +
        MiniLLM + 76.4 + 73.1 + 35.5 +
        Llama 1.1B + SFT + 22.1 + - + 27.8 +
        KD + 22.2 + - + 28.1 +
        AKL + 24.4 + - + 31.4 +
        OpenLlama 3B + SFT + 47.3 + 41.7 + 29.3 +
        KD + 44.9 + 42.1 + 27.9 +
        SeqKD + 48.1 + 46.0 + 29.1 +
        DistiLLM + 59.9 + 53.3 + 37.6 +
        + +

        Table 1: Comparison of knowledge distillation approaches to supervised fine-tuning

        + +

        Below is a simplified example of how knowledge distillation differs from supervised fine-tuning.

        + + + + + + + + + + +
        Supervised fine-tuning + Knowledge distillation +
        +
        +   
        +model = llama3_2_1b()
        +ce_loss = CrossEntropyLoss()
        +kd_loss = ForwardKLLoss()
        +
        +tokens, labels = batch["tokens"], batch["labels"]
        +logits = model(tokens, ...)
        +
        +loss = ce_loss(logits, labels)
        +loss.backward()
        +
        +   
        +   
        +
        +
        +   
        +model = llama3_2_1b()
        +teacher_model = llama3_1_8b()
        +ce_loss = CrossEntropyLoss()
        +kd_loss = ForwardKLLoss()
        +
        +tokens, labels = batch["tokens"], batch["labels"]
        +logits = model(tokens, ...)
        +teacher_logits = teacher_model(tokens, ...)
        +loss = ce_loss(logits, labels) + kd_loss(logits, teacher_logits, labels)
        +loss.backward()
        +   
        +   
        +
        + +

        KD recipe in torchtune

        + +

        With torchtune, we can easily apply knowledge distillation to Llama3, as well as other LLM model families, using torchtune’s KD recipe. The objective for this recipe is to fine-tune Llama3.2-1B on the Alpaca instruction-following dataset by distilling from Llama3.1-8B. This recipe focuses on post-training and assumes the teacher and student models have already been pre-trained.

        + +

        First, we have to download the model weights. To be consistent with other torchtune fine-tuning configs, we will use the instruction tuned models of Llama3.1-8B as teacher and Llama3.2-1B as student.

        + +
        tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf_token <HF_TOKEN>
        +
        +tune download meta-llama/Llama-3.2-1B-Instruct --output-dir /tmp/Llama-3.2-1B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf_token <HF_TOKEN>
        +
        + +

        In order for the teacher model distribution to be similar to the Alpaca dataset, we will fine-tune the teacher model using LoRA. Based on our experiments, shown in the next section, we’ve found that KD performs better when the teacher model is already fine-tuned on the target dataset.

        + +
        tune run lora_finetune_single_device --config llama3_1/8B_lora_single_device
        +
        + +

        Finally, we can run the following command to distill the fine-tuned 8B model into the 1B model on a single GPU. For this case study, we used a single A100 80GB GPU. We also have a distributed recipe for running on multiple devices.

        + +
        tune run knowledge_distillation_single_device --config llama3_2/knowledge_distillation_single_device
        +
        + +

        Ablation studies

        + +

        In this section, we demonstrate how changing configurations and hyperparameters can affect performance. By default, our configuration uses the LoRA fine-tuned 8B teacher model, downloaded 1B student model, learning rate of 3e-4 and KD loss ratio of 0.5. For this case study, we fine-tuned on the alpaca_cleaned_dataset and evaluated the models on truthfulqa_mc2, hellaswag and commonsense_qa tasks through the EleutherAI LM evaluation harness. Let’s take a look at the effects of:

        + +
          +
        1. Using a fine-tuned teacher model
        2. +
        3. Using a fine-tuned student model
        4. +
        5. Hyperparameter tuning of KD loss ratio and learning rate
        6. +
        + +

        Using a fine-tuned teacher model

        + +

        The default settings in the config uses the fine-tuned teacher model. Now, let’s take a look at the effects of not fine-tuning the teacher model first.

        + +

        Taking a loss at the losses, using the baseline 8B as teacher results in a higher loss than using the fine-tuned teacher model. The KD loss also remains relatively constant, suggesting that the teacher model should have the same distributions as the transfer dataset.

        + +

        Figure 2: (left to right) KD loss from forward KL divergence, class loss from cross entropy, total loss: even combination of KD and class loss.

        + +

        Figure 2: (left to right) KD loss from forward KL divergence, class loss from cross entropy, total loss: even combination of KD and class loss.

        + +

        In our benchmarks, we can see that supervised fine-tuning of the 1B model achieves better accuracy than the baseline 1B model. By using the fine-tuned 8B teacher model, we see comparable results for truthfulqa and improvement for hellaswag and commonsense. When using the baseline 8B as a teacher, we see improvement across all metrics, but lower than the other configurations.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model + TruthfulQA + hellaswag + commonsense +
        mc2 + acc + acc_norm + acc +
        Baseline Llama 3.1 8B + 0.5401 + 0.5911 + 0.7915 + 0.7707 +
        Fine-tuned Llama 3.1 8B using LoRA + 0.5475 + 0.6031 + 0.7951 + 0.7789 +
        Baseline Llama 3.2 1B + 0.4384 + 0.4517 + 0.6064 + 0.5536 +
        Fine-tuned Llama 3.2 1B using LoRA + 0.4492 + 0.4595 + 0.6132 + 0.5528 +
        KD using baseline 8B as teacher + 0.444 + 0.4576 + 0.6123 + 0.5561 +
        KD using fine-tuned 8B as teacher + 0.4481 + 0.4603 + 0.6157 + 0.5569 +
        + +

        Table 2: Comparison between using baseline and fine-tuned 8B as teacher model

        + +

        Using a fine-tuned student model

        + +

        For these experiments, we look at the effects of KD when the student model is already fine-tuned. We analyze the effects using different combinations of baseline and fine-tuned 8B and 1B models.

        + +

        Based on the loss graphs, using a fine-tuned teacher model results in a lower loss irrespective of whether the student model is fine-tuned or not. It’s also interesting to note that the class loss starts to increase when using a fine-tuned student model.

        + +

        Figure 3: Comparing losses of different teacher and student model initializations

        + +

        Figure 3: Comparing losses of different teacher and student model initializations

        + +

        Using the fine-tuned student model boosts accuracy even further for truthfulqa, but the accuracy drops for hellaswag and commonsense. Using a fine-tuned teacher model and baseline student model achieved the best results on hellaswag and commonsense dataset. Based on these findings, the best configuration will change depending on which evaluation dataset and metric you are optimizing for.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model + TruthfulQA + hellaswag + commonsense +
        mc2 + acc + acc_norm + acc +
        Baseline Llama 3.1 8B + 0.5401 + 0.5911 + 0.7915 + 0.7707 +
        Fine-tuned Llama 3.1 8B using LoRA + 0.5475 + 0.6031 + 0.7951 + 0.7789 +
        Baseline Llama 3.2 1B + 0.4384 + 0.4517 + 0.6064 + 0.5536 +
        Fine-tuned Llama 3.2 1B using LoRA + 0.4492 + 0.4595 + 0.6132 + 0.5528 +
        KD using baseline 8B and baseline 1B + 0.444 + 0.4576 + 0.6123 + 0.5561 +
        KD using baseline 8B and fine-tuned 1B + 0.4508 + 0.448 + 0.6004 + 0.5274 +
        KD using fine-tuned 8B and baseline 1B + 0.4481 + 0.4603 + 0.6157 + 0.5569 +
        KD using fine-tuned 8B and fine-tuned 1B + 0.4713 + 0.4512 + 0.599 + 0.5233 +
        + +

        Table 3: Comparison using baseline and fine-tuned teacher and student models

        + +

        Hyperparameter tuning: learning rate

        + +

        By default, the recipe has a learning rate of 3e-4. For these experiments, we changed the learning rate from as high as 1e-3 to as low as 1e-5.

        + +

        Based on the loss graphs, all learning rates result in similar losses except for 1e-5, which has a higher KD and class loss.

        + +

        Figure 4: Comparing losses of different learning rates

        + +

        Figure 4: Comparing losses of different learning rates

        + +

        Based on our benchmarks, the optimal learning rate changes depending on which metric and tasks you are optimizing for.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model + learning rate + TruthfulQA + hellaswag + commonsense +
        mc2 + acc + acc_norm + acc +
        Baseline Llama 3.1 8B + - + 0.5401 + 0.5911 + 0.7915 + 0.7707 +
        Fine-tuned Llama 3.1 8B using LoRA + - + 0.5475 + 0.6031 + 0.7951 + 0.7789 +
        Baseline Llama 3.2 1B + - + 0.4384 + 0.4517 + 0.6064 + 0.5536 +
        Fine-tuned Llama 3.2 1B using LoRA + - + 0.4492 + 0.4595 + 0.6132 + 0.5528 +
        KD using fine-tuned 8B and baseline 1B + 3e-4 + 0.4481 + 0.4603 + 0.6157 + 0.5569 +
        KD using fine-tuned 8B and baseline 1B + 1e-3 + 0.4453 + 0.4535 + 0.6071 + 0.5258 +
        KD using fine-tuned 8B and baseline 1B + 1e-4 + 0.4489 + 0.4606 + 0.6156 + 0.5586 +
        KD using fine-tuned 8B and baseline 1B + 1e-5 + 0.4547 + 0.4548 + 0.6114 + 0.5487 +
        + +

        Table 4: Effects of tuning learning rate

        + +

        Hyperparameter tuning: KD ratio

        + +

        By default, the KD ratio is set to 0.5, which gives even weighting to both the class and KD loss. In these experiments, we look at the effects of different KD ratios, where 0 only uses the class loss and 1 only uses the KD loss.

        + +

        Overall, the benchmark results show that for these tasks and metrics, higher KD ratios perform slightly better.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model + kd_ratio (lr=3e-4) + TruthfulQA + hellaswag + commonsense +
        mc2 + acc + acc_norm + acc +
        Baseline Llama 3.1 8B + - + 0.5401 + 0.5911 + 0.7915 + 0.7707 +
        Fine-tuned Llama 3.1 8B using LoRA + - + 0.5475 + 0.6031 + 0.7951 + 0.7789 +
        Baseline Llama 3.2 1B + - + 0.4384 + 0.4517 + 0.6064 + 0.5536 +
        Fine-tuned Llama 3.2 1B using LoRA + - + 0.4492 + 0.4595 + 0.6132 + 0.5528 +
        KD using fine-tuned 8B and baseline 1B + 0.25 + 0.4485 + 0.4595 + 0.6155 + 0.5602 +
        KD using fine-tuned 8B and baseline 1B + 0.5 + 0.4481 + 0.4603 + 0.6157 + 0.5569 +
        KD using fine-tuned 8B and baseline 1B + 0.75 + 0.4543 + 0.463 + 0.6189 + 0.5643 +
        KD using fine-tuned 8B and baseline 1B + 1.0 + 0.4537 + 0.4641 + 0.6177 + 0.5717 +
        + +

        Table 5: Effects of tuning KD ratio

        + +

        Looking Ahead

        + +

        In this blog, we presented a study on how to distill LLMs through torchtune using the forward KL divergence loss on Llama 3.1 8B and Llama 3.2 1B logits. There are many directions for future exploration to further improve performance and offer more flexibility in distillation methods.

        + +
          +
        • Expand KD loss offerings. The KD recipe uses the forward KL divergence loss. However, aligning the student distribution to the whole teacher distribution may not be effective, as mentioned above. There are multiple papers, such as MiniLLM, DistiLLM, and Generalized KD, that introduce new KD losses and policies to address the limitation and have shown to outperform the standard use of cross entropy with forward KL divergence loss. For instance, MiniLLM uses reverse KL divergence to prevent the student from over-estimating low-probability regions of the teacher. DistiLLM introduces a skewed KL loss and an adaptive training policy.
        • +
        • Enable cross-tokenizer distillation. The current recipe requires the teacher and student model to use the same tokenizer, which limits the ability to distill across different LLM families. There has been research on cross-tokenizer approaches (e.g. Universal Logit Distillation) that we could explore.
        • +
        • Expand distillation to multimodal LLMs and encoder models. A natural extension of the KD recipe is to expand to multimodal LLMs. Similar to deploying more efficient LLMs, there’s also a need to deploy smaller and more efficient multimodal LLMs. In addition, there has been work in demonstrating LLMs as encoder models (e.g. LLM2Vec). Distillation from LLMs as encoders to smaller encoder models may also be a promising direction to explore.
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/mapillary-research/index.html b/blog/mapillary-research/index.html new file mode 100644 index 000000000000..3e917576ec61 --- /dev/null +++ b/blog/mapillary-research/index.html @@ -0,0 +1,749 @@ + + + + + + + + + + + + + Mapillary Research: Seamless Scene Segmentation and In-Place Activated BatchNorm | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Lorenzo Porzi, Mapillary + +

        +

        With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry.

        + +

        Today, people and organizations all over the world have contributed more than 600 million images toward Mapillary’s mission of helping people understand the world’s places through images and making this data available, with clients and partners including the World Bank, HERE, and Toyota Research Institute.

        + +

        Mapillary’s computer vision technology brings intelligence to maps in an unprecedented way, increasing our overall understanding of the world. Mapillary runs state-of-the-art semantic image analysis and image-based 3d modeling at scale and on all its images. In this post we discuss two recent works from Mapillary Research and their implementations in PyTorch - Seamless Scene Segmentation [1] and In-Place Activated BatchNorm [2] - generating Panoptic segmentation results and saving up to 50% of GPU memory during training, respectively.

        + +

        Seamless Scene Segmentation

        + +

        Github project page: https://github.com/mapillary/seamseg/

        + +
        + +
        + +

        The objective of Seamless Scene Segmentation is to predict a “panoptic” segmentation [3] from an image, that is a complete labeling where each pixel is assigned with a class id and, where possible, an instance id. Like many modern CNNs dealing with instance detection and segmentation, we adopt the Mask R-CNN framework [4], using ResNet50 + FPN [5] as a backbone. This architecture works in two stages: first, the “Proposal Head” selects a set of candidate bounding boxes on the image (i.e. the proposals) that could contain an object; then, the “Mask Head” focuses on each proposal, predicting its class and segmentation mask. The output of this process is a “sparse” instance segmentation, covering only the parts of the image that contain countable objects (e.g. cars and pedestrians).

        + +

        To complete our panoptic approach coined Seamless Scene Segmentation, we add a third stage to Mask R-CNN. Stemming from the same backbone, the “Semantic Head” predicts a dense semantic segmentation over the whole image, also accounting for the uncountable or amorphous classes (e.g. road and sky). The outputs of the Mask and Semantic heads are finally fused using a simple non-maximum suppression algorithm to generate the final panoptic prediction. All details about the actual network architecture, used losses and underlying math can be found at the project website for our CVPR 2019 paper [1].

        + +

        While several versions of Mask R-CNN are publicly available, including an official implementation written in Caffe2, at Mapillary we decided to build Seamless Scene Segmentation from scratch using PyTorch, in order to have full control and understanding of the whole pipeline. While doing so we encountered a couple of main stumbling blocks, and had to come up with some creative workarounds we are going to describe next.

        + +

        Dealing with variable-sized tensors

        + +

        Something that sets aside panoptic segmentation networks from traditional CNNs is the prevalence of variable-sized data. In fact, many of the quantities we are dealing with cannot be easily represented with fixed sized tensors: each image contains a different number of objects, the Proposal head can produce a different number of proposals for each image, and the images themselves can have different sizes. While this is not a problem per-se – one could just process images one at a time – we would still like to exploit batch-level parallelism as much as possible. Furthermore, when performing distributed training with multiple GPUs, DistributedDataParallel expects its inputs to be batched, uniformly-sized tensors.

        + +
        + +
        + +

        Our solution to these issues is to wrap each batch of variable-sized tensors in a PackedSequence. PackedSequence is little more than a glorified list class for tensors, tagging its contents as “related”, ensuring that they all share the same type, and providing useful methods like moving all the tensors to a particular device, etc. When performing light-weight operations that wouldn’t be much faster with batch-level parallelism, we simply iterate over the contents of the PackedSequence in a for loop. When performance is crucial, e.g. in the body of the network, we simply concatenate the contents of the PackedSequence, adding zero padding as required (like in RNNs with variable-length inputs), and keeping track of the original dimensions of each tensor.

        + +

        PackedSequences also help us deal with the second problem highlighted above. We slightly modify DistributedDataParallel to recognize PackedSequence inputs, splitting them in equally sized chunks and distributing their contents across the GPUs.

        + +

        Asymmetric computational graphs with Distributed Data Parallel

        + +

        Another, perhaps more subtle, peculiarity of our network is that it can generate asymmetric computational graphs across GPUs. In fact, some of the modules that compose the network are “optional”, in the sense that they are not always computed for all images. As an example, when the Proposal head doesn’t output any proposal, the Mask head is not traversed at all. If we are training on multiple GPUs with DistributedDataParallel, this results in one of the replicas not computing gradients for the Mask head parameters.

        + +

        Prior to PyTorch 1.1, this resulted in a crash, so we had to develop a workaround. Our simple but effective solution was to compute a “fake forward pass” when no actual forward is required, i.e. something like this:

        + +
        def fake_forward():
        +    fake_input = get_correctly_shaped_fake_input()
        +    fake_output = mask_head(fake_input)
        +    fake_loss = fake_output.sum() * 0
        +    return fake_loss
        +
        + +

        Here, we generate a batch of bogus data, pass it through the Mask head, and return a loss that always back-progates zeros to all parameters.

        + +

        Starting from PyTorch 1.1 this workaround is no longer required: by setting find_unused_parameters=True in the constructor, DistributedDataParallel is told to identify parameters whose gradients have not been computed by all replicas and correctly handle them. This leads to some substantial simplifications in our code base!

        + +

        In-place Activated BatchNorm

        + +

        Github project page: https://github.com/mapillary/inplace_abn/

        + +

        Most researchers would probably agree that there are always constraints in terms of available GPU resources, regardless if their research lab has access to only a few or multiple thousands of GPUs. In a time where at Mapillary we still worked at rather few and mostly 12GB Titan X - style prosumer GPUs, we were searching for a solution that virtually enhances the usable memory during training, so we would be able to obtain and push state-of-the-art results on dense labeling tasks like semantic segmentation. In-place activated BatchNorm is enabling us to use up to 50% more memory (at little computational overhead) and is therefore deeply integrated in all our current projects (including Seamless Scene Segmentation described above).

        + +
        + +
        + +

        When processing a BN-Activation-Convolution sequence in the forward pass, most deep learning frameworks (including PyTorch) need to store two big buffers, i.e. the input x of BN and the input z of Conv. This is necessary because the standard implementations of the backward passes of BN and Conv depend on their inputs to calculate the gradients. Using InPlace-ABN to replace the BN-Activation sequence, we can safely discard x, thus saving up to 50% GPU memory at training time. To achieve this, we rewrite the backward pass of BN in terms of its output y, which is in turn reconstructed from z by inverting the activation function.

        + +

        The only limitation of InPlace-ABN is that it requires using an invertible activation function, such as leaky relu or elu. Except for this, it can be used as a direct, drop-in replacement for BN+activation modules in any network. Our native CUDA implementation offers minimal computational overhead compared to PyTorch’s standard BN, and is available for anyone to use from here: https://github.com/mapillary/inplace_abn/.

        + +

        Synchronized BN with asymmetric graphs and unbalanced batches

        + +

        When training networks with synchronized SGD over multiple GPUs and/or multiple nodes, it’s common practice to compute BatchNorm statistics separately on each device. However, in our experience working with semantic and panoptic segmentation networks, we found that accumulating mean and variance across all workers can bring a substantial boost in accuracy. This is particularly true when dealing with small batches, like in Seamless Scene Segmentation where we train with a single, super-high resolution image per GPU.

        + +

        InPlace-ABN supports synchronized operation over multiple GPUs and multiple nodes, and, since version 1.1, this can also be achieved in the standard PyTorch library using SyncBatchNorm. Compared to SyncBatchNorm, however, we support some additional functionality which is particularly important for Seamless Scene Segmentation: unbalanced batches and asymmetric graphs.

        + +

        As mentioned before, Mask R-CNN-like networks naturally give rise to variable-sized tensors. Thus, in InPlace-ABN we calculate synchronized statistics using a variant of the parallel algorithm described here, which properly takes into account the fact that each GPU can hold a different number of samples. PyTorch’s SyncBatchNorm is currently being revised to support this, and the improved functionality will be available in a future release.

        + +

        Asymmetric graphs (in the sense mentioned above) are another complicating factor one has to deal with when creating a synchronized BatchNorm implementation. Luckily, PyTorch’s distributed group functionality allows us to restrict distributed communication to a subset of workers, easily excluding those that are currently inactive. The only missing piece is that, in order to create a distributed group, each process needs to know the ids of all processes that will participate in the group, and even processes that are not part of the group need to call the new_group() function. In InPlace-ABN we handle it with a function like this:

        + +
        import torch
        +import torch.distributed as distributed
        +
        +def active_group(active):
        +    """Initialize a distributed group where each process can independently decide whether to participate or not"""
        +    world_size = distributed.get_world_size()
        +    rank = distributed.get_rank()
        +
        +    # Gather active status from all workers
        +    active = torch.tensor(rank if active else -1, dtype=torch.long, device=torch.cuda.current_device())
        +    active_workers = torch.empty(world_size, dtype=torch.long, device=torch.cuda.current_device())
        +    distributed.all_gather(list(active_workers.unbind(0)), active)
        +
        +    # Create group
        +    active_workers = [int(i) for i in active_workers.tolist() if i != -1]
        +    group = distributed.new_group(active_workers)
        +    return group
        +
        + +

        First each process, including inactive ones, communicates its status to all others through an all_gather call, then it creates the distributed group with the shared information. In the actual implementation we also include a caching mechanism for groups, since new_group() is usually too expensive to call at each batch.

        + +

        References

        + +

        [1] Seamless Scene Segmentation; Lorenzo Porzi, Samuel Rota Bulò, Aleksander Colovic, Peter Kontschieder; Computer Vision and Pattern Recognition (CVPR), 2019

        + +

        [2] In-place Activated BatchNorm for Memory-Optimized Training of DNNs; Samuel Rota Bulò, Lorenzo Porzi, Peter Kontschieder; Computer Vision and Pattern Recognition (CVPR), 2018

        + +

        [3] Panoptic Segmentation; Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, Piotr Dollar; Computer Vision and Pattern Recognition (CVPR), 2019

        + +

        [4] Mask R-CNN; Kaiming He, Georgia Gkioxari, Piotr Dollar, Ross Girshick; International Conference on Computer Vision (ICCV), 2017

        + +

        [5] Feature Pyramid Networks for Object Detection; Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan, Serge Belongie; Computer Vision and Pattern Recognition (CVPR), 2017

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/maximizing-training-throughput/index.html b/blog/maximizing-training-throughput/index.html new file mode 100644 index 000000000000..85bb770c027d --- /dev/null +++ b/blog/maximizing-training-throughput/index.html @@ -0,0 +1,843 @@ + + + + + + + + + + + + + Maximizing Training Throughput Using PyTorch FSDP and Torch.compile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch at IBM and Team PyTorch at Meta + +

        +

        Recently, we demonstrated how FSDP and selective activation checkpointing can be used to achieve 57% MFU (Model Flops Utilization) for training a 7B model on A100 GPUs. We also demonstrated how it can train a high quality model, which we open sourced as Granite 7B base model on Hugging Face Hub under the Apache v2.0 license.

        + +

        We continued our quest to improve the utilization of GPUs by leveraging torch.compile. Using torch.compile and the selective activation checkpointing from our previous work, we achieve a MFU of 68% for the 7B model on A100 GPUs! torch.compile improves training MFU between 10% and 23% for various model sizes.

        + +

        This blog is organized into three parts: (1) Challenges addressed in order to train using torch.compile, (2) Numerical parity of compile with no-compile, and (3) MFU report.

        + +

        We open sourced all the code and updated it in the fms-fsdp repository. We are also working with Team PyTorch at Meta to contribute these to the newly released torch titan repository for pre-training.

        + +

        Challenges of using torch.compile

        + +

        torch.compile is a graph compilation technique that improves GPU utilization. For details on how torch compile works, we refer the readers to the recent PyTorch paper and associated tutorials. A key challenge in getting torch.compile to perform well is to minimize (or eliminate) graph breaks. We initially started with the Llama implementation provided by Meta, but compiling it caused too many graph breaks resulting in reduced training throughput.

        + +

        Several portions of the model architecture had to be fixed, with the most important one being the positional embedding layer (RoPE). The typical RoPE implementation uses complex numbers, which was not supported in torch.compile at the time of testing. We implemented RoPE using einops while maintaining parity with the original model architecture implementation. We had to properly cache the frequencies so that we did not run into graph breaks within the RoPE implementation.

        + +

        Compiling an FSDP model does result in graph breaks, which the PyTorch team at Meta is working to remove. However, these graph breaks as of PyTorch 2.3 are at FSDP unit boundaries and do not affect throughput significantly.

        + +

        When using custom kernels, we need to wrap each kernel by exposing its API to torch.compile. This involves indicating what parameters are modified in-place, how they are modified, and what shapes and strides will their return values have based on the inputs. In our case, SDPA Flash attention is already integrated appropriately and we were able to get that kernel to work with torch.compile with no graph breaks.

        + +

        We also noticed that when increasing the amount of data from 2T to 6T tokens, the data loader became a bottleneck. A key reason for this is the fact that previously, we implemented document shuffling in our dataloader naively, by having each worker maintain a list of shuffled document pointers.

        + +

        With the larger dataset, these pointer lists were growing to hundreds of thousands of entries per worker. Maintaining pointer lists at this scale became expensive enough that cpu contention throttled our training throughput. We re-implemented document shuffling without any pointer lists using a Linear Congruential Generator. LCG is a pseudorandom number generator algorithm that implements a random walk over a population, providing sampling without replacement.

        + +

        We leveraged the same idea to produce implicit bijective mappings from ordered to shuffled document indices. This enables us to shrink those annoying lists of hundreds of thousands of pointers down to a single integer state for the LCG. This eliminated 80% of the bottleneck and provided a significant boost to our performance. We will devote a separate blog to go into all the details of our performant pre-training data loader.

        + +

        Numerical Parity of torch.compile and torch.no-compile

        + +

        We had previously observed parity issues when training with compile and no-compile options, with one of these being related to the use of SDPA. After a few days of intense debugging sessions between the PyTorch teams at Meta and IBM, we were able to achieve parity between PyTorch compile and no-compile modes. To document and verify this parity, we take a mini-Llama model architecture of 1.4B size and train it to 100B tokens in four variations – no-compile, compile with no activation checkpointing, compile with selective activation checkpointing, and compile with full activation checkpointing.

        + +

        We plot the loss curves and gradient norm for these options below:

        + +

        Figure 1: Loss curve and gradient norm for various compile options

        + +

        Figure 1: Loss curve and gradient norm for various compile options

        + +

        Further, we run the lm-evaluation-harness and compare the various model scores on different benchmarks and observe no major differences between compile and no-compile, which is shown below.

        + +

        Figure 2: lm-evaluation-harness comparison of various benchmarks between compile and no-compile

        + +

        Figure 2: lm-evaluation-harness comparison of various benchmarks between compile and no-compile

        + +

        We observe from all these results that compile with all its variants is equal to no-compile option, thus demonstrating parity between compile and no-compile.

        + +

        MFU report

        + +

        Finally, like our previous blog, we compute the MFU for four different model sizes on two clusters. One cluster is 128 A100 GPUs with 400 Gbps inter-node connectivity, and the other is 464 H100 GPUs with 3.2 Tbps inter-node connectivity. We use the selective activation checkpointing that we covered in the prior blog in addition to compile. We capture the results in the table below.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model size + Batch size + MFU no-compile + MFU compile + Percentage gain (%) +
        7B + 2 + 0.57 + 0.68 + 20 +
        13B + 2 + 0.51 + 0.60 + 17 +
        34B + 2 + 0.47 + 0.54 + 15 +
        70B + 2 + 0.50 + 0.55 + 10 +
        + +

        Table 1: MFU results with compile and no compile for Llama2 model architectures on 128 A100 80GB GPUs with 400Gbps internode interconnect

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model size + Batch size + MFU no-compile + MFU compile + Percentage gain +
        7B + 2 + 0.37 + 0.45 + 21 +
        13B + 2 + 0.35 + 0.43 + 23 +
        34B + 2 + 0.32 + 0.38 + 19 +
        70B + 2 + 0.32 + 0.38 + 19 +
        + +

        Table 2: MFU results with compile and no compile for Llama2 model architectures on 464 H100 80GB GPUs with 3.2Tbps internode interconnect

        + +

        We also had an internal production run on 448 GPUs using a Llama2 7B architecture. Using compile and selective activation checkpointing, with a global batch size of 3.7M, we trained for 4T tokens in 13 days 10 hours!

        + +

        During training, the data center cooling had to kick in with extra air conditioning and our training team was alerted to this, since we were using the GPUs quite effectively ☺

        + +

        One key observation from the tables 1 and 2 is that the MFU numbers do not linearly scale with model size. There are two possible explanations that we are actively investigating, one is the scalability of FSDP as model size increases and when tensor parallel needs to be enabled to more effectively use the GPU and the other is batch size, which can be increased further to get better MFU. We plan to explore FSDP v2 and selective operator checkpointing along with the tensor parallel feature to study the scaling laws of FSDP with model size.

        + +

        Future Work

        + +

        We plan to start testing FSDP v2 which will be released as part of PyTorch 2.4. FSDP2 provides per parameter sharding and selective operator checkpointing feature that can potentially provide even better memory-compute tradeoffs.

        + +

        We have also been engaged with the PyTorch team at Meta to evaluate the new asynchronous checkpointing feature that can further improve the GPU utilization by reducing the time to write checkpoints.

        + +

        We are exploring extending various Triton kernels currently used in inference to perform backward operations to gain speedups beyond inference only.

        + +

        Finally, as recent work on use of fp8 is emerging, we plan to explore how we can even further accelerate model training using the new data type that promises a 2x acceleration.

        + +

        Acknowledgements

        + +

        There are several teams that have been involved in reaching this proof point and we would like to thank the teams across Meta and IBM. Specifically, we extend our gratitude to the Meta PyTorch distributed and compiler teams and IBM Research.

        + +

        Multiple people were extensively involved in the effort of achieving torch.compile numerical parity with our models, and we wish to acknowledge the key folks involved in this effort; Animesh Jain and Less Wright at Meta, and Linsong Chu, Davis Wertheimer, Brian Vaughan, Antoni i Viros Martin, Mudhakar Srivatsa, and Raghu Ganti at IBM Research.

        + +

        Special thanks to Stas Bekman, who provided extensive feedback and helped improve this blog. Their insights have been invaluable in highlighting key aspects of optimizing the training and exploring further enhancements.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/maximizing-training/index.html b/blog/maximizing-training/index.html new file mode 100644 index 000000000000..852b54d575c1 --- /dev/null +++ b/blog/maximizing-training/index.html @@ -0,0 +1,1095 @@ + + + + + + + + + + + + + Maximizing training throughput using PyTorch FSDP | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch at IBM and Team PyTorch at Meta + +

        +

        In this blog, we demonstrate the scalability of FSDP with a pre-training exemplar, a 7B model trained for 2T tokens, and share various techniques we used to achieve a rapid training speed of 3,700 tokens/sec/GPU, or 40B tokens/day on 128 A100 GPUs. This translates to a model FLOPS utilization (MFU) and hardware FLOPS utilization (HFU) of 57%. Additionally, we have observed near linear scaling of FSDP to 512 GPUs, implying that training a 7B model on 512 GPUs to 2T tokens using this method would take just under two weeks.

        + +

        IBM researchers trained a Meta Llama 2 7B architecture to 2T tokens, which we will refer to as LlamaT(est). This model demonstrates comparable model quality as Llama 2 on various academic benchmarks. All of the training code, along with our methodology to achieve this throughput, can be found in this blog. We also share the configuration knobs that work well for the Llama 2 models – 7B, 13B, 34B, and 70B for A100s and H100s.

        + +

        In this process, we also propose a _new _selective activation checkpointing mechanism that applies to FSDP which gives us a 10% boost beyond out-of-the box FSDP. We have open sourced the training code base and an associated scalable data loader as the methodology to achieve this throughput.

        + +

        One key benefit of a PyTorch native pathway for training is the ability to seamlessly train on multiple hardware backends. For example, the recent end-to-end stack for training that was released by AllenAI through OLMo also leverages PyTorch FSDP for training on AMD and NVIDIA GPUs. There are three main components that we leverage from FSDP to achieve our throughput:

        + +
          +
        1. SDPA Flash attention, that enables fused attention kernels and efficient attention computation
        2. +
        3. Overlap in computation and communication allows for better utilization of the GPU
        4. +
        5. Selective activation checkpointing enables us to tradeoff between GPU memory and compute
        6. +
        + +

        IBM has been working closely with Team PyTorch at Meta on PyTorch FSDP for nearly two years: introducing the rate limiter for achieving better throughput on Ethernet interconnects, distributed checkpointing to improve the checkpoint times by an order of magnitude, and implementing the early version of checkpointing for the hybrid sharding mode of FSDP. Late last year, we used FSDP to train a model end-to-end.

        + +

        Training Details

        + +

        The 7B model is trained on 128 A100 GPUs with 400Gbps network connectivity and GPU direct RDMA. We use SDPA FlashAttention v2 for attention computation, and for this model we turned off activation checkpointing that limits the batch size, but provides the highest throughput – batch size is 1 million tokens per batch for 128 GPUs and improves throughput by about 10% when compared to activation checkpointing. With these parameters, we have an almost full overlap in computation and communication. We use the AdamW optimizer in 32-bit with beta1 of 0.9 and beta2 of 0.95, weight decay of 0.1, and a learning rate ending at 3e-5 with a warmup to max learning rate of 3e-4 and a cosine schedule to reduce to 3e-5 over 2T tokens. The training was performed using mixed precision bf16 on an internal dataset. The training stack is using IBM’s Foundation Model Stack for model architecture and PyTorch nightlies post-2.2 release for FSDP and SDPA. We tried a few different nightlies during the time period of Nov 2023 through Feb 2024 and we observed an improvement in the throughput.

        + +

        Selective activation checkpointing

        + +

        We jointly implemented a simple and effective mechanism of selective activation checkpointing (AC). In FSDP, the common practice is to checkpoint each transformer block. A simple extension is to checkpoint every _n _blocks and reduce the amount of recomputation, while increasing the memory needed. This is quite effective for the 13B model size, increasing the throughput by 10%. For the 7B model size, we did not need activation checkpointing at all. Future versions of FSDP will provide selective activation checkpointing at an operator level, enabling an optimal compute-memory tradeoff. The code for the above is implemented here.

        + +

        Throughput and MFU, HFU computation

        + +

        While we only trained the 7B model to 2T tokens, we performed numerous experiments on the other model sizes to provide the best configuration options. This is summarized in the table below for two types of infrastructure — an A100 cluster with 128 GPUs and 400Gbps inter-node interconnect, and an H100 cluster with 96 GPUs and 800Gbps inter-node interconnect.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +Model size + + + +Batch size + + + +Activation checkpoint + + + +Throughput tokens/sec/GPU (A100 80GB and 400Gbps interconnect) + + + +MFU % (A100 80GB) + + + +HFU % (A100 80GB) + + + +Throughput tokens/sec/GPU (H100 80GB and 800Gbps interconnect) + + + +MFU % (H100 80GB) + + + +HFU % (H100 80GB) + + +
        +7B + + + +2 + + + +No + + + +3700 + + + +0.57 + + + +0.57 + + + +7500 + + + +0.37 + + + +0.37 + + +
        +13B + + + +2 + + + +Selective + + + +1800 + + + +0.51 + + + +0.59 + + + +3800 + + + +0.35 + + + +0.40 + + +
        +34B + + + +2 + + + +Yes + + + +700 + + + +0.47 + + + +0.64 + + + +1550 + + + +0.32 + + + +0.44 + + +
        +70B + + + +2 + + + +Yes + + + +370 + + + +0.50 + + + +0.67 + + + +800 + + + +0.34 + + + +0.45 + + +
        + +

        Table 1: Model and Hardware FLOPS utilization of various model sizes on A100 and H100 GPUs

        + +

        HFU numbers are computed using the PyTorch FLOP counter and the theoretical bf16 performance of A100 and H100 GPUs, whereas MFU numbers are computed using the methodology outlined in NanoGPT and the PaLM paper. We also note that the batch sizes we use for the larger models are intentionally kept at 2 per GPU to mimic choices made in training models of 4k sequence length and achieve this up to 512 GPUs without exceeding the 4M tokens popular batch size. Beyond that, we would need tensor parallelism or sequence parallelism.

        + +

        We note in the table above that for A100s, that activation recomputation causes the MFU to reduce, while HFU increases! With the introduction of better activation checkpointing schemes, we expect MFU to increase and catch up with HFU. However, we observe that for H100s, both MFU and HFU are relatively low. We analyze the PyTorch profile traces on H100 and observe that there is a 10% gap due to network “peeking” out. In addition, we hypothesize that the HBM bandwidth of H100s is the cause for the reduced HFU/MFU on H100s and not being able to obtain the 3x improvement (H100s are theoretically 3x faster than A100s - 312 vs 989TFLOPS, but only have <2x the HBM bandwidth than A100s - 2.0 vs 3.35TBps). We plan to try out other configuration options like Tensor Parallel to improve the knobs for the 70B model on H100s.

        + +

        Model details

        + +

        The loss curve for training is shown in the below figure.

        + +

        loss curve for training

        + +

        Figure 1: LlamaT training loss curve

        + +

        The 2T checkpoint is converted to Hugging Face format by a script that is provided in the repository and we then use lm-evaluation-harness to compute key academic benchmarks and compare that by running it on Llama2-7B. These results are captured in the below table.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Evaluation metric + Llama2-7B (baseline) + LlamaT-7B +
        MMLU (zero shot) + 0.41 + 0.43 +
        MMLU (5-shot weighted avg) + 0.47 + 0.50 +
        Arc challenge + 0.46 + 0.44 +
        Arc easy + 0.74 + 0.71 +
        Boolq + 0.78 + 0.76 +
        Copa + 0.87 + 0.83 +
        Hellaswag + 0.76 + 0.74 +
        Openbookqa + 0.44 + 0.42 +
        Piqa + 0.79 + 0.79 +
        Sciq + 0.91 + 0.91 +
        Winogrande + 0.69 + 0.67 +
        Truthfulqa + 0.39 + 0.39 +
        GSM8k (8-shot) + 0.13 + 0.11 +
        + +

        Table 1: LM eval harness scores

        + +

        We observe that the model performs competitively with Llama2 (bolder is better).

        + +

        Training chronicles

        + +

        Training was stable with no crashes, though we did observe a few hiccups:

        + +

        0-200B tokens: We observed a slowdown in the iteration time (time taken to execute one training step). We stopped the job to ensure that the data loader was not causing any slowdowns and the checkpointing was performant and accurate. We did not find any issues. By this time, HSDP checkpointing code was available in PyTorch, and we took this opportunity to make the switch to PyTorch checkpointing code.

        + +

        200B tokens-1.9T: We did not do any manual intervention in the job in late December. When we came back early January, disk space had exceeded and checkpoints were failing to be written, although the training job continued. The last known checkpoint was 1.5T.

        + +

        1.5T-1.7T: We evaluated the 1.5T checkpoint with lm-evaluation-harness and discovered that model has been trained with an extra special token between two documents due to the Hugging Face tokenizer introducing a separator token and our dataloader also appending its own document separator. We modified the dataloader to eliminate the extra special token, and continued training with the modified dataloader from 1.7T token onwards.

        + +

        1.7T-2T: The loss initially spiked due to the change in the special tokens which was quickly recovered in a few billion tokens. The training finished without any other manual intervention!

        + +

        Key takeaways and even more speed

        + +

        We demonstrated how one can use FSDP to train a model to 2T tokens with an excellent performance of 3700 tokens/sec/GPU and that generates a good quality model. As part of this exercise, we open sourced all our code for training and the knobs to achieve this throughput. These knobs can be leveraged by not only large-scale runs, but also smaller scale tuning runs. You can find the code here.

        + +

        FSDP APIs implement the ZeRO algorithms in a PyTorch native manner and allow for tuning and training of large models. In the past, we have seen FSDP proof points (Stanford Alpaca, Hugging Face, Llama 2 recipes) on tuning a variety of LLMs (such as Meta Llama 2 7B to 70B Llama) using simple training loops and achieving good throughputs and training times.

        + +

        Finally, we note that there are several levers for speeding up training:

        + +
          +
        1. Node optimizations that can speedup specific operations (e.g., attention computation using Flash Attention V2)
        2. +
        3. Graph optimizations (e.g., fusing kernels, torch.compile)
        4. +
        5. Overlap in compute-communications
        6. +
        7. Activation recomputation
        8. +
        + +

        We have leveraged 1, 3, and a variation of 4 in this blog and are working closely with Team PyTorch at Meta to get torch.compile (2) as well as a more advanced version of 4 with per-operator selective activation recomputation. We plan to share a simple formatting code and example data to ingest into our data loader to enable others to use the code base for training of models.

        + +

        Acknowledgements

        + +

        There are several teams that have been involved in reaching this proof point and we would like to thank the teams across Meta and IBM. Specifically, we extend our gratitude to the PyTorch distributed team, Facebook Research and Applied AI teams that built the FSDP APIs and made enhancements based on our feedback. We also wish to thank the data team at IBM Research that curated the data corpus used in this exercise and the infrastructure team at IBM Research (especially, Claudia Misale, Shweta Salaria, and Seetharami Seelam) that optimized NCCL and network configurations. By building and leveraging all of these components, we have successfully demonstrated the LlamaT proof point.

        + +

        The selective activation checkpointing was conceptualized at IBM by Linsong Chu, Davis Wertheimer, Mudhakar Srivatsa, and Raghu Ganti and implemented by Less Wright at Meta.

        + +

        Special thanks to Stas Bekman and Minjia Zhang, who provided extensive feedback and helped improve the blog. Their insights have been invaluable in highlighting key aspects of optimizing the training and exploring further enhancements.

        + +

        Appendix

        + +

        Communication computation overlap

        + +

        Another key aspect of training in a multi-node setting is the ability to overlap communication and computation. In FSDP, there are multiple opportunities for overlapping – during the FSDP unit gathering phase at forward pass as well as the backward pass computation. Overlapping the gather during forward pass while the computation of the previous unit and overlapping backward computation with the next unit gathering and gradient scattering help improve GPU utilization by nearly 2x. We illustrate this on the 400Gbps network interconnect with A100 80GB GPUs. In the case of HSDP, there is no inter-node traffic during the pre-fetch stage for forward pass and the overlap is only for the backward gradient computation phase. Of course, HSDP is feasible only when the model can be sharded within a single node, limiting the size of models to around 30B parameters.

        + +

        The below figure shows three steps in FSDP with the communication between nodes at the bottom and the compute stream at the top of the second half of the image. For the 7B model with no activation recomputation, we observe the overlap to be complete. In practice, the overlap percentage possible is 90% since the first block during forward pass and the last block during backward pass are not able to overlap.

        + +

        three steps in FSDP with the communication between nodes at the bottom and the compute stream at the top of the second half

        + +

        A zoomed in view of the above three-step process is shown below for a single step. We can clearly see the granularity of the computation and communication and how they overlap in an interleaved manner.

        + +

        zoomed in view of the above three-step process

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/microsoft-becomes-maintainer-of-the-windows-version-of-pytorch/index.html b/blog/microsoft-becomes-maintainer-of-the-windows-version-of-pytorch/index.html new file mode 100644 index 000000000000..c114b93457db --- /dev/null +++ b/blog/microsoft-becomes-maintainer-of-the-windows-version-of-pytorch/index.html @@ -0,0 +1,672 @@ + + + + + + + + + + + + + Microsoft becomes maintainer of the Windows version of PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Maxim Lukiyanov - Principal PM at Microsoft, Emad Barsoum - Group EM at Microsoft, Guoliang Hua - Principal EM at Microsoft, Nikita Shulga - Tech Lead at Facebook, Geeta Chauhan - PE Lead at Facebook, Chris Gottbrath - Technical PM at Facebook, Jiachen Pu - Engineer at Facebook + +

        +

        Along with the PyTorch 1.6 release, we are excited to announce that Microsoft has expanded its participation in the PyTorch community and will be responsible for the development and maintenance of the PyTorch build for Windows.

        + +

        According to the latest Stack Overflow developer survey, Windows remains the primary operating system for the developer community (46% Windows vs 28% MacOS). Jiachen Pu initially made a heroic effort to add support for PyTorch on Windows, but due to limited resources, Windows support for PyTorch has lagged behind other platforms. Lack of test coverage resulted in unexpected issues popping up every now and then. Some of the core tutorials, meant for new users to learn and adopt PyTorch, would fail to run. The installation experience was also not as smooth, with the lack of official PyPI support for PyTorch on Windows. Lastly, some of the PyTorch functionality was simply not available on the Windows platform, such as the TorchAudio domain library and distributed training support. To help alleviate this pain, Microsoft is happy to bring its Windows expertise to the table and bring PyTorch on Windows to its best possible self.

        + +

        In the PyTorch 1.6 release, we have improved the core quality of the Windows build by bringing test coverage up to par with Linux for core PyTorch and its domain libraries and by automating tutorial testing. Thanks to the broader PyTorch community, which contributed TorchAudio support to Windows, we were able to add test coverage to all three domain libraries: TorchVision, TorchText and TorchAudio. In subsequent releases of PyTorch, we will continue improving the Windows experience based on community feedback and requests. So far, the feedback we received from the community points to distributed training support and a better installation experience using pip as the next areas of improvement.

        + +

        In addition to the native Windows experience, Microsoft released a preview adding GPU compute support to Windows Subsystem for Linux (WSL) 2 distros, with a focus on enabling AI and ML developer workflows. WSL is designed for developers that want to run any Linux based tools directly on Windows. This preview enables valuable scenarios for a variety of frameworks and Python packages that utilize NVIDIA CUDA for acceleration and only support Linux. This means WSL customers using the preview can run native Linux based PyTorch applications on Windows unmodified without the need for a traditional virtual machine or a dual boot setup.

        + +

        Getting started with PyTorch on Windows

        +

        It’s easy to get started with PyTorch on Windows. To install PyTorch using Anaconda with the latest GPU support, run the command below. To install different supported configurations of PyTorch, refer to the installation instructions on pytorch.org.

        + +

        conda install pytorch torchvision cudatoolkit=10.2 -c pytorch

        + +

        Once you install PyTorch, learn more by visiting the PyTorch Tutorials and documentation.

        + +
        + +
        + +

        Getting started with PyTorch on Windows Subsystem for Linux

        +

        The preview of NVIDIA CUDA support in WSL is now available to Windows Insiders running Build 20150 or higher. In WSL, the command to install PyTorch using Anaconda is the same as the above command for native Windows. If you prefer pip, use the command below.

        + +

        pip install torch torchvision

        + +

        You can use the same tutorials and documentation inside your WSL environment as on native Windows. This functionality is still in preview so if you run into issues with WSL please share feedback via the WSL GitHub repo or with NVIDIA CUDA support share via NVIDIA’s Community Forum for CUDA on WSL.

        + +

        Feedback

        +

        If you find gaps in the PyTorch experience on Windows, please let us know on the PyTorch discussion forum or file an issue on GitHub using the #module: windows label.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ml-model-server-resource-saving/index.html b/blog/ml-model-server-resource-saving/index.html new file mode 100644 index 000000000000..33dd72866b00 --- /dev/null +++ b/blog/ml-model-server-resource-saving/index.html @@ -0,0 +1,882 @@ + + + + + + + + + + + + + ML Model Server Resource Saving - Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Sangjune Park(Naver GplaceAI MLOps), Jooyoung Lee(Naver GplaceAI MLE), Junho Min(Naver GplaceAI MLE) + +

        +

        Reviewers: Yunsang Ju(Naver GplaceAI Leader), Min Jean Cho(Intel), Jing Xu(Intel), Mark Saroufim(Meta)

        + +

        Intro

        + +

        Here, We will be sharing our experience in moving AI workloads from our GPU servers to our Intel CPU servers without any performance or quality degradation, and saving annual costs of approximately 340 thousand U.S. Dollar (refer to the Conclusion) in the process.

        + +

        We aim to provide value to our consumers by serving various AI models that enhance the Online to Offline (O2O) experience. With the ongoing growth in the demand for new models and the limited nature of high-cost resource GPUs, we needed to transition relatively lightweight AI models from GPU servers to Intel CPU servers for reducing resource consumption. In the same setting, however, the CPU server had issues where performance of rps, inference time, etc. was reduced by tens of times. We applied various engineering techniques and lightweighted the model to solve this problem, and we were able to successfully transition to the Intel CPU servers with the same performance or better performance as the GPU servers with just a three-fold scale out.

        + +

        For a more detailed introduction about our team, please refer to the Introduction to NAVER Place AI Development Team.

        + +

        I’ll mention it again in the middle, but I’ve received a lot of help from Grokking Pytorch Intel CPU Performance From First Principles written by Intel and PyTorch in the overall work.

        + +

        Problem Definition

        + +

        1: Service Architecture

        + +

        Simplified service architecture

        + +

        Simplified service architecture (Image Source: NAVER GplaceAI)

        + +

        To facilitate understanding, a brief introduction to our service architecture will be provided. CPU intensive tasks such as preprocessing input to tensor format (then forwarded to the model) and post processing inference results to human readable output (e.g. natural language and image formats) are performed on the App Server(FastAPI) The Model Server(TorchServe) exclusively handles inference operations. For stable operation of the service, the following actions need to be performed with sufficient throughput and low latency.

        + +

        The specific processing sequence is as follows:

        + +
          +
        • The client submits a request to the app server via the Traefik gateway.
        • +
        • The app server pre-processes the input by performing actions such as resizing and transforming, and converting it into a Torch tensor before then requesting the model server.
        • +
        • The model server performs inference and returns the feature to the app server
        • +
        • The app server converts the feature into a format understandable by humans through post-processing and returns it to the client
        • +
        + +

        2:  Throughput and Latency Measurement

        + +

        Comparison of Image Scoring Models

        + +

        Comparison of Image Scoring Models

        + +

        With all other conditions remaining the same, deploying on a threefold increase CPU server pod, yet, notably, the RPS (requests per second) and response time deteriorated by more than tenfold. While it was not surprising that CPU inference performance is inferior to GPUs, the challenging situation was evident. Given the goal of maintaining performance within limited resources, achieving an approximate 10 to 20 times performance improvement was necessary Barring any additional scaling.

        + +

        3: Challenges From a Throughput Perspective

        + +
        Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +POST     /predictions/image-scoring                                                        37     0(0.00%) |   9031    4043   28985   8200 |    1.00        0.00
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +         Aggregated                                                                        37     0(0.00%) |   9031    4043   28985   8200 |    1.00        0.00
        +
        + +

        One of the first steps TorchServer framework users might take in order to improve throughput is to increase the number of workers in TorchServe. This approach is effective on GPU servers Because of parallel workload processing, excluding the linear memory usage increase as workers scale. However, we were experiencing worse performance when increasing the number of workers. Identifying the cause of performance degradation on CPU servers required further investigation.

        + +

        4: Challenges From a Latency Perspective

        + +

        Our primary concern was latency. Throughput improvement is normally achievable when a system’s implementation is faithful to scale-out principles, except for perhaps very rare worst-case scenarios. However, in the case of the Image Scoring model example, even performing a single inference took more than 1 second, and as the request volume increased, latency increased to as much as 4 seconds. It was a situation where the timeout criteria to satisfy the client could not be met even with a single inference.

        + +

        Proposed Solutions

        + +

        Improvements were needed from both an ML and an engineering perspective. It was essential to fundamentally reduce the inference time on the CPU and to identify the causes of performance degradation when applying config that generally enhances performance, in order to find the optimal configuration values. To accomplish this, collaboration was established with MLE professionals to concurrently execute tasks encompassing ‘model lightweighting without compromising performance’, and ‘Identify optimal configurations for achieving peak performance’. Using the aforementioned approaches we were able to effectively transition workload handling to our CPU servers.

        + +

        1: Resolving Low RPS from an Engineering Perspective

        + +

        First, the reason for performance degradation even after increasing the worker number was the front-end bound caused by logical threads in GEMM operations. Generally, when increasing the number of workers, the expected improvement effect is the increase in parallelism. Conversely, if performance decreases, one can infer the corresponding trade-off effect.

        + +

        CPU + GPU

        + +

        Image Source: Nvidia

        + +

        As many are aware, the reason model inference performance on CPUs is inferior to GPUs lies in the difference in hardware design, particularly in terms of multi-threading capabilities. Diving deeper, model inference is fundamentally a repetition of GEMM (General Matrix Multiply) operations, and these GEMM operations are executed independently in “fused-multiply-add” (FMA) or “dot-product” (DP) execution units. If the GEMM operation becomes a bottleneck on the CPU, increasing parallelism might actually result in decreased performance. While researching the problem we found relevant information within the PyTorch documentation.

        + +

        While two logical threads run GEMM at the same time, they will be sharing the same core resources causing front-end bound

        + +

        This information highlighted that logical threads could cause a bottleneck in CPU GEMM operations, which helped us intuitively understand why performance decreased when increasing the worker num. This is because the default value of the torch thread corresponds to the physical core value of the CPU.

        + +
        root@test-pod:/# lscpu
        +  …
        +Thread(s) per core: 2
        +Core(s) per socket: 12
        +  …
        +root@test-pod:/# python
        +>>> import torch
        +>>> print(torch.get_num_threads())
        +24
        +
        + +

        When the worker_num increases, the total thread count increases by the product of the physical core * worker number. Consequently, logical threads are utilized. In order to improve performance, the total number of threads per worker was adjusted to align with the physical core count. Below, it can be observed that the metric RPS increased approximately threefold to 6.3(from the previous value of 2.1) when the worker_num was increased to 4 and the total thread count was aligned with the number of physical cores.

        + +
        Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +POST     /predictions/image-scoring                                                       265     0(0.00%) |   3154    1885    4008   3200 |    6.30        0.00
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +         Aggregated                                                                       265     0(0.00%) |   3154    1885    4008   3200 |    6.30        0.00
        +
        + +

        Cautionary Note 1: Our team is Using Kubernetes to maintain our deployments. So we are adjusting the which required us to adjust according to the CPU resource limit of the pod, rather than the physical core count of the node that can be checked using the lscpu command. (Setting the torch thread of each worker to 8/4 = 2, or 24/4 = 6 resulted in performance degradation.)

        + +

        Cautionary Note 2: Since torch thread settings for each worker can only be configured as integers, it’s advisable to set the CPU limit divisible by the worker_num in order to adequately utilize CPU usage.

        + +

        example

        + +

        ex) core=8, In the case of worker_num=3: int(8/worker_num) = 2, 2*worker_num/8 = 75%

        + +

        example

        + +

        ex) core=8, In the case of worker_num=4: int(8/worker_num) = 2, 2*worker_num/8 = 100%

        + +

        We also analyzed the model containers to see why we got a mere threefold improvement in performance despite a four times increase in the number of workers. Various resources were monitored, and among them, the core utilization rate was identified as the underlying cause.

        + +

        threads

        + +

        Even when the total thread count was adjusted to match the CPU(2nd Generation, Intel(R) Xeon(R) Silver 4214) limit(8 core), there were instances where computations were executed from logical thread to logical core. Due to the presence of 24 physical cores, the cores numbered 25 to 48 are classified as logical cores. The possibility of confining thread execution solely within physical cores seemed to offer the potential for further performance enhancement. The reference to this solution could be found within the source document mentioned in the PyTorch-geometric article that warned about CPU GEMM bottlenecks.

        + + + +

        As per the instructions in the document, Intel provides Intel® Extension for PyTorch where we can simply pin cores to specific sockets. The application method is also made very simple, by adding the following settings to the torchserve config.properties file.(used intel_extension_for_pytorch==1.13.0)

        + +
        ipex_enable=true
        +CPU_launcher_enable=true
        +
        + +

        two-socket configuration

        + +

        Image Source: PyTorch

        + +

        Beyond the removal of logical threads through socket pinning, there is an additional effect of eliminating UPI cache hit overhead. Since the CPU comprises more than one socket when threads scheduled on socket 1 are rescheduled on socket 2, cache hits occur in cases of accessing the cache of socket 1 via Intel Ultra Path Interconnect (UPI). At this point, UPI access to the local cache becomes more than twice as slow as local cache access, resulting in more bottlenecks. With threads being pinned to socket units by oneAPI powered Intel® Extension for PyTorch, We observed rps handling increase of up to four times than when the bottleneck existed.

        + +
        Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +POST     /predictions/image-scoring                                                       131     0(0.00%) |   3456    1412    6813   3100 |    7.90        0.00
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +         Aggregated                                                                       131     0(0.00%) |   3456    1412    6813   3100 |    7.90        0.00
        +
        + +

        Cautionary Note 1: Intel® Extension for PyTorch is specialized in neural network (referred to as “nn” hereafter) inference optimization, so the performance improvement from additional techniques outside nn might be minimal. Indeed, in the instance of the image scoring system highlighted as an example, where svr (support vector regression) is applied post-inference, the performance enhancement was confined to a 4-fold increase. However, for a purely nn inference model such as the food recognition model, a performance boost of 7-fold (2.5rps -> 17.5rps) was detected.

        + +
        Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +POST     /predictions/food-classification                                                 446     0(0.00%) |   1113     249    1804   1200 |   17.50        0.00
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +         Aggregated                                                                       446     0(0.00%) |   1113     249    1804   1200 |   17.50        0.00
        +
        + +

        Cautionary Note 2: Applying Intel® Extension for PyTorch requires torchserve version 0.6.1 or higher. Since our team was using version 0.6.0, there was an issue where socket pinning was not functioning correctly. Currently, we have made modifications to the guide document, specifying the required version.

        + +

        Within WorkerLifeCycle.java, multi-worker pinning is not supported in 0.6.0 and below (ninstance is hardcoded to 1)

        + +
        // 0.6.0 version
        +
        +public ArrayList<String> launcherArgsToList() {
        +   ArrayList<String> arrlist = new ArrayList<String>();
        +   arrlist.add("-m");
        +   arrlist.add("intel_extension_for_pytorch.cpu.launch");
        +   arrlist.add(" — ninstance");
        +   arrlist.add("1");
        +   if (launcherArgs != null && launcherArgs.length() > 1) {
        +     String[] argarray = launcherArgs.split(" ");
        +     for (int i = 0; i < argarray.length; i++) {
        +       arrlist.add(argarray[i]);
        +     }
        +   }
        +   return arrlist;
        + }
        +// master version
        +
        +if (this.numWorker > 1) {
        +   argl.add(" — ninstances");
        +   argl.add(String.valueOf(this.numWorker));
        +   argl.add(" — instance_idx");
        +   argl.add(String.valueOf(this.currNumRunningWorkers));
        + }
        +
        + +

        2: Addressing Slow Latency Through Model Lightweighting

        + +

        We also streamlined our model using Knowledge Distillation (commonly abbreviated as KD) to further reduce latency. As is widely known, kd is a technique where knowledge from a larger network (Teacher network) is conveyed to a smaller, lightweight network (Student network) which is less resource intensive and can be more readily deployed. For more detailed information, please refer to the paper where this concept was initially introduced, titled Distilling the Knowledge in a Neural Network.

        + +

        neural networks

        + +

        There is a variety of KD techniques available and because we were primarily focused on accuracy loss minimization, we adopted the approach from the paper Knowledge Distillation from A Stronger Teacher, which was published in the year 2022. The concept is straightforward. Unlike the conventional method of distillation that utilizes only the model’s prop values, the chosen approach involves having the student network learn the correlations between classes in the teacher network. When put into actual application, We observed effective model weight reduction to observe the effective reduction in the model’s weight while mainting high accuracy. The following are the outcomes of our experimentation with the mentioned knowledge distillation technique on several candidate student models, where selections were made based on the maintained level of accuracy.

        + +

        table of services

        + +

        For the image scoring system, additional measures were taken to reduce the input size. Considering that the prior use of CPU-based ML technique SVR (Support Vector Regression) was used (2-stage: CNN + SVR), even when this was streamlined into a 1-stage model, significant speed advantages were not observed in CPU inference. In order for streamlining to have significance, the input size of the student model during inference needed further reduction. Consequently, experiments were conducted with the size reduced from 384384 to 224224.

        + +

        Further simplifying transformations, the 2-stage (CNN + SVR) approach was unified into a 1-stage model with a larger ConvNext, and then kd was applied using the lightweight EfficientNet to resolve the accuracy trade-off. During the experiments, we encountered a problem where changing Img_resize to 224 led to a performance drop from 0.4007 to 0.4296 in terms of MAE. Due to the reduction in input size, various preprocessing techniques applied to the original training images (such as Affine, RandomRotate90, Blur, OneOf [GridDistortion, OpticalDistortion, ElasticTransform], VerticalFlip) had a counterproductive effect. By adopting these measures, effective training of the student was achieved, and the MAE value improved by 25% compared to the previous one (.518 to .3876).

        + +

        Validation

        + +

        1: Final Performance Measurement

        + +

        The following shows the final performance improvements using CPU servers, on the three models mentioned throughout this article.

        + +
        # Food photo classifier (pod 3): 2.5rps -> 84 rps
        +
        + Type Name                                                                           # reqs # fails | Avg Min Max Med | req/s failures/s
        + --------|----------------------------------------------------------------------------|------|------------|-------|------|-------|-------|--------|--------- 
        +POST /predictions/food-classification 2341 0(0.00%) | 208 130 508 200 | 84.50 0.00 
        +--------|----------------------------------------------------------------------------|--------|-------------|------|-------|--------|------|--------|----------
        +         Aggregated                                                                      2341     0(0.00%) |    208     130     508    200 |   84.50        0.00
        +
        +# Image scoring (pod 3): 2.1rps -> 62rps
        + Type Name                                                                               #reqs #fails | Avg Min Max Median | req/s failures/s
        + --------|---------------------------------------------------------------------------------|--------|-------------|--------|-------|--------|---------|--------|--------- 
        +  POST /predictions/image-scoring 1298 0 (0.00%) | 323 99 607 370 | 61.90 0.00 
        +--------|---------------------------------------------------------------------------------|--------|-------------|--------|------|--------|---------|--------|----------
        +          Aggregated                                                                          1298     0(0.00%)  |     323      99     607     370  |   61.90        0.00
        +
        +# receipt classifier(pod 3) : 20rps -> 111.8rps
        +Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +POST     /predictions/receipt-classification                                             4024     0(0.00%) |    266     133    2211    200 |   111.8        0.00
        +--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
        +         Aggregated                                                                      4020     0(0.00%) |    266     133    2211    200 |   111.8        0.00
        +
        + +

        2:  Traffic Mirroring

        + +

        As previously mentioned, our team’s service architecture employs the tool “traefik” as a gateway in front of the app server, as briefly introduced at the beginning of the article. For final validation, the mirroring feature of this traefik gateway was utilized to mirror traffic from production to staging for a month of validation before applying it to production, which is now operational.

        + +

        Details regarding mirroring are beyond the scope of this topic and hence omitted. For those interested, kindly refer to the document at https://doc.traefik.io/traefik/routing/services/#mirroring-service.

        + +

        In Conclusion

        + +

        This concludes the discussion about transitioning from a GPU model server to a CPU server while maintaining service quality. Through this effort, our team was able to save 15 GPUs each in South Korea and Japan, resulting in an annual cost savings of approximately 340 thousand U.S. Dollar. Although we directly purchase and use GPUs within NAVER, we calculated a rough cost reduction based on AWS EC2 instances that stably support T4 GPUs.

        + +

        instance sizes

        + +

        Calculation: 1.306 (1-year reserved instance effective hourly cost) * 24 (hours) * 365 (days) * 15 (number of GPUs) * 2 (KR + JP)

        + +

        These secured GPUs will be harnessed to further advance and enhance our team’s AI services, delivering exceptional service experiences. We sincerely appreciate your encouragement and anticipation.:)

        + +

        Explore More

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/ml-models-torchvision-v0.9/index.html b/blog/ml-models-torchvision-v0.9/index.html new file mode 100644 index 000000000000..a1e9de343f93 --- /dev/null +++ b/blog/ml-models-torchvision-v0.9/index.html @@ -0,0 +1,688 @@ + + + + + + + + + + + + + An overview of the ML models introduced in TorchVision v0.9 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        TorchVision v0.9 has been released and it is packed with numerous new Machine Learning models and features, speed improvements and bug fixes. In this blog post, we provide a quick overview of the newly introduced ML models and discuss their key features and characteristics.

        + +

        Classification

        +
          +
        • +

          MobileNetV3 Large & Small: These two classification models are optimized for Mobile use-cases and are used as backbones on other Computer Vision tasks. The implementation of the new MobileNetV3 architecture supports the Large & Small variants and the depth multiplier parameter as described in the original paper. We offer pre-trained weights on ImageNet for both Large and Small networks with depth multiplier 1.0 and resolution 224x224. Our previous training recipes have been updated and can be used to easily train the models from scratch (shoutout to Ross Wightman for inspiring some of our training configuration). The Large variant offers a competitive accuracy comparing to ResNet50 while being over 6x faster on CPU, meaning that it is a good candidate for applications where speed is important. For applications where speed is critical, one can sacrifice further accuracy for speed and use the Small variant which is 15x faster than ResNet50.

          +
        • +
        • +

          Quantized MobileNetV3 Large: The quantized version of MobilNetV3 Large reduces the number of parameters by 45% and it is roughly 2.5x faster than the non-quantized version while remaining competitive in terms of accuracy. It was fitted on ImageNet using Quantization Aware Training by iterating on the non-quantized version and it can be trained from scratch using the existing reference scripts.

          +
        • +
        + +

        Usage:

        +
        model = torchvision.models.mobilenet_v3_large(pretrained=True)
        +# model = torchvision.models.mobilenet_v3_small(pretrained=True)
        +# model = torchvision.models.quantization.mobilenet_v3_large(pretrained=True)
        +model.eval()
        +predictions = model(img)
        +
        +

        Object Detection

        +
          +
        • Faster R-CNN MobileNetV3-Large FPN: Combining the MobileNetV3 Large backbone with a Faster R-CNN detector and a Feature Pyramid Network leads to a highly accurate and fast object detector. The pre-trained weights are fitted on COCO 2017 using the provided reference scripts and the model is 5x faster on CPU than the equivalent ResNet50 detector while remaining competitive in terms of accuracy.
        • +
        • Faster R-CNN MobileNetV3-Large 320 FPN: This is an iteration of the previous model that uses reduced resolution (min_size=320 pixel) and sacrifices accuracy for speed. It is 25x faster on CPU than the equivalent ResNet50 detector and thus it is good for real mobile use-cases.
        • +
        + +

        Usage:

        +
        model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
        +# model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=True)
        +model.eval()
        +predictions = model(img)
        +
        +

        Semantic Segmentation

        +
          +
        • DeepLabV3 with Dilated MobileNetV3 Large Backbone: A dilated version of the MobileNetV3 Large backbone combined with DeepLabV3 helps us build a highly accurate and fast semantic segmentation model. The pre-trained weights are fitted on COCO 2017 using our standard training recipes. The final model has the same accuracy as the FCN ResNet50 but it is 8.5x faster on CPU and thus making it an excellent replacement for the majority of applications.
        • +
        • Lite R-ASPP with Dilated MobileNetV3 Large Backbone: We introduce the implementation of a new segmentation head called Lite R-ASPP and combine it with the dilated MobileNetV3 Large backbone to build a very fast segmentation model. The new model sacrifices some accuracy to achieve a 15x speed improvement comparing to the previously most lightweight segmentation model which was the FCN ResNet50.
        • +
        + +

        Usage:

        +
        model = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True)
        +# model = torchvision.models.segmentation.lraspp_mobilenet_v3_large(pretrained=True)
        +model.eval()
        +predictions = model(img)
        +
        +

        In the near future we plan to publish an article that covers the details of how the above models were trained and discuss their tradeoffs and design choices. Until then we encourage you to try out the new models and provide your feedback.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/mlops-workflow/index.html b/blog/mlops-workflow/index.html new file mode 100644 index 000000000000..a26a1c581ec8 --- /dev/null +++ b/blog/mlops-workflow/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + MLOps Workflow Simplified for PyTorch with Arm and GitHub Collaboration | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Eric Sondhi, Arm + +

        +

        PyTorch is one of the most widely used and most powerful deep learning frameworks for training and deploying complex neural networks. It has never been easier to train and deploy AI applications, and low-cost, high-performance, energy-efficient hardware, tools, and technology for creating optimized workflows are more accessible than ever. But data science, machine learning, and devops can be deep topics unto themselves, and it can be overwhelming for developers with one specialty to see how they all come together in the real world, or even to know where to get started.

        + +

        To that end, we at Arm have collaborated with our friends at GitHub to decompose the basic elements of real world MLOps pipelines that use PyTorch models and create a simplified workflow and MLOps tutorial that anyone with a GitHub and a Docker Hub account can leverage.

        + +

        MLOps Overview

        + +

        The software development lifecycle for machine learning applications typically starts from training data, which is used to train sophisticated neural networks (NNs) that are optimized, integrated into software images, and then deployed onto compute clusters and even fleets of devices in the field. These devices are typically continuously collecting data and are managed by cloud services, which actively monitor performance of the ML algorithm(s) and feedback data for retraining in the next iteration of the lifecycle – enabling continuous improvement of the algorithms, as well as supporting deployment of new AI features.

        + +

        process flow chart

        + +

        Example of a typical ML software development lifecycle.

        + +

        Scott Arbeit from GitHub recently published an excellent blog that highlights the importance of MLOps in machine learning and describes automation via simplified GitHub actions for several key tasks including:

        + +
          +
        • Data preprocessing: cleaning and preparation of data for training.
        • +
        • Model training and validation: automatic execution of training scripts when new data is pushed or when changes are made to the model code.
        • +
        • Deployment: automatic packaging and deployment of models to production environments upon successful training and validation.
        • +
        • Monitoring and alerts: workflows to monitor model performance and send alerts if certain thresholds are breached.
        • +
        + +

        The article also describes a conceptual efficient MLOps pipeline that takes advantage of new, low-cost Arm Runners natively integrated into GitHub Actions to train and validate PyTorch models. It also uses containerization for consistent deployment across different environments.

        + +

        Our team at Arm put GitHub’s ideas and conceptual workflow into practice and created a tutorial to help you get started today.

        + +

        Optimizing Your PyTorch MLOps Workflow

        + +

        A new Arm Learning Path unpacks each of the key phases described in Scott’s blog, and demonstrates each key task in detail, providing prescriptive instructions and code examples to leverage several aspects of the PyTorch framework to implement each phase.

        + +

        process flow chart

        + +

        Key ML tasks to setup and automate with GitHub Actions.

        + +

        With this learning path you will be able to take advantage of the following strategies with a real-world object detection use case to make your own streamlined MLOps workflow:

        + +
          +
        • Containerization: Package your PyTorch model and its dependencies into a Docker container to help ensure consistent performance across different environments.
        • +
        • Efficient Data Loading: Optimize data loading pipelines to help minimize I/O bottlenecks and maximize GPU utilization.
        • +
        • Model Optimization: Explore techniques like model quantization, pruning, and knowledge distillation to help reduce model size and improve inference speed.
        • +
        • Leverage PyTorch’s Ecosystem: Utilize libraries like TorchVision to help streamline common deep learning tasks.
        • +
        • Monitor and Profile: Monitor resource utilization and identify potential bottlenecks to further optimize your workflow.
        • +
        + +

        An End-to-End MLOps Workflow

        + +

        The best part of this learning path is not just that it takes you through each task in detail, but it brings it all together into a unified automated workflow.

        + +

        With GitHub Actions, you can build an end-to-end custom MLOPs workflow that combines and automates the individual workflows for each ML task. To demonstrate this, the repository contains a workflow in a boilerplate .yml file that automates the individual steps.

        + +

        You can run an MLOps workflow using GitHub Actions natively for managing all the steps in your ML application’s lifecycle.

        + +

        process flow chart

        + +

        A successful run of this MLOps workflow in GitHub Actions.

        + +

        Try It Yourself!

        + +

        Our Arm team has battle-tested this tutorial in the field and delivered the tutorial as a workshop at GitHub Universe 2024 earlier this year. Now it’s time for you to take it for a spin and get hands-on with PyTorch and MLOps.

        + +

        Try the Arm Learning Path Here!

        + +

        By the end of this tutorial, you can:

        + +
          +
        • Set up a new GitHub Arm-runner to natively build an arm64 image to take advantage of the lowest-cost, most power efficient compute available.
        • +
        • Train and test a PyTorch ML model with the German Traffic Sign Recognition Benchmark (GTSRB) dataset.
        • +
        • Compare the performance of two trained PyTorch ML models; one model compiled with OpenBLAS (Open Basic Linear Algebra Subprograms Library) and oneDNN (Deep Neural Network Library), and the other model compiled with Arm Compute Library (ACL).
        • +
        • Containerize a ML model and push the container to DockerHub.
        • +
        • Automate each task into a single MLOps pipeline Using GitHub Actions.
        • +
        + +

        Combining the power of PyTorch with the simplicity of GitHub Actions and the efficiency of native Arm Runners significantly helps you accelerate your deep learning development and deployment processes. Following the best practices outlined in this blog post helps you achieve optimal performance and cost-effectiveness for your PyTorch projects.

        + +

        We’d love to see what you create based on this example. If you have created your own Arm Learning Path, you are invited to share it here.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/mobile-demo-apps-overview/index.html b/blog/mobile-demo-apps-overview/index.html new file mode 100644 index 000000000000..a4436a3fe8b8 --- /dev/null +++ b/blog/mobile-demo-apps-overview/index.html @@ -0,0 +1,775 @@ + + + + + + + + + + + + + An Overview of the PyTorch Mobile Demo Apps | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Jeff Tang and Mark Saroufim + +

        +

        PyTorch Mobile provides a runtime environment to execute state-of-the-art machine learning models on mobile devices. Latency is reduced, privacy preserved, and models can run on mobile devices anytime, anywhere.

        + +

        In this blog post, we provide a quick overview of 10 currently available PyTorch Mobile powered demo apps running various state-of-the-art PyTorch 1.9 machine learning models spanning images, video, audio and text.

        + +

        It’s never been easier to deploy a state-of-the-art ML model to a phone. You don’t need any domain knowledge in Machine Learning and we hope one of the below examples resonates enough with you to be the starting point for your next project.

        + +
        + +
        + +

        Computer Vision

        +

        Image Classification

        +

        This app demonstrates how to use PyTorch C++ libraries on iOS and Android to classify a static image with the MobileNetv2/3 model.

        + +

        iOS #1 iOS #2 Android #1 Android #2

        + +

        iOS Android

        + +
        + +
        + +

        Live Image Classification

        +

        This app demonstrates how to run a quantized MobileNetV2 and Resnet18 models to classify images in real time with an iOS and Android device camera.

        + +

        iOS Android

        + +
        + + +
        + +

        Image Segmentation

        +

        This app demonstrates how to use the PyTorch DeepLabV3 model to segment images. The updated app for PyTorch 1.9 also demonstrates how to create the model using the Mobile Interpreter and load the model with the LiteModuleLoader API.

        + +

        iOS Android

        + +

        iOS Android

        + +
        + +
        + +

        Vision Transformer for Handwritten Digit Recognition

        +

        This app demonstrates how to use Facebook’s latest optimized Vision Transformer DeiT model to do image classification and handwritten digit recognition.

        + +

        iOS Android

        + +

        Android

        + +
        + +
        + +

        Object Detection

        +

        This app demonstrates how to convert the popular YOLOv5 model and use it on an iOS app that detects objects from pictures in your photos, taken with camera, or with live camera.

        + +

        iOS Android

        + +

        iOS Android

        + +
        + +
        + +

        D2Go

        +

        This app demonstrates how to create and use a much lighter and faster Facebook D2Go model to detect objects from pictures in your photos, taken with camera, or with live camera.

        + +

        iOS Android

        + +

        iOS Android

        + +
        + +
        + +

        Video

        +

        Video Classification

        +

        This app demonstrates how to use a pre-trained PyTorchVideo model to perform video classification on tested videos, videos from the Photos library, or even real-time videos.

        + +

        iOS Android

        + +

        iOS Android Deep Dive

        + +
        + +
        + +

        Natural Language Processing

        +

        Text Classification

        +

        This app demonstrates how to use a pre-trained Reddit model to perform text classification.

        + +

        iOS Android

        + +
        + +
        + +

        Machine Translation

        +

        This app demonstrates how to convert a sequence-to-sequence neural machine translation model trained with the code in the PyTorch NMT tutorial for french to english translation.

        + +

        iOS Android

        + +

        iOS Android

        + +
        + +
        + +

        Question Answering

        +

        This app demonstrates how to use the DistilBERT Hugging Face transformer model to answer questions about Pytorch Mobile itself.

        + +

        iOS Android

        + +

        iOS Android

        + +
        + +
        + +

        Audio

        +

        Speech Recognition

        +

        This app demonstrates how to convert Facebook AI’s torchaudio-powered wav2vec 2.0, one of the leading models in speech recognition to TorchScript before deploying it.

        + +

        iOS Android

        + +
        + +
        + +

        We really hope one of these demo apps stood out for you. For the full list, make sure to visit the iOS and Android demo app repos. You should also definitely check out the video An Overview of the PyTorch Mobile Demo Apps which provides both an overview of the PyTorch mobile demo apps and a deep dive into the PyTorch Video app for iOS and Android.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/model-serving-in-pyorch/index.html b/blog/model-serving-in-pyorch/index.html new file mode 100644 index 000000000000..1326adba574f --- /dev/null +++ b/blog/model-serving-in-pyorch/index.html @@ -0,0 +1,712 @@ + + + + + + + + + + + + + Model Serving in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        May 08, 2019

        +

        + Model Serving in PyTorch +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Jeff Smith + +

        +

        PyTorch has seen a lot of adoption in research, but people can get confused about how well PyTorch models can be taken into production. This blog post is meant to clear up any confusion people might have about the road to production in PyTorch. +Usually when people talk about taking a model “to production,” they usually mean performing inference, sometimes called model evaluation or prediction or serving. At the level of a function call, in PyTorch, inference looks something like this:

        + +
          +
        • In Python +
            +
          • module(input)
          • +
          +
        • +
        • In traced modules +
            +
          • module(input)
          • +
          +
        • +
        • In C++ +
            +
          • at::Tensor output = module->forward(inputs).toTensor();
          • +
          +
        • +
        + +

        Since we at Facebook perform inference operations using PyTorch hundreds of trillions of times per day, we’ve done a lot to make sure that inference runs as efficiently as possible.

        + +

        Serving Strategies

        + +

        That zoomed-in view of how you use models in inference isn’t usually the whole story, though. In a real world machine learning system, you often need to do more than just run a single inference operation in the REPL or Jupyter notebook. Instead, you usually need to integrate your model into a larger application in some way. Depending on what you need to do, you can usually take one of the following approaches.

        + +

        Direct embedding

        + +

        In application settings like mobile, we often just directly call the model as part of a larger program. This isn’t just for apps; usually this is how robotics and dedicated devices work as well. At a code-level, the call to the model is exactly the same as what is shown above in the section about inference shown above. A key concern is often that a Python interpreter is not present in such environments, which is why PyTorch allows you to call your models from C++ and ship a model without the need for a Python runtime.

        + +

        Model microservices

        + +

        If you’re using your model in a server side context and you’re managing multiple models, you might choose to treat each individual model (or each individual model version) as a separate service, usually using some sort of packaging mechanism like a Docker container. Then that service is often made network accessible via some sort of service, either using JSON over HTTP or an RPC technology like gRPC. The key characteristic of this approach is that you’re defining a service with a single endpoint that just calls your model. Then you do do all of your model management (promotion, rollback, etc.) via whatever system you already use to manage your services (e.g. kubernetes, ECS).

        + +

        Model servers

        + +

        An additional possible solution is to use a model server. This is an application built to manage and serve models. It allows you to upload multiple models and get distinct prediction endpoints for each of them. Typically such systems include a number of other features to help solve more of the whole problem of managing and serving models. This can include things like metrics, visualization, data pre-processing, and more. Even something as simple as having a system for automatically versioning models can make building important features like model rollbacks much easier.

        + +

        Evolving Patterns

        + +

        The above is a somewhat arbitrary breakdown of different approaches based on a snapshot in time. Design patterns are still evolving. Recently, model server designs have started to adopt more of the technologies of general service infrastructure such as Docker containers and kubernetes, so many model servers have started to share properties of the model microservice design discussed above. For a deeper dive into the general concepts of model server designs, you can check out my book on machine learning systems.

        + +

        Serving PyTorch Models

        + +

        So, if you’re a PyTorch user, what should you use if you want to take your models to production?

        + +

        If you’re on mobile or working on an embedded system like a robot, direct embedding in your application is often the right choice. +For mobile specifically, your use case might be served by the ONNX export functionality. +Note that ONNX, by its very nature, has limitations and doesn’t support all of the functionality provided by the larger PyTorch project. +You can check out this tutorial on deploying PyTorch models to mobile using ONNX to see if this path might suit your use case. +That said, we’ve heard that there’s a lot more that PyTorch users want to do on mobile, so look for more mobile-specific functionality in PyTorch in the future. +For other embedded systems, like robots, running inference on a PyTorch model from the C++ API could be the right solution.

        + +

        If you can’t use the cloud or prefer to manage all services using the same technology, you can follow this example to build a simple model microservice using the Flask web framework.

        + +

        If you want to manage multiple models within a non-cloud service solution, there are teams developing PyTorch support in model servers like MLFlow, Kubeflow, and RedisAI. We’re excited to see innovation from multiple teams building OSS model servers, and we’ll continue to highlight innovation in the PyTorch ecosystem in the future.

        + +

        If you can use the cloud for your application, there are several great choices for working with models in the cloud. For AWS Sagemaker, you can start find a guide to all of the resources from AWS for working with PyTorch, including docs on how to use the Sagemaker Python SDK. You can also see some talks we’ve given on using PyTorch on Sagemaker. Finally, if you happen to be using PyTorch via FastAI, then they’ve written a really simple guide to getting up and running on Sagemaker.

        + +

        The story is similar across other major clouds. On Google Cloud, you can follow these instructions to get access to a Deep Learning VM with PyTorch pre-installed. On Microsoft Azure, you have a number of ways to get started from Azure Machine Learning Service to Azure Notebooks showing how to use PyTorch.

        + +

        Your Models

        + +

        Whichever approach you take to bringing your PyTorch models to production, we want to support you and enable your success. Do you love one of the options above? Are you having difficulty with that one crucial feature you can’t find support for? We’d love to discuss more on the deployment category on the PyTorch Discuss forums. We’d love to help, and where you’re seeing success, amplify your story.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-executive-director/index.html b/blog/new-executive-director/index.html new file mode 100644 index 000000000000..9ba2fc054e72 --- /dev/null +++ b/blog/new-executive-director/index.html @@ -0,0 +1,661 @@ + + + + + + + + + + + + + PyTorch Foundation Welcomes New Executive Director | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Matt White +The PyTorch Foundation is excited to welcome Matt White, our new executive director. The PyTorch Foundation formed in 2022 with the goal to drive adoption of AI tooling by fostering and sustaining an ecosystem of open source, vendor-neutral projects with PyTorch. Over the past 2 years, we’ve seen excellent growth across the project – with both contributor and member growth.

        + +

        “I am honored to be a part of the PyTorch Foundation, working with such a passionate and skilled community,” said Matt White. “I am looking forward to working with our contributors and members to advance the PyTorch ecosystem through research, cutting edge technologies and open source best practices.”

        + +

        Matt is a career technologist, researcher and innovator and has over 25 years of experience in AI, data, autonomous systems and simulations. He is the Co-founder and Chair of the Open Metaverse Foundation, a part of the Linux Foundation. Previously, Matt was the Director of the Generative AI Commons at the Linux Foundation, leading the advancement of open science and open-source artificial intelligence projects. He is also the GM of AI at the Linux Foundation.

        + +

        Learn more about the PyTorch Foundation:

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-features-for-ai/index.html b/blog/new-features-for-ai/index.html new file mode 100644 index 000000000000..fb3037290de1 --- /dev/null +++ b/blog/new-features-for-ai/index.html @@ -0,0 +1,1238 @@ + + + + + + + + + + + + + PyTorch 2.1 Contains New Performance Features for AI Developers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel + +

        +

        We are excited to see the release of PyTorch 2.1. In this blog, we discuss the five features for which Intel made significant contributions to PyTorch 2.1:

        + +
          +
        1. TorchInductor-CPU optimizations including Bfloat16 inference path for torch.compile
        2. +
        3. CPU dynamic shape inference path for torch.compile
        4. +
        5. C++ wrapper (prototype)
        6. +
        7. Flash-attention-based scaled dot product algorithm for CPU
        8. +
        9. PyTorch 2 export post-training auantization with an x86 back end through an inductor
        10. +
        + +

        At Intel, we are delighted to be part of the PyTorch community and appreciate the collaboration with and feedback from our colleagues at Meta* as we co-developed these features.

        + +

        Let’s get started.

        + +

        TorchInductor-CPU Optimizations

        + +

        This feature optimizes bfloat16 inference performance for TorchInductor. The 3rd and 4th generation Intel® Xeon® Scalable processors have built-in hardware accelerators for speeding up dot-product computation with the bfloat16 data type. Figure 1 shows a code snippet of how to specify the BF16 inference path.

        + +
        user_model = ...
        +
        +user_model.eval()
        +with torch.no_grad(), torch.autocast("cpu"):
        +	compiled_model = torch.compile(user_model)
        +	y = compiled_model(x)
        +
        + +

        Figure 1. Code snippet showing the use of BF16 inference with TorchInductor \

        + +

        We measured the performance on three TorchInductor benchmark suites—TorchBench, Hugging Face, and TIMM—and the results are as follows in Table 1. Here we see that performance in graph mode (TorchInductor) outperforms eager mode by factors ranging from 1.25x to 2.35x.

        + +

        Table 1. Bfloat16 performance geometric mean speedup in graph mode, compared with eager mode

        + + + + + + + + + + + + + + + + + +
        +Bfloat16 Geometric Mean Speedup (Single-Socket Multithreads) +
        +Compiler + +torchbench + +huggingface + +timm_models +
        +inductor + +1.81x + +1.25x + +2.35x +
        + + + + + + + + + + + + + + + + + +
        +Bfloat16 Geometric Mean Speedup (Single-Core Single Thread) +
        +Compiler + +torchbench + +huggingface + +timm_models +
        +inductor + +1.74x + +1.28x + +1.29x +
        + +

        Developers can fully deploy their models on 4th generation Intel Xeon processors to take advantage of the Intel® Advanced Matrix Extensions (Intel® AMX) feature to get peak performance for torch.compile. Intel AMX has two primary components: tiles and tiled matrix multiplication (TMUL). The tiles store large amounts of data in eight two-dimensional registers, each one kilobyte in size. TMUL is an accelerator engine attached to the tiles that contain instructions to compute larger matrices in a single operation.

        + +

        CPU Dynamic Shapes Inference Path for torch.compile

        + +

        Dynamic shapes is one of the key features in PyTorch 2.0. PyTorch 2.0 assumes everything is static by default. If we recompile because a size changed, we will instead attempt to recompile that size as being dynamic (sizes that have changed are likely to change in the future). Dynamic shapes support is required for popular models like large language models (LLM). Dynamic shapes that provide support for a broad scope of models can help users get more benefit from torch.compile. For dynamic shapes, we provide the post-op fusion for conv/gemm operators and vectorization code-gen for non-conv/gemm operators.

        + +

        Dynamic shapes is supported by both the inductor Triton back end for CUDA* and the C++ back end for CPU. The scope covers improvements for both functionality (as measured by model passing rate) and performance (as measured by inference latency/throughput). Figure 2 shows a code snippet for the use of dynamic shape inference with TorchInductor.

        + +
        user_model = ...
        +
        +# Training example
        +compiled_model = torch.compile(user_model)
        +y = compiled_model(x_size1)
        +# Here trigger the recompile because the input size changed
        +y = compiled_model(x_size2)
        +
        +
        +# Inference example
        +user_model.eval()
        +compiled_model = torch.compile(user_model)
        +with torch.no_grad():
        +	y = compiled_model(x_size1)
        + # Here trigger the recompile because the input size changed
        + y = compiled_model(x_size2)
        +
        + +

        Figure 2. Code snippet showing the use of dynamic shape inference with TorchInductor

        + +

        We again measured the performance on the three TorchInductor benchmark suites—TorchBench, Hugging Face, and TIMM—and the results are in Table 2. Here we see that performance in graph mode outperforms eager mode by factors ranging from 1.15x to 1.79x.

        + +

        Table 2. Dynamic shape geometric mean speedup compared with Eager mode

        + + + + + + + + + + + + + + + + + +
        +Dynamic Shape Geometric Mean Speedup (Single-Socket Multithreads) +
        +Compiler + +torchbench + +huggingface + +timm_models +
        +inductor + +1.35x + +1.15x + +1.79x +
        + + + + + + + + + + + + + + + + + +
        +Dynamic Shape Geometric Mean Speedup (Single-Core Single-Thread) +
        +Compiler + +torchbench + +huggingface + +timm_models +
        +inductor + +1.48x + +1.15x + +1.48x +
        + +

        C++ Wrapper (Prototype)

        + +

        The feature generates C++ code instead of Python* code to invoke the generated kernels and external kernels in TorchInductor to reduce Python overhead. It is also an intermediate step to support deployment in environments without Python.

        + +

        To enable this feature, use the following configuration:

        + +
        import torch
        +import torch._inductor.config as config
        +config.cpp_wrapper = True
        +
        + +

        For light workloads where the overhead of the Python wrapper is more dominant, C++ wrapper demonstrates a higher performance boost ratio. We grouped the models in TorchBench, Hugging Face, and TIMM per the average inference time of one iteration and categorized them into small, medium, and large categories. Table 3 shows the geometric mean speedups achieved by the C++ wrapper in comparison to the default Python wrapper.

        + +

        Table 3. C++ wrapper geometric mean speedup compared with Eager mode

        + + + + + + + + + + + + + + + + + +
        +FP32 Static Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) +
        +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
        +inductor + +1.06x + +1.01x + +1.00x +
        + + + + + + + + + + + + + + + + + +
        +FP32 Static Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) +
        +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
        +inductor + +1.13x + +1.02x + +1.01x +
        + + + + + + + + + + + + + + + + + +
        +FP32 Dynamic Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) +
        +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
        +inductor + +1.05x + +1.01x + +1.00x +
        + + + + + + + + + + + + + + + + + +
        +FP32 Dynamic Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) +
        +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
        +inductor + +1.14x + +1.02x + +1.01x +
        + + + + + + + + + + + + + + + + + +
        +BF16 Static Shape Mode Geometric Mean Speedup (Single-Socket Multithreads) +
        +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
        +inductor + +1.09x + +1.03x + +1.04x +
        + + + + + + + + + + + + + + + + + +
        +BF16 Static Shape Mode Geometric Mean Speedup (Single-Core Single-Thread) +
        +Compiler + +Small (t <= 0.04s) + +Medium (0.04s < t <= 1.5s) + +Large (t > 1.5s) +
        +inductor + +1.17x + +1.04x + +1.03x +
        + +

        Flash-Attention-Based Scaled Dot Product Algorithm for CPU

        + +

        Scaled dot product attention (SDPA) is one of the flagship features of PyTorch 2.0 that helps speed up transformer models. It is accelerated with optimal CUDA kernels while still lacking optimized CPU kernels. This flash-attention implementation targets both training and inference, with both FP32 and Bfloat16 data types supported. There is no front-end use change for users to leverage this SDPA optimization. When calling SDPA, a specific implementation will be chosen automatically, including this new implementation.

        + +

        We have measured the SDPA-related models in Hugging Face, and they are proven effective when compared to the unfused SDPA. Shown in Table 4 are the geometric mean speedups for SDPA optimization. \

        + +

        Table 4. SDPA optimization performance geometric mean speedup

        + + + + + + + + + + + + + + + +
        +SDPA Geometric Mean Speedup (Single-Socket Multithreads) +
        +Compiler + +Geometric Speedup FP32 + +Geometric Speedup BF16 +
        +inductor + +1.15x, 20/20 + +1.07x, 20/20 +
        + + + + + + + + + + + + + + + +
        +SDPA Geometric Mean Speedup (Single-Core Single-Thread) +
        +Compiler + +Geometric Speedup FP32 + +Geometric Speedup BF16 +
        +inductor + +1.02x, 20/20 + +1.04x, 20/20 +
        + +

        PyTorch 2 Export Post-Training Quantization with x86 Back End through Inductor

        + +

        PyTorch provides a new quantization flow in the PyTorch 2.0 export. This feature uses TorchInductor with an x86 CPU device as the back end for post-training static quantization with this new quantization flow. An example code snippet is shown in Figure 3.

        + +
        import torch
        +import torch._dynamo as torchdynamo
        +from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
        +import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
        +
        +model = ... 
        +
        +model.eval()
        +with torch.no_grad():
        + # Step 1: Trace the model into an FX graph of flattened ATen operators
        + exported_graph_module, guards = torchdynamo.export(
        +	 model,
        +	 *copy.deepcopy(example_inputs),
        +	 aten_graph=True,
        + )
        +
        + # Step 2: Insert observers or fake quantize modules
        + quantizer = xiq.X86InductorQuantizer()
        + operator_config = xiq.get_default_x86_inductor_quantization_config()
        + quantizer.set_global(operator_config)
        + prepared_graph_module = prepare_pt2e(exported_graph_module, quantizer)
        +
        + # Doing calibration here.
        +
        + # Step 3: Quantize the model
        + convert_graph_module = convert_pt2e(prepared_graph_module)
        +
        + # Step 4: Lower Quantized Model into the backend
        + compile_model = torch.compile(convert_graph_module)
        +
        + +

        Figure 3. Code snippet showing the use of Inductor as back end for PyTorch 2 export post-training quantization

        + +

        All convolutional neural networks (CNN) models from the TorchBench test suite have been measured and proven effective when compared with the Inductor FP32 inference path. Performance metrics are shown in Table 5.

        + + + + + + + + + + + + +
        +Compiler + +Geometric Speedup + +Geometric Related Accuracy Loss +
        +inductor + +3.25x, 12/12 + +0.44%, 12/12 +
        + +

        Next Steps

        + +

        Get the Software

        + +

        Try out PyTorch 2.1 and realize the performance benefits for yourself from these features contributed by Intel.

        + +

        We encourage you to check out Intel’s other AI Tools and framework optimizations and learn about the open, standards-based oneAPI multiarchitecture, multivendor programming model that forms the foundation of Intel’s AI software portfolio.

        + +

        For more details about the 4th generation Intel Xeon Scalable processor, visit the AI platform where you can learn how Intel is empowering developers to run high-performance, efficient end-to-end AI pipelines.

        + +

        PyTorch Resources

        + + + +

        Product and Performance Information

        + +

        1 Amazon EC2* m7i.16xlarge: 1-node, Intel Xeon Platinum 8488C processor with 256 GB memory (1 x 256 GB DDR5 4800 MT/s), microcode 0x2b000461, hyperthreading on, turbo on, Ubuntu* 22.04.3 LTS, kernel 6.2.0-1011-aws, GCC* 11.3.0, Amazon Elastic Block Store 200 GB, BIOS Amazon EC2 1.0 10/16/2017; Software: PyTorch 2.1.0_rc4, Intel® oneAPI Deep Neural Network Library (oneDNN) version 3.1.1, TorchBench, TorchVision, TorchText, TorchAudio, TorchData, TorchDynamo Benchmarks, tested by Intel on 9/12/2023.

        + +

        2 Amazon EC2 c6i.16xlarge: 1-node, Intel Xeon Platinum 8375C processor with 128 GB memory (1 x 128 GB DDR4 3200 MT/s), microcode 0xd0003a5, hyperthreading on, turbo on, Ubuntu 22.04.2 LTS, kernel 6.2.0-1011-aws, gcc 11.3.0, Amazon Elastic Block Store 200 GB, BIOS Amazon EC2 1.010/16/2017; Software: PyTorch 2.1.0_rc4, oneDNN version 3.1.1, TorchBench, TorchVision, TorchText, TorchAudio, TorchData, TorchDynamo Benchmarks, TorchBench cpu userbenchmark, tested by Intel on 9/12/2023.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-in-docs/index.html b/blog/new-in-docs/index.html new file mode 100644 index 000000000000..eec1c3da2f95 --- /dev/null +++ b/blog/new-in-docs/index.html @@ -0,0 +1,666 @@ + + + + + + + + + + + + + What's New in PyTorch Documentation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        February 01, 2024

        +

        + What's New in PyTorch Documentation +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Greetings to the PyTorch community! Here is a quick update on PyTorch docs.

        + +

        In November 2023, we successfully conducted a PyTorch Docathon, a community event where PyTorch community members gathered together to improve PyTorch documentation and tutorials. This event saw a global participation of contributors who dedicated their time and effort to enhance our docs. We extend our sincere gratitude to everyone involved.

        + +

        A key accomplishment of the Docathon was the comprehensive work carried out on docstrings. Our community contributors meticulously reviewed and improved the docstrings based on the provided tasks.

        + +

        In addition to that, we’ve added three new tutorials that showcase real-world applications of PyTorch. We are particularly proud that two of these tutorials were contributed by PyTorch ecosystem partners.

        + +

        Here is the new tutorials for you to explore:

        + +
          +
        • Whole Slide Image Classification Using PyTorch and TIAToolbox —This tutorial demonstrates how to classify Whole Slide Images (WSIs) using PyTorch deep learning models with TIAToolbox, which are images of human tissue samples used by pathologists and researchers to study diseases like cancer at the microscopic level.
        • +
        • Semi-Supervised Learning using USB built upon PyTorch – This tutorial introduces USB, a flexible and modular semi-supervised learning framework based on PyTorch, demonstrating its ease of use in training a FreeMatch/SoftMatch model on CIFAR-10 using pre-trained ViT and its adaptability to various algorithms and imbalanced datasets.
        • +
        • Deploying a PyTorch Stable Diffusion model as a Vertex AI Endpoint – This tutorial provides a step-by-step guide on how to streamline the deployment of a PyTorch Stable Diffusion model (v1.5) using Vertex AI, a fully-managed machine learning platform, by creating a custom TorchServe handler, uploading model artifacts to Google Cloud Storage, creating a Vertex AI model with the model artifacts and a prebuilt PyTorch container image, and finally deploying the model onto an endpoint.
        • +
        + +

        We’re planning more community events this year, so stay tuned!

        + +

        And finally, we just published new 2.2 PyTorch documentation and tutorials. Check it out!

        + +

        Best regards,
        +The PyTorch Team

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-library-updates-in-pytorch-1.13/index.html b/blog/new-library-updates-in-pytorch-1.13/index.html new file mode 100644 index 000000000000..4632b07dc91c --- /dev/null +++ b/blog/new-library-updates-in-pytorch-1.13/index.html @@ -0,0 +1,1093 @@ + + + + + + + + + + + + + New Library Updates in PyTorch 1.13 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 28, 2022

        +

        + New Library Updates in PyTorch 1.13 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Summary

        + +

        We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 1.13 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.

        + +

        Along with 1.13, we are releasing updates to the PyTorch Libraries, please find them below.

        + +

        TorchAudio

        + +

        (Beta) Hybrid Demucs Model and Pipeline

        + +

        Hybrid Demucs is a music source separation model that uses both spectrogram and time domain features. It has demonstrated state-of-the-art performance in the Sony® Music DeMixing Challenge. (citation: https://arxiv.org/abs/2111.03600)

        + +

        The TorchAudio v0.13 release includes the following features

        + +
          +
        • MUSDB_HQ Dataset, which is used in Hybrid Demucs training (docs)
        • +
        • Hybrid Demucs model architecture (docs)
        • +
        • Three factory functions suitable for different sample rate ranges
        • +
        • Pre-trained pipelines (docs)
        • +
        • SDR Results of pre-trained pipelines on MUSDB_HQ test set
        • +
        • Tutorial that steps through music source separation using the pretrained pipeline (docs)
        • +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        PipelineAllDrumsBassOtherVocals
        HDEMUCS_HIGH_MUSDB*6.427.766.514.476.93
        HDEMUCS_HIGH_MUSDB_PLUS**9.3711.3810.537.248.32
        + +

        * Trained on the training data of MUSDB-HQ dataset.
        ** Trained on both training and test sets of MUSDB-HQ and 150 extra songs from an internal database that were specifically produced for Meta.

        + +
        from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
        +
        +bundle = HDEMUCS_HIGH_MUSDB_PLUS
        +model = bundle.get_model()
        +sources_list = model.sources
        +
        +mixture, samplerate = torchaudio.load("song.wav")
        +sources = model(mixture)
        +audios = dict(zip(sources_list, sources)
        +
        + +

        Special thanks to Alexandre Defossez for the guidance.

        + +

        (Beta) Datasets and Metadata Mode for SUPERB Benchmark

        + +

        TorchAudio adds support for various audio-related datasets used in downstream tasks for benchmarking self-supervised learning models. With the addition of several new datasets, there is now support for the downstream tasks in version 1 of the SUPERB benchmark, which can be found in the s3prl repository.

        + +

        For these datasets, we also add metadata support through a get_metadata function, enabling faster dataset iteration or preprocessing without the need to load waveforms. The function returns the same features as __getitem__, except it returns the relative waveform path rather than the loaded waveform.

        + +

        Datasets with metadata functionality

        + + + +

        (Beta) Custom Language Model support in CTC Beam Search Decoding

        + +

        TorchAudio released a CTC beam search decoder in release 0.12, with KenLM language model support. This release, there is added functionality for creating custom Python language models that are compatible with the decoder, using the torchaudio.models.decoder.CTCDecoderLM wrapper.

        + +

        For more information on using a custom language model, please refer to the documentation and tutorial.

        + +

        (Beta) StreamWriter

        + +

        torchaudio.io.StreamWriter is a class for encoding media including audio and video. This can handle a wide variety of codecs, chunk-by-chunk encoding and GPU encoding.

        + +
        writer = StreamWriter("example.mp4")
        +writer.add_audio_stream(
        +    sample_rate=16_000,
        +    num_channels=2,
        +)
        +writer.add_video_stream(
        +    frame_rate=30,
        +    height=96,
        +    width=128,
        +    format="rgb24",
        +)
        +with writer.open():
        +    writer.write_audio_chunk(0, audio)
        +    writer.write_video_chunk(1, video)
        +
        + +

        For more information, refer to the documentation and the following tutorials

        + + +

        TorchData

        + +

        For a complete list of changes and new features, please visit our repository’s 0.5.0 release note.

        + +

        (Prototype) DataLoader2

        + +

        DataLoader2 was introduced in the last release to execute DataPipe graph, with support for dynamic sharding for multi-process/distributed data loading, multiple backend ReadingServices, and DataPipe graph in-place modification (e.g. shuffle control).

        + +

        In this release, we further consolidated the API for DataLoader2 and a detailed documentation is now available here. We continue to welcome early adopters and feedback, as well as potential contributors. If you are interested in trying it out, we encourage you to install the nightly version of TorchData.

        + +

        (Beta) Data Loading from Cloud Service Providers

        + +

        We extended our support to load data from additional cloud storage providers via DataPipes, now covering AWS, Google Cloud Storage, and Azure. A tutorial is also available. We are open to feedback and feature requests.

        + +

        We also performed a simple benchmark, comparing the performance of data loading from AWS S3 and attached volume on an AWS EC2 instance.

        + +

        torch::deploy (Beta)

        + +

        torch::deploy is now in Beta! torch::deploy is a C++ library for Linux based operating systems that allows you to run multiple Python interpreters in a single process. You can run your existing eager PyTorch models without any changes for production inference use cases. Highlights include:

        + +
          +
        • Existing models work out of the box–no need to modify your python code to support tracing.
        • +
        • Full support for your existing Python environment including C extensions.
        • +
        • No need to cross process boundaries to load balance in multi-GPU serving environments.
        • +
        • Model weight can be shared between multiple Python interpreters.
        • +
        • A vastly improved installation and setup process.
        • +
        + +
        torch::deploy::InterpreterManager manager(4);
        +
        +// access one of the 4 interpreters
        +auto I = manager.acquireOne();
        +
        +// run infer from your_model.py
        +I.global("your_model", "infer")({at::randn({10, 240, 320})});
        +
        + +

        Learn more here.

        + +

        (Beta) CUDA/ROCm/CPU Backends

        + +

        torch::deploy now links against standard PyTorch Python distributions so all accelerators that PyTorch core supports such as CUDA and AMD/HIP work out of the box.

        + + + +

        (Prototype) aarch64/arm64 support

        + +

        torch::deploy now has basic support for aarch64 Linux systems.

        + + + +

        TorchEval

        + +

        (Prototype) Introducing Native Metrics Support for PyTorch

        + +

        TorchEval is a library built for users who want highly performant implementations of common metrics to evaluate machine learning models. It also provides an easy to use interface for building custom metrics with the same toolkit. Building your metrics with TorchEval makes running distributed training loops with torch.distributed a breeze.

        + +

        Learn more with our docs, see our examples, or check out our GitHub repo.

        + +

        TorchMultimodal Release (Beta)

        + +

        Please watch for upcoming blogs in early November that will introduce TorchMultimodal, a PyTorch domain library for training SoTA multi-task multimodal models at scale, in more details; in the meantime, play around with the library and models through our tutorial.

        + +

        TorchRec

        + +

        (Prototype) Simplified Optimizer Fusion APIs

        + +

        We’ve provided a simplified and more intuitive API for setting fused optimizer settings via apply_optimizer_in_backward. This new approach enables the ability to specify optimizer settings on a per-parameter basis and sharded modules will configure FBGEMM’s TableBatchedEmbedding modules accordingly. Additionally, this now let’s TorchRec’s planner account for optimizer memory usage. This should alleviate reports of sharding jobs OOMing after using Adam using a plan generated from planner.

        + +

        (Prototype) Simplified Sharding APIs

        + +

        We’re introducing the shard API, which now allows you to shard only the embedding modules within a model, and provides an alternative to the current main entry point - DistributedModelParallel. This lets you have a finer grained control over the rest of the model, which can be useful for customized parallelization logic, and inference use cases (which may not require any parallelization on the dense layers). We’re also introducing construct_module_sharding_plan, providing a simpler interface to the TorchRec sharder.

        + +

        (Beta) Quantized Comms

        + +

        Applying quantization or mixed precision to tensors in a collective call during model parallel training greatly improves training efficiency, with little to no effect on model quality. TorchRec now integrates with the quantized comms library provided by FBGEMM GPU and provides an interface to construct encoders and decoders (codecs) that surround the all_to_all, and reduce_scatter collective calls in the output_dist of a sharded module. We also allow you to construct your own codecs to apply to your sharded module. The codces provided by FBGEMM allow FP16, BF16, FP8, and INT8 compressions, and you may use different quantizations for the forward pass and backward pass.

        + +

        TorchSnapshot (Beta)

        + +

        Along with PyTorch 1.13, we are releasing the beta version of TorchSnapshot, which is a performant, memory-efficient checkpointing library for PyTorch applications, designed with large, complex distributed workloads in mind. Highlights include:

        + +
          +
        • Performance: TorchSnapshot provides a fast checkpointing implementation employing various optimizations, including zero-copy serialization for most tensor types, overlapped device-to-host copy and storage I/O, parallelized storage I/O
        • +
        • Memory Use: TorchSnapshot’s memory usage adapts to the host’s available resources, greatly reducing the chance of out-of-memory issues when saving and loading checkpoints
        • +
        • Usability: Simple APIs that are consistent between distributed and non-distributed workloads
        • +
        + +

        Learn more with our tutorial.

        + +

        TorchVision

        + +

        We are happy to introduce torchvision v0.14 (release note). This version introduces a new model registration API to help users retrieving and listing models and weights. It also includes new image and video classification models such as MViT, S3D, Swin Transformer V2, and MaxViT. Last but not least, we also have new primitives and augmentation such as PolynomicalLR scheduler and SimpleCopyPaste.

        + +

        (Beta) Model Registration API

        + +

        Following up on the multi-weight support API that was released on the previous version, we have added a new model registration API to help users retrieve models and weights. There are now 4 new methods under the torchvision.models module: get_model, get_model_weights, get_weight, and list_models. Here are examples of how we can use them:

        + +
        import torchvision
        +from torchvision.models import get_model, get_model_weights, list_models
        +
        +
        +max_params = 5000000
        +
        +tiny_models = []
        +for model_name in list_models(module=torchvision.models):
        +    weights_enum = get_model_weights(model_name)
        +    if len([w for w in weights_enum if w.meta["num_params"] <= max_params]) > 0:
        +        tiny_models.append(model_name)
        +
        +print(tiny_models)
        +# ['mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mobilenet_v2', ...]
        +
        +model = get_model(tiny_models[0], weights="DEFAULT")
        +print(sum(x.numel() for x in model.state_dict().values()))
        +# 2239188
        +
        + +

        (Beta) New Video Classification Models

        + +

        We added two new video classification models, MViT and S3D. MViT is a state of the art video classification transformer model which has 80.757% accuracy on the Kinetics400 dataset, while S3D is a relatively small model with good accuracy for its size. These models can be used as follows:

        + +
        import torch
        +from torchvision.models.video import *
        +
        +video = torch.rand(3, 32, 800, 600)
        +model = mvit_v2_s(weights="DEFAULT")
        +# model = s3d(weights="DEFAULT")
        +model.eval()
        +prediction = model(images)
        +
        + +

        Here is the table showing the accuracy of the new video classification models tested in the Kinetics400 dataset.

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelAcc@1Acc@5
        mvit_v1_b81.47495.776
        mvit_v2_s83.19696.36
        s3d83.58296.64
        + +

        We would like to thank Haoqi Fan, Yanghao Li, Christoph Feichtenhofer and Wan-Yen Lo for their work on PyTorchVideo and their support during the development of the MViT model. We would like to thank Sophia Zhi for her contribution implementing the S3D model in torchvision.

        + +

        (Stable) New Architecture and Model Variants

        + +

        For Classification Models, we’ve added the Swin Transformer V2 architecture along with pre-trained weights for its tiny/small/base variants. In addition, we have added support for the MaxViT transformer. Here is an example on how to use the models:

        + +
        import torch
        +from torchvision.models import *
        +
        +image = torch.rand(1, 3, 224, 224)
        +model = swin_v2_t(weights="DEFAULT").eval()
        +# model = maxvit_t(weights="DEFAULT").eval()
        +prediction = model(image)
        +
        + +

        Here is the table showing the accuracy of the models tested on ImageNet1K dataset.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelAcc@1Acc@1 change over V1Acc@5Acc@5 change over V1
        swin_v2_t82.072+ 0.59896.132+ 0.356
        swin_v2_s83.712+ 0.51696.816+ 0.456
        swin_v2_b84.112+ 0.53096.864+ 0.224
        maxvit_t83.700-96.722-
        + +

        We would like to thank Ren Pang and Teodor Poncu for contributing the 2 models to torchvision.

        + +

        (Stable) New Primitives & Augmentations

        + +

        In this release we’ve added the SimpleCopyPaste augmentation in our reference scripts and we up-streamed the PolynomialLR scheduler to PyTorch Core. We would like to thank Lezwon Castelino and Federico Pozzi for their contributions. We are continuing our efforts to modernize TorchVision by adding more SoTA primitives, Augmentations and architectures with the help of our community. If you are interested in contributing, have a look at the following issue.

        + +

        Torch-TensorRT

        + +

        (Prototype) TensorRT with FX2TRT frontend

        + +

        Torch-TensorRT is the PyTorch integration for TensorRT, providing high performance inference on NVIDIA GPUs. Torch-TRT allows for optimizing models directly in PyTorch for deployment providing up to 6x performance improvement.

        + +

        Torch-TRT is an AoT compiler which ingests an nn.Module or TorchScript module, optimizes compatible subgraphs in TensorRT & leaves the rest to run in PyTorch. This gives users the performance of TensorRT, but the usability and familiarity of Torch.

        + +

        Torch-TensorRT is part of the PyTorch ecosystem, and was released as v1.0 in November ‘21. There are currently two distinct front-ends: Torchscript & FX. Each provides the same value proposition and underlying operation with the primary difference being the input & output formats (TS vs FX / Python).

        + +

        The Torchscript front-end was included in v1.0 and should be considered stable. The FX front-end is first released in v1.2 and should be considered a Beta.

        + +

        Relevant Links:

        + + + +

        (Stable) Introducing Torch-TensorRT

        + +

        Torch-TensorRT is an integration for PyTorch that leverages inference optimizations of TensorRT on NVIDIA GPUs. It takes advantage of TensorRT optimizations, such as FP16 and INT8 reduced precision, graph optimization, operation fusion, etc. while offering a fallback to native PyTorch when TensorRT does not support the model subgraphs. Currently, there are two frontend paths existing in the library that help to convert a PyTorch model to tensorRT engine. One path is through Torch Script (TS) and the other is through FX frontend. That being said, the models are traced by either TS or FX into their IR graph and then converted to TensorRT from it.

        + +

        Learn more with our tutorial.

        + +

        TorchX

        + +

        TorchX 0.3 updates include a new list API, experiment tracking, elastic training and improved scheduler support. There’s also a new Multi-Objective NAS tutorial using TorchX + Ax.

        + +

        (Prototype) List

        + +

        The newly added list command and API allows you to list recently launched jobs and their statuses for a given scheduler directly from within TorchX.

        + +
          +
        • This removes the need for using secondary tools to list the jobs.
        • +
        • Full programmatic access to recent jobs for integration with custom tools.
        • +
        + +
        $ torchx list -s kubernetes
        +APP HANDLE                                                       APP STATUS
        +-----------------------------------------------            -----------------
        +kubernetes://torchx/default:train-f2nx4459p5crr   SUCCEEDED
        +
        + +

        Learn more with our documentation.

        + +

        (Prototype) Tracker

        + +

        TorchX Tracker is a new prototype library that provides a flexible and customizable experiment and artifact tracking interface. This allows you to track inputs and outputs for jobs across multiple steps to make it easier to use TorchX with pipelines and other external systems.

        + +
        from torchx import tracker
        +
        +app_run = tracker.app_run_from_env()
        +app_run.add_metadata(lr=lr, gamma=gamma) # hyper parameters
        +app_run.add_artifact("model", "storage://path/mnist_cnn.pt") # logs / checkpoints
        +app_run.add_source(parent_run_id, "model") # lineage
        +
        + +

        Example:

        + + + +

        (Prototype) Elastic Training and Autoscaling

        + +

        Elasticity on Ray and Kubernetes – automatic scale up of distributed training jobs when using a supported scheduler. Learn more with our documentation.

        + +

        (Prototype) Scheduler Improvements: IBM® Spectrum LSF

        + +

        Added prototype support for the IBM Spectrum LSF scheduler.

        + +

        (Beta) AWS Batch Scheduler

        + +

        The AWS Batch scheduler integration is now in beta.

        + + + +

        (Prototype) AnyPrecision Optimizer

        + +

        Drop in replacement for AdamW optimizer that reduces GPU memory, enables two main features:

        + +
          +
        • Ability to successfully train the entire model pipeline in full BFloat16. +Kahan summation ensures precision. This can improve training throughput, especially on huge models, by reduced memory and increased computation speed.
        • +
        • Ability to change the variance state to BFloat16. This can reduce overall memory required for model training with additional speed improvements.
        • +
        + +

        Find more information here.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-library-updates-in-pytorch-2.0/index.html b/blog/new-library-updates-in-pytorch-2.0/index.html new file mode 100644 index 000000000000..adba26639d84 --- /dev/null +++ b/blog/new-library-updates-in-pytorch-2.0/index.html @@ -0,0 +1,845 @@ + + + + + + + + + + + + + New Library Updates in PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        March 15, 2023

        +

        + New Library Updates in PyTorch 2.0 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Summary

        + +

        We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 2.0 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.

        + +

        Along with 2.0, we are also releasing a series of beta updates to the PyTorch domain libraries, including those that are in-tree, and separate libraries including TorchAudio, TorchVision, and TorchText. An update for TorchX is also being released as it moves to community supported mode. Please find the list of the latest stable versions and updates below.

        + +

        Latest Stable Library Versions (Full List)

        + + + + + + + + + + + + + + + + +
        TorchArrow 0.1.0 + TorchRec 0.4.0 + TorchVision 0.15 +
        TorchAudio 2.0 + TorchServe 0.7.1 + TorchX 0.4.0 +
        TorchData 0.6.0 + TorchText 0.15.0 + PyTorch on XLA Devices 1.14 +
        + +

        *To see prior versions or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’.

        + +

        TorchAudio

        + +

        [Beta] Data augmentation operators

        + +

        The release adds several data augmentation operators under torchaudio.functional and torchaudio.transforms:

        +
          +
        • torchaudio.functional.add_noise
        • +
        • torchaudio.functional.convolve
        • +
        • torchaudio.functional.deemphasis
        • +
        • torchaudio.functional.fftconvolve
        • +
        • torchaudio.functional.preemphasis
        • +
        • torchaudio.functional.speed
        • +
        • torchaudio.transforms.AddNoise
        • +
        • torchaudio.transforms.Convolve
        • +
        • torchaudio.transforms.Deemphasis
        • +
        • torchaudio.transforms.FFTConvolve
        • +
        • torchaudio.transforms.Preemphasis
        • +
        • torchaudio.transforms.Speed
        • +
        • torchaudio.transforms.SpeedPerturbation
        • +
        + +

        The operators can be used to synthetically diversify training data to improve the generalizability of downstream models.

        + +

        For usage details, please refer to the functional and transform documentation and Audio Data Augmentation tutorial.

        + +

        [Beta] WavLM and XLS-R models

        + +

        The release adds two self-supervised learning models for speech and audio.

        + +
          +
        • WavLM that is robust to noise and reverberation.
        • +
        • XLS-R that is trained on cross-lingual datasets.
        • +
        + +

        Besides the model architectures, torchaudio also supports corresponding pre-trained pipelines:

        + +
          +
        • torchaudio.pipelines.WAVLM_BASE
        • +
        • torchaudio.pipelines.WAVLM_BASE_PLUS
        • +
        • torchaudio.pipelines.WAVLM_LARGE
        • +
        • torchaudio.pipelines.WAV2VEC_XLSR_300M
        • +
        • torchaudio.pipelines.WAV2VEC_XLSR_1B
        • +
        • torchaudio.pipelines.WAV2VEC_XLSR_2B
        • +
        + +

        For usage details, please refer to the factory function and pre-trained pipelines documentation.

        + +

        TorchRL

        + +

        The initial release of torchrl includes several features that span across the entire RL domain. TorchRL can already be used in online, offline, multi-agent, multi-task and distributed RL settings, among others. See below:

        + +

        [Beta] Environment wrappers and transforms

        + +

        torchrl.envs includes several wrappers around common environment libraries. This allows users to swap one library with another without effort. These wrappers build an interface between these simulators and torchrl:

        + +
          +
        • dm_control:
        • +
        • Gym
        • +
        • Brax
        • +
        • EnvPool
        • +
        • Jumanji
        • +
        • Habitat
        • +
        + +

        It also comes with many commonly used transforms and vectorized environment utilities that allow for a fast execution across simulation libraries. Please refer to the documentation for more detail.

        + +

        [Beta] Datacollectors

        + +

        Data collection in RL is made easy via the usage of single process or multiprocessed/distributed data collectors that execute the policy in the environment over a desired duration and deliver samples according to the user’s needs. These can be found in torchrl.collectors and are documented here.

        + +

        [Beta] Objective modules

        + +

        Several objective functions are included in torchrl.objectives, among which:

        + +
          +
        • A generic PPOLoss class and derived ClipPPOLoss and KLPPOLoss
        • +
        • SACLoss and DiscreteSACLoss
        • +
        • DDPGLoss
        • +
        • DQNLoss
        • +
        • REDQLoss
        • +
        • A2CLoss
        • +
        • TD3Loss
        • +
        • ReinforceLoss
        • +
        • Dreamer
        • +
        + +

        Vectorized value function operators also appear in the library. Check the documentation here.

        + +

        [Beta] Models and exploration strategies

        + +

        We provide multiple models, modules and exploration strategies. Get a detailed description in the doc.

        + +

        [Beta] Composable replay buffer

        + +

        A composable replay buffer class is provided that can be used to store data in multiple contexts including single and multi-agent, on and off-policy and many more.. Components include:

        + +
          +
        • Storages (list, physical or memory-based contiguous storages)
        • +
        • Samplers (Prioritized, sampler without repetition)
        • +
        • Writers
        • +
        • Possibility to add transforms
        • +
        + +

        Replay buffers and other data utilities are documented here.

        + +

        [Beta] Logging tools and trainer

        + +

        We support multiple logging tools including tensorboard, wandb and mlflow.

        + +

        We provide a generic Trainer class that allows for easy code recycling and checkpointing.

        + +

        These features are documented here.

        + +

        TensorDict

        + +

        TensorDict is a new data carrier for PyTorch.

        + +

        [Beta] TensorDict: specialized dictionary for PyTorch

        + +

        TensorDict allows you to execute many common operations across batches of tensors carried by a single container. TensorDict supports many shape and device or storage operations, and can readily be used in distributed settings. Check the documentation to know more.

        + +

        [Beta] @tensorclass: a dataclass for PyTorch

        + +

        Like TensorDict, tensorclass provides the opportunity to write dataclasses with built-in torch features such as shape or device operations.

        + +

        [Beta] tensordict.nn: specialized modules for TensorDict

        + +

        The tensordict.nn module provides specialized nn.Module subclasses that make it easy to build arbitrarily complex graphs that can be executed with TensorDict inputs. It is compatible with the latest PyTorch features such as functorch, torch.fx and torch.compile.

        + +

        TorchRec

        + +

        [Beta] KeyedJaggedTensor All-to-All Redesign and Input Dist Fusion

        + +

        We observed performance regression due to a bottleneck in sparse data distribution for models that have multiple, large KJTs to redistribute.

        + +

        To combat this we altered the comms pattern to transport the minimum data required in the initial collective to support the collective calls for the actual KJT tensor data. This data sent in the initial collective, ‘splits’ means more data is transmitted over the comms stream overall, but the CPU is blocked for significantly shorter amounts of time leading to better overall QPS.

        + +

        Furthermore, we altered the TorchRec train pipeline to group the initial collective calls for the splits together before launching the more expensive KJT tensor collective calls. This fusion minimizes the CPU blocked time as launching each subsequent input distribution is no longer dependent on the previous input distribution.

        + +

        With this feature, variable batch sizes are now natively supported across ranks. These features are documented here.

        + +

        TorchVision

        + +

        [Beta] Extending TorchVision’s Transforms to Object Detection, Segmentation & Video tasks

        + +

        TorchVision is extending its Transforms API! Here is what’s new:

        + +
          +
        • You can use them not only for Image Classification but also for Object Detection, Instance & Semantic Segmentation and Video Classification.
        • +
        • You can use new functional transforms for transforming Videos, Bounding Boxes and Segmentation Masks.
        • +
        + +

        Learn more about these new transforms from our docs, and submit any feedback in our dedicated issue.

        + +

        TorchText

        + +

        [Beta] Adding scriptable T5 and Flan-T5 to the TorchText library with incremental decoding support!

        + +

        TorchText has added the T5 model architecture with pre-trained weights for both the original T5 paper and Flan-T5. The model is fully torchscriptable and features an optimized multiheaded attention implementation. We include several examples of how to utilize the model including summarization, classification, and translation.

        + +

        For more details, please refer to our docs.

        + +

        TorchX

        + +

        TorchX is moving to community supported mode. More details will be coming in at a later time.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/new-library-updates/index.html b/blog/new-library-updates/index.html new file mode 100644 index 000000000000..c07483e10cca --- /dev/null +++ b/blog/new-library-updates/index.html @@ -0,0 +1,843 @@ + + + + + + + + + + + + + New Library Updates in PyTorch 2.1 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 04, 2023

        +

        + New Library Updates in PyTorch 2.1 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Summary

        + +

        We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 2.1 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. 

        + +

        Along with 2.1, we are also releasing a series of beta updates to the PyTorch domain libraries including TorchAudio and TorchVision. Please find the list of the latest stable versions and updates below.

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Latest Stable Library Versions(Full List)* 
        TorchArrow 0.1.0TorchRec 0.5.0TorchVision 0.16
        TorchAudio 2.1TorchServe 0.8.2TorchX 0.5.0
        TorchData 0.7.0TorchText 0.16.0PyTorch on XLA Devices 1.14
        + +

        *To see prior versions or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’.

        + +

        TorchAudio

        + +

        TorchAudio v2.1 introduces the following new features and backward-incompatible changes:

        + +

        [Beta] A new API to apply filter, effects and codec

        + +

        `torchaudio.io.AudioEffector` can apply filters, effects and encodings to waveforms in online/offline fashion. You can use it as a form of augmentation.

        + +

        Please refer to https://pytorch.org/audio/2.1/tutorials/effector_tutorial.html for the usage and examples.

        + +

        [Beta] Tools for Forced alignment

        + +

        New functions and a pre-trained model for forced alignment were added. `torchaudio.functional.forced_align` computes alignment from an emission and `torchaudio.pipelines.MMS_FA` provides access to the model trained for multilingual forced alignment in MMS: Scaling Speech Technology to 1000+ languages project.

        + +

        Please refer to https://pytorch.org/audio/2.1/tutorials/ctc_forced_alignment_api_tutorial.html for the usage of `forced_align` function, and https://pytorch.org/audio/2.1/tutorials/forced_alignment_for_multilingual_data_tutorial.html for how one can use `MMS_FA` to align transcript in multiple languages.

        + +

        [Beta] TorchAudio-Squim : Models for reference-free speech assessment

        + +

        Model architectures and pre-trained models from the paper TorchAudio-Sequim: Reference-less Speech Quality and Intelligibility measures in TorchAudio were added.

        + +

        You can use the pre-trained models `torchaudio.pipelines.SQUIM_SUBJECTIVE` and `torchaudio.pipelines.SQUIM_OBJECTIVE`. They can estimate the various speech quality and intelligibility metrics (e.g. STOI, wideband PESQ, Si-SDR, and MOS). This is helpful when evaluating the quality of speech generation models, such as Text-to-Speech (TTS).

        + +

        Please refer to https://pytorch.org/audio/2.1/tutorials/squim_tutorial.html for the details.

        + +

        [Beta] CUDA-based CTC decoder

        + +

        `torchaudio.models.decoder.CUCTCDecoder` performs CTC beam search in CUDA devices. The beam search is fast. It eliminates the need to move data from CUDA device to CPU when performing automatic speech recognition. With PyTorch’s CUDA support, it is now possible to perform the entire speech recognition pipeline in CUDA.

        + +

        Please refer to https://pytorch.org/audio/2.1/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.html for the detail.

        + +

        [Prototype] Utilities for AI music generation

        + +

        We are working to add utilities that are relevant to music AI. Since the last release, the following APIs were added to the prototype.

        + +

        Please refer to respective documentation for the usage.

        + + +

        New recipes for training models

        + +

        Recipes for Audio-visual ASR, multi-channel DNN beamforming and TCPGen context-biasing were added.

        + +

        Please refer to the recipes

        + + +

        Update to FFmpeg support

        + +

        The version of supported FFmpeg libraries was updated. TorchAudio v2.1 works with FFmpeg 6, 5 and 4.4. The support for 4.3, 4.2 and 4.1 are dropped.

        + +

        Please refer to https://pytorch.org/audio/2.1/installation.html#optional-dependencies for the detail of the new FFmpeg integration mechanism.

        + +

        Update to libsox integration

        + +

        TorchAudio now depends on libsox installed separately from torchaudio. Sox I/O backend no longer supports file-like objects. (This is supported by FFmpeg backend and soundfile.)

        + +

        Please refer to https://pytorch.org/audio/2.1/installation.html#optional-dependencies for the details.

        + +

        TorchRL

        + +

        Our RLHF components make it easy to build an RLHF training loop with limited RL knowledge. TensorDict enables an easy interaction between datasets (eg, HF datasets) and RL models. The new algorithms we provide deliver a wide range of solutions for offline RL training, which is more data efficient.

        + +

        Through RoboHive and IsaacGym, TorchRL now provides a built-in interface with hardware (robots), tying training at scale with policy deployment on device. Thanks to SMAC, VMAS, and PettingZoo and related MARL-oriented losses, TorchRL is now fully capable of training complex policies in multi-agent settings.

        + +

        New algorithms

        +
          +
        • [BETA] We integrate some RLHF components and examples: we provide building blocks for data formatting in RL frameworks, reward model design, specific transforms that enable efficient learning (eg. KL correction) and training scripts
        • +
        • [Stable] New algorithms include Decision transformers, CQL, multi-agent losses such as MAPPO and QMixer.New features- [Stable] New transforms such as Visual Cortex 1 (VC1), a foundational model for RL. 
        • +
        • We widened the panel of library covered by TorchRL:  +
            +
          • [Beta] IsaacGym, a powerful GPU-based simulator that allows interaction and rendering of thousands of vectorized environments by NVIDIA.
          • +
          • [Stable] PettingZoo, a multi-agent library by the Farama Foundation.
          • +
          • [Stable] SMAC-v2, the new Starcraft Multi-agent simulator
          • +
          • [Stable] RoboHive, a collection of environments/tasks simulated with the MuJoCo physics engine.
          • +
          +
        • +
        + +

        Performance improvements

        + +

        We provide faster data collection through refactoring and integration of SB3 and Gym asynchronous environments execution. We also made our value functions faster to execute.

        + +

        TorchRec

        + +

        [Prototype] Zero Collision / Managed Collision Embedding Bags

        + +

        A common constraint in Recommender Systems is the sparse id input range is larger than the number of embeddings the model can learn for a given parameter size.   To resolve this issue, the conventional solution is to hash sparse ids into the same size range as the embedding table.  This will ultimately lead to hash collisions, with multiple sparse ids sharing the same embedding space.   We have developed a performant alternative algorithm that attempts to address this problem by tracking the N most common sparse ids and ensuring that they have a unique embedding representation. The module is defined here and an example can be found here.

        + +

        [Prototype] UVM Caching - Prefetch Training Pipeline

        + +

        For tables where on-device memory is insufficient to hold the entire embedding table, it is common to leverage a caching architecture where part of the embedding table is cached on device and the full embedding table is on host memory (typically DDR SDRAM).   However, in practice, caching misses are common, and hurt performance due to relatively high latency of going to host memory.   Building on TorchRec’s existing data pipelining, we developed a new Prefetch Training Pipeline to avoid these cache misses by prefetching the relevant embeddings for upcoming batch from host memory, effectively eliminating cache misses in the forward path.

        + +

        TorchVision 

        +

        Transforms and augmentations

        + +

        Major speedups

        + +

        The new transforms in torchvision.transforms.v2 are now 10%-40% faster than before! This is mostly achieved thanks to 2X-4X improvements made to v2.Resize(), which now supports native uint8 tensors for Bilinear and Bicubic mode. Output results are also now closer to PIL’s! Check out our performance recommendations to learn more.

        + +

        Additionally, torchvision now ships with libjpeg-turbo instead of libjpeg, which should significantly speed-up the jpeg decoding utilities (read_image, decode_jpeg), and avoid compatibility issues with PIL.

        + +

        CutMix and MixUp

        + +

        Long-awaited support for the CutMix and MixUp augmentations is now here! Check our tutorial to learn how to use them.

        + +

        Towards stable V2 transforms

        + +

        In the previous release 0.15 we BETA-released a new set of transforms in torchvision.transforms.v2 with native support for tasks like segmentation, detection, or videos. We have now stabilized the design decisions of these transforms and made further improvements in terms of speedups, usability, new transforms support, etc.

        + +

        We’re keeping the torchvision.transforms.v2 and torchvision.tv_tensors namespaces as BETA until 0.17 out of precaution, but we do not expect disruptive API changes in the future.

        + +

        Whether you’re new to Torchvision transforms, or you’re already experienced with them, we encourage you to start with Getting started with transforms v2 in order to learn more about what can be done with the new v2 transforms.

        + +

        Browse our main docs for general information and performance tips. The available transforms and functionals are listed in the API reference. Additional information and tutorials can also be found in our example gallery, e.g. Transforms v2: End-to-end object detection/segmentation example or How to write your own v2 transforms.

        + +

        [BETA] MPS support

        + +

        The nms and roi-align kernels (roi_align, roi_pool, ps_roi_align, ps_roi_pool) now support MPS. Thanks to Li-Huai (Allan) Lin for this contribution!

        + +

        TorchX

        + +

        Schedulers

        +
          +
        • +

          [Prototype] Kubernetes MCAD Scheduler: Integration for easily scheduling jobs on Multi-Cluster-Application-Dispatcher (MCAD)

          +
        • +
        • +

          AWS Batch 

          + +
            +
          • Add privileged option to enable running containers on EFA enabled instances with elevated networking permissions
          • +
          +
        • +
        + +

        TorchX Tracker

        +
          +
        • [Prototype] MLFlow backend for TorchX Tracker: in addition to fsspec based tracker, TorchX can use MLFlow instance to track metadata/experiments 
        • +
        + +

        Components

        +
          +
        • dist.spmd component to support Single-Process-Multiple-Data style applications
        • +
        + +

        Workspace

        +
          +
        • Add ability to access image and workspace path from Dockerfile while building docker workspace
        • +
        + +

        Release includes number of other bugfixes.

        + +

        To learn more about Torchx visit https://pytorch.org/torchx/latest/

        + +

        TorchText and TorchData

        + +

        As of September 2023 we have paused active development of TorchText and TorchData as we re-evaluate how we want to serve the needs of the community in this space.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/one-year-pytorch/index.html b/blog/one-year-pytorch/index.html new file mode 100644 index 000000000000..e5071fd0dfbd --- /dev/null +++ b/blog/one-year-pytorch/index.html @@ -0,0 +1,667 @@ + + + + + + + + + + + + + One Year of PyTorch Foundation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        September 12, 2023

        +

        + One Year of PyTorch Foundation +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        It’s been one year since we announced the formation of the PyTorch Foundation! 🎉

        + +

        In its inaugural year, the PyTorch Foundation made a significant impact by launching PyTorch 2.0, growing contributors and adding new member companies. We’re grateful to our founding members for their support to move the foundation forward.

        + +

        A few milestones in the past year include:

        + +

        💻 Over 600,000 repositories on GitHub
        +✅ 60% of AI implementations choosing PyTorch
        +📈 More than 20% year over year growth in new repositories
        +🤝 Over 12,000 commits since last year

        + +

        And a look at what the foundation has been up to this past year:

        + +

        PyTorch project timeline

        + +

        We look forward to growing our community for the years to come through supporting our contributors, democratizing the AI field, and creating new innovations.

        + +

        We invite you to join us at this year’s PyTorch Conference on October 16-17 in San Francisco. Conference registration is filling up quickly, so take advantage of your chance to be part of this exciting event.

        + +

        Join us to stay informed about the latest announcements and have the opportunity to connect with both the founding members and new additions to the PyTorch community.

        + +

        With thanks and gratitude,
        +The PyTorch Foundation Team

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml/index.html b/blog/openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml/index.html new file mode 100644 index 000000000000..8dfde8497443 --- /dev/null +++ b/blog/openmined-and-pytorch-launch-fellowship-funding-for-privacy-preserving-ml/index.html @@ -0,0 +1,690 @@ + + + + + + + + + + + + + OpenMined and PyTorch partner to launch fellowship funding for privacy-preserving ML community | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Andrew Trask (OpenMined/U.Oxford), Shubho Sengupta, Laurens van der Maaten, Joe Spisak + +

        +
        + +
        + +

        Many applications of machine learning (ML) pose a range of security and privacy challenges. In particular, users may not be willing or allowed to share their data, which prevents them from taking full advantage of ML platforms like PyTorch. To take the field of privacy-preserving ML (PPML) forward, OpenMined and PyTorch are announcing plans to jointly develop a combined platform to accelerate PPML research as well as new funding for fellowships.

        + +

        There are many techniques attempting to solve the problem of privacy in ML, each at various levels of maturity. These include (1) homomorphic encryption, (2) secure multi-party computation, (3) trusted execution environments, (4) on-device computation, (5) federated learning with secure aggregation, and (6) differential privacy. Additionally, a number of open source projects implementing these techniques were created with the goal of enabling research at the intersection of privacy, security, and ML. Among them, PySyft and CrypTen have taken an “ML-first” approach by presenting an API that is familiar to the ML community, while masking the complexities of privacy and security protocols. We are excited to announce that these two projects are now collaborating closely to build a mature PPML ecosystem around PyTorch.

        + +

        Additionally, to bolster this ecosystem and take the field of privacy preserving ML forward, we are also calling for contributions and supporting research efforts on this combined platform by providing funding to support the OpenMined community and the researchers that contribute, build proofs of concepts and desire to be on the cutting edge of how privacy-preserving technology is applied. We will provide funding through the RAAIS Foundation, a non-profit organization with a mission to advance education and research in artificial intelligence for the common good. We encourage interested parties to apply to one or more of the fellowships listed below.

        + +

        Tools Powering the Future of Privacy-Preserving ML

        + +

        The next generation of privacy-preserving open source tools enable ML researchers to easily experiment with ML models using secure computing techniques without needing to be cryptography experts. By integrating with PyTorch, PySyft and CrypTen offer familiar environments for ML developers to research and apply these techniques as part of their work.

        + +

        PySyft is a Python library for secure and private ML developed by the OpenMined community. It is a flexible, easy-to-use library that makes secure computation techniques like multi-party computation (MPC) and privacy-preserving techniques like differential privacy accessible to the ML community. It prioritizes ease of use and focuses on integrating these techniques into end-user use cases like federated learning with mobile phones and other edge devices, encrypted ML as a service, and privacy-preserving data science.

        + +

        CrypTen is a framework built on PyTorch that enables private and secure ML for the PyTorch community. It is the first step along the journey towards a privacy-preserving mode in PyTorch that will make secure computing techniques accessible beyond cryptography researchers. It currently implements secure multiparty computation with the goal of offering other secure computing backends in the near future. Other benefits to ML researchers include:

        + +
          +
        • It is ML first and presents secure computing techniques via a CrypTensor object that looks and feels exactly like a PyTorch Tensor. This allows the user to use automatic differentiation and neural network modules akin to those in PyTorch.
        • +
        • The framework focuses on scalability and performance and is built with real-world challenges in mind.
        • +
        + +

        The focus areas for CrypTen and PySyft are naturally aligned and complement each other. The former focuses on building support for various secure and privacy preserving techniques on PyTorch through an encrypted tensor abstraction, while the latter focuses on end user use cases like deployment on edge devices and a user friendly data science platform.

        + +

        Working together will enable PySyft to use CrypTen as a backend for encrypted tensors. This can lead to an increase in performance for PySyft and the adoption of CrypTen as a runtime by PySyft’s userbase. In addition to this, PyTorch is also adding cryptography friendly features such as support for cryptographically secure random number generation. Over the long run, this allows each library to focus exclusively on its core competencies while enjoying the benefits of the synergistic relationship.

        + +

        New Funding for OpenMined Contributors

        + +

        We are especially excited to announce that the PyTorch team has invested $250,000 to support OpenMined in furthering the development and proliferation of privacy-preserving ML. This gift will be facilitated via the RAAIS Foundation and will be available immediately to support paid fellowship grants for the OpenMined community.

        + +

        How to get involved

        + +

        Thanks to the support from the PyTorch team, OpenMined is able to offer three different opportunities for you to participate in the project’s development. Each of these fellowships furthers our shared mission to lower the barrier-to-entry for privacy-preserving ML and to create a more privacy-preserving world.

        + +

        Core PySyft CrypTen Integration Fellowships

        + +

        During these fellowships, we will integrate CrypTen as a supported backend for encrypted computation in PySyft. This will allow for the high-performance, secure multi-party computation capabilities of CrypTen to be used alongside other important tools in PySyft such as differential privacy and federated learning. For more information on the roadmap and how to apply for a paid fellowship, check out the project’s call for contributors.

        + +

        Federated Learning on Mobile, Web, and IoT Devices

        + +

        During these fellowships, we will be extending PyTorch with the ability to perform federated learning across mobile, web, and IoT devices. To this end, a PyTorch front-end will be able to coordinate across federated learning backends that run in Javascript, Kotlin, Swift, and Python. Furthermore, we will also extend PySyft with the ability to coordinate these backends using peer-to-peer connections, providing low latency and the ability to run secure aggregation as a part of the protocol. For more information on the roadmap and how to apply for a paid fellowship, check out the project’s call for contributors.

        + +

        Development Challenges

        + +

        Over the coming months, we will issue regular open competitions for increasing the performance and security of the PySyft and PyGrid codebases. For performance-related challenges, contestants will compete (for a cash prize) to make a specific PySyft demo (such as federated learning) as fast as possible. For security-related challenges, contestants will compete to hack into a PyGrid server. The first to demonstrate their ability will win the cash bounty! For more information on the challenges and to sign up to receive emails when each challenge is opened, sign up here.

        + +

        To apply, select one of the above projects and identify a role that matches your strengths!

        + +

        Cheers,

        + +

        Andrew, Laurens, Joe, and Shubho

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimize-llms/index.html b/blog/optimize-llms/index.html new file mode 100644 index 000000000000..254165119dc7 --- /dev/null +++ b/blog/optimize-llms/index.html @@ -0,0 +1,790 @@ + + + + + + + + + + + + + Optimize LLMs for Efficiency & Sustainability | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        February 19, 2025

        +

        + Optimize LLMs for Efficiency & Sustainability +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Zach Lasiuk, Arm + +

        +

        The rapid growth of large language model (LLM) applications is linked to rapid growth in energy demand. According to the International Energy Agency (IEA), data center electricity consumption is projected to roughly double by 2026 primarily driven by AI. This is due to the energy-intensive training requirements for massive LLMs – however, the increase in AI Inferencing workloads also plays a role. For example, compared with traditional search queries, a single AI inference can consume about 10x more energy.

        + +

        As developers, we directly affect how energy-intensive our AI solution is. There are technical decisions we can take to help make our AI solution more environmentally sustainable. Minimizing compute to deliver LLM solutions is not the only requirement for creating sustainable AI use. For example, systemic changes, such as policy interventions may be needed, but utilizing energy efficient solutions is an important factor and is an impactful intervention we can adopt right away.

        + +

        With that said, minimizing your LLM inference cloud compute requirements also leads to reducing your cloud bill and makes your app more energy efficient, creating a win-win situation. In this blog, we will take you through the steps to creating an LLM chatbot by optimizing and deploying a Llama 3.1 model on PyTorch, quantifying the computational efficiency benefits of specific architecture decisions.

        + +

        What will we evaluate?

        + +

        For this blog, our goal is to create an immersive fantasy storytelling app where users enter a fantasy world by chatting with a Generative AI. The first location is the land of Wicked, allowing people to role-play walking around the Emerald City and observe the sights and scenes in real-time. We’ll implement this via a chatbot and a custom system prompt.

        + +

        We will be evaluating LLM performance on CPUs. You can see the advantages of CPU vs GPU inference here. In general, leveraging CPUs in the cloud for LLM inference is a great choice for models around 10B parameters or less like the Llama series.

        + +

        We will also be using Arm-based CPUs, specifically the AWS Graviton series. Based on studies, the Arm-based Graviton3 server can provide 67.6 percent lower workload carbon intensity built in. While this study was based on a simulation, it is an excellent start to showing the possibilities for minimizing our app’s energy requirements.

        + +

        First, you’ll see how to run a simple LLM chatbot on PyTorch, then explore three techniques to optimize your application for computational efficiency:

        + +
          +
        1. Model optimization: Utilizing 4-bit quantization and added KleidiAI kernels.
        2. +
        3. Shortcut optimization: Implementing a vector database to handle common queries.
        4. +
        5. Architecture optimization: Adopting a serverless architecture.
        6. +
        + +

        Let’s get started.

        + +

        Run Llama-3.1 via PyTorch on AWS Graviton4

        + +

        To maximize energy efficiency, we will only use the minimum server resources needed to support this LLM chatbot. For this Llama-3.1 8-billion parameter model, 16 cores, 64GB RAM, and disk space of 50GB is required. We will use the r8g.4xlarge Graviton4 instance running Ubuntu 24.04, as it meets these specifications.

        + +

        Spin up this EC2 instance, connect to it, and start installing the requirements:

        + +
            sudo apt-get update
        +    sudo apt install gcc g++ build-essential python3-pip python3-venv google-perftools -y
        +
        + +

        Then install Torchchat, the library developed by the PyTorch team that enables running LLMs across devices:

        + +
            git clone https://github.com/pytorch/torchchat.git
        +    cd torchchat
        +    python3 -m venv .venv
        +    source .venv/bin/activate
        +    ./install/install_requirements.sh 
        +
        + +

        Next, install the Llama-3.1-8b model from Hugging Face through the CLI. You will first need to make a Hugging Face access token on your HF account. This will download the 16GB model to your instance, which may take a few minutes:

        + +
            pip install -U "huggingface_hub[cli]"
        +    huggingface-cli login
        +    	<enter your access token when prompted>
        +    python torchchat.py export llama3.1 --output-dso-path exportedModels/llama3.1.so --device cpu --max-seq-length 1024
        +
        + +

        Now you are ready to run the LLM model, adding a system prompt to be a guiding storyteller in the land of Wicked:

        + +
            LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4 TORCHINDUCTOR_CPP_WRAPPER=1 TORCHINDUCTOR_FREEZING=1 OMP_NUM_THREADS=16 python torchchat.py generate llama3.1 --device cpu --chat
        +
        + +

        Type ‘y’ to enter a system prompt and enter the following prompt:

        + +

        You are the guiding storyteller for a fantasy adventure application. Immerse users in the enchanting world of Wicked, guiding them through interactive, real-time experiences in the Emerald City. Describe vivid sights, dynamic scenes, and engage users in storytelling that feels alive and responsive. Allow users to make choices that shape their journey while maintaining the magical tone of the Wicked universe.

        + +

        Then enter your user query:

        + +

        I walk through the Emerald City gates and look up

        + +

        The output will show on the screen, taking about 7 seconds to generate the first token with less than 1 token per second.

        + +

        terminal

        + +

        This example took 245 seconds, or 4 minutes, to generate its complete reply—not very fast. The first optimization we’ll look at will speed up the LLM generation, reducing its computational footprint.

        + +

        Optimization 1: KleidiAI and Quantization

        + +

        Several optimizations are possible from the basic implementation above. The simplest and quickest one t to do is to quantize the model from FP16 to INT4. This approach trades-off some accuracy while cutting the model size from 16Gb to about 4Gb, increasing the inference speed in the process.

        + +

        Another common optimization comes in leveraging TorchAO (Torch Architecture Optimization), the PyTorch library that works seamlessly with TorchChat to enhance model performance through various quantization and sparsity methods.

        + +

        Lastly, we’ll use Arm KleidiAI optimizations. These are micro-kernels written in assembly that lead to significant performance improvements for LLM inference on Arm CPUs. You can read more about how KleidiAI kernels work if interested.

        + +

        To implement these optimizations, spin up a fresh EC2 instance and follow the instructions on how to run a Large Language Model (LLM) chatbot with PyTorch. When ready, run the model and enter the same system prompt and user query as above. You’ll get results that significantly speed up the inference: Less than 1 second to first token, and about 25 tokens per second.

        + +

        This cuts the inference time from 245 seconds to about 10 seconds. This results in less power-draw from your server, as it is spending more time idle vs running a power-hungry inference. All else being equal, this is a more carbon-friendly solution than the non-optimized app. The next two approaches go beyond model inference optimization, modifying the solution architectural to further reduce computational load.

        + +

        Optimization 2: FAISS to match database for common questions

        + +

        As stated in the introduction, model inferences are typically more computationally expensive than other search techniques. What if you could automatically respond to common user queries without performing an LLM inference? Using a query/response database is an option to bypass LLM inference and respond efficiently. For this interactive storytelling app, you can imagine common questions about specific characters, the world itself, and rules about what the chatbot is/is not capable of that can have pre-generated answers.

        + +

        However, a traditional exact-match database isn’t sufficient as users can phrase the same query in many ways. Asking about the chatbot’s capabilities could all invite the same answer but be phrased differently:

        + +
          +
        • “What are you capable of?”
        • +
        • “Tell me what you can do.”
        • +
        • “How can I interact with you?”
        • +
        + +

        Implementing semantic search solves this issue by matching a user’s query to the most relevant pre-generated answer by understanding the user’s intent. The FAISS library is a great option to implement semantic search.

        + +

        The computational savings of this approach depends on three factors:

        + +
          +
        1. Percentage of user queries that can be serviced by semantic search instead of LLM.
        2. +
        3. Computational cost of running the LLM inference.
        4. +
        5. Computational cost of running the semantic search.
        6. +
        + +

        With the savings equation being:

        + +
            Computational_savings = (% of queries) * (LLM_cost – search_cost).
        +
        + +

        This type of architecture makes sense in a few situations. One is if your system has common queries with many repeat questions. Another is large-scale systems with hundreds of thousands of incoming queries, where small percentage savings add up to meaningful changes. Lastly, if your LLM inference is very computationally expensive compared to the search cost, particularly with larger parameter models.

        + +

        The final optimization approach is transitioning from server to serverless.

        + +

        Optimization 3: Serverless approach

        + +

        Using serverless architectures are popular for many reasons, one being only paying for active compute time, and eliminating costs with idle servers. Idling servers require a non-trivial amount of power to keep on, wasting energy while waiting.

        + +

        This cost efficiency translates into being an inherently more environmentally friendly architecture, as it reduces wasteful energy consumption. Further, multiple applications share underlying physical infrastructure, improving resource efficiency.

        + +

        To set up your own serverless chatbot, you need to first containerize the quantized Llama-3.1-8b with TorchChat, TorchAO, and Arm KleidiAI optimizations with a python script containing a Lambda entry function lambda_handler. One deployment option is to upload your container to AWS ECR and attach the container to your Lambda function. Then set up an API Gateway WebSocket or similar to interact with your Lambda through an API.

        + +

        There are two notable limitations to using a serverless architecture to host your LLM, the first being token generation speed. Recall that the server-based approach delivered about 25 tokens/second with KleidiAI optimizations. The serverless approach delivers an order of magnitude slower, which we measured at around about 2.5 tokens/second. This limitation mainly results from Lambda functions deploying onto Graviton2 servers. When deployment moves to CPUs with more SIMD channels, like Graviton3 and Graviton4, the tokens/second should increase over time. Learn more about architecture optimizations introduced in Graviton3 via the Arm Neoverse-V1 CPU here.

        + +

        This slower speed restricts the viable use cases for serverless LLM architectures, but there are certain cases where this can be seen as an advantage. In our use cases of interactive storytelling, slowly revealing information creates a sense of immersion, building anticipation and mimicking real-time narration. Other use cases include:

        + +
          +
        • Guided meditation apps with slow, relaxing word delivery
        • +
        • Virtual friend engaging in thoughtful conversation, or a therapeutic conversation.
        • +
        • Poetry generation or interactive art to slow delivery creating a contemplative aesthetic.
        • +
        + +

        Users may have a better experience with slower token generation in the right applications. When prioritizing a more sustainable solution, restrictions end up becoming strengths. As an analogy, a common critique of modern movies today is that their overreliance on visual effects leads to fewer compelling storylines vs older movies. The cost restrictions of VFX meant older movies had to craft captivating dialog, leveraging skillful camera angles and character positioning to fully engage viewers. Similarly, focusing on sustainable AI architectures can lead to more engaging, immersive experiences when done thoughtfully.

        + +

        The second serverless limitation on LLM inferences is the cold-start time of about 50 seconds. If implemented poorly, a user waiting 50 seconds with no alternative will likely leave the app. You can turn this limitation into a feature in our Wicked-based experience with several design tricks:

        + +
          +
        • Create a “prologue experience” where you guide users through hard-coded questions and answers, priming them for where they will land in Emerald City and collecting input to shape their upcoming experience.
        • +
        • Make the waiting period a countdown timer, revealing hard-coded text snippets of the story or world-building. A character, like the wizard, could communicate with the user with fragmented lines to build suspense and prime the user into the right mindset.
        • +
        • Create an audio intro with music from the movie or musical, along with rotating visuals to draw users into the atmosphere of the Wicked world.
        • +
        + +

        Thinking outside the box

        + +

        Implementing a sustainability-minded solution architecture includes and goes beyond optimizing your AI inferences. Understand how users will interact with your system, and right-size your implementation accordingly. Always optimizing for fast tokens per second or time to first token will hide opportunities for engaging features.

        + +

        With that said, you should be leveraging straightforward optimizations when possible. Using TorchAO and Arm KleidiAI micro-kernels are great ways to speed up your LLM chatbot. By combining creative solution architectures and optimizing where possible, you can build more sustainable LLM-based applications. Happy coding!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimized-pytorch-w-graviton/index.html b/blog/optimized-pytorch-w-graviton/index.html new file mode 100644 index 000000000000..3b5b4bbb69d9 --- /dev/null +++ b/blog/optimized-pytorch-w-graviton/index.html @@ -0,0 +1,808 @@ + + + + + + + + + + + + + Optimized PyTorch 2.0 Inference with AWS Graviton processors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Sunita Nadampalli from AWS & Ankith Gunapal from Meta + +

        +

        New generations of CPUs offer significant performance improvement in machine learning (ML) inference due to specialized built-in instructions. Combined with their flexibility, high speed of development, and low operating cost, these general-purpose processors offer an alternative ML inference solution to other existing hardware solutions.

        + +

        AWS, Arm, Meta, and others helped optimize the performance of PyTorch 2.0 inference for Arm-based processors. As a result, we are delighted to announce that Arm-based AWS Graviton instance inference performance for PyTorch 2.0 is up to 3.5 times the speed for ResNet-50 compared to the previous PyTorch release, and up to 1.4 times the speed for BERT, making Graviton-based instances the fastest compute optimized instances on AWS for these models (see the following graph).

        + +

        Relative speed improvement achieved by upgrading PyTorch to 2.0

        + +

        Image 1: Relative speed improvement achieved by upgrading from PyTorch version 1.13 to 2.0 (higher is better). The performance is measured on c7g.4xlarge instances.

        + +

        As shown in the next graph, we measured up to 50% cost savings for PyTorch inference with Graviton3-based c7g instances across Torch Hub ResNet-50 and multiple Hugging Face models compared to comparable x86-based compute optimized Amazon EC2 instances. For that graph, we first measured the cost per million inference for the five instance types. Then, we normalized the cost per million inference results to a c5.4xlarge instance, which is the baseline measure of “1” on the Y-axis of the chart.

        + +

        Relative cost of PyTorch inference running on different AWS instances

        + +

        Image 2: Relative cost of PyTorch inference running on different AWS instances (lower is better).
        Source: AWS ML Blog on Graviton PyTorch2.0 inference performance.

        + +

        Similar to the preceding inference cost comparison graph, the following graph shows the model p90 latency for the same five instance types. We normalized the latency results to the c5.4xlarge instance, which is the baseline measure of “1” on the Y-axis of the chart. The c7g.4xlarge (AWS Graviton3) model inference latency is up to 50% better than the latencies measured on c5.4xlarge, c6i.4xlarge, and c6a.4xlarge. \

        + +

        Relative latency (p90) of PyTorch inference running on different AWS instances

        + +

        Image 3: Relative latency (p90) of PyTorch inference running on different AWS instances (lower is better).
        Source: AWS ML Blog on Graviton PyTorch2.0 inference performance.

        + +

        Optimization details

        + +

        PyTorch supports Compute Library for the Arm® Architecture (ACL) GEMM kernels via the oneDNN backend (previously called “MKL-DNN”) for AArch64 platforms. The optimizations are primarily for PyTorch ATen CPU BLAS, ACL kernels for fp32 and bfloat16, and oneDNN primitive caching. There are no frontend API changes, so no changes are required at the application level to get these optimizations working on Graviton3-based instances.

        + +

        PyTorch level optimizations

        + +

        We extended the ATen CPU BLAS interface to accelerate more operators and tensor configurations via oneDNN backend for aarch64 platform. The following diagram highlights (in orange) the optimized components that improved the PyTorch inference performance on aarch64 platform.

        + +

        PyTorch software stack highlighting (in orange) the components optimized for inference performance improvement on AArch64 platform

        + +

        Image 4: PyTorch software stack highlighting (in orange) the components optimized for inference performance improvement on AArch64 platform

        + +

        ACL kernels and BFloat16 FPmath mode

        + +

        The ACL library provides Neon and SVE optimized GEMM kernels for both fp32 and bfloat16 formats: These kernels improve the SIMD hardware utilization and reduce the end to end inference latencies. The bfloat16 support in Graviton3 allows efficient deployment of models trained using bfloat16, fp32 and Automatic Mixed Precision (AMP). The standard fp32 models use bfloat16 kernels via oneDNN FPmath mode without model quantization. They provide up to two times faster performance compared to existing fp32 model inference without bfloat16 FPmath support. For more details on ACL GEMM kernel support, refer to Arm Compute Library github.

        + +

        Primitive Caching

        + +

        The following call sequence diagram shows how ACL operators are integrated into oneDNN backend. As shown in the diagram, ACL objects are handled as oneDNN resources instead of the primitive objects. This is because the ACL objects are stateful and mutable. Since the ACL objects are handled as resource objects, they are not cacheable with the default primitive caching feature supported in oneDNN. We implemented primitive caching at ideep operator level for “convolution”, “matmul” and “inner product” operators to avoid redundant GEMM kernel initialization and tensor allocation overhead.

        + +

        Call sequence diagram showing how the Compute Library for the Arm® Architecture (ACL) GEMM kernels are integrated into oneDNN backend

        + +

        Image 5: Call sequence diagram showing how the Compute Library for the Arm® Architecture (ACL) GEMM kernels are integrated into oneDNN backend

        + +

        How to take advantage of the optimizations

        + +

        Install the PyTorch 2.0 wheel from the official repo and set environment variables to enable the additional optimizations.

        + +
        # Install Python
        +sudo apt-get update
        +sudo apt-get install -y python3 python3-pip
        +
        +# Upgrade pip3 to the latest version
        +python3 -m pip install --upgrade pip
        +
        +# Install PyTorch and extensions
        +python3 -m pip install torch
        +python3 -m pip install torchvision torchaudio torchtext
        +
        +# Turn on Graviton3 optimization
        +export DNNL_DEFAULT_FPMATH_MODE=BF16
        +export LRU_CACHE_CAPACITY=1024
        +
        + +

        Running an inference

        + +

        You can use PyTorch torchbench to measure the CPU inference performance improvements, or to compare different instance types.

        + +
        # Pre-requisite:
        +# pip install PyTorch2.0 wheels and set the above mentioned environment variables
        +
        +# Clone PyTorch benchmark repo
        +git clone https://github.com/pytorch/benchmark.git
        +
        +# Setup ResNet-50 benchmark
        +cd benchmark
        +python3 install.py resnet50
        +
        +# Install the dependent wheels
        +python3 -m pip install numba
        +
        +# Run ResNet-50 inference in jit mode. On successful completion of the inference runs,
        +# the script prints the inference latency and accuracy results
        +python3 run.py resnet50 -d cpu -m jit -t eval --use_cosine_similarity
        +
        + +

        Performance Analysis

        + +

        Now, we will analyze the inference performance of ResNet-50 on Graviton3-based c7g instance using PyTorch profiler. We run the code below with PyTorch 1.13 and PyTorch 2.0 and run the inference for a few iterations as a warmup before measuring the performance.

        + +
        # Turn on Graviton3 optimization
        +export DNNL_DEFAULT_FPMATH_MODE=BF16
        +export LRU_CACHE_CAPACITY=1024
        +
        + +
        import torch
        +from torchvision import models
        +sample_input = [torch.rand(1, 3, 224, 224)]
        +eager_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        +model = torch.jit.script(eager_model, example_inputs=[sample_input, ])
        +
        +model = model.eval()
        +model = torch.jit.optimize_for_inference(model)
        +
        +with torch.no_grad():
        +    # warmup runs
        +    for i in range(10):
        +        model(*sample_input)
        +    prof = torch.profiler.profile(
        +      on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'), record_shapes=True, with_stack=True)
        +    # profile after warmup
        +    prof.start()
        +    model(*sample_input)
        +    prof.stop()
        +
        + +

        We use tensorboard to view results of the profiler and analyze model performance.

        + +

        Install PyTorch Profiler Tensorboard plugin as follows

        + +
        pip install torch_tb_profiler
        +
        + +

        Launch the tensorboard using

        + +
        tensorboard --logdir=./logs
        +
        + +

        Launch the following in the browser to view the profiler output. The profiler supports ‘Overview’, ‘Operator’, ‘Trace’ and ‘Module’ views to get insight into the inference execution.

        + +
        http://localhost:6006/#pytorch_profiler
        +
        + +

        The following diagram is the profiler ‘Trace’ view which shows the call stack along with the execution time of each function. In the profiler, we selected the forward() function to get the overall inference time. As shown in the diagram, the inference time for the ResNet-50 model on Graviton3-based c7g instance is around 3 times faster in PyTorch 2.0 compared to PyTorch 1.13.

        + +

        Profiler Trace view: Forward pass wall duration on PyTorch 1.13 and PyTorch 2.0

        + +

        Image 6: Profiler Trace view: Forward pass wall duration on PyTorch 1.13 and PyTorch 2.0

        + +

        The next diagram is the ‘Operator’ view which shows the list of PyTorch operators and their execution time. Similar to the preceding Trace view, the Operator view shows that the operator host duration for the ResNet-50 model on Graviton3-based c7g instance is around 3 times faster in PyTorch 2.0 compared to PyTorch 1.13.

        + +

        Profiler Operator view: Forward operator Host duration on PyTorch 1.13 and PyTorch 2.0

        + +

        Image 7: Profiler Operator view: Forward operator Host duration on PyTorch 1.13 and PyTorch 2.0

        + +

        Benchmarking Hugging Face models

        + +

        You can use the Amazon SageMaker Inference Recommender utility to automate performance benchmarking across different instances. With Inference Recommender, you can find the real-time inference endpoint that delivers the best performance at the lowest cost for a given ML model. We collected the preceding data using the Inference Recommender notebooks by deploying the models on production endpoints. For more details on Inference Recommender, refer to the amazon-sagemaker-examples GitHub repo. We benchmarked the following models for this post: ResNet50 image classification, DistilBERT sentiment analysis, RoBERTa fill mask, and RoBERTa sentiment analysis.

        + +

        Conclusion

        + +

        For PyTorch 2.0, the Graviton3-based C7g instance is the most cost-effective compute optimized Amazon EC2 instance for inference. These instances are available on SageMaker and Amazon EC2. The AWS Graviton Technical Guide provides the list of optimized libraries and best practices that will help you achieve cost benefit with Graviton instances across different workloads.

        + +

        If you find use cases where similar performance gains are not observed on Graviton, please open an issue on the aws-graviton-getting-started github to let us know about it. We will continue to add more performance improvements to make AWS Graviton-based instances the most cost-effective and efficient general purpose processor for inference using PyTorch.

        + +

        Acknowledgments

        + +

        We would like to thank Ali Saidi (Sr. Principal Engineer) and Csaba Csoma (Sr. Manager, Software Development) from AWS, Ashok Bhat (Sr. Product Manager), Nathan Sircombe (Sr. Engineering Manager) and Milos Puzovic (Principal Software Engineer) from Arm for their support during the Graviton PyTorch inference optimization work. We would also like to thank Geeta Chauhan (Engineering Leader, Applied AI) from Meta for her guidance on this blog.

        + +

        About the authors

        + +

        Sunita Nadampalli is a ML Engineer and Software Development Manager at AWS.

        + +

        Ankith Gunapal is an AI Partner Engineer at Meta(PyTorch).

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimizing-cuda-rnn-with-torchscript/index.html b/blog/optimizing-cuda-rnn-with-torchscript/index.html new file mode 100644 index 000000000000..c53ab248b9ae --- /dev/null +++ b/blog/optimizing-cuda-rnn-with-torchscript/index.html @@ -0,0 +1,874 @@ + + + + + + + + + + + + + Optimizing CUDA Recurrent Neural Networks with TorchScript | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + The PyTorch Team + +

        +

        This week, we officially released PyTorch 1.1, a large feature update to PyTorch 1.0. One of the new features we’ve added is better support for fast, custom Recurrent Neural Networks (fastrnns) with TorchScript (the PyTorch JIT) (https://pytorch.org/docs/stable/jit.html).

        + +

        RNNs are popular models that have shown good performance on a variety of NLP tasks that come in different shapes and sizes. PyTorch implements a number of the most popular ones, the Elman RNN, GRU, and LSTM as well as multi-layered and bidirectional variants.

        + +

        However, many users want to implement their own custom RNNs, taking ideas from recent literature. Applying Layer Normalization to LSTMs is one such use case. Because the PyTorch CUDA LSTM implementation uses a fused kernel, it is difficult to insert normalizations or even modify the base LSTM implementation. Many users have turned to writing custom implementations using standard PyTorch operators, but such code suffers from high overhead: most PyTorch operations launch at least one kernel on the GPU and RNNs generally run many operations due to their recurrent nature. However, we can apply TorchScript to fuse operations and optimize our code automatically, launching fewer, more optimized kernels on the GPU.

        + +

        Our goal is for users to be able to write fast, custom RNNs in TorchScript without writing specialized CUDA kernels to achieve similar performance. In this post, we’ll provide a tutorial for how to write your own fast RNNs with TorchScript. To better understand the optimizations TorchScript applies, we’ll examine how those work on a standard LSTM implementation but most of the optimizations can be applied to general RNNs.

        + +

        Writing custom RNNs

        + +

        To get started, you can use this file as a template to write your own custom RNNs.

        + +

        We are constantly improving our infrastructure on trying to make the performance better. If you want to gain the speed/optimizations that TorchScript currently provides (like operator fusion, batch matrix multiplications, etc.), here are some guidelines to follow. The next section explains the optimizations in depth.

        + +
          +
        1. +

          If the customized operations are all element-wise, that’s great because you can get the benefits of the PyTorch JIT’s operator fusion automatically!

          +
        2. +
        3. +

          If you have more complex operations (e.g. reduce ops mixed with element-wise ops), consider grouping the reduce operations and element-wise ops separately in order to fuse the element-wise operations into a single fusion group.

          +
        4. +
        5. +

          If you want to know about what has been fused in your custom RNN, you can inspect the operation’s optimized graph by using graph_for . Using LSTMCell as an example:

          + +
           # get inputs and states for LSTMCell
          +
          + inputs = get_lstm_inputs()
          +
          + # instantiate a ScriptModule
          +
          + cell = LSTMCell(input_size, hidden_size)
          +
          + # print the optimized graph using graph_for
          +
          + out = cell(inputs)
          + print(cell.graph_for(inputs))
          +
          +
          + +

          This will generate the optimized TorchScript graph (a.k.a PyTorch JIT IR) for the specialized inputs that you provides:

          + +
           graph(%x : Float(*, *),
          +         %hx : Float(*, *),
          +         %cx : Float(*, *),
          +         %w_ih : Float(*, *),
          +         %w_hh : Float(*, *),
          +         %b_ih : Float(*),
          +         %b_hh : Float(*)):
          +     %hy : Float(*, *), %cy : Float(*, *) = prim::DifferentiableGraph_0(%cx, %b_hh, %b_ih, %hx, %w_hh, %x, %w_ih)
          +     %30 : (Float(*, *), Float(*, *)) = prim::TupleConstruct(%hy, %cy)
          +     return (%30)
          +     with prim::DifferentiableGraph_0 = graph(%13 : Float(*, *),
          +         %29 : Float(*),
          +         %33 : Float(*),
          +         %40 : Float(*, *),
          +         %43 : Float(*, *),
          +         %45 : Float(*, *),
          +         %48 : Float(*, *)):
          +     %49 : Float(*, *) = aten::t(%48)
          +     %47 : Float(*, *) = aten::mm(%45, %49)
          +     %44 : Float(*, *) = aten::t(%43)
          +     %42 : Float(*, *) = aten::mm(%40, %44)
          +     ...some broadcast sizes operations...
          +     %hy : Float(*, *), %287 : Float(*, *), %cy : Float(*, *), %outgate.1 : Float(*, *), %cellgate.1 : Float(*, *), %forgetgate.1 : Float(*, *), %ingate.1 : Float(*, *) = prim::FusionGroup_0(%13, %346, %345, %344, %343)
          +     ...some broadcast sizes operations...
          +     return (%hy, %cy, %49, %44, %196, %199, %340, %192, %325, %185, %ingate.1, %forgetgate.1, %cellgate.1, %outgate.1, %395, %396, %287)
          +     with prim::FusionGroup_0 = graph(%13 : Float(*, *),
          +         %71 : Tensor,
          +         %76 : Tensor,
          +         %81 : Tensor,
          +         %86 : Tensor):
          +     ...some chunks, constants, and add operations...
          +     %ingate.1 : Float(*, *) = aten::sigmoid(%38)
          +     %forgetgate.1 : Float(*, *) = aten::sigmoid(%34)
          +     %cellgate.1 : Float(*, *) = aten::tanh(%30)
          +     %outgate.1 : Float(*, *) = aten::sigmoid(%26)
          +     %14 : Float(*, *) = aten::mul(%forgetgate.1, %13)
          +     %11 : Float(*, *) = aten::mul(%ingate.1, %cellgate.1)
          +     %cy : Float(*, *) = aten::add(%14, %11, %69)
          +     %4 : Float(*, *) = aten::tanh(%cy)
          +     %hy : Float(*, *) = aten::mul(%outgate.1, %4)
          +     return (%hy, %4, %cy, %outgate.1, %cellgate.1, %forgetgate.1, %ingate.1)
          +
          +
        6. +
        + +

        From the above graph we can see that it has a prim::FusionGroup_0 subgraph that is fusing all element-wise operations in LSTMCell (transpose and matrix multiplication are not element-wise ops). Some graph nodes might be hard to understand in the first place but we will explain some of them in the optimization section, we also omitted some long verbose operators in this post that is there just for correctness.

        + +

        Variable-length sequences best practices

        + +

        TorchScript does not support PackedSequence. Generally, when one is handling variable-length sequences, it is best to pad them into a single tensor and send that tensor through a TorchScript LSTM. Here’s an example:

        + +
        sequences = [...] # List[Tensor], each Tensor is T' x C
        +padded = torch.utils.rnn.pad_sequence(sequences)
        +lengths = [seq.size(0) for seq in sequences]
        +padded  # T x N x C, where N is batch size and T is the max of all T'
        +
        +model = LSTM(...)
        +output, hiddens = model(padded)
        +output  # T x N x C
        +
        + +

        Of course, output may have some garbage data in the padded regions; use lengths to keep track of which part you don’t need.

        + +

        Optimizations

        + +

        We will now explain the optimizations performed by the PyTorch JIT to speed up custom RNNs. We will use a simple custom LSTM model in TorchScript to illustrate the optimizations, but many of these are general and apply to other RNNs.

        + +

        To illustrate the optimizations we did and how we get benefits from those optimizations, we will run a simple custom LSTM model written in TorchScript (you can refer the code in the custom_lstm.py or the below code snippets) and time our changes.

        + +

        We set up the environment in a machine equipped with 2 Intel Xeon chip and one Nvidia P100, with cuDNN v7.3, CUDA 9.2 installed. The basic set up for the LSTM model is as follows:

        + +
        input_size = 512
        +hidden_size = 512
        +mini_batch = 64
        +numLayers = 1
        +seq_length = 100 
        +
        + +

        The most important thing PyTorch JIT did is to compile the python program to a PyTorch JIT IR, which is an intermediate representation used to model the program’s graph structure. This IR can then benefit from whole program optimization, hardware acceleration and overall has the potential to provide large computation gains. In this example, we run the initial TorchScript model with only compiler optimization passes that are provided by the JIT, including common subexpression elimination, constant pooling, constant propagation, dead code elimination and some peephole optimizations. We run the model training for 100 times after warm up and average the training time. The initial results for model forward time is around 27ms and backward time is around 64ms, which is a bit far away from what PyTorch cuDNN LSTM provided. Next we will explain the major optimizations we did on how we improve the performance on training or inferencing, starting with LSTMCell and LSTMLayer, and some misc optimizations.

        + +

        LSTM Cell (forward)

        + +

        Almost all the computations in an LSTM happen in the LSTMCell, so it’s important for us to take a look at the computations it contains and how can we improve their speed. Below is a sample LSTMCell implementation in TorchScript:

        + +
        class LSTMCell(jit.ScriptModule):
        +    def __init__(self, input_size, hidden_size):
        +        super(LSTMCell, self).__init__()
        +        self.input_size = input_size
        +        self.hidden_size = hidden_size
        +        self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size))
        +        self.weight_hh = Parameter(torch.randn(4 * hidden_size, hidden_size))
        +        self.bias_ih = Parameter(torch.randn(4 * hidden_size))
        +        self.bias_hh = Parameter(torch.randn(4 * hidden_size))
        +
        +    @jit.script_method
        +    def forward(self, input, state):
        +        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
        +        hx, cx = state
        +        gates = (torch.mm(input, self.weight_ih.t()) + self.bias_ih +
        +                 torch.mm(hx, self.weight_hh.t()) + self.bias_hh)
        +        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
        +
        +        ingate = torch.sigmoid(ingate)
        +        forgetgate = torch.sigmoid(forgetgate)
        +        cellgate = torch.tanh(cellgate)
        +        outgate = torch.sigmoid(outgate)
        +
        +        cy = (forgetgate * cx) + (ingate * cellgate)
        +        hy = outgate * torch.tanh(cy)
        +
        +        return hy, (hy, cy)
        +
        + +

        This graph representation (IR) that TorchScript generated enables several optimizations and scalable computations. In addition to the typical compiler optimizations that we could do (CSE, constant propagation, etc. ) we can also run other IR transformations to make our code run faster.

        + +
          +
        • Element-wise operator fusion. PyTorch JIT will automatically fuse element-wise ops, so when you have adjacent operators that are all element-wise, JIT will automatically group all those operations together into a single FusionGroup, this FusionGroup can then be launched with a single GPU/CPU kernel and performed in one pass. This avoids expensive memory reads and writes for each operation.
        • +
        • Reordering chunks and pointwise ops to enable more fusion. An LSTM cell adds gates together (a pointwise operation), and then chunks the gates into four pieces: the ifco gates. Then, it performs pointwise operations on the ifco gates like above. This leads to two fusion groups in practice: one fusion group for the element-wise ops pre-chunk, and one group for the element-wise ops post-chunk. + The interesting thing to note here is that pointwise operations commute with torch.chunk: Instead of performing pointwise ops on some input tensors and chunking the output, we can chunk the input tensors and then perform the same pointwise ops on the output tensors. By moving the chunk to before the first fusion group, we can merge the first and second fusion groups into one big group.
        • +
        + +
        + +
        + +
          +
        • Tensor creation on the CPU is expensive, but there is ongoing work to make it faster. At this point, a LSTMCell runs three CUDA kernels: two gemm kernels and one for the single pointwise group. One of the things we noticed was that there was a large gap between the finish of the second gemm and the start of the single pointwise group. This gap was a period of time when the GPU was idling around and not doing anything. Looking into it more, we discovered that the problem was that torch.chunk constructs new tensors and that tensor construction was not as fast as it could be. Instead of constructing new Tensor objects, we taught the fusion compiler how to manipulate a data pointer and strides to do the torch.chunk before sending it into the fused kernel, shrinking the amount of idle time between the second gemm and the launch of the element-wise fusion group. This give us around 1.2x increase speed up on the LSTM forward pass.
        • +
        + +

        By doing the above tricks, we are able to fuse the almost all LSTMCell forward graph (except the two gemm kernels) into a single fusion group, which corresponds to the prim::FusionGroup_0 in the above IR graph. It will then be launched into a single fused kernel for execution. With these optimizations the model performance improves significantly with average forward time reduced by around 17ms (1.7x speedup) to 10ms, and average backward time reduce by 37ms to 27ms (1.37x speed up).

        + +

        LSTM Layer (forward)

        + +
        class LSTMLayer(jit.ScriptModule):
        +    def __init__(self, cell, *cell_args):
        +        super(LSTMLayer, self).__init__()
        +        self.cell = cell(*cell_args)
        +
        +    @jit.script_method
        +    def forward(self, input, state):
        +        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
        +        inputs = input.unbind(0)
        +        outputs = torch.jit.annotate(List[Tensor], [])
        +        for i in range(len(inputs)):
        +            out, state = self.cell(inputs[i], state)
        +            outputs += [out]
        +        return torch.stack(outputs), state
        +
        + +

        We did several tricks on the IR we generated for TorchScript LSTM to boost the performance, some example optimizations we did:

        + +
          +
        • Loop Unrolling: We automatically unroll loops in the code (for big loops, we unroll a small subset of it), which then empowers us to do further optimizations on the for loops control flow. For example, the fuser can fuse together operations across iterations of the loop body, which results in a good performance improvement for control flow intensive models like LSTMs.
        • +
        • Batch Matrix Multiplication: For RNNs where the input is pre-multiplied (i.e. the model has a lot of matrix multiplies with the same LHS or RHS), we can efficiently batch those operations together into a single matrix multiply while chunking the outputs to achieve equivalent semantics.
        • +
        + +

        By applying these techniques, we reduced our time in the forward pass by an additional 1.6ms to 8.4ms (1.2x speed up) and timing in backward by 7ms to around 20ms (1.35x speed up).

        + +

        LSTM Layer (backward)

        + +
          +
        • +

          “Tree” Batch Matrix Muplication: It is often the case that a single weight is reused multiple times in the LSTM backward graph, forming a tree where the leaves are matrix multiplies and nodes are adds. These nodes can be combined together by concatenating the LHSs and RHSs in different dimensions, then computed as a single matrix multiplication. The formula of equivalence can be denoted as follows:

          + +

          $L1 * R1 + L2 * R2 = torch.cat((L1, L2), dim=1) * torch.cat((R1, R2), dim=0)$

          +
        • +
        • +

          Autograd is a critical component of what makes PyTorch such an elegant ML framework. As such, we carried this through to PyTorch JIT, but using a new Automatic Differentiation (AD) mechanism that works on the IR level. JIT automatic differentiation will slice the forward graph into symbolically differentiable subgraphs, and generate backwards nodes for those subgraphs. Taking the above IR as an example, we group the graph nodes into a single prim::DifferentiableGraph_0 for the operations that has AD formulas. For operations that have not been added to AD formulas, we will fall back to Autograd during execution.

          +
        • +
        • +

          Optimizing the backwards path is hard, and the implicit broadcasting semantics make the optimization of automatic differentiation harder. PyTorch makes it convenient to write tensor operations without worrying about the shapes by broadcasting the tensors for you. For performance, the painful point in backward is that we need to have a summation for such kind of broadcastable operations. This results in the derivative of every broadcastable op being followed by a summation. Since we cannot currently fuse reduce operations, this causes FusionGroups to break into multiple small groups leading to bad performance. To deal with this, refer to this great post written by Thomas Viehmann.

          +
        • +
        + +

        Misc Optimizations

        + +
          +
        • In addition to the steps laid about above, we also eliminated overhead between CUDA kernel launches and unnecessary tensor allocations. One example is when you do a tensor device look up. This can provide some poor performance initially with a lot of unnecessary allocations. When we remove these this results in a reduction from milliseconds to nanoseconds between kernel launches.
        • +
        • Lastly, there might be normalization applied in the custom LSTMCell like LayerNorm. Since LayerNorm and other normalization ops contains reduce operations, it is hard to fuse it in its entirety. Instead, we automatically decompose Layernorm to a statistics computation (reduce operations) + element-wise transformations, and then fuse those element-wise parts together. As of this post, there are some limitations on our auto differentiation and graph fuser infrastructure which limits the current support to inference mode only. We plan to add backward support in a future release.
        • +
        + +

        With the above optimizations on operation fusion, loop unrolling, batch matrix multiplication and some misc optimizations, we can see a clear performance increase on our custom TorchScript LSTM forward and backward from the following figure:

        + +
        + +
        + +

        There are a number of additional optimizations that we did not cover in this post. In addition to the ones laid out in this post, we now see that our custom LSTM forward pass is on par with cuDNN. We are also working on optimizing backward more and expect to see improvements in future releases. Besides the speed that TorchScript provides, we introduced a much more flexible API that enable you to hand draft a lot more custom RNNs, which cuDNN could not provide.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimizing-libtorch/index.html b/blog/optimizing-libtorch/index.html new file mode 100644 index 000000000000..cc14385d3151 --- /dev/null +++ b/blog/optimizing-libtorch/index.html @@ -0,0 +1,722 @@ + + + + + + + + + + + + + Optimizing LibTorch-based inference engine memory usage and thread-pooling | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Himalay Mohanlal Joriwal, Pierre-Yves Aquilanti, Vivek Govindan, Hamid Shojanazeri, Ankith Gunapal, Tristan Rice + +

        +

        Outline

        + +

        In this blog post we show how to optimize LibTorch-based inference engine to maximize throughput by reducing memory usage and optimizing the thread-pooling strategy. We apply these optimizations to Pattern Recognition engines for audio data, for example, music and speech recognition or acoustic fingerprinting. The optimizations discussed in this blog post allow for memory usage reduction by 50% and reduction in end-to-end latency for Inference by 37.5%. These optimizations are applicable to computer vision and natural language processing.

        + +

        Audio Recognition Inferencing

        + +

        Audio Recognition (AR) engines can be used to recognize and identify sound patterns. As an example, identifying the type and species of a bird from audio recordings, distinguishing music from the singer’s voice, or detecting an abnormal sound indicating a breach in a building. To identify sounds of interest, AR engines process audio through 4 stages:

        + +
          +
        1. File Validation: The AR engine validates the input audio file.
        2. +
        3. Feature Extraction: Features are extracted from each segment within the audio file.
        4. +
        5. Inference: LibTorch performs inference using CPUs or accelerators. In our case Intel processors on an Elastic Cloud Compute (EC2) instance.
        6. +
        7. Post-processing: A post-processing model decodes the results and calculates scores that are used to convert inference output into tags or transcripts.
        8. +
        + +

        Of these 4 steps, inference is the most computationally intensive and can take up to 50% of the pipeline processing time depending on the model complexity. This means that any optimization at this stage has a significant impact on the overall pipeline. 

        + +

        Optimizing the Audio Recognition engine with concurrency…is not so simple

        + +

        Our objective for this processing pipeline is to extract audio segments into tags or transcripts through a processing. The input data is an audio file composed of several short sound segments (S1 to S6 in Figure 1). The output data corresponds to tags or transcripts ordered by timestamps.

        + +

        Figure 1: Example audio file with segment boundaries

        + +

        Figure 1: Example audio file with segment boundaries

        + +

        Each segment can be processed independently and in an out-of-order fashion. This offers the opportunity to process segments concurrently and in parallel to optimize the overall inference throughput as well as maximize the usage of the resources.

        + +

        Parallelization on an instance can be achieved through multi-threading (pThreads, std::threads, OpenMP) or multi-processing. The advantage of multi-threading over multi-processing is the ability to use shared memory. It enables developers to minimize data duplication across threads by sharing data across threads; the AR models in our case (Figure 2). Furthermore, a reduction in memory allows us to run more pipelines in parallel by increasing the number of engine threads in order to utilize all vCPUs on our Amazon EC2 instance (c5.4xlarge in our case, it offers 16 vCPUs). In theory, we expect to see higher hardware utilization and higher throughput for our AR engine as a result.

        + +

        Figure 2: Multi-threaded AR Engine

        + +

        Figure 2: Multi-threaded AR Engine

        + +

        But we found these assumptions to be wrong. Indeed, we found that increasing the number of threads of the application led to an increase of the end-to-end latency for each audio segment and to a decrease of the engine throughput. For example, increasing the concurrency from 1 to 5 threads led to an increase of the latency by 4x which had a proportional effect on decreasing the throughput. In fact, metrics showed that within the pipeline, the latency of the inference stage alone was 3x higher than it’s single thread baseline. 

        + +

        Using a profiler, we found that the CPU Spin Time increased, potentially due to CPU oversubscription which impacts system and application performance. Given our control over the application’s multi-thread implementation, we chose to dive deeper into the stack and identify potential conflicts with LibTorch’s default settings.

        + +

        Diving deeper on LibTorch’s multi-threading and its impact on concurrency

        + +

        LibTorch’s parallel implementations on CPU for inference are based on  global thread pools. Examples of implementations are Inter-op and intra-op parallelism, which can be chosen depending on the model’s properties. In both cases, it is possible to set the number of threads in each thread-poll to optimize the latency and throughput. 

        + +

        To test if LibTorch’s parallel default implementation settings had a counter effect on our inference latency, we ran an experiment on a 16 vCPus machine with a 35-minute audio file, keeping the LibTorch inter-threads constant at 1 (because our models didn’t utilize the inter-op thread pool). We collected the following data as shown in Figure 3 and 4. 

        + +

        Figure 3: CPU Utilization for different number of engine threads

        + +

        Figure 3: CPU Utilization for different number of engine threads

        + +

        Figure 4: Processing times for different number of engine threads

        + +

        Figure 4: Processing times for different number of engine threads

        + +

        Execution time in Figure 4 is the end-to-end processing time for processing all the segments of the given audio file. We have 4 different configurations of LibTorch intra-threads which are 1, 4, 8, 16 and we change the number of engine threads from 1 to 16 for each intra-thread LibTorch configuration. As we see in Figure 3, CPU utilization increases with an increase in the number of engine threads for all LibTorch intra-thread configurations. But as we see in Figure 4, an increase in CPU utilization doesn’t translate into lower execution time. We found out that in all but one case, as the number of engine threads shot up, so did execution time. The one exception was the case where the intra-thread pool size was 1.

        + +

        Resolving the global thread pool issue

        + +

        Using too many threads with a global thread pool led to performance degradation and caused an over-subscription problem. Without disabling LibTorch global thread pools, it was difficult to match the performance of the multi-process engine.

        + +

        Disabling the LibTorch global thread pool is as simple as setting the intra-op/inter-op parallelism threads to 1, as shown here:

        + +
        at::set_num_threads(1)           // Disables the intraop thread pool.
        +at::set_num_interop_threads(1). // Disables the interop thread pool.
        +
        + +

        As shown in Figure 4, the lowest processing time was measured when the LibTorch global thread pool was disabled.

        + +

        This solution improved AR engine throughput in several cases. However, when evaluating long datasets (audio files longer than 2 hours in load test), we found that the memory footprint of the engine gradually started to increase.

        + +

        Optimizing memory usage

        + +

        We ran a load-test on the system with two hours long audio files and found out that the observed memory increase was the result of memory fragmentation within a multi-threaded LibTorch inference. We resolved this using jemalloc, which is a general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support. Using jemalloc, our peak memory usage decreased by an average of 34% and average memory usage decreased by 53%.

        + +

        Figure 5: Memory usage over time using the same input file with and without jemalloc

        + +

        Figure 5: Memory usage over time using the same input file with and without jemalloc

        + +

        Summary

        + +

        To optimize the performance of multi-threaded LibTorch-based inference engines, we recommend verifying that there is no oversubscription problem in LibTorch. In our case, all threads in the multi-threaded engine were sharing the LibTorch global thread pool, which caused an oversubscription problem. This was remedied by disabling the global thread pool: we disabled the interop and intraop global thread pool by setting threads to 1. To optimize the memory of a multi-threaded engine, we recommend using Jemalloc as a memory allocator tool rather than the default malloc function.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/optimizing-production-pytorch-performance-with-graph-transformations/index.html b/blog/optimizing-production-pytorch-performance-with-graph-transformations/index.html new file mode 100644 index 000000000000..f63e3d7bb6ba --- /dev/null +++ b/blog/optimizing-production-pytorch-performance-with-graph-transformations/index.html @@ -0,0 +1,846 @@ + + + + + + + + + + + + + Optimizing Production PyTorch Models’ Performance with Graph Transformations | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Jade Nie, CK Luk, Xiaodong Wang, Jackie (Jiaqi) Xu + +

        +

        1. Introduction

        + +

        PyTorch supports two execution modes [1]: eager mode and graph mode. In eager mode, operators in a model are immediately executed as they are encountered. In contrast, in graph mode, operators are first synthesized into a graph, which will then be compiled and executed as a whole. Eager mode is easier to use, more suitable for ML researchers, and hence is the default mode of execution. On the other hand, graph mode typically delivers higher performance and hence is heavily used in production.

        + +

        Specifically, graph mode enables operator fusion [2], wherein one operator is merged with another to reduce/localize memory reads as well as total kernel launch overhead. Fusion can be horizontal—taking a single operation (e.g., BatchNorm) that is independently applied to many operands and merging those operands into an array; and vertical—merging a kernel with another kernel that consumes the output of the first kernel (e.g., Convolution followed by ReLU).

        + +

        Torch.FX [3, 4] (abbreviated as FX) is a publicly available toolkit as part of the PyTorch package that supports graph mode execution. In particular, it (1) captures the graph from a PyTorch program and (2) allows developers to write transformations on the captured graph. It is used inside Meta to optimize the training throughput of production models. By introducing a number of FX-based optimizations developed at Meta, we demonstrate the approach of using graph transformation to optimize PyTorch’s performance for production.

        + +

        2. Background

        + +

        Embedding tables are ubiquitous in recommendation systems. Section 3 will discuss three FX transformations that optimize accesses to embedding tables. In this section, we provide some background on FX (Section 2.1) and embedding tables (Section 2.2).

        + +

        2.1 FX

        + +

        Figure 1 is a simple example adopted from [3] which illustrates using FX to transform a PyTorch program. It contains three steps: (1) capturing the graph from a program, (2) modifying the graph (in this example, all uses of RELU are replaced by GELU), and (3) generating a new program from the modified graph.

        + +

        + +

        + +

        Figure 1: A FX example which replaces all uses of RELU by GELU in a PyTorch module.

        + +

        The FX API [4] provides many more functionalities for inspecting and transforming PyTorch program graphs.

        + +

        2.2 Embedding Tables

        + +

        + +

        + +

        Figure 2: Illustration of an embedding table for a sparse feature with batch size = 1

        + +

        In a recommendation system, sparse features (e.g., User ID, Story ID) are represented by embedding tables. An embedding table E is an HxD matrix, where H is the hash size, D is the embedding dimension. Each row of E is a vector of floats. Feature hashing [5] is used to map a sparse feature to a list of indices to E, say [S1,S2, …, Sk], where 0<=Si<H. Its output value is computed as f(E[S1], E[S2], …, E[Sk]), where E[Si] is the vector at row Si, and f is called the pooling function, which is typically one of the following functions: sum, average, maximum. See Figure 2 for an illustration.

        + +

        To fully utilize the GPU, sparse features are usually processed in a batch. Each entity in a batch has its own list of indices. If a batch has B entities, a naive representation has B lists of indices. A more compact representation is to combine the B lists of indices into a single list of indices and add a list of the lengths of indices (one length for each entity in the batch). For example, if a batch has 3 entities whose lists of indices are as follows:

        + +
          +
        • Entity 1: indices = [10, 20]
        • +
        • Entity 2: indices = [5, 9, 77, 81]
        • +
        • Entity 3: indices = [15, 20, 45]
        • +
        + +

        Then the indices and lengths for the entire batch will be:

        + +
          +
        • Indices = [10, 20, 5, 9, 77, 81, 15, 20, 45]
        • +
        • Lengths = [2, 4, 3]
        • +
        + +

        And the output of the embedding table lookup for the whole batch is a BxD matrix.

        + +

        3. Three FX Transformations

        + +

        We have developed three FX transformations that accelerate accesses to embedding tables. Section 3.1 discusses a transformation that combines multiple small input tensors into a single big tensor; Section 3.2 a transformation that fuses multiple, parallel compute chains into a single compute chain; and Section 3.3 a transformation that overlaps communication with computation.

        + +

        3.1 Combining Input Sparse Features

        + +

        Recall that an input sparse feature in a batch is represented by two lists: a list of indices and a list of B lengths, where B is the batch size. In PyTorch, these two lists are implemented as two tensors. When a PyTorch model is run on a GPU, embedding tables are commonly stored in the GPU memory (which is closer to the GPU and has much higher read/write bandwidth than the CPU memory). To use an input sparse feature, its two tensors need to be first copied from CPU to GPU. Nevertheless, per host-to-device memory copying requires a kernel launch, which is relatively expensive compared to the actual data transfer time. If a model uses many input sparse features, this copying could become a performance bottleneck (e.g., 1000 input sparse features would require copying 2000 tensors from host to device).

        + +

        An optimization that reduces the number of host-to-device memcpy is to combine multiple input sparse features before sending them to the device. For instance, given the following three input features:

        + +
          +
        • Feature_A: indices = [106, 211, 7], lengths = [2, 1]
        • +
        • Feature_B: indices = [52, 498, 616, 870, 1013], lengths = [3, 2]
        • +
        • Feature_C: indices = [2011, 19, 351, 790], lengths = [1, 3]
        • +
        + +

        The combined form is:

        + +
          +
        • Features_A_B_C: indices = [106, 211, 7, 52, 498, 616, 870, 1013, 2011, 19, 351, 790], lengths = [2, 1, 3, 2, 1, 3]
        • +
        + +

        So, instead of copying 3x2=6 tensors from host to device, we only need to copy 2 tensors.

        + +

        Figure 3(b) describes an implementation of this optimization, which has two components:

        + +
          +
        • On the CPU side: The input pipeline is modified to combine all the indices of sparse features into a single tensor and similarly all the lengths into another tensor. Then the two tensors are copied to the GPU.
        • +
        • On the GPU side: Using FX, we insert a Permute_and_Split op into the model graph to recover the indices and lengths tensors of individual features from the combined tensors, and route them to the corresponding nodes downstream.
        • +
        + +

        + +

        + +

        (a). Without the optimization

        + +

        + +

        + +

        (b). With the optimization

        + +

        Figure 3: Combining input sparse features

        + +

        3.2 Horizontal fusion of computation chains started with accesses to embedding tables

        + +

        In a production model, it is fairly common to have 10s of embedding tables residing on each GPU. For performance reasons, lookups to these tables are grouped together so that their outputs are concatenated in a single big tensor (see the red part in Figure 4(a)). To apply computations to individual feature outputs, a Split op is used to divide the big tensors into N smaller tensors (where N is the number of features) and then the desired computations are applied to each tensor. This is shown in Figure 4(a), where the computation applied to each feature output O is Tanh(LayerNorm(O)). All the computation results are concatenated back to a big tensor, which is then passed to downstream ops (Op1 in Figure 4(a)).

        + +

        The main runtime cost here is the GPU kernel launch overhead. For instance, the number of GPU kernel launches in Figure 4(a) is 2*N + 3 (each oval in the figure is a GPU kernel). This could become a performance issue because execution times of LayerNorm and Tanh on the GPU are short compared to their kernel launch times. In addition, the Split op may create an extra copy of the embedding output tensor, consuming additional GPU memory.

        + +

        We use FX to implement an optimization called horizontal fusion which dramatically reduces the number of GPU kernel launches (in this example, the optimized number of GPU kernel launches is 5, see Figure 4(b)). Instead of doing an explicit Split, we use the Add_middle_dim op to reshape the 2D embedding tensor of shape (B, NxD) to a 3D tensor of shape (B, N, D). Then a single LayerNorm is applied to the last dimension of it. Then a single Tanh is applied to the result of the LayerNorm. At the end, we use the Remove_middle_dim op to reshape the Tanh’s result back to a 2D tensor. In addition, since Add_middle_dim and Remove_middle_dim only reshape the tensor without creating an extra copy, the amount of GPU memory consumption could be reduced as well.

        + +

        + +

        + +

        (a). Without the optimization

        + +

        + +

        + +

        (b). With the optimization

        + +

        Figure 4: Horizontal fusion

        + +

        3.3 Overlapping Computation with Communication

        + +

        Training of a production recommendation model is typically done on a distributed GPU system. Since the capacity of the device memory per GPU is not big enough to hold all the embedding tables in the model, they need to be distributed among the GPUs.

        + +

        Within a training step, a GPU needs to read/write feature values from/to the embedding tables on the other GPUs. This is known as all-to-all communication [6] and can be a major performance bottleneck.

        + +

        We use FX to implement a transformation that can overlap computation with all-to-all communication. Figure 5(a) shows the example of a model graph which has embedding table accesses (EmbeddingAllToAll) and other ops. Without any optimization, they are sequentially executed on a GPU stream, as shown in Figure 5(b). Using FX, we break EmbeddingAllToAll into EmbeddingAllToAll_Request and EmbeddingAllToAll_Wait, and schedule independent ops in between them.

        + +

        + +

        + +

        (a) Model graph

        + +

        + +

        + +

        (b) Original execution order

        + +

        + +

        + +

        (c)Optimized execution order

        + +

        Figure 5: Overlapping Computation with Communication

        + +

        3.4 Summary

        + +

        Table 1 summarizes the optimizations discussed in this section and the corresponding performance bottlenecks addressed.

        + + + + + + + + + + + + + + + + + + +
        Optimization + Performance Bottleneck Addressed +
        Combining Input Sparse Features + Host-to-device memory copy +
        Horizontal fusion + GPU kernel launch overhead +
        Overlapping Computation with Communication + Embedding all-to-all access time +
        + +

        Table 1: Summary of the optimizations and the performance bottlenecks addressed

        + +

        We have also developed other FX transformations which are not discussed in this section due to space limitations.

        + +

        To discover which models would benefit from these transformations, we analyzed the performance data collected by MAIProf [7] from the models that run at Meta’s data centers. Altogether, these transformations provide up to 2-3x of speedups compared to eager mode on a set of production models.

        + +

        4. Concluding Remarks

        + +

        The graph mode in PyTorch is preferred over the eager mode for production use for performance reasons. FX is a powerful tool for capturing and optimizing the graph of a PyTorch program. We demonstrate three FX transformations that are used to optimize production recommendation models inside Meta. We hope that this blog can motivate other PyTorch model developers to use graph transformations to boost their models’ performance.

        + +

        References

        + +

        [1] End-to-end Machine Learning Framework

        + +

        [2] DNNFusion: Accelerating Deep Neural Networks Execution with Advanced Operator Fusion

        + +

        [3] Torch.FX: Practical Program Capture and Transformation for Deep Learning In Python, MLSys 2022.

        + +

        [4] Torch.fx—PyTorch 1.12 documentation

        + +

        [5] Feature Hashing for Large Scale Multitask Learning

        + +

        [6] NVIDIA Collective Communication Library Documentation

        + +

        [7] Performance Debugging of Production PyTorch Models at Meta

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/out-of-the-box-acceleration/index.html b/blog/out-of-the-box-acceleration/index.html new file mode 100644 index 000000000000..bc6a12ae2a12 --- /dev/null +++ b/blog/out-of-the-box-acceleration/index.html @@ -0,0 +1,813 @@ + + + + + + + + + + + + + Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Felix Marty, Younes Belkada, Hamid Shojanazeri, Driss Guessous + +

        +

        As part of PyTorch 2.0 release, an accelerated implementation of the attention mechanism as part of the “Better Transformer” project (and known in PyTorch as Accelerated Transformers) has been added natively into PyTorch as torch.nn.functional.scaled_dot_product_attention. This implementation leverages fused kernels from FlashAttention and Memory-efficient attention, and supports both training and inference.

        + +

        We also release a notebook showcasing an example of this integration here

        + +

        After seeing 20-30% speedups at inference for diffusion models, we went ahead and implemented an integration with 🤗 Transformers models through the 🤗 Optimum library. Similar to the previous integration for encoder models, the integration replaces modules from Transformers with efficient implementations that use torch.nn.functional.scaled_dot_product_attention. The usage is as follow:

        + +
        from optimum.bettertransformer import BetterTransformer
        +from transformers import AutoModelForCausalLM
        +
        +with torch.device(“cuda”):
        +model = AutoModelForCausalLM.from_pretrained(“gpt2-large”, torch_dtype=torch.float16)
        +
        +model = BetterTransformer.transform(model)
        +
        +# do your inference or training here
        +
        +# if training and want to save the model
        +model = BetterTransformer.reverse(model)
        +model.save_pretrained(“fine_tuned_model”)
        +model.push_to_hub(“fine_tuned_model”) 
        +
        + +

        Summarizing our findings below about torch.nn.functional.scaled_dot_product_attention:

        +
          +
        • It is most useful to fit larger models, sequence length, or batch size to train on a given hardware.
        • +
        • Memory footprint savings on GPU during training range from 20% to 110%+.
        • +
        • Speedups during training range from 10% to 70%.
        • +
        • Speedups during inference range from 5% to 20%.
        • +
        • Standalone, for small head dimensions, scaled_dot_product_attention speedups go up to 3x, memory savings go as high as 40x (depending on the sequence length).
        • +
        + +

        You may be surprised by the wide range of memory savings and speedups. In this blog post, we discuss our benchmarks, where this feature shines and upcoming improvements in future PyTorch releases.

        + +

        In the next release of transformers you will just need to install the proper version of optimum and run:

        +
        model = model.to_bettertransformer()
        +
        +

        To convert your model using the BetterTransformer API. You can already try this feature out by installing transformers from source.

        + +

        Benchmark and usage with 🤗 Transformers

        + +

        torch.nn.functional.scaled_dot_product_attention is usable with any architecture that uses standard attention, and namely replaces the boiler-plate code:

        + +
        # native scaled_dot_product_attention is equivalent to the following:
        +def eager_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale):
        +	scale_factor = 1 / math.sqrt(Q.size(-1)) if scale is None else scale
        +	attn_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) if is_causal else attn_mask
        +	attn_mask = attn_mask.masked_fill(not attn_mask, -float('inf')) if attn_mask.dtype==torch.bool else attn_mask
        +	attn_weight = torch.softmax((Q @ K.transpose(-2, -1) * scale_factor) + attn_mask, dim=-1)
        +	attn_weight = torch.dropout(attn_weight, dropout_p)
        +	return attn_weight @ V
        +
        + +

        In the 🤗 Optimum integration with Transformers models, the following architectures are supported for now: gpt2, gpt-neo, gpt-neox, gptj, t5, bart, codegen, pegasus, opt, LLaMA, blenderbot, m2m100. You can expect this list to be extended in the near future!

        + +

        To validate the benefits from the native scaled dot-product attention, we ran inference and training benchmarks, whose results are presented below.

        + +

        Inference benchmark on a single A10G GPU, AWS g5.4xlarge instance +Inference benchmark on a single A10G GPU, AWS g5.4xlarge instance

        + +

        + +

        Training benchmark on a single A10G GPU, AWS g5.4xlarge instance +Training benchmark on a single A10G GPU, AWS g5.4xlarge instance

        + +

        + +

        Training benchmark on a single A100-SXM4-80GB, Nvidia DGX +Training benchmark on a single A100-SXM4-80GB, Nvidia DGX

        + +

        + +

        Out of this benchmark, the most interesting finding is that native SDPA allows for the usage of longer sequence lengths and batch sizes without running into out of memory issues. Moreover, up to 20% speedups can be seen during inference, and even larger during training.

        + +

        As seen on the training benchmarks, it appears that smaller head dimension brings higher speedups and memory savings, which we will discuss in the following section.

        + +

        The implementation supports multi-GPU settings as well, thanks to 🤗 Accelerate library by passing device_map=”auto” to the from_pretrained method. Here are some results for training on two A100-SXM4-80GB.

        + +

        Training benchmark on two A100-SXM4-80GB, Nvidia DGX, using 🤗 Accelerate library for distributed training +Training benchmark on two A100-SXM4-80GB, Nvidia DGX, using 🤗 Accelerate library for distributed training

        + +

        + +

        Note that some kernels support only the sm_80 compute capability (which is the one from A100 GPUs), which limits usability on a wide range of hardware, notably if the head dimension is not a power of two. For example, as of PyTorch 2.0.0 during training, opt-2.7b (headim=80) and gpt-neox-20b (headdim=96) can not dispatch to a kernel using flash attention, unless run on an A100 GPU. Better kernels may be developed in the future: https://github.com/pytorch/pytorch/issues/98140#issuecomment-1518101895

        + +

        Flash Attention, Memory-efficient attention & math differences

        + +

        The native scaled_dot_product_attention relies on three possible backend implementations: flash attention, memory-efficient attention, and the so-called math implementation which provides a hardware-neutral fallback for all PyTorch platforms.

        + +

        When fused kernels are available for a given problem size, flash-attention or memory-efficient attention will be used, effectively allowing for a lower memory footprint, as in the memory-efficient attention case O(N) memory allocations are done on the GPU global memory instead of the classic O(N^2) for the traditional eager attention implementation. With flash attention, a reduced number of memory accesses (read and writes) is expected, hence both giving speedups and memory savings.

        + +

        The “math” implementation is simply an implementation using the PyTorch’s C++ API. Interesting to note in this implementation is that the query and key tensors are scaled individually for numerical stability, thus launching two aten::div operations instead of possibly only one in an eager implementation that does not contain this optimization for numerical stability.

        + +

        Head dimension influence on speedups, memory savings

        + +

        Benchmarking torch.nn.functional.scaled_dot_product_attention, we notice a decrease in the speedup / memory gains as the head dimension increases. This is an issue for some architectures like EleutherAI/gpt-neo-2.7B, that has a relatively large head dimension of 128, or EleutherAI/gpt-j-6B (and derived models as PygmalionAI/pygmalion-6b) that has a head dimension of 256 (that actually currently do not dispatch on fused kernels as the head dimension is too large).

        + +

        This trend can be seen in the figures below, where torch.nn.scaled_dot_production is benchmarked standalone versus the above eager implementation. Moreover, we use the torch.backends.cuda.sdp_kernel context manager to force the usage of respectively math, flash attention, and memory-efficient attention implementation.

        + +

        Using memory-efficient attention SDP kernel (forward-only), A100 +Using memory-efficient attention SDP kernel (forward-only), A100

        + +

        + +

        Using math (without dropout), A100 +Using math (without dropout), A100

        + +

        + +

        Using flash attention SDP kernel (without dropout), A100 +Using flash attention SDP kernel (without dropout), A100

        + +

        + +

        Using memory-efficient attention SDP kernel (without dropout), A100 +Using memory-efficient attention SDP kernel (without dropout), A100

        + +

        + +

        We see that for the same problem size, be it for inference-only or training, the speedup decreases with higher head dimension, e.g. from 3.4x for headdim=8 to 1.01x for headdim=128 using flash attention kernel.

        + +

        The reduced memory saving is expected with larger head dimensions. Recall the standard attention computation:

        + +

        Math equation

        + +

        Due to the intermediate computations, the global memory footprint is 2 * N * N + N * d in this standard step by step computation. Memory-efficient attention proposes to iteratively update the softmax renormalization constant and moving its computation at the very end, allowing for only a constant output memory allocation N * d.

        + +

        Thus, the memory saving ratio is 2 * N / d + 1, which decreases with larger head dimension.

        + +

        In flash attention, the tradeoff is between the head dimension d and the shared memory size M of a GPU streaming multiprocessor, with a total number of memory accesses of O(N² * d²/M). Thus, the memory accesses scale quadratically in the head dimension, contrary to the standard attention that scales linearly. The reason is that in flash attention, for larger head dimension d, the key and value K, V need to be split into more blocks to fit into shared memory, and in turn each block needs to load the full query Q and output O.

        + +

        Thus, the highest speedups for flash attention are in a regime where the ratio d² / M is small enough.

        + +

        Current limitations as of PyTorch 2.0.0

        + +

        Absence of a scale argument

        + +

        As of PyTorch 2.0.0, torch.nn.functional.scaled_dot_product_attention has no scale argument and uses the default square root of the hidden size sqrt(d_k).

        + +

        Math equation

        + +

        However, some architectures as OPT or T5 do not use a scaling in the attention, which as of Pytorch 2.0.0 forces it to artificially rescale before the scaled_dot_product_attention call. This introduces an unnecessary overhead, as an additional multiplication is necessary, on top of unneeded divisions in the attention.

        + +

        A fix for this issue has been merged in PyTorch repository.

        + +

        Support of flash attention / memory-efficient attention with custom mask

        + +

        As of PyTorch 2.0.0, when passing a custom attention mask, flash attention and memory-efficient attention can not be used. In this case, scaled_dot_product_attention automatically dispatches to the C++ implementation.

        + +

        However, as we have seen, some architectures require a custom attention mask, as T5 that uses positional bias. Moreover, in the case of a batch size larger than one where some inputs may be padded, a custom attention mask also needs to be passed. For this latter case, an alternative would be to use NestedTensor, which SDPA supports.

        + +

        This limited support for custom masks thus limits the benefits from SDPA in these specific cases, although we can hope for an extended support in the future.

        + +

        Note that xformers, from which PyTorch’s SDPA partially takes inspiration, currently supports arbitrary attention masks: https://github.com/facebookresearch/xformers/blob/658ebab39545f180a6075385b3897921623d6c3b/xformers/ops/fmha/cutlass.py#L147-L156 . HazyResearch implementation of flash attention also supports an equivalent implementation of padding, as a cumulative sequence length array is used along with packed query/key/values - similar in essence to NestedTensor.

        + +

        In conclusion

        + +

        Using torch.nn.functional.scaled_dot_product_attention is a free-lunch optimization, both making your code more readable, uses less memory, and is in most common cases faster.

        + +

        Although the implementation in PyTorch 2.0.0 has still minor limitations, inference and training already massively benefit from SDPA in most cases. We encourage you to use this native implementation be it to train or deploy your PyTorch models, and for 🤗 Transformers models as a one-line transformation!

        + +

        In the future, we would like to adapt the API to enable users to use SDPA in encoder-based models as well.

        + +

        We thank Benjamin Lefaudeux, Daniel Haziza and Francisco Massa for their advice on the head dimension influence, as well as Michael Gschwind, Christian Puhrsch and Driss Guessous for their feedback on the blog post!

        + +

        Benchmark reproduction

        + +

        The benchmark presented in this post was done using torch==2.0.0, transformers==4.27.4, accelerate==0.18.0 and optimum==1.8.0.

        + +

        The benchmarks can be easily reproduced using the scripts for inference, training for 🤗 Transformers models, and standalone SDPA.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/overview-of-pytorch-autograd-engine/index.html b/blog/overview-of-pytorch-autograd-engine/index.html new file mode 100644 index 000000000000..3387674bc9d7 --- /dev/null +++ b/blog/overview-of-pytorch-autograd-engine/index.html @@ -0,0 +1,803 @@ + + + + + + + + + + + + + Overview of PyTorch Autograd Engine | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        June 08, 2021

        +

        + Overview of PyTorch Autograd Engine +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Preferred Networks, Inc. + +

        +

        This blog post is based on PyTorch version 1.8, although it should apply for older versions too, since most of the mechanics have remained constant.

        + +

        To help understand the concepts explained here, it is recommended that you read the awesome blog post by @ezyang: PyTorch internals if you are not familiar with PyTorch architecture components such as ATen or c10d.

        + +

        What is autograd?

        + +

        Background

        + +

        PyTorch computes the gradient of a function with respect to the inputs by using automatic differentiation. Automatic differentiation is a technique that, given a computational graph, calculates the gradients of the inputs. Automatic differentiation can be performed in two different ways; forward and reverse mode. Forward mode means that we calculate the gradients along with the result of the function, while reverse mode requires us to evaluate the function first, and then we calculate the gradients starting from the output. While both modes have their pros and cons, the reverse mode is the de-facto choice since the number of outputs is smaller than the number of inputs, which allows a much more efficient computation. Check [3] to learn more about this.

        + +

        Automatic differentiation relies on a classic calculus formula known as the chain-rule. The chain rule allows us to calculate very complex derivatives by splitting them and recombining them later.

        + +

        Formally speaking, given a composite function , we can calculate its derivative as . This result is what makes automatic differentiation work. +By combining the derivatives of the simpler functions that compose a larger one, such as a neural network, it is possible to compute the exact value of the gradient at a given point rather than relying on the numerical approximation, which would require multiple perturbations in the input to obtain a value.

        + +

        To get the intuition of how the reverse mode works, let’s look at a simple function . Figure 1 shows its computational graph where the inputs x, y in the left, flow through a series of operations to generate the output z.

        + +
        + +

        Figure 1: Computational graph of f(x, y) = log(x*y)

        +
        + +

        The automatic differentiation engine will normally execute this graph. It will also extend it to calculate the derivatives of w with respect to the inputs x, y, and the intermediate result v.

        + +

        The example function can be decomposed in f and g, where and . Every time the engine executes an operation in the graph, the derivative of that operation is added to the graph to be executed later in the backward pass. Note, that the engine knows the derivatives of the basic functions.

        + +

        In the example above, when multiplying x and y to obtain v, the engine will extend the graph to calculate the partial derivatives of the multiplication by using the multiplication derivative definition that it already knows. and . The resulting extended graph is shown in Figure 2, where the MultDerivative node also calculates the product of the resulting gradients by an input gradient to apply the chain rule; this will be explicitly seen in the following operations. Note that the backward graph (green nodes) will not be executed until all the forward steps are completed.

        + +
        + +

        Figure 2: Computational graph extended after executing the logarithm

        +
        + +

        Continuing, the engine now calculates the operation and extends the graph again with the log derivative that it knows to be . This is shown in figure 3. This operation generates the result that when propagated backward and multiplied by the multiplication derivative as in the chain rule, generates the derivatives , .

        + +
        + +

        Figure 3: Computational graph extended after executing the logarithm

        +
        + +

        The original computation graph is extended with a new dummy variable z that is the same w. The derivative of z with respect to w is 1 as they are the same variable, this trick allows us to apply the chain rule to calculate the derivatives of the inputs. After the forward pass is complete, we start the backward pass, by supplying the initial value of 1.0 for . This is shown in Figure 4.

        + +
        + +

        Figure 4: Computational graph extended for reverse auto differentiation

        +
        + +

        Then following the green graph we execute the LogDerivative operation that the auto differentiation engine introduced, and multiply its result by to obtain the gradient as per the chain rule states. Next, the multiplication derivative is executed in the same way, and the desired derivatives are finally obtained.

        + +

        Formally, what we are doing here, and PyTorch autograd engine also does, is computing a Jacobian-vector product (Jvp) to calculate the gradients of the model parameters, since the model parameters and inputs are vectors.

        + +

        The Jacobian-vector product

        + +

        When we calculate the gradient of a vector-valued function (a function whose inputs and outputs are vectors), we are essentially constructing a Jacobian matrix .

        + +

        Thanks to the chain rule, multiplying the Jacobian matrix of a function by a vector with the previously calculated gradients of a scalar function results in the gradients of the scalar output with respect to the vector-valued function inputs.

        + +

        As an example, let’s look at some functions in python notation to show how the chain rule applies.

        +
        + + +
        def f(x1, x2):
        +      a = x1 * x2
        +      y1 = log(a)
        +      y2 = sin(x2)
        +      return (y1, y2)
        +  
        + + + +
        def g(y1, y2):
        +      return y1 * y2
        +  
        + +
        + +

        Now, if we derive this by hand using the chain rule and the definition of the derivatives, we obtain the following set of identities that we can directly plug into the Jacobian matrix of

        + +
        +

        +

        +

        +

        +
        + +

        Next, let’s consider the gradients for the scalar function

        + +
        +

        +

        +
        + +

        If we now calculate the transpose-Jacobian vector product obeying the chain rule, we obtain the following expression:

        +
        + +
        + +

        Evaluating the Jvp for yields the result: + +We can execute the same expression in PyTorch and calculate the gradient of the input:

        +
        +
        >>> import torch
        +
        >>> x = torch.tensor([0.5, 0.75], requires_grad=True)
        +
        >>> y = torch.log(x[0] * x[1]) * torch.sin(x[1])
        +
        >>> y.backward(1.0)
        +
        >>> x.grad
        + tensor([1.3633, + 0.1912]) +
        + +

        The result is the same as our hand-calculated Jacobian-vector product! +However, PyTorch never constructed the matrix as it could grow prohibitively large but instead, created a graph of operations that traversed backward while applying the Jacobian-vector products defined in tools/autograd/derivatives.yaml.

        + +

        Going through the graph

        + +

        Every time PyTorch executes an operation, the autograd engine constructs the graph to be traversed backward. +The reverse mode auto differentiation starts by adding a scalar variable at the end so that as we saw in the introduction. This is the initial gradient value that is supplied to the Jvp engine calculation as we saw in the section above.

        + +

        In PyTorch, the initial gradient is explicitly set by the user when he calls the backward method.

        + +

        Then, the Jvp calculation starts but it never constructs the matrix. Instead, when PyTorch records the computational graph, the derivatives of the executed forward operations are added (Backward Nodes). Figure 5 shows a backward graph generated by the execution of the functions and seen before.

        + +
        + +

        Figure 5: Computational Graph extended with the backward pass

        +
        + +

        Once the forward pass is done, the results are used in the backward pass where the derivatives in the computational graph are executed. The basic derivatives are stored in the tools/autograd/derivatives.yaml file and they are not regular derivatives but the Jvp versions of them [3]. They take their primitive function inputs and outputs as parameters along with the gradient of the function outputs with respect to the final outputs. By repeatedly multiplying the resulting gradients by the next Jvp derivatives in the graph, the gradients up to the inputs will be generated following the chain rule.

        + +
        + +

        Figure 6: How the chain rule is applied in backward differentiation

        +
        + +

        Figure 6 represents the process by showing the chain rule. We started with a value of 1.0 as detailed before which is the already calculated gradient highlighted in green. And we move to the next node in the graph. The backward function registered in derivatives.yaml will calculate the associated + value highlighted in red and multiply it by . By the chain rule this results in which will be the already calculated gradient (green) when we process the next backward node in the graph.

        + +

        You may also have noticed that in Figure 5 there is a gradient generated from two different sources. When two different functions share an input, the gradients with respect to the output are aggregated for that input, and calculations using that gradient can’t proceed unless all the paths have been aggregated together.

        + +

        Let’s see an example of how the derivatives are stored in PyTorch.

        + +

        Suppose that we are currently processing the backward propagation of the function, in the LogBackward node in Figure 2. The derivative of in derivatives.yaml is specified as grad.div(self.conj()). grad is the already calculated gradient and self.conj() is the complex conjugate of the input vector. For complex numbers PyTorch calculates a special derivative called the conjugate Wirtinger derivative [6]. This derivative takes the complex number and its conjugate and by operating some magic that is described in [6], they are the direction of steepest descent when plugged into optimizers.

        + +

        This code translates to , the corresponding green, and red squares in Figure 3. Continuing, the autograd engine will execute the next operation; backward of the multiplication. As before, the inputs are the original function’s inputs and the gradient calculated from the backward step. This step will keep repeating until we reach the gradient with respect to the inputs and the computation will be finished. The gradient of is only completed once the multiplication and sin gradients are added together. As you can see, we computed the equivalent of the Jvp but without constructing the matrix.

        + +

        In the next post we will dive inside PyTorch code to see how this graph is constructed and where are the relevant pieces should you want to experiment with it!

        + +

        References

        + +
          +
        1. https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
        2. +
        3. https://web.stanford.edu/class/cs224n/readings/gradient-notes.pdf
        4. +
        5. https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec10.pdf
        6. +
        7. https://mustafaghali11.medium.com/how-pytorch-backward-function-works-55669b3b7c62
        8. +
        9. https://indico.cern.ch/event/708041/contributions/3308814/attachments/1813852/2963725/automatic_differentiation_and_deep_learning.pdf
        10. +
        11. https://pytorch.org/docs/stable/notes/autograd.html#complex-autograd-doc
        12. +

          Recommended: shows why the backprop is formally expressed with the Jacobian

          +
        13. https://cs.ubc.ca/~fwood/CS340/lectures/AD1.pdf
        14. +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/path-achieve-low-inference-latency/index.html b/blog/path-achieve-low-inference-latency/index.html new file mode 100644 index 000000000000..7eee42c27ab1 --- /dev/null +++ b/blog/path-achieve-low-inference-latency/index.html @@ -0,0 +1,957 @@ + + + + + + + + + + + + + The Path to Achieve Ultra-Low Inference Latency With LLaMA 65B on PyTorch/XLA | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Milad Mohammadi, Jiewen Tan, Liyang Lu, Siyuan Liu, Yeounoh Chung, Wonjoo Lee, Manfei Bai, Steven Krawczyk, Shauheen Zahirazami, Alex Wertheim, Meghan Cowan, Jack Cao, Joe Spisak + +

        +

        Background & State of the Art

        + +

        In the natural language processing (NLP) space, language models are designed to generate a token (e.g. word) using a sequence of past input tokens. Large Language Models (LLMs) are the latest deep learning innovation in this space built to generate text in a human-like fashion. These models generally use transformers to improve their attention over a large sequence of input tokens.

        + +

        LLaMA, open sourced by Meta AI, is a powerful foundation LLM trained on over 1T tokens. LLaMA is competitive with many best-in-class models such as GPT-3, Chinchilla, PaLM. LLaMA (13B) outperforms GPT-3 (175B) highlighting its ability to extract more compute from each model parameter.

        + +

        In this blog post, we use LLaMA as an example model to demonstrate the capabilities of PyTorch/XLA for LLM inference. We discuss how the computation techniques and optimizations discussed here improve inference latency by 6.4x on 65B parameter LLaMA models powered by Google Cloud TPU v4 (v4-16).

        + +

        Model Overview

        + +

        We demonstrate the performance capabilities of PyTorch/XLA on LLaMA, the latest LLM from Meta. We showcase performance optimizations on a series of common LLaMA configurations. Notice the 175B parameter model configuration is absent in the public domain. For the 175B parameter model mentioned below, we apply OPT 175B model configuration to the LLaMA code base. Unless stated otherwise, in all configurations, we use max_seq_len=256 and dtype=bfloat16 for weights and activations.

        + +

        Table 1: Model Configurations Explored in this article

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        LLaMA + Model Hyper Parameters +
        # Parameters + Dimensions + N Heads + N Layers + Max Seq Len +
        7B + 4,096 + 32 + 32 + 256 +
        33B + 6,656 + 52 + 60 + 256 +
        65B + 8,192 + 64 + 80 + 256 +
        175B + 12,288 + 96 + 96 + 256 +
        + +

        Performance Challenges of LLMs

        + +

        LLMs have a few properties that make them challenging for compiler optimizations. (a) LLMs use autoregressive decoding to generate the next token baked on the previous ones; this means prompt tensors and coaches have a dynamic shape. (b) LLMs must work with variable input prompt lengths without triggering recompilation due to input tensor shape changes; input tensors must be properly bucketized and padded to avoid recompilation. (c) LLMs often require more memory than a single TPU (or GPU) device can support. A model-sharding scheme is required to fit the model across a distributed compute architecture. For instance, a LLaMA model with 65B parameters can fit on a v4-16 Cloud TPU, which is comparable to 8 A100 GPUs. (d) running LLMs in production can be expensive; one way to improve performance per total cost of ownership (Perf/TCO) is via quantization; quantization can potentially reduce hardware requirements.

        + +

        Inference Tech Stack in PyTorch/XLA

        + +

        Our goal is to offer the AI community a high performance inference stack. PyTorch/XLA integrates with TorchDynamo, PjRt, OpenXLA, and various model parallelism schemes. TorchDynamo eliminates tracing overhead at runtime, PjRt enables efficient host-device communication; PyTorch/XLA traceable collectives enable model and data parallelism on LLaMA via TorchDynamo. To try our results, please use our custom torch, torch-xla wheels to reproduce our LLaMA inference solution. PyTorch/XLA 2.1 will support the features discussed in this post by default.

        + +

        Parallel Computing

        + +

        FairScale Sharding

        + +

        LLaMA uses FairScale model sharding API (fairscale.nn.model_parallel.layers). We built an equivalent representation of this API using PyTorch/XLA communication collective (CC) ops such as all-reduce to communicate program state (e.g. activations) between accelerators. TorchDynamo does not fully support capturing CC ops currently (a.k.a. traceable collectives). Without this support, a TorchDynamo FX graph would be cut at every device communication, meaning at every model layer. Graph cuts lead to performance loss as the underlying XLA compiler loses full graph optimization opportunities. To resolve this, we offer PyTorch/XLA traceable collectives by integrating the dispatcher collectives into our existing CC APIs. The difference is we don’t need to insert c10d.wait() ops after collectives, given the lazy execution nature of PyTorch/XLA. With support for traceable collectives, PyTorch/XLA allows singular FX graph generation in TorchDynamo.

        + +

        Autoregressive Decoding on PyTorch/XLA

        + +

        LLMs need autoregressive decoding to feed the previous word as a prompt to predict the next token. Autoregressive decoding leads to unbounded dynamic shape problems, which in turn causes recompilation of every prompt. We optimized the LLaMA autoregressive decoder to operate with fixed shapes that in-place updates the KV-cache, output sequences, and attention masks during every token generation. With a combination of padding, masking, and index ops, we avoided excessive graph recompilation, thereby achieving efficient autoregressive decoding.

        + +

        KV-Cache Optimization

        + +

        LLaMA implements autoregressive decoding with KV-cache. For every generated token, the KV-cache stores the attention key/value activations of each Transformer layer. Thus, upon decoding a new token, the key/values of prior tokens no longer need recomputation.

        + +

        In LLaMA, the KV-cache tensor slices are updated in-place; this leads to recompilation events every time a token is generated. To address this issue, we use index tensors and tensor.index_copy() ops to replace the in-place slice updates. Attention masks and output sequences also benefit from the same optimization.

        + +

        Input Prompt Optimization

        + +

        Variable length input prompts are common in LLM applications. This property causes input tensor shape dynamism and in turn recompilation events. When processing a prompt to fill the KV-cache, we either (a) process the input prompt token-by-token, or (b) process the whole prompt in one iteration. The pros and cons of each method are:

        + +
          +
        1. Pre-compile 1 graph and process a prompt token-by-token +
            +
          • Practical: 1 graph is compiled during warm-up
          • +
          • Slow: O(L) to process an input prompt length L - a disadvantage for long prompts
          • +
          +
        2. +
        3. Pre-compile all graphs with input lengths ranging from 1 to max_seq_len (e.g. 2,048) +
            +
          • Impractical: pre-compile and cache max_seq_len graphs during warm-up time
          • +
          • Fast: 1 graph execution to process the full prompt
          • +
          +
        4. +
        + +

        We introduce prompt length bucketization, an optimization to strike a balance between the two alternatives. We define a set of ascending bucket sizes, (b0,b1,b2,…,bB-1), and then pre-compile program graphs with input sizes according to these bucket values, (G0,G1,G2,…,GB-1); B is the number of buckets. For a given input prompt, we round up the prompt length to the closest bucket value bn, pad the sequence, and use Gn to process the prompt in one iteration. The computation on the padding tokens is discarded. For prompts larger than the largest bucket size, we process them section-by-section.

        + +

        The optimal bucket sizes should be determined by prompt length distribution in a target application. Here, we adopt bucket lengths: 128, 256, 384, 512. Any input prompt with up to 2,047 tokens requires up to 4 graph executions. For example, a 1,500 input prompt with generation length of 256 requires 260 graph executions - 4 to process the input, and 256 to generate the output.

        + +

        Quantization

        + +

        Quantization reduces the number of bits necessary to represent a value; it reduces the bandwidth to communicate data across multiple accelerator nodes (via collectives) and lowers the hardware requirements to serve a specific model size.

        + +

        Normally, with BF16 weights, a 175B parameter model would consume about 351GB of memory, and therefore require a v4-32 instance to accommodate the model. By quantizing the weights to INT8, we reduced the model size by roughly 50%, allowing it to run on a smaller v4-16 instance. Because LLaMA shards model activations, quantization offers negligible communication gain.

        + +

        In our experiments, we quantized the linear layer. Since LLaMA model checkpoints are unavailable publicly, and our goal is to evaluate performance, the quantized model is initialized with random weights.Recent literature such as AWQ and Integer or Floating Point? offer insights into performance properties of LLaMA under various low-bit quantization schemes.

        + +

        Effect of Batch Size on Quantization Performance

        + +

        TPU v4 is programmed to run matmul on the Matrix Multiply Unit (MXU) when the model batch size (BS) > 1. For BS = 1, matmul runs on the Vector Processor Unit (VPU). Since MXU is more efficient than VPU, INT8 quantization gains performance at BS>1. See Performance Analysis section for details.

        + +

        Op Support

        + +

        Occasionally, new models introduce new mathematical operations that require PyTorch/XLA to extend its supported op set for compilation. For LLaMA, we supported: multinomial.

        + +

        Methodology

        + +

        LLaMA works on PyTorch/XLA out of the box on LazyTensorCore. We use this configuration as a baseline for our follow up analysis. All experiments assume 256-long input prompts. In the absence of a publicly available model checkpoint, we used random tensor initialization for this inference stack optimization effort. A model checkpoint is not expected to change latency results discussed here.

        + +

        Model Sizing

        + +

        Assuming N is the number of parameters, dimensions is the hidden size, n_layers is the number of layers, n_heads is the number of attention heads, the equation below can be used to approximate the model size. See the Model Overview section for details.

        + +
        N = (dimensions)^2 * n_layers * 12
        +
        + +

        n_heads doesn’t affect N, but the following equation holds for the open sourced model configs.

        + +
        dim = 128 * n_heads
        +
        + +

        Cache Sizing

        + +

        Both model parameters and the cache layers in the Attention block contribute to memory consumption. Since the default LLaMA model uses BF16 weights, the memory consumption calculation in this section is based on BF16 weights.

        + +

        The size of the cache layer is calculated by cache_size = max_batch_size * max_seq_len * dimensions. max_batch_size = 1 and max_seq_len = 256 are used as an example configuration in the following calculations. There are 2 cache layers in each Attention block. So, the total LLaMA cache size (in Bytes) is total_cache_size = n_layers * 2 * cache_size * (2 bytes).

        + +

        TPU v4 Hardware Sizing

        + +

        Each TPU v4 chip has 32GB of available High-Bandwidth Memory (HBM). Table 2 has the details on memory consumption and the number of required TPU chips to hold a LLaMA model.

        + +

        Table 2: LLaMA TPU v4 HBM requirements (i.e. TPU v4 chip requirements)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        # Parameters + Parameter (MB) + Cache (MB) + Total (GB) + Min # of TPU v4 Chips +
        7B + 14,000 + 134 + 14.128 + 1 +
        33B + 66,000 + 408 + 66.41 + 3 +
        65B + 130,000 + 671 + 130.67 + 5 +
        175B + 350,000 + 1,208 + 351.21 + 11 +
        + +

        Metrics

        + +

        Below are useful metrics to measure inference speed. Assuming T is the total time, B is the batch size, L is the decoded sequence length.

        + +

        Latency Definition

        + +

        Latency is the time it takes to get the decoded result at target length L, regardless of the batch size B. Latency represents how long the user should wait to get the response from the generation model.

        + +
        Latency = T (s)
        +
        + +

        Per-token latency

        + +

        One step of autoregressive decoding generates a token for each sample in the batch. Per-token latency is the average time for that one step.

        + +
        Per-token latency = T / L (s/token)
        +
        + +

        Throughput

        + +

        Throughput measures how many tokens are generated per unit time. While it’s not a useful metric for evaluating online serving it is useful to measure the speed of batch processing.

        + +
        Throughput = B * L / T (tokens/s)
        +
        + +

        To minimize confusion and misinterpretation, it’s better to avoid metrics like T / (B * L), which mixes latency and throughput.

        + +

        Results

        + +

        Figure 1 shows latency / token results for LLaMA 7B to 175B models. In each case, the model is run on a range of TPU v4 configurations. For instance, LLaMA 7B shows 4.7ms/token and 3.8ms/token on v4-8 and v4-16 respectively. For more comparison, visit the HuggingFace LLM performance leaderboard.

        + +

        In the absence of the features discussed in this blog post, the LLaMA 65B running on v4-32 delivers 120ms/token instead of 14.5ms/token obtained here, leading to 8.3x speedup. As discussed earlier, developers are encouraged to try our custom torch, torch-xla wheels that unlock the repro of LLaMA inference results shared here.

        + +

        Figure 1: LLaMA Inference Performance on TPU v4 hardware

        + +

        Figure 1: LLaMA Inference Performance on TPU v4 hardware

        + +

        PyTorch/XLA:GPU performance is better than PyTorch:GPU eager and similar to PyTorch Inductor. PyTorch/XLA:TPU performance is superior to PyTorch/XLA:GPU. In the near future, XLA:GPU will deliver optimizations that bring parity with XLA:TPU. The single A100 configuration only fits LLaMA 7B, and the 8-A100 doesn’t fit LLaMA 175B.

        + +

        Figure 2: LLaMA Inference Performance on GPU A100 hardware

        + +

        Figure 2: LLaMA Inference Performance on GPU A100 hardware

        + +

        As the batch size increases, we observe a sublinear increase in per-token latency highlighting the tradeoff between hardware utilization and latency.

        + +

        Figure 3: LLaMA Inference Performance across different batch sizes

        + +

        Figure 3: LLaMA Inference Performance across different batch sizes

        + +

        Our studies suggest the impact of maximum sequence input length (max_seq_len) on inference latency is relatively minimal. We attribute this to the sequential and iterative nature of token generation. The small difference in performance can be due to KV cache access latency changes as the storage size increases.

        + +

        Figure 4: LLaMA Inference Performance across different prompt lengths

        + +

        Figure 4: LLaMA Inference Performance across different prompt lengths

        + +

        LLMs are often memory bound applications; thus, by quantizing model parameters we enable loading and executing a larger tensor on MXUs per unit time (i.e. HBM ⇒ CMEM and CMEM ⇒ MXU data moevment). Figure 5 shows INT8 weight-only quantization offers 1.6x-1.9x speedup allowing running a larger model on a given hardware.

        + +

        When BS=1, INT8 tensors are dispatched to VPU which is smaller than MXU (see the TPU v4 paper); otherwise, MXU is used. As a result, when BS=1, quantization memory bandwidth gains are offset by lack of MXU utilization. When BS>1, however, memory gains deliver superior latency on the quantized model. For example, in the case of 175B parameters LLaMA, v4-16 with quantiztion and v4-32 without quantiztion deliver similar performance. Note we do not provied FP8 comparisons because PyTorch is yet to offer this data type.

        + +

        Figure 5: LLaMA Inference Performance vs. weight-only quantization. The missing blue bars suggest the model size doesn’t fit in the specified TPU hardware.

        + +

        Figure 5: LLaMA Inference Performance vs. weight-only quantization. The missing blue bars suggest the model size doesn’t fit in the specified TPU hardware.

        + +

        Figure 6 demonstrates the steady performance advantage of PyTorch/XLA as the input prompt length grows from 10 tokens to 1,500 tokens. This strong scaling capability suggests minimal PyTorch/XLA recompilation events enabling a wide range of real-world applications. In this experiment, the maximum length is 2,048 and maximum generation length is 256.

        + +

        Figure 6: LLaMA Inference Performance vs. Input Prompt Length

        + +

        Figure 6: LLaMA Inference Performance vs. Input Prompt Length

        + +

        Final Thoughts

        + +

        We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source. So, please file issues, submit pull requests, and send RFCs to GitHub so that we can openly collaborate. You can also try out PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs.

        + +

        Cheers,
        +The PyTorch/XLA Team at Google
        +#PoweredByPyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/peak-performance-minimized-memory/index.html b/blog/peak-performance-minimized-memory/index.html new file mode 100644 index 000000000000..74c8b6a7a70c --- /dev/null +++ b/blog/peak-performance-minimized-memory/index.html @@ -0,0 +1,774 @@ + + + + + + + + + + + + + Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + LinkedIn and Meta + +

        +

        LinkedIn: Shivam Sahni, Byron Hsu, Yanning Chen
        +Meta: Ankith Gunapal, Evan Smothers

        + +

        This blog explores the integration of a custom triton kernel, Liger Kernel with torch.compile to enhance the performance of fine-tuning large language models (LLMs) using torchtune. torchtune, a PyTorch-native library, offers modular building blocks and customizable finetuning recipes which include torch.compile support for various LLMs, while Liger Kernel provides optimized Triton kernels to improve training efficiency and reduce memory usage. The integration involves modifying the TransformerDecoder module in torchtune to bypass the linear layer computation, allowing the Liger Fused Linear Cross Entropy Loss to handle the forward projection weights. Experiments conducted on an NVIDIA A100 instance demonstrate that torch.compile outperforms PyTorch Eager in throughput and memory efficiency, with Liger Kernel further reducing peak memory allocation and enabling larger batch sizes. The results show a 47% reduction in peak memory at batch size 256 and a marginal increase in throughput with meta-llama/Llama-3.2-1B , confirming the effectiveness of the integration without affecting the loss curves.

        + +

        Introduction to torchtune

        + +

        torchtune is a PyTorch-native library which has been designed for finetuning LLMs. torchtune provides composable and modular building blocks along with finetuning recipes that can be easily customized for your use case, as will be shown in this blog.
        +torchtune provides:

        + +
          +
        • PyTorch implementations of popular LLM model architectures from Llama, Gemma, Mistral, Phi, and Qwen model families
        • +
        • Hackable training recipes for full finetuning, LoRA, QLoRA, DPO, PPO, QAT, knowledge distillation, and more
        • +
        • Out-of-the-box memory efficiency, performance improvements, and scaling with the latest PyTorch APIs, including torch.compile
        • +
        • YAML configs for easily configuring training, evaluation, quantization or inference recipes
        • +
        • Built-in support for many popular dataset formats and prompt templates
        • +
        + +

        Introduction to Liger Kernel

        + +

        Liger Kernel is an open source library of optimized Triton kernels designed to enhance the efficiency and scalability of training Large Language Models (LLMs). It focuses on kernel-level optimizations such as operation fusing and input chunking, achieving significant improvements in training throughput and GPU memory usage compared to existing implementations like those from HuggingFace. By using a single line of code, Liger Kernel can improve training throughput by 20% and reduce memory usage by 60%.

        + +

        Fused Linear Cross Entropy

        + + + +

        The bulk of LIger Kernel’s performance improvement comes from the Fused Linear Cross Entropy (FLCE) Loss, whose core idea is as follows:

        + +

        In LLMs, the vocabulary size has increased significantly, leading to a large logit tensor during cross-entropy (CE) loss computation. This logit tensor consumes excessive memory, causing a bottleneck in training. For example, when training with a batch size of 8 and sequence length of 4096, the 256k vocabulary size results in a 16.8 GB logit tensor. The FLCE kernel breaks down the computation into smaller chunks, reducing memory consumption.

        + +

        Here’s how it works:

        + +
          +
        1. Flattens the 3D hidden states into a 2D matrix by collapsing the batch size and sequence length dimensions.
        2. +
        3. Applies the linear projection head sequentially on the chunked hidden states.
        4. +
        5. Computes the partial loss and returns the chunked logits gradient using the Liger CE kernel.
        6. +
        7. Derives the chunked hidden states gradients and accumulates the projection head gradients.
        8. +
        + +

        Torchtune’s recipes provide torch.compile support out of the box. It has been shown that utilizing torch.compile with FLCE makes FLCE 2x faster.

        + +

        Integrating Liger Kernel with torch.compile & torchtune

        + +

        We demonstrate integration of Liger Kernel with torch.compile & torchtune by running a full fine-tuning recipe with meta-llama/Llama-3.2-1B. To make this integration happen, we have defined a custom full finetuning recipe, the details of the changes are mentioned below.

        + +
        CUDA_VISIBLE_DEVICES=0,1,2,3 tune run --nproc_per_node 4 recipes/full_finetune_distributed.py --config llama3_2/1B_full optimizer=torch.optim.AdamW optimizer.fused=True optimizer_in_bwd=False gradient_accumulation_steps=1  dataset.packed=True compile=True enable_activation_checkpointing=True tokenizer.max_seq_len=512  batch_size=128
        +
        + +

        One of the inputs to the LCE Kernel is the forward projection weights. torchtune is designed as a modular library with composable blocks. There is a TransformerDecoder block where at the end of the block, we pass the final hidden state through a linear layer to get the final output. Since the linear layer is combined with the CE loss in LCE Kernel, we write a custom forward function for TransformerDecoder where we skip the computation through the linear layer.

        + +

        In the full finetuning recipe, we override the model’s forward method with this custom method

        + +
        import types
        +from liger_kernel.torchtune.modules.transformers import decoder_forward
        +self._model.forward = types.MethodType(decoder_forward, self._model)
        +
        + +

        We then pass the model’s forward projection weights to calculate the loss with LCE Kernel

        + +
        from liger_kernel.transformers.fused_linear_cross_entropy import (
        +    LigerFusedLinearCrossEntropyLoss,
        +)
        +
        +# Use LCE loss instead of CE loss
        +self._loss_fn = LigerFusedLinearCrossEntropyLoss()
        +
        +# call torch.compile on the loss function
        +if self._compile:
        +    training.compile_loss(self._loss_fn, verbose=self._is_rank_zero)
        +
        +# pass the model's forward projection weights for loss computation
        +current_loss = (
        +     self._loss_fn(
        +         self._model.output.tied_module.weight,
        +         logits,
        +         labels,
        +     )
        +     * current_num_tokens
        + )
        +
        + +

        The complete code and instructions can be found in the GitHub repo.

        + +

        Experiments & Benchmarking Results

        + +

        We conduct 3 types of experiments to demonstrate how Liger Kernel integration with torch.compile enhances the performance of torchtune. We set up our experiments on an instance running NVIDIA A100. We fine-tune a small LLM meta-llama/Llama-3.2-1B with differing batch sizes. We record the throughput in terms of tokens/second and measure the peak memory allocated during finetuning. Since it’s a small model, we only use 4 A100 GPUs for the benchmarking. The following are the experiments we conducted:

        + +
          +
        1. Increase batch_size in powers of 2 with PyTorch eager
        2. +
        3. Increase batch_size in powers of 2 with torch.compile
        4. +
        5. Increase batch_size in powers of 2 with torch.compile & Liger integration
        6. +
        + +

        We notice that with PyTorch Eager, throughput increases with increasing batch_size till we hit OOM at batch_size 256. With torch.compile, the throughput is higher than PyTorch Eager for each batch_size. We see that the peak memory allocation reduces drastically with increasing batch_size and more than 50% reduction in peak memory at batch_size 128. This results in torch.compile being able to support batch_size 256 and hence, the overall throughput with torch.compile being 36% greater than PyTorch Eager. Integrating Liger Kernel with torch.compile doesn’t drop the throughput at lower batch_size but with increasing batch_size, we notice that torchtune is consuming less memory compared to torch.compile. At batch_size 256, we see a 47% reduction in peak memory allocation with the Liger kernel. This allows us to use batch_size 512 with torch.compile & Liger. We notice that there is a marginal 1-2% increase in throughput compared to torch.compile without custom triton kernels.

        + +

        Plot of tokens/sec per rank vs batch_size

        + +
        +

        Figure 2: Plot of tokens/sec per rank vs batch_size

        +
        + +

        Peak memory allocated vs batch_size

        + +
        +

        Figure 3: Peak memory allocated vs batch_size

        +
        + +

        To rule out any potential functional issues with our integration of Liger Kernel with torchtune, we plot the loss curve against training steps with & without Liger. We see that there is no visible difference in the loss curves.

        + +

        Plot of loss vs training steps for batch_size=128

        + +
        +

        Figure 4: Plot of loss vs training steps for batch_size=128

        +
        + +

        Next Steps

        + + + +

        Acknowledgments

        + +

        We thank Hamid Shojanazeri (Meta), Less Wright (Meta), Horace He (Meta) & Gregory Chanan (Meta) for their feedback and support in making this blog post happen.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/performance-boost-windows/index.html b/blog/performance-boost-windows/index.html new file mode 100644 index 000000000000..656fe2525892 --- /dev/null +++ b/blog/performance-boost-windows/index.html @@ -0,0 +1,2086 @@ + + + + + + + + + + + + + The Path to Achieve PyTorch Performance Boost on Windows CPU | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Intel Corporation + +

        +

        The challenge of PyTorch’s lower CPU performance on Windows compared to Linux has been a significant issue. There are multiple factors leading to this performance disparity. Through our investigation, we’ve identified several reasons for poor CPU performance on Windows, two primary issues have been pinpointed: the inefficiency of the Windows default malloc memory allocator and the absence of SIMD for vectorization optimizations on the Windows platform. In this article, we show how PyTorch CPU performance on Windows has improved from the previous releases and where it stands as of PyTorch 2.4.1.

        + +

        Memory Allocation Optimization in PyTorch 2.1.2 and later

        + +

        In versions prior to PyTorch 2.1.2, PyTorch relied on the operating system’s default malloc function for memory allocation. The default malloc memory allocation on the Windows platform was less efficient compared to the malloc implementation mechanism on the Linux platform, leading to increased memory allocation times and reduced performance. To address this, we have substituted the default Windows malloc with mimalloc, a more efficient memory allocator developed by Microsoft. This update, included with the release of PyTorch 2.1.2 and later, has significantly enhanced the CPU performance of PyTorch on Windows, as shown in Figure 1.1.

        + +

        performance comparison chart

        + +

        PyTorch CPU Performance Improvement on Windows with Memory Allocation Optimization

        + +

        Figure 1.1: Relative throughput improvement achieved by upgrading from Windows PyTorch version 2.0.1 to 2.1.2 (higher is better).

        + +

        The graph illustrates that with the release of PyTorch 2.1.2, there has been a notable enhancement in CPU performance on the Windows platform. The degree of improvement varies across different models, which can be attributed to the diverse mix of operations they perform and their corresponding memory access patterns. While the BERT model shows a modest performance gain, models like ResNet50 and MobileNet-v3 Large benefit from more pronounced improvements.

        + +

        On a high-performance CPU, memory allocation becomes a performance bottleneck. This is also why addressing this issue has led to such significant performance improvements.

        + +

        As shown in the graphs below, we see that PyTorch CPU performance on Windows can significantly be improved. However, there is still a noticeable gap when compared to its performance on Linux. The absence of vectorization optimizations in the Windows variant of PyTorch CPU is a key factor to the remaining performance gap.

        + +

        performance comparison chart

        + +

        Windows vs Linux Performance on PyTorch 2.0.1

        + +

        Figure 1.2: Relative performance of Windows vs Linux with PyTorch version 2.0.1 (higher is better).

        + +

        performance comparison chart

        + +

        Windows vs Linux Performance on PyTorch 2.1.2

        + +

        Figure 1.3: Relative performance of Windows vs Linux with PyTorch version 2.1.2 (higher is better).

        + +

        Vectorization Optimization in PyTorch 2.4.1 and later

        + +

        Prior to PyTorch 2.4.1, the Windows build of PyTorch lacked SIMD for vectorization optimizations, a feature that the Linux build leveraged for improved performance. This discrepancy was due to the SLEEF Library’s integration issues on Windows, which is a SIMD Library for Evaluating Elementary Functions, vectorized libm and DFT and is essential for efficient trigonometric calculations. Through a collaborative effort with engineers from ARM and Qualcomm, these challenges were resolved, enabling the integration of SIMD into PyTorch for Windows. The PyTorch 2.4.1 update has thus significantly enhanced PyTorch’s CPU performance on Windows, as shown in Figure 2.1.

        + +

        performance comparison chart

        + +

        PyTorch CPU Performance Improvement on Windows with Vertorization Optimization

        + +

        Figure 2.1: Relative throughput improvement achieved by upgrading from PyTorch CPU version 2.1.2 to 2.4.1 (higher is better).

        + +

        As shown in the graph below, we see that PyTorch CPU performance on Windows ahieved the performance on Linux.

        + +

        performance comparison chart

        + +

        Windows vs Linux Performance on PyTorch 2.4.1

        + +

        Figure 2.2: Relative performance of Windows vs Linux with PyTorch version 2.4.1 (higher is better).

        + +

        CONCLUSION

        + +

        From PyTorch 2.0.1 to PyTorch 2.4.1, the CPU performance gap between Windows and Linux has been continuously narrowing. We compared the ratio of CPU performance on Windows to CPU performance on Linux across different versions, and the results are shown in the following graph.

        + +

        performance comparison chart

        + +

        Windows vs Linux Performance on different version of PyTorch

        + +

        Figure 3: Performance Ratio for Windows to Linux with different version of PyTorch (higher is better).

        + +

        The graph shows that with PyTorch 2.4.1, CPU performance on Windows has nearly converged with that on Linux, and on some models, it has even surpassed Linux. For example, in the case of DistillBERT and RoBERTa models, the CPU performance ratio of Windows to Linux has achieved a remarkable 102%. However, certain models, including MobileNet-v3, still show a performance discrepancy. Intel engineers will continue to collaborate with Meta engineers, to reduce the performance gap of PyTorch CPU between Windows and Linux.

        + +

        HOW TO TAKE ADVANTAGE OF THE OPTIMIZATIONS

        + +

        Install PyTorch CPU 2.4.1 or later on Windows from the official repository, and you may automatically experience a performance boost with memory allocation and vectorizations.

        + +

        ACKNOWLEDGMENTS

        + +

        The results presented in this blog post was achieved through the collaborative effort of the Intel PyTorch team and Meta. We would like to express our sincere gratitude to Xu Han, Jiong Gong, Haozhe Zhu, Mingfei Ma, Chuanqi Wang, Guobing Chen and Eikan Wang. Their expertise and dedication have been instrumental in achieving the optimizations and performance improvements discussed here. Thanks to Jiachen Pu from community for his participation in the issue discussion and suggesting the use of mimalloc. We’d also like to express our gratitude to Microsoft for providing such an easily integrated and performant mallocation library. Thanks to Pierre Blanchard , Nathan Sircombe from ARM and Alex Reinking from Adobe for their contribution in overcome the compatibility issues with the sleef integrated to PyTorch Windows. Finally we want to thank Jing Xu, Weizhuo Zhang and Zhaoqiong Zheng for their contributions to this blog.

        + +

        Product and Performance Information

        + +

        The configurations in the table are collected with svr-info. Test by Intel on August 30, 2024.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Specification + Configuration1 + Configuration2 +
        Name + ThinkBook 14 G5+ IRH + ThinkBook 14 G5+ IRH +
        Time + Fri Aug 30 02:43:02 PM UTC 2024 + Fri Aug 30 02:43:02 PM UTC 2024 +
        System + LENOVO + LENOVO +
        Baseboard + LENOVO + LENOVO +
        Chassis + LENOVO + LENOVO +
        CPU Model + 13th Gen Intel(R) Core(TM) i7-13700H + 13th Gen Intel(R) Core(TM) i7-13700H +
        Microarchitecture + Unknown Intel + Unknown Intel +
        Sockets + 1 + 1 +
        Cores per Socket + 14 + 14 +
        Hyperthreading + Enabled + Enabled +
        CPUs + 20 + 20 +
        Intel Turbo Boost + Enabled + Enabled +
        Base Frequency + 2.4GHz + 2.4GHz +
        All-core Maximum Frequency + 4.7GHz + 4.7GHz +
        Maximum Frequency + 4.8GHz + 4.8GHz +
        NUMA Nodes + 1 + 1 +
        Prefetchers + L2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled + L2 HW: Enabled, L2 Adj.: Enabled, DCU HW: Enabled, DCU IP: Enabled +
        PPINs + - + - +
        Accelerators + DLB, DSA, IAA, QAT + DLB, DSA, IAA, QAT +
        Installed Memory + 32GB (8x4GB LPDDR4 7400 MT/s [5200 MT/s]) + 32GB (8x4GB LPDDR4 7400 MT/s [5200 MT/s]) +
        Hugepagesize + 2048kb + 2048kb +
        Transparent Huge Pages + madvise + madvise +
        Automatic NUMA Balancing + Disabled + Disabled +
        NIC + “1. Raptor Lake PCH CNVi WiFi 2. Intel Corporation” + “1. Raptor Lake PCH CNVi WiFi 2. Intel Corporation” +
        Disk + Micron MTFDKBA512TFH 500G + Micron MTFDKBA512TFH 500G +
        BIOS + LBCN22WW + LBCN22WW +
        Microcode + 0x411c + 0x411c +
        OS + Windows 11 Desktop + Ubuntu 23.10 +
        Kernel + OS Build 19045.4412 + 6.5.0-27-generic +
        TDP + 200 watts + 200 watts +
        Power & Perf Policy + Normal Powersave (7) + Normal Powersave (7) +
        Frequency Governor + performance + performance +
        Frequency Driver + intel_pstate + intel_pstate +
        Max C-State + 9 + 9 +
        + +

        Notices and Disclaimers

        + +

        Performance varies by use, configuration and other factors. Learn more on the Performance Index site.

        + +

        Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates. See backup for configuration details. No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

        + +

        Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/performance-debugging-of-production-pytorch-models-at-meta/index.html b/blog/performance-debugging-of-production-pytorch-models-at-meta/index.html new file mode 100644 index 000000000000..15072908948f --- /dev/null +++ b/blog/performance-debugging-of-production-pytorch-models-at-meta/index.html @@ -0,0 +1,787 @@ + + + + + + + + + + + + + Performance Debugging of Production PyTorch Models at Meta | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + CK Luk, Lei Tian + +

        +

        1. Meta’s AI Performance Profiling (MAIProf)

        + +

        + +

        + +

        +Figure 1: A simplified illustration of the Meta’s AI performance profiling (MAIProf) infrastructure. +

        + +

        Figure 1 gives a simplified illustration of the AI performance profiling infrastructure at Meta. ML research and performance engineers submit through the User Portal a profiling request for a training job to the Profiling Service, which subsequently broadcasts the request to all the GPU hosts running the training job. When the Monitoring Daemon on a GPU host receives the profiling request, it will notify the Kineto GPU tracer (built on top of NVIDIA’s libcupti) inside the PyTorch program corresponding to the training job. As a result, Kineto traces will be collected and uploaded to the Object Store asynchronously (in more details: there is one Kineto trace collected for each individual GPU, each is treated and stored as a blob; an example will be given in Section 2). Meanwhile, MAIProf also collects a variety of aggregated performance metrics: the Monitoring Daemon on every GPU host continuously reads performance counters from NVIDIA’s DCGM/NVML and logs them to a Time Series DB.

        + +

        Once both trace and metrics collections are completed, the Profiling Service will automatically download traces from the Object Store for trace analysis and performance metrics from the Time Series DB for metric analysis. Finally, an overall profiling report with detailed and insightful analysis is delivered to the user.

        + +

        To serve production uses, we deliberately made the following design choices for MAIProf:

        + +
          +
        • No source-code change required in the PyTorch models: profiling is triggered by sampling the execution of an unmodified model for a user-specified amount of time.
        • +
        • Provide a holistic view of performance: MAIProf performs system-wide analysis that cover both CPU and GPU. Under the hood, it invokes various CPU tools (e.g., Python tracer, Autograd Observer) and GPU tools (e.g., Kineto, DCGM) and correlates their results.
        • +
        • Provide multiple tools that target a wide range of AI partitioners: At Meta, there are engineers with different backgrounds who may need to tune their AI workload performance. Some of them are AI experts while others are general software engineers. Therefore, MAIProf provides a variety of tools for different levels of performance debugging, from high-level automatic trace comprehension to low-level trace analysis.
        • +
        • Support distributed GPU profiling: MAIProf can collect profiling data from multiple hosts, each with multiple GPUs. It then shows a combined view/analysis of the entire system.
        • +
        • Highly scalable: MAIProf is built as a service on top of existing infrastructures in Meta data centers such as a scalable storage system called Manifold. Its profiling capability can be easily scaled by adding more machines in the service pool with the increase of workloads.
        • +
        + +

        2. Case Study: Optimizing a Protection PyTorch Model

        + +

        To be concrete, we use a case study on a protection PyTorch model used in production. First, we discuss our steps for identifying the performance bottlenecks in the model with MAIProf. Then we describe the corresponding optimizations applied and their impacts.

        + +

        2.1 Performance Bottlenecks

        + +

        Step 1:

        + +

        Inspect the CPU and GPU utilization on the same timeline, as shown in Figure 2.

        + +

        + +

        + +

        +Figure 2: CPU usage over time (the top) vs. GPU usage over time (the bottom). +

        + +

        The first performance anomaly we noticed in Figure 2 is the pattern: “GPU-idle, GPU-active, GPU-idle, GPU-active …” throughout the training. Overall, the GPU is idle for more than half of the training time (this is bad for performance because the GPU is a higher-performance device and so we want it to be utilized as much as possible).

        + +

        Step 2:

        + +

        Collect a Python function call trace on the CPU with MAIProf while the GPU is idle, which is shown in Figure 3.

        + +

        + +

        + +

        +Figure 3: A Python call trace. +

        + +

        The Python trace shows that most of the CPU time is spent inside a Python function sharded_iterrows(). From the source code of the model, we learned that this function processes a big feature table in parallel. The number of worker threads used is controlled by a configurable parameter (num_worker_threads). Also, after investigating how the feature table is generated, we understood the performance anomaly: the training dataset is too large to fit in the CPU memory all at once; it needs to be broken into multiple sub-datasets, each has sufficient data for running 10 epochs. Consequently, a new sub-dataset needs to be read from the disk to memory every 10 epochs, during which the GPU is totally idle.

        + +

        Step 3:

        + +

        Collect GPU performance metrics, which is shown in Figure 4.

        + +

        + +

        + +

        +Figure 4: GPU performance metrics in MAIProf. +

        + +

        We made the following observations from Figure 4:

        + +
          +
        • The streaming multiprocessor (SM) runs the model’s CUDA kernels. Its utilization [1] is 9.1%, indicating that the parallel compute units on the GPU are not well utilized.
        • +
        • Tensor Core utilization is 0, meaning that Tensor Core (the mixed-precision compute unit on GPU) [2] is not used at all.
        • +
        • Max GPU memory utilization is 47.13%, indicating that half of the GPU memory is left unused.
        • +
        + +

        Step 4:

        + +

        Collect a GPU trace (aka Kineto trace) of the training loop as shown in Figure 5.

        + +

        + +

        + +

        +Figure 5: A GPU trace (aka Kineto trace) of the training loop. +

        + +

        Since commonly used PyTorch functions are already annotated, their names are automatically shown on the trace. With them, we can roughly divide the trace into the four phases in a training iteration: (1) data loading, (2) forward pass, (3) backward pass, (4) gradient optimization (note: In Figure 5, the “optimizer” phase is from the previous batch while the other three phases are from the current batch).

        + +

        2.2 Optimizations

        + +

        We performed four simple optimizations that target the bottlenecks identified above, each requiring only a change in a config parameter or at most a few source lines. They are listed in Figure 6.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        OptimizationAmount of changesBottlenecks addressed
        Tune num_worker_threads by trying a few possible values within the number of CPU cores on each host.1 source lineGPU totally idle time
        Double the batch sizes2 config parametersGPU memory under-utilization
        Use automatic mixed precision in PyTorch13 source linesZero Tensor Core utilization
        Use mulitensor optimizer in PyTorch1 source lineMany small GPU kernels in the optimizer
        + +

        +Figure 6: Four simple optimizations applied. +

        + +

        3. Concluding Remarks

        + +

        Performance tuning for PyTorch in production environments is increasingly important. A capable performance-debugging tool is a key to this process. We demonstrate with a case study on a production model that MAIProf is a powerful infrastructure for identifying optimization opportunities.

        + +

        At Meta, MAIProf has been used by 100s of engineers, from performance novices to experts, to identify many more types of bottlenecks. These include slow data loading, small and/or slow GPU kernels, distributed training issues such as load imbalance and excessive communication. MAIProf covers major classes of models, including recommendation, vision, and natural language processing. In summary, it is now an indispensable tool for tuning the performance of production PyTorch workloads.

        + +

        References

        + +

        [1] https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/ cudaexperiments/kernellevel/achievedoccupancy.htm

        + +

        [2] https://www.nvidia.com/en-us/data-center/tensor-cores/

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/performant-distributed-checkpointing/index.html b/blog/performant-distributed-checkpointing/index.html new file mode 100644 index 000000000000..a285314e775e --- /dev/null +++ b/blog/performant-distributed-checkpointing/index.html @@ -0,0 +1,698 @@ + + + + + + + + + + + + + Performant Distributed checkpointing in Production with IBM | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Meta: Iris Zhang, Less Wright, Rodrigo Kumpera, Chien-Chin Huang, IBM: Davis Wertheimer, Supriyo Chakraboty, Sophia Wen, Raghu Ganti, Mudhakar Srivatsa, Seethrami Seelam + +

        +

        Params saved per minute

        + +

        Last year, IBM Research began collaborating with us to onboard Fully Sharded Data Parallelism (FSDP) for their large foundation models. They became interested as FSDP is a PyTorch native offering for scaling their distributed training efforts on IBM Cloud.

        + +

        We are pleased to share that, in collaboration with IBM, we have achieved substantial checkpointing speedups for large models (72x vs the original PyTorch 1.13 save speed), proven model and optimizer checkpoint scaling to 30B parameters, and enabled cloud first training using FSDP + Distributed Checkpoint on S3 backends.

        + +

        What is a Distributed Checkpoint?

        + +

        Distributed checkpointing is the PyTorch native solution for saving and loading PyTorch models and optimizer states from multiple ranks, as well as supporting dynamically changing world sizes between reloads.

        + +

        Checkpoint time vs model params

        + +

        PyTorch Distributed Checkpoint (DCP) APIs were introduced in PyTorch 1.13, and are included as an official prototype feature in PyTorch 2.0.

        + +

        Distributed checkpoint is different from torch.save() and torch.load() in a few significant ways:

        + +
          +
        1. DCP produces multiples files per checkpoint, with at least one file per rank,
        2. +
        3. DCP operates in place, meaning that the model should allocate its data first and the Distributed Checkpoint will then use the storage.
        4. +
        + +

        A major improvement from 1.13 to 2.0 includes adding sharded_state_dict support for checkpointing FSDP models. This allows checkpointing for larger sized models, as well as adding support for load-time resharding. Load time resharding enables saving in one cluster topology, and loading into another. This feature was highly requested as it allows training jobs to be run on one cluster, saved, and then continued on a different cluster with different world size.

        + +

        Another major change is that we decouple the storage layer from the checkpoint planning layer and separate implementation from the interface for both layers. With this change, users can now specify how their state_dict should be chunked or transformed during the checkpoint planning phase. Additionally, the customizable storage layer can easily accommodate different backends.

        + +

        More information on the Distributed Checkpoint package can be found here.

        + +

        Performant Distributed checkpointing in Production with IBM

        + +

        IBM at Think 2023 announced its watsonx.ai platform for development and deployment of foundation models for the enterprise. Built on Hybrid Cloud, the platform enables use cases across multiple modalities such as NLP, timeseries, weather, chemistry, tabular data, and cybersecurity, with model sizes from 100s of millions to 10s of billions of parameters. Model architectures range from vision transformers, to multi-modal RoBERTa-style feature extractors, to large-scale generative language models similar to T5, GPT and Llama.

        + +

        As of today, IBM has now enabled checkpointing for T5-style architectures up to 11B parameters, and decoder architectures (GPT style) up to 30B.

        + +

        IBM helped us identify that this limits the scaling power of DCP from both memory and performance standpoints. With their suggestion, we enhanced our FileSystemWriter to produce single checkpoint per rank to reduce read write overhead.

        + +

        With this option as the new default, DCP now creates a single file per rank during checkpoint saving, which would then be sliced when reading parameters at load time.

        + +

        By combining sharded_state_dict support with single filer per rank writer, distributed checkpoint was able to accelerate checkpoint saving time over 72x vs the original PyTorch 1.13 save speed, and enable rapid checkpointing for models sizes over 15B which would previously simply time out.

        + +

        “Looking back, it’s really astounding the speedups we’ve seen, handling training for many of these models. We went from taking almost half an hour to write a single 11B checkpoint in PyTorch 1.13, to being able to handle a 30B parameter model, with optimizer and dataloader state - so that’s over eight times the raw data - in just over 3 minutes. That’s done wonders for both the stability and efficiency of our jobs, as we scale up training to hundreds of gpus.” – Davis Wertheimer, IBM Research

        + +

        IBM’s adoption has also helped us validate and improve our solutions in a real world, large-scale training environment. As an example, IBM discovered that DCP was working well for them on a single node with multiple GPUs, but erred out when used on multiple nodes.

        + +

        Upon investigating the issue, we realized that we were assuming writing to a NFS-like shared file system, which assumes strong read-after-write consistencies. Object stores with file system APIs such as S3FS provide eventual consistency semantics, thus causing the distributed checkpoint in such a setting to fail. Working together with IBM, we identified this issue and fixed it by making one line code change and enabled object storage backend for DCP! Such storage approaches are typically an order of magnitude cheaper than shared file systems thus enabling finer grained checkpointing.

        + +

        Looking for Collaboration

        + +

        If you are interested in trying Distributed Checkpoint, feel free to reach out to us!

        + +

        If you run into any issue when trying it, you can open an issue at our Github repo.

        + +

        Acknowledgements

        + +

        This project would not have been possible without the assistance from many collaborators. We would like to thank Yanli Zhao, Andrew Gu, Rohan Varma for their support of FSDP. Thanks to Pritam Damania, Junjie Zhao, and Wanchao Liang for their support of ShardedTensor.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pipetransformer-automated-elastic-pipelining/index.html b/blog/pipetransformer-automated-elastic-pipelining/index.html new file mode 100644 index 000000000000..f91a9b07a9bb --- /dev/null +++ b/blog/pipetransformer-automated-elastic-pipelining/index.html @@ -0,0 +1,991 @@ + + + + + + + + + + + + + PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Chaoyang He, Shen Li, Mahdi Soltanolkotabi, and Salman Avestimehr + +

        +

        In this blog post, we describe the first peer-reviewed research paper that explores accelerating the hybrid of PyTorch DDP (torch.nn.parallel.DistributedDataParallel) [1] and Pipeline (torch.distributed.pipeline) - PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models (Transformers such as BERT [2] and ViT [3]), published at ICML 2021.

        + +

        PipeTransformer leverages automated elastic pipelining for efficient distributed training of Transformer models. In PipeTransformer, we designed an adaptive on-the-fly freeze algorithm that can identify and freeze some layers gradually during training and an elastic pipelining system that can dynamically allocate resources to train the remaining active layers. More specifically, PipeTransformer automatically excludes frozen layers from the pipeline, packs active layers into fewer GPUs, and forks more replicas to increase data-parallel width. We evaluate PipeTransformer using Vision Transformer (ViT) on ImageNet and BERT on SQuAD and GLUE datasets. Our results show that compared to the state-of-the-art baseline, PipeTransformer attains up to 2.83-fold speedup without losing accuracy. We also provide various performance analyses for a more comprehensive understanding of our algorithmic and system-wise design.

        + +

        Next, we will introduce the background, motivation, our idea, design, and how we implement the algorithm and system with PyTorch Distributed APIs.

        + + + +

        Introduction

        +

        +Model Size +
        +Figure 1: the Parameter Number of Transformer Models Increases Dramatically. +

        + +

        Large Transformer models [4][5] have powered accuracy breakthroughs in both natural language processing and computer vision. GPT-3 [4] hit a new record high accuracy for nearly all NLP tasks. Vision Transformer (ViT) [3] also achieved 89\% top-1 accuracy in ImageNet, outperforming state-of-the-art convolutional networks ResNet-152 and EfficientNet. To tackle the growth in model sizes, researchers have proposed various distributed training techniques, including parameter servers [6][7][8], pipeline parallelism [9][10][11][12], intra-layer parallelism [13][14][15], and zero redundancy data-parallel [16].

        + +

        Existing distributed training solutions, however, only study scenarios where all model weights are required to be optimized throughout the training (i.e., computation and communication overhead remains relatively static over different iterations). Recent works on progressive training suggest that parameters in neural networks can be trained dynamically:

        + +
          +
        • Freeze Training: Singular Vector Canonical Correlation Analysis for Deep Learning Dynamics and Interpretability. NeurIPS 2017
        • +
        • Efficient Training of BERT by Progressively Stacking. ICML 2019
        • +
        • Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. NeurIPS 2020.
        • +
        • On the Transformer Growth for Progressive BERT Training. NACCL 2021
        • +
        + +

        +Freeze Training +
        +

        +

        Figure 2. Interpretable Freeze Training: DNNs converge bottom-up (Results on CIFAR10 using ResNet). Each pane shows layer-by-layer similarity using SVCCA [17][18]

        + +

        For example, in freeze training [17][18], neural networks usually converge from the bottom-up (i.e., not all layers need to be trained all the way through training). Figure 2 shows an example of how weights gradually stabilize during training in this approach. This observation motivates us to utilize freeze training for distributed training of Transformer models to accelerate training by dynamically allocating resources to focus on a shrinking set of active layers. Such a layer freezing strategy is especially pertinent to pipeline parallelism, as excluding consecutive bottom layers from the pipeline can reduce computation, memory, and communication overhead.

        + +

        + +
        +Figure 3. The process of PipeTransformer’s automated and elastic pipelining to accelerate distributed training of Transformer models +

        + +

        We propose PipeTransformer, an elastic pipelining training acceleration framework that automatically reacts to frozen layers by dynamically transforming the scope of the pipelined model and the number of pipeline replicas. To the best of our knowledge, this is the first paper that studies layer freezing in the context of both pipeline and data-parallel training. Figure 3 demonstrates the benefits of such a combination. First, by excluding frozen layers from the pipeline, the same model can be packed into fewer GPUs, leading to both fewer cross-GPU communications and smaller pipeline bubbles. Second, after packing the model into fewer GPUs, the same cluster can accommodate more pipeline replicas, increasing the width of data parallelism. More importantly, the speedups acquired from these two benefits are multiplicative rather than additive, further accelerating the training.

        + +

        The design of PipeTransformer faces four major challenges. First, the freeze algorithm must make on-the-fly and adaptive freezing decisions; however, existing work [17][18] only provides a posterior analysis tool. Second, the efficiency of pipeline re-partitioning results is influenced by multiple factors, including partition granularity, cross-partition activation size, and the chunking (the number of micro-batches) in mini-batches, which require reasoning and searching in a large solution space. Third, to dynamically introduce additional pipeline replicas, PipeTransformer must overcome the static nature of collective communications and avoid potentially complex cross-process messaging protocols when onboarding new processes (one pipeline is handled by one process). Finally, caching can save time for repeated forward propagation of frozen layers, but it must be shared between existing pipelines and newly added ones, as the system cannot afford to create and warm up a dedicated cache for each replica.

        + +

        +Freeze Training +
        +Figure 4: An Animation to Show the Dynamics of PipeTransformer +

        + +

        As shown in the animation (Figure 4), PipeTransformer is designed with four core building blocks to address the aforementioned challenges. First, we design a tunable and adaptive algorithm to generate signals that guide the selection of layers to freeze over different iterations (Freeze Algorithm). Once triggered by these signals, our elastic pipelining module (AutoPipe), then packs the remaining active layers into fewer GPUs by taking both activation sizes and variances of workloads across heterogeneous partitions (frozen layers and active layers) into account. It then splits a mini-batch into an optimal number of micro-batches based on prior profiling results for different pipeline lengths. Our next module, AutoDP, spawns additional pipeline replicas to occupy freed-up GPUs and maintains hierarchical communication process groups to attain dynamic membership for collective communications. Our final module, AutoCache, efficiently shares activations across existing and new data-parallel processes and automatically replaces stale caches during transitions.

        + +

        Overall, PipeTransformer combines the Freeze Algorithm, AutoPipe, AutoDP, and AutoCache modules to provide a significant training speedup. +We evaluate PipeTransformer using Vision Transformer (ViT) on ImageNet and BERT on GLUE and SQuAD datasets. Our results show that PipeTransformer attains up to 2.83-fold speedup without losing accuracy. We also provide various performance analyses for a more comprehensive understanding of our algorithmic and system-wise design. +Finally, we have also developed open-source flexible APIs for PipeTransformer, which offer a clean separation among the freeze algorithm, model definitions, and training accelerations, allowing for transferability to other algorithms that require similar freezing strategies.

        + +

        Overall Design

        + +

        Suppose we aim to train a massive model in a distributed training system where the hybrid of pipelined model parallelism and data parallelism is used to target scenarios where either the memory of a single GPU device cannot hold the model, or if loaded, the batch size is small enough to avoid running out of memory. More specifically, we define our settings as follows:

        + +

        Training task and model definition. We train Transformer models (e.g., Vision Transformer, BERT on large-scale image or text datasets. The Transformer model mathcal{F} has L layers, in which the i th layer is composed of a forward computation function f_i and a corresponding set of parameters.

        + +

        Training infrastructure. Assume the training infrastructure contains a GPU cluster that has N GPU servers (i.e. nodes). Each node has I GPUs. Our cluster is homogeneous, meaning that each GPU and server have the same hardware configuration. Each GPU’s memory capacity is M_\text{GPU}. Servers are connected by a high bandwidth network interface such as InfiniBand interconnect.

        + +

        Pipeline parallelism. In each machine, we load a model \mathcal{F} into a pipeline \mathcal{P} which has Kpartitions (K also represents the pipeline length). The kth partition p_k consists of consecutive layers. We assume each partition is handled by a single GPU device. 1 \leq K \leq I, meaning that we can build multiple pipelines for multiple model replicas in a single machine. We assume all GPU devices in a pipeline belonging to the same machine. Our pipeline is a synchronous pipeline, which does not involve stale gradients, and the number of micro-batches is M. In the Linux OS, each pipeline is handled by a single process. We refer the reader to GPipe [10] for more details.

        + +

        Data parallelism. DDP is a cross-machine distributed data-parallel process group within R parallel workers. Each worker is a pipeline replica (a single process). The rth worker’s index (ID) is rank r. For any two pipelines in DDP, they can belong to either the same GPU server or different GPU servers, and they can exchange gradients with the AllReduce algorithm.

        + +

        Under these settings, our goal is to accelerate training by leveraging freeze training, which does not require all layers to be trained throughout the duration of the training. Additionally, it may help save computation, communication, memory cost, and potentially prevent overfitting by consecutively freezing layers. However, these benefits can only be achieved by overcoming the four challenges of designing an adaptive freezing algorithm, dynamical pipeline re-partitioning, efficient resource reallocation, and cross-process caching, as discussed in the introduction.

        + +

        +Overview +
        +Figure 5. Overview of PipeTransformer Training System +

        + +

        PipeTransformer co-designs an on-the-fly freeze algorithm and an automated elastic pipelining training system that can dynamically transform the scope of the pipelined model and the number of pipeline replicas. The overall system architecture is illustrated in Figure 5. To support PipeTransformer’s elastic pipelining, we maintain a customized version of PyTorch Pipeline. For data parallelism, we use PyTorch DDP as a baseline. Other libraries are standard mechanisms of an operating system (e.g.,multi-processing) and thus avoid specialized software or hardware customization requirements. To ensure the generality of our framework, we have decoupled the training system into four core components: freeze algorithm, AutoPipe, AutoDP, and AutoCache. The freeze algorithm (grey) samples indicators from the training loop and makes layer-wise freezing decisions, which will be shared with AutoPipe (green). AutoPipe is an elastic pipeline module that speeds up training by excluding frozen layers from the pipeline and packing the active layers into fewer GPUs (pink), leading to both fewer cross-GPU communications and smaller pipeline bubbles. Subsequently, AutoPipe passes pipeline length information to AutoDP (purple), which then spawns more pipeline replicas to increase data-parallel width, if possible. The illustration also includes an example in which AutoDP introduces a new replica (purple). AutoCache (orange edges) is a cross-pipeline caching module, as illustrated by connections between pipelines. The source code architecture is aligned with Figure 5 for readability and generality.

        + +

        Implementation Using PyTorch APIs

        + +

        As can be seen from Figure 5, PipeTransformers contain four components: Freeze Algorithm, AutoPipe, AutoDP, and AutoCache. Among them, AutoPipe and AutoDP relies on PyTorch DDP (torch.nn.parallel.DistributedDataParallel) [1] and Pipeline (torch.distributed.pipeline), respectively. In this blog, we only highlight the key implementation details of AutoPipe and AutoDP. For details of Freeze Algorithm and AutoCache, please refer to our paper.

        + +

        AutoPipe: Elastic Pipelining

        + +

        AutoPipe can accelerate training by excluding frozen layers from the pipeline and packing the active layers into fewer GPUs. This section elaborates on the key components of AutoPipe that dynamically 1) partition pipelines, 2) minimize the number of pipeline devices, and 3) optimize mini-batch chunk size accordingly.

        + +

        Basic Usage of PyTorch Pipeline

        + +

        Before diving into details of AutoPipe, let us warm up the basic usage of PyTorch Pipeline (torch.distributed.pipeline.sync.Pipe, see this tutorial). More specially, we present a simple example to understand the design of Pipeline in practice:

        + +
        # Step 1: build a model including two linear layers
        +fc1 = nn.Linear(16, 8).cuda(0)
        +fc2 = nn.Linear(8, 4).cuda(1)
        +
        +# Step 2: wrap the two layers with nn.Sequential
        +model = nn.Sequential(fc1, fc2)
        +
        +# Step 3: build Pipe (torch.distributed.pipeline.sync.Pipe)
        +model = Pipe(model, chunks=8)
        +
        +# do training/inference
        +input = torch.rand(16, 16).cuda(0)
        +output_rref = model(input)
        +
        + +

        In this basic example, we can see that before initializing Pipe, we need to partition the model nn.Sequential into multiple GPU devices and set optimal chunk number (chunks). Balancing computation time across partitions is critical to pipeline training speed, as skewed workload distributions across stages can lead to stragglers and forcing devices with lighter workloads to wait. The chunk number may also have a non-trivial influence on the throughput of the pipeline.

        + +

        Balanced Pipeline Partitioning

        + +

        In dynamic training system such as PipeTransformer, maintaining optimally balanced partitions in terms of parameter numbers does not guarantee the fastest training speed because other factors also play a crucial role:

        + +

        + +
        +Figure 6. The partition boundary is in the middle of a skip connection +

        + +
          +
        1. +

          Cross-partition communication overhead. Placing a partition boundary in the middle of a skip connection leads to additional communications since tensors in the skip connection must now be copied to a different GPU. For example, with BERT partitions in Figure 6, partition k must take intermediate outputs from both partition k-2 and partition k-1. In contrast, if the boundary is placed after the addition layer, the communication overhead between partition k-1 and k is visibly smaller. Our measurements show that having cross-device communication is more expensive than having slightly imbalanced partitions (see the Appendix in our paper). Therefore, we do not consider breaking skip connections (highlighted separately as an entire attention layer and MLP layer in green color at line 7 in Algorithm 1.

          +
        2. +
        3. +

          Frozen layer memory footprint. During training, AutoPipe must recompute partition boundaries several times to balance two distinct types of layers: frozen layers and active layers. The frozen layer’s memory cost is a fraction of that inactive layer, given that the frozen layer does not need backward activation maps, optimizer states, and gradients. Instead of launching intrusive profilers to obtain thorough metrics on memory and computational cost, we define a tunable cost factor lambda_{\text{frozen}} to estimate the memory footprint ratio of a frozen layer over the same active layer. Based on empirical measurements in our experimental hardware, we set it to \frac{1}{6}.

          +
        4. +
        + +

        + +
        +

        + +

        Based on the above two considerations, AutoPipe balances pipeline partitions based on parameter sizes. More specifically, AutoPipe uses a greedy algorithm to allocate all frozen and active layers to evenly distribute partitioned sublayers into K GPU devices. Pseudocode is described as the load\_balance() function in Algorithm 1. The frozen layers are extracted from the original model and kept in a separate model instance \mathcal{F}_{\text{frozen}} in the first device of a pipeline.

        + +

        Note that the partition algorithm employed in this paper is not the only option; PipeTransformer is modularized to work with any alternatives.

        + +

        Pipeline Compression

        + +

        Pipeline compression helps to free up GPUs to accommodate more pipeline replicas and reduce the number of cross-device communications between partitions. To determine the timing of compression, we can estimate the memory cost of the largest partition after compression, and then compare it with that of the largest partition of a pipeline at timestep T=0. To avoid extensive memory profiling, the compression algorithm uses the parameter size as a proxy for the training memory footprint. Based on this simplification, the criterion of pipeline compression is as follows:

        + +

        + +
        +

        + +

        Once the freeze notification is received, AutoPipe will always attempt to divide the pipeline length K by 2 (e.g., from 8 to 4, then 2). By using \frac{K}{2} as the input, the compression algorithm can verify if the result satisfies the criterion in Equation (1). Pseudocode is shown in lines 25-33 in Algorithm 1. Note that this compression makes the acceleration ratio exponentially increase during training, meaning that if a GPU server has a larger number of GPUs (e.g., more than 8), the acceleration ratio will be further amplified.

        + +

        + +
        +Figure 7. Pipeline Bubble: F_{d,b}, and U_d" denote forward, backward, and the optimizer update of micro-batch b on device d, respectively. The total bubble size in each iteration is K-1 times per micro-batch forward and backward cost. +

        + +

        Additionally, such a technique can also speed up training by shrinking the size of pipeline bubbles. To explain bubble sizes in a pipeline, Figure 7 depicts how 4 micro-batches run through a 4-device pipeline K = 4. In general, the total bubble size is (K-1) times per micro-batch forward and backward cost. Therefore, it is clear that shorter pipelines have smaller bubble sizes.

        + +

        Dynamic Number of Micro-Batches

        + +

        Prior pipeline parallel systems use a fixed number of micro-batches per mini-batch (M ). GPipe suggests M \geq 4 \times K, where K is the number of partitions (pipeline length). However, given that PipeTransformer dynamically configures K, we find it to be sub-optimal to maintain a static M during training. Moreover, when integrated with DDP, the value of M also has an impact on the efficiency of DDP gradient synchronizations. Since DDP must wait for the last micro-batch to finish its backward computation on a parameter before launching its gradient synchronization, finer micro-batches lead to a smaller overlap between computation and communication. Hence, instead of using a static value, PipeTransformer searches for optimal M on the fly in the hybrid of DDP environment by enumerating M values ranging from K to 6K. For a specific training environment, the profiling needs only to be done once (see Algorithm 1 line 35).

        + +

        For the complete source code, please refer to https://github.com/Distributed-AI/PipeTransformer/blob/master/pipe_transformer/pipe/auto_pipe.py.

        + +

        AutoDP: Spawning More Pipeline Replicas

        +

        As AutoPipe compresses the same pipeline into fewer GPUs, AutoDP can automatically spawn new pipeline replicas to increase data-parallel width.

        + +

        Despite the conceptual simplicity, subtle dependencies on communications and states require careful design. The challenges are threefold:

        + +
          +
        1. +

          DDP Communication: Collective communications in PyTorch DDP requires static membership, which prevents new pipelines from connecting with existing ones;

          +
        2. +
        3. +

          State Synchronization: newly activated processes must be consistent with existing pipelines in the training progress (e.g., epoch number and learning rate), weights and optimizer states, the boundary of frozen layers, and pipeline GPU range;

          +
        4. +
        5. +

          Dataset Redistribution: the dataset should be re-balanced to match a dynamic number of pipelines. This not only avoids stragglers but also ensures that gradients from all DDP processes are equally weighted.

          +
        6. +
        + +

        + +
        +Figure 8. AutoDP: handling dynamical data-parallel with messaging between double process groups (Process 0-7 belong to machine 0, while process 8-15 belong to machine 1) +

        + +

        To tackle these challenges, we create double communication process groups for DDP. As in the example shown in Figure 8, the message process group (purple) is responsible for light-weight control messages and covers all processes, while the active training process group (yellow) only contains active processes and serves as a vehicle for heavy-weight tensor communications during training. The message group remains static, whereas the training group is dismantled and reconstructed to match active processes. +In T0, only processes 0 and 8 are active. During the transition to T1, process 0 activates processes 1 and 9 (newly added pipeline replicas) and synchronizes necessary information mentioned above using the message group. The four active processes then form a new training group, allowing static collective communications adaptive to dynamic memberships. +To redistribute the dataset, we implement a variant of DistributedSampler that can seamlessly adjust data samples to match the number of active pipeline replicas.

        + +

        The above design also naturally helps to reduce DDP communication overhead. More specifically, when transitioning from T0 to T1, processes 0 and 1 destroy the existing DDP instances, and active processes construct a new DDP training group using a cached pipelined model (AutoPipe stores frozen model and cached model separately).

        + +

        We use the following APIs to implement the design above.

        + +
        import torch.distributed as dist
        +from torch.nn.parallel import DistributedDataParallel as DDP
        +
        +# initialize the process group (this must be called in the initialization of PyTorch DDP)
        +dist.init_process_group(init_method='tcp://' + str(self.config.master_addr) + ':' +
        +str(self.config.master_port), backend=Backend.GLOO, rank=self.global_rank, world_size=self.world_size)
        +...
        +
        +# create active process group (yellow color)
        +self.active_process_group = dist.new_group(ranks=self.active_ranks, backend=Backend.NCCL, timeout=timedelta(days=365))
        +...
        +
        +# create message process group (yellow color)
        +self.comm_broadcast_group = dist.new_group(ranks=[i for i in range(self.world_size)], backend=Backend.GLOO, timeout=timedelta(days=365))
        +...
        +
        +# create DDP-enabled model when the number of data-parallel workers is changed. Note:
        +# 1. The process group to be used for distributed data all-reduction.
        +If None, the default process group, which is created by torch.distributed.init_process_group, will be used.
        +In our case, we set it as self.active_process_group
        +# 2. device_ids should be set when the pipeline length = 1 (the model resides on a single CUDA device).
        +
        +self.pipe_len = gpu_num_per_process
        +if gpu_num_per_process > 1:
        +    model = DDP(model, process_group=self.active_process_group, find_unused_parameters=True)
        +else:
        +    model = DDP(model, device_ids=[self.local_rank], process_group=self.active_process_group, find_unused_parameters=True)
        +
        +# to broadcast message among processes, we use dist.broadcast_object_list
        +def dist_broadcast(object_list, src, group):
        +    """Broadcasts a given object to all parties."""
        +    dist.broadcast_object_list(object_list, src, group=group)
        +    return object_list
        +
        +

        For the complete source code, please refer to https://github.com/Distributed-AI/PipeTransformer/blob/master/pipe_transformer/dp/auto_dp.py.

        + +

        Experiments

        + +

        This section first summarizes experiment setups and then evaluates PipeTransformer using computer vision and natural language processing tasks.

        + +

        Hardware. Experiments were conducted on 2 identical machines connected by InfiniBand CX353A (5GB/s), where each machine is equipped with 8 NVIDIA Quadro RTX 5000 (16GB GPU memory). GPU-to-GPU bandwidth within a machine (PCI 3.0, 16 lanes) is 15.754GB/s.

        + +

        Implementation. We used PyTorch Pipe as a building block. The BERT model definition, configuration, and related tokenizer are from HuggingFace 3.5.0. We implemented Vision Transformer using PyTorch by following its TensorFlow implementation. More details can be found in our source code.

        + +

        Models and Datasets. Experiments employ two representative Transformers in CV and NLP: Vision Transformer (ViT) and BERT. ViT was run on an image classification task, initialized with pre-trained weights on ImageNet21K and fine-tuned on ImageNet and CIFAR-100. BERT was run on two tasks, text classification on the SST-2 dataset from the General Language Understanding Evaluation (GLUE) benchmark, and question answering on the SQuAD v1.1 Dataset (Stanford Question Answering), which is a collection of 100k crowdsourced question/answer pairs.

        + +

        Training Schemes. Given that large models normally would require thousands of GPU-days {\emph{e.g.}, GPT-3) if trained from scratch, fine-tuning downstream tasks using pre-trained models has become a trend in CV and NLP communities. Moreover, PipeTransformer is a complex training system that involves multiple core components. Thus, for the first version of PipeTransformer system development and algorithmic research, it is not cost-efficient to develop and evaluate from scratch using large-scale pre-training. Therefore, the experiments presented in this section focuses on pre-trained models. Note that since the model architectures in pre-training and fine-tuning are the same, PipeTransformer can serve both. We discussed pre-training results in the Appendix.

        + +

        Baseline. Experiments in this section compare PipeTransformer to the state-of-the-art framework, a hybrid scheme of PyTorch Pipeline (PyTorch’s implementation of GPipe) and PyTorch DDP. Since this is the first paper that studies accelerating distributed training by freezing layers, there are no perfectly aligned counterpart solutions yet.

        + +

        Hyper-parameters. Experiments use ViT-B/16 (12 transformer layers, 16 \times 16 input patch size) for ImageNet and CIFAR-100, BERT-large-uncased (24 layers) for SQuAD 1.1, and BERT-base-uncased (12 layers) for SST-2. With PipeTransformer, ViT and BERT training can set the per-pipeline batch size to around 400 and 64, respectively. Other hyperparameters (e.g., epoch, learning rate) for all experiments are presented in Appendix.

        + +

        Overall Training Acceleration

        +

        + +
        +

        + +

        We summarize the overall experimental results in the table above. Note that the speedup we report is based on a conservative \alpha \frac{1}{3} value that can obtain comparable or even higher accuracy. A more aggressive \alpha (\frac{2}{5}, \frac{1}{2}) can obtain a higher speedup but may lead to a slight loss in accuracy. Note that the model size of BERT (24 layers) is larger than ViT-B/16 (12 layers), thus it takes more time for communication.

        + +

        Performance Analysis

        + +

        Speedup Breakdown

        + +

        This section presents evaluation results and analyzes the performance of different components in \autopipe. More experimental results can be found in the Appendix.

        + +

        + +
        +Figure 9. Speedup Breakdown (ViT on ImageNet) +

        + +

        To understand the efficacy of all four components and their impacts on training speed, we experimented with different combinations and used their training sample throughput (samples/second) and speedup ratio as metrics. Results are illustrated in Figure 9. Key takeaways from these experimental results are:

        + +
          +
        1. the main speedup is the result of elastic pipelining which is achieved through the joint use of AutoPipe and AutoDP;
        2. +
        3. AutoCache’s contribution is amplified by AutoDP;
        4. +
        5. freeze training alone without system-wise adjustment even downgrades the training speed.
        6. +
        + +

        Tuning \alpha in Freezing Algorithm

        + +

        + +
        +Figure 10. Tuning \alpha in Freezing Algorithm +

        + +

        We ran experiments to show how the \alpha in the freeze algorithms influences training speed. The result clearly demonstrates that a larger \alpha (excessive freeze) leads to a greater speedup but suffers from a slight performance degradation. In the case shown in Figure 10, where \alpha=1/5, freeze training outperforms normal training and obtains a 2.04-fold speedup. We provide more results in the Appendix.

        + +

        Optimal Chunks in the elastic pipeline

        + +

        + +
        +Figure 11. Optimal chunk number in the elastic pipeline +

        + +

        We profiled the optimal number of micro-batches M for different pipeline lengths K. Results are summarized in Figure 11. As we can see, different K values lead to different optimal M, and the throughput gaps across different M values are large (as shown when K=8), which confirms the necessity of an anterior profiler in elastic pipelining.

        + +

        Understanding the Timing of Caching

        + +

        + +
        +Figure 12. the timing of caching +

        + +

        To evaluate AutoCache, we compared the sample throughput of training that activates AutoCache from epoch 0 (blue) with the training job without AutoCache (red). Figure 12 shows that enabling caching too early can slow down training, as caching can be more expensive than the forward propagation on a small number of frozen layers. After more layers are frozen, caching activations clearly outperform the corresponding forward propagation. As a result, AutoCache uses a profiler to determine the proper timing to enable caching. In our system, for ViT (12 layers), caching starts from 3 frozen layers, while for BERT (24 layers), caching starts from 5 frozen layers.

        + +

        For more detailed experimental analysis, please refer to our paper.

        + +

        Summarization

        +

        This blog introduces PipeTransformer, a holistic solution that combines elastic pipeline-parallel and data-parallel for distributed training using PyTorch Distributed APIs. More specifically, PipeTransformer incrementally freezes layers in the pipeline, packs remaining active layers into fewer GPUs, and forks more pipeline replicas to increase the data-parallel width. Evaluations on ViT and BERT models show that compared to the state-of-the-art baseline, PipeTransformer attains up to 2.83× speedups without accuracy loss.

        + +

        Reference

        + +

        [1] Li, S., Zhao, Y., Varma, R., Salpekar, O., Noordhuis, P., Li,T., Paszke, A., Smith, J., Vaughan, B., Damania, P., et al. Pytorch Distributed: Experiences on Accelerating Dataparallel Training. Proceedings of the VLDB Endowment,13(12), 2020

        + +

        [2] Devlin, J., Chang, M. W., Lee, K., and Toutanova, K. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT, 2019

        + +

        [3] Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al. An image is Worth 16x16 words: Transformers for Image Recognition at Scale.

        + +

        [4] Brown, T. B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. Language Models are Few-shot Learners.

        + +

        [5] Lepikhin, D., Lee, H., Xu, Y., Chen, D., Firat, O., Huang, Y., Krikun, M., Shazeer, N., and Chen, Z. Gshard: Scaling Giant Models with Conditional Computation and Automatic Sharding.

        + +

        [6] Li, M., Andersen, D. G., Park, J. W., Smola, A. J., Ahmed, A., Josifovski, V., Long, J., Shekita, E. J., and Su, B. Y. Scaling Distributed Machine Learning with the Parameter Server. In 11th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 14), pp. 583–598, 2014.

        + +

        [7] Jiang, Y., Zhu, Y., Lan, C., Yi, B., Cui, Y., and Guo, C. A Unified Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU/CPU Clusters. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pp. 463–479. USENIX Association, November 2020. ISBN 978-1-939133-19- 9.

        + +

        [8] Kim, S., Yu, G. I., Park, H., Cho, S., Jeong, E., Ha, H., Lee, S., Jeong, J. S., and Chun, B. G. Parallax: Sparsity-aware Data Parallel Training of Deep Neural Networks. In Proceedings of the Fourteenth EuroSys Conference 2019, pp. 1–15, 2019.

        + +

        [9] Kim, C., Lee, H., Jeong, M., Baek, W., Yoon, B., Kim, I., Lim, S., and Kim, S. TorchGPipe: On-the-fly Pipeline Parallelism for Training Giant Models.

        + +

        [10] Huang, Y., Cheng, Y., Bapna, A., Firat, O., Chen, M. X., Chen, D., Lee, H., Ngiam, J., Le, Q. V., Wu, Y., et al. Gpipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism.

        + +

        [11] Park, J. H., Yun, G., Yi, C. M., Nguyen, N. T., Lee, S., Choi, J., Noh, S. H., and ri Choi, Y. Hetpipe: Enabling Large DNN Training on (whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism. In 2020 USENIX Annual Technical Conference (USENIX ATC 20), pp. 307–321. USENIX Association, July 2020. ISBN 978-1-939133- 14-4.

        + +

        [12] Narayanan, D., Harlap, A., Phanishayee, A., Seshadri, V., Devanur, N. R., Ganger, G. R., Gibbons, P. B., and Zaharia, M. Pipedream: Generalized Pipeline Parallelism for DNN Training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles, SOSP ’19, pp. 1–15, New York, NY, USA, 2019. Association for Computing Machinery. ISBN 9781450368735. doi: 10.1145/3341301.3359646.

        + +

        [13] Lepikhin, D., Lee, H., Xu, Y., Chen, D., Firat, O., Huang, Y., Krikun, M., Shazeer, N., and Chen, Z. Gshard: Scaling Giant Models with Conditional Computation and Automatic Sharding.

        + +

        [14] Shazeer, N., Cheng, Y., Parmar, N., Tran, D., Vaswani, A., Koanantakool, P., Hawkins, P., Lee, H., Hong, M., Young, C., Sepassi, R., and Hechtman, B. Mesh-Tensorflow: Deep Learning for Supercomputers. In Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., and Garnett, R. (eds.), Advances in Neural Information Processing Systems, volume 31, pp. 10414–10423. Curran Associates, Inc., 2018.

        + +

        [15] Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., and Catanzaro, B. Megatron-LM: Training Multi-billion Parameter Language Models using Model Parallelism.

        + +

        [16] Rajbhandari, S., Rasley, J., Ruwase, O., and He, Y. ZERO: Memory Optimization towards Training a Trillion Parameter Models.

        + +

        [17] Raghu, M., Gilmer, J., Yosinski, J., and Sohl Dickstein, J. Svcca: Singular Vector Canonical Correlation Analysis for Deep Learning Dynamics and Interpretability. In NIPS, 2017.

        + +

        [18] Morcos, A., Raghu, M., and Bengio, S. Insights on Representational Similarity in Neural Networks with Canonical Correlation. In Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., and Garnett, R. (eds.), Advances in Neural Information Processing Systems 31, pp. 5732–5741. Curran Associates, Inc., 2018.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds/index.html b/blog/prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds/index.html new file mode 100644 index 000000000000..2c0b4fe55474 --- /dev/null +++ b/blog/prototype-features-now-available-apis-for-hardware-accelerated-mobile-and-arm64-builds/index.html @@ -0,0 +1,685 @@ + + + + + + + + + + + + + Prototype Features Now Available - APIs for Hardware Accelerated Mobile and ARM64 Builds | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we are announcing four PyTorch prototype features. The first three of these will enable Mobile machine-learning developers to execute models on the full set of hardware (HW) engines making up a system-on-chip (SOC). This gives developers options to optimize their model execution for unique performance, power, and system-level concurrency.

        + +

        These features include enabling execution on the following on-device HW engines:

        +
          +
        • DSP and NPUs using the Android Neural Networks API (NNAPI), developed in collaboration with Google
        • +
        • GPU execution on Android via Vulkan
        • +
        • GPU execution on iOS via Metal
        • +
        + +

        This release also includes developer efficiency benefits with newly introduced support for ARM64 builds for Linux.

        + +

        Below, you’ll find brief descriptions of each feature with the links to get you started. These features are available through our nightly builds. Reach out to us on the PyTorch Forums for any comment or feedback. We would love to get your feedback on those and hear how you are using them!

        + +

        NNAPI Support with Google Android

        + +

        The Google Android and PyTorch teams collaborated to enable support for Android’s Neural Networks API (NNAPI) via PyTorch Mobile. Developers can now unlock high-performance execution on Android phones as their machine-learning models will be able to access additional hardware blocks on the phone’s system-on-chip. NNAPI allows Android apps to run computationally intensive neural networks on the most powerful and efficient parts of the chips that power mobile phones, including DSPs (Digital Signal Processors) and NPUs (specialized Neural Processing Units). The API was introduced in Android 8 (Oreo) and significantly expanded in Android 10 and 11 to support a richer set of AI models. With this integration, developers can now seamlessly access NNAPI directly from PyTorch Mobile. This initial release includes fully-functional support for a core set of features and operators, and Google and Facebook will be working to expand capabilities in the coming months.

        + +

        Links

        + + +

        PyTorch Mobile GPU support

        + +

        Inferencing on GPU can provide great performance on many models types, especially those utilizing high-precision floating-point math. Leveraging the GPU for ML model execution as those found in SOCs from Qualcomm, Mediatek, and Apple allows for CPU-offload, freeing up the Mobile CPU for non-ML use cases. This initial prototype level support provided for on device GPUs is via the Metal API specification for iOS, and the Vulkan API specification for Android. As this feature is in an early stage: performance is not optimized and model coverage is limited. We expect this to improve significantly over the course of 2021 and would like to hear from you which models and devices you would like to see performance improvements on.

        + +

        Links

        + + +

        ARM64 Builds for Linux

        + +

        We will now provide prototype level PyTorch builds for ARM64 devices on Linux. As we see more ARM usage in our community with platforms such as Raspberry Pis and Graviton(2) instances spanning both at the edge and on servers respectively. This feature is available through our nightly builds.

        + +

        We value your feedback on these features and look forward to collaborating with you to continuously improve them further!

        + +

        Thank you,

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-day-china-2025-cfp/index.html b/blog/pt-day-china-2025-cfp/index.html new file mode 100644 index 000000000000..389deb82124e --- /dev/null +++ b/blog/pt-day-china-2025-cfp/index.html @@ -0,0 +1,694 @@ + + + + + + + + + + + + + PyTorch Day China 2025 Call for Proposals Open | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We’re excited to announce the first-ever PyTorch Day China! This new event, hosted by the PyTorch Foundation, will take place on June 7 in Beijing, China, bringing together AI practitioners, researchers, and industry professionals to explore the latest advancements in open source AI and machine learning. Co-located with the BAAI Conference, PyTorch Day China is a chance to connect with the community, share knowledge, and help shape the future of deep learning.

        + +

        PyTorch Day China 2025 Call for Proposals Open

        + +

        Why Submit a Proposal?

        + +

        PyTorch Day China offers a platform for AI practitioners and researchers to showcase their work, exchange ideas, and connect with others in the community. If you’re working on innovative applications, tools, or research in the PyTorch ecosystem, we encourage you to share your expertise.

        + +

        Topics for Submission:

        + +
          +
        • AI Applications and Use Cases
        • +
        • Core PyTorch Framework
        • +
        • DL Compilers and Kernel Authoring
        • +
        • Edge AI and On-Device
        • +
        • Ethical AI, Governance, and Regulation
        • +
        • Generative AI and Large Language Models (LLMs) with PyTorch
        • +
        • Open Source Collaboration, Education, and Community Building
        • +
        • Optimization for Training and Inference
        • +
        • PyTorch on Accelerator Hardware
        • +
        • PyTorch Ecosystem and Tools
        • +
        • PyTorch in Research and Academia
        • +
        • Performance Measurement and Benchmarking
        • +
        • Scaling Training and Inference
        • +
        + +

        The submission deadline is April 13. Submit and learn more here: https://www.lfasiallc.com/pytorch-day-china/call-for-proposals-cfp/

        + +

        Why Attend?

        + +

        PyTorch Day China will feature technical talks, discussions, and poster sessions that highlight real-world applications and developments in AI and machine learning. Attendees will have the opportunity to learn from experts, contribute to the open source community, and engage with fellow PyTorch users. Registration information will be available in April.

        + +

        Event Details

        + +
          +
        • Date: June 7, 2025
        • +
        • Location: Zhongguancun Exhibition Center, Beijing, China
        • +
        • Address: 索家坟, Hai Dian Qu, Bei Jing Shi, China, 100080
        • +
        • Co-located with: BAAI Conference
        • +
        + +

        Travel Information

        + +

        The venue, Zhongguancun Exhibition Center, is approximately 39 km from Beijing International Airport. More details on travel and accommodation will be available on the BAAI Conference website and updated here as they become available.

        + +

        Have Questions?

        + +

        For inquiries, please contact pytorchevents@linuxfoundation.org.

        + +

        Submit your proposal by April 13 and join the conversation shaping the future of PyTorch.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-day-france-cfp/index.html b/blog/pt-day-france-cfp/index.html new file mode 100644 index 000000000000..82b961b9b3c1 --- /dev/null +++ b/blog/pt-day-france-cfp/index.html @@ -0,0 +1,691 @@ + + + + + + + + + + + + + PyTorch Day France 2025: Call For Proposals Open | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We’re pleased to announce PyTorch Day France 2025, a dedicated gathering of the PyTorch community held 7 May 2025 in Paris, France. Proudly hosted by the PyTorch Foundation and co-located with GOSIM AI Paris 2025, this event will bring together developers, researchers, and practitioners driving innovation in open source AI and machine learning.

        + +

        Whether you’re building cutting-edge models or contributing to the ecosystem, PyTorch Day France is your opportunity to connect, collaborate, and help shape the future of deep learning.

        + +

        PT Day CFP

        + +

        Why Attend?

        + +

        Set in the vibrant atmosphere of STATION F, the world’s largest startup campus, PyTorch Day France will offer a full day of:

        + +
          +
        • Insightful Technical Talks
        • +
        • Interactive Discussions
        • +
        • Engaging Poster Sessions
        • +
        + +

        The event is designed to foster open exchange across the PyTorch ecosystem, providing a space to learn from peers, share practical insights, and explore the latest research and applications in AI.

        + +

        Submit a Proposal

        + +

        We are currently accepting proposals for talks. If you have a project, idea, or research story you’d like to share with the PyTorch community, we want to hear from you.

        + +

        📩 Email your talk title and abstract to pytorchevents@linuxfoundation.org for consideration.

        + +

        Registration

        + +

        To register for PyTorch Day France, please visit the GOSIM AI Paris website, and use the code PYTORCHFRIEND to receive 25% off.

        + +

        👉 https://paris2025.gosim.org/

        + +

        We encourage early registration to secure your spot and ensure access to both PyTorch Day France and the broader GOSIM AI Paris programming.

        + +

        Venue

        + +

        STATION F
        +5 Parv. Alan Turing, 75013 Paris, France
        +A landmark of innovation and entrepreneurship in the heart of Paris.

        + +

        Travel and Accommodations

        + +

        Participants are responsible for their own travel and lodging. For those arriving internationally, Paris Charles de Gaulle Airport is approximately 38.4 km from STATION F. Additional information about accommodations and transportation may be available on the GOSIM AI Paris website.

        + +

        Questions?

        + +

        For any inquiries, please contact us at pytorchevents@linuxfoundation.org.

        + +

        We look forward to welcoming the PyTorch community to Paris this May for a day of collaboration, learning, and open source AI innovation.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-day-france-featured-sessions/index.html b/blog/pt-day-france-featured-sessions/index.html new file mode 100644 index 000000000000..2e0b9a3a62e6 --- /dev/null +++ b/blog/pt-day-france-featured-sessions/index.html @@ -0,0 +1,685 @@ + + + + + + + + + + + + + PyTorch Day France Featured Sessions: A Defining Moment for Open Source AI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        PyTorch Day France offers a front-row seat to the future of open source AI. Taking place 7 May at Station F in Paris and co-located with GOSIM AI Paris, this one-day event will bring together developers, researchers, and industry leaders for a day of technical sessions, real-world insights, and community exchange.

        + +

        🌍 A Major Milestone for the PyTorch Foundation

        + +

        This event marks the very first PyTorch Day, launching a new international series hosted annually in different regions to convene AI researchers, developers, engineers, and enthusiasts. PyTorch Days are designed to spotlight open source AI advancements, foster community collaboration, and provide a forum to learn about active, high-impact AI projects built using PyTorch.

        + +

        PyTorch Day France also represents a pivotal moment in the PyTorch Foundation’s journey. With its recent expansion into an umbrella foundation, PyTorch is now positioned to support a broader ecosystem of trusted, community-driven AI projects across the full AI lifecycle.

        + +

        At PyTorch Day France, you’ll hear directly from PyTorch Foundation Executive Director, Matt White, about this transition—and get a first look at some exciting announcements.

        + +

        🎟️ Registration Details

        + +

        Register now with code PYTORCH for free access to the full day of PyTorch Day France sessions, plus GOSIM AI Paris.

        + +

        🔗Two events, one registration—double the sessions, double the innovation.
        +Register here

        + + + +

        The day’s agenda includes deep technical dives and applied AI use cases from across the community, including the following talks:

        + + + +

        View the full schedule.

        + +

        Whether you’re a contributor, practitioner, or simply curious about what’s ahead, PyTorch Day France is an opportunity to connect with the community and shape what’s next for our ecosystem.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-executorch-ethos-u85/index.html b/blog/pt-executorch-ethos-u85/index.html new file mode 100644 index 000000000000..6808b1a0fb9a --- /dev/null +++ b/blog/pt-executorch-ethos-u85/index.html @@ -0,0 +1,728 @@ + + + + + + + + + + + + + Getting started with PyTorch, ExecuTorch, and Ethos-U85 in three easy steps | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Robert Elliott, Fredrik Knutsson, and Mark Quartermain + +

        +

        ExecuTorch support for Ethos-U85

        + +

        In the rapidly evolving landscape of machine learning, PyTorch has emerged as a leading framework for model development, given its flexibility and comprehensive ecosystem. Arm has worked with Meta to introduce support for Arm platforms in ExecuTorch, that further simplifies this process, making it seamless to deploy PyTorch models on edge devices.

        + +

        The Arm Ethos-U85 NPU is the highest performing Ethos NPU addressing the growing demand for running advanced AI inference workloads at the edge, including transformer-based networks like LLMs. Arm offers reference designs, including the Corstone-320 IoT reference design platform, around the Ethos-U to accelerate and simplify the chip development cycle. The reference design platform includes, among many items, a Fixed Virtual Platform (FVP) that simulates an entire system, enabling cutting edge embedded software development and neural network deployment for the Ethos-U85.

        + +

        Today, Arm is extending the support for developers building IoT edge applications, by supporting ExecuTorch beta on Ethos-U85. Leveraging ExecuTorch, developers can now efficiently land their natively developed PyTorch models to enable intelligent and responsive IoT solutions built on Arm.

        + +

        With this package now available, thousands of developers looking to create Edge AI applications, can start their model and application development months before the platforms arrive on the market.

        + +

        Getting started with ExecuTorch on Ethos-U85

        + +

        A full development environment has been provided in the public ExecuTorch GitHub repository. This provides an integrated and tested development flow with all necessary components.

        + +

        The three simple steps are:

        + +
          +
        1. Set up ExecuTorch
        2. +
        3. Set up the Arm Build environment
        4. +
        5. Compile and Run models on the arm_executor_runner
        6. +
        + +

        You can then build on this flow for compiling and running models, to capture runtime behavior from the Ethos-U85 driver, such as cycle count information.

        + +

        To make the process easier for end users, we have also added scripts to the ExecuTorch repository:

        + +
          +
        1. Set up ExecuTorch
        2. +
        3. setup.sh: Download the necessary software.
        4. +
        5. run.sh: to compile and run the model on the Corstone-320 FVP
        6. +
        + +

        To build other models, you can use the ahead of time compiler script aot_arm_compiler.py, which takes a PyTorch program (nn.module) to an ExecuTorch program (.pte flatbuffer file). To write custom applications which use ExecuTorch you can follow the application flow in the example executor_runner application.

        + +

        We support approximately 40 core ATen operators and already support end-to-end deployment of models such as Mobilenetv2. Ongoing efforts to support further operators will enable more PyTorch models every week .

        + +

        As more functionality is added, it will be demonstrated through the tutorial materials for Ethos-U on pytorch.org

        + +

        How this deployment flow works in more detail

        + +

        Leveraging the extensibility of ExecuTorch and the expressiveness of Arm’s Tensor Operator Set Architecture (TOSA), we have enabled Ethos-U support in ExecuTorch. The Ethos-U compiler, Vela, has been enhanced with a TOSA front-end, making it possible to compile models for all products in the Ethos-U family. Combining these components into a cohesive workflow involves the following steps.

        + +
          +
        1. Converting a PyTorch model into a deployable ExecuTorch program (AOT flow)
        2. +
        3. Compile the ExecuTorch program into an executable, which can be deployed on Corstone-320 (runtime flow)
        4. +
        + +

        The ExecuTorch Ahead of time (AOT) flow

        + +

        The process begins by converting a PyTorch model into a quantized TOSA representation using the PyTorch dynamo export flow. This allows us to generate an Ethos-U set of machine instructions, known as a command stream, utilizing the Vela compiler TOSA frontend. The command stream is bundled into an ExecuTorch program, represented by a flatbuffer file (.pte). This file contains everything the ExecuTorch runtime needs to perform inference using Ethos-U hardware.

        + +

        flow diagram

        + +

        The ExecuTorch Runtime flow

        + +

        The ExecuTorch runtime, written in C/C++, is designed to support multiple backends. We have extended it to include support for the Ethos-U device driver. Following this flow will produce a self-contained compiled executable. Deploying the executable on the Corstone-320 FVP is straightforward and requires only the appropriate flags when calling the FVP.

        + +

        flow diagram

        + +

        Ethos-U85 and Corstone-320

        + +

        The Ethos-U family of NPUs offers high performance and energy-efficient solutions for edge AI. The Ethos-U55 (also supported by ExecuTorch) is widely deployed in many Cortex-M heterogeneous systems, while the Ethos-U65 extends the applicability of the Ethos-U family to Cortex-A-based systems and increases the performance.

        + +

        Ethos-U85 further extends the Ethos-U product line, supporting current and future workloads on the edge using transformer-based networks. Ethos-U85 delivers a 4x performance uplift and 20% higher energy efficiency compared to its predecessor, with up to 85% utilization on popular networks. Notable feature of Ethos-U85 includes;

        + +
          +
        • configurations from 128 to 2048 MACs/cycle, delivering up 4 TOP/s at 1GHz
        • +
        • Compatible with Cortex-A and Cortex-M based systems
        • +
        • Native support for major neural networks though support for TOSA
        • +
        • Full hardware acceleration of all major neural networks
        • +
        • For a full list of features, see the Ethos-U85 Technical Overview
        • +
        + +

        A typical compute subsystem design with Ethos-U85

        + +

        A typical compute subsystem design with Ethos-U85

        + +

        What’s next

        + +

        We are adding new operator support every week, extending ExecuTorch core ATen operator coverage, and enabling a wider range of models to run on Ethos-U. Our ongoing efforts focus on improving performance to ensure models run as optimally as possible on Ethos-U.

        + +

        The ExecuTorch delegate framework supports fallback to running operators not supported by Ethos-U on the CPU using reference kernel implementations. We will work towards optimal performance on Cortex-M CPUs using CMSIS-NN, providing the best possible support for fallback operators and ensuring optimal performance for devices without Ethos-U capability.

        + +

        The package above with the Corstone-320 FVP are more steps to simplify application development, so please, go ahead, check out the code and build process and send us feedback. Meanwhile we will be busy making weekly releases to enable more features, models and to extract the maximum performance out of the hardware.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-fedora-os-communities/index.html b/blog/pt-fedora-os-communities/index.html new file mode 100644 index 000000000000..7f0db2e8b3fb --- /dev/null +++ b/blog/pt-fedora-os-communities/index.html @@ -0,0 +1,677 @@ + + + + + + + + + + + + + Powering AI with PyTorch, Fedora, and Open Source Communities | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Sudhir Dharanendraiah + +

        +

        man speaking at a conference

        + +

        At DevConf.IN 2025 in Pune, I had the opportunity to host a PyTorch Meetup on February 28th. The session, titled “Powering AI with PyTorch, Fedora, and Open Source Communities” was aimed at introducing PyTorch to students and professionals, explaining why PyTorch+Fedora form an ideal AI development platform. The other key aspect I covered was collaboration between open source communities.

        + +

        Introduction to PyTorch

        + +

        The Power of Deep Learning made simple

        + +

        With the explosion of GPTs, there is a renowned interest in the field of AI and ML. The myth of developing AI/ML technologies and its applications is rocket science and far-fetched, needs correction. Only open source has the power to demystify this myth and further evolve the technology to make it versatile and developer friendly. Since its inception, PyTorch has evolved and has been a driving force to make AI/ML development extremely simple. I covered the aspects of PyTorch key components, its features and why PyTorch is the best choice as a deep learning framework.

        + +

        man speaking at a conference

        + +

        The codewalk through was designed to showcase how easy and simple it is to utilise the power of GPUs, creating a simple neural network and training the model. The code walkthrough was very well received and it was great to hear back from the attendees that they never knew how powerful PyTorch is for deep learning. The real world examples sighted how this powerful framework can be used beyond the common GPTs and has the power to influence across a broad spectrum of applications.

        + +

        Fedora+PyTorch the Ideal AI/ML Development Platform

        + +

        man speaking at a conference

        + +

        man speaking at a conference

        + +

        One of the highlights of the event was the discussion on Fedora’s role as an AI platform. Fedora’s reliability, flexibility, and strong community support make it an ideal partner for PyTorch, allowing developers to focus on model-building without worrying about infrastructure. The students were intrigued by the idea of contributing to Fedora’s AI/ML ecosystem while building their own projects. Sumantro Mukherjee spoke about the AI policy in Fedora and how one can start contributing to the AI/ML using Fedora as a platform. He highlighted how Fedora is evolving to meet the needs of AI practitioners. The idea that an open-source operating system could provide the perfect foundation for AI research sparked an engaging conversation.

        + +

        Innovation in Open Source When Communities Come Together

        + +

        charts

        + +

        It is important that we learn from history and repeat the good things! When open source communities come together they can create seismic shifts in the industry. To drive this home, I took the audience on a journey through history, revisiting a pivotal moment when Apache and Linux came together, solving common problems and fundamentally reshaping enterprise computing. That moment was not just about technology; it was about collaboration. It was about two powerful communities recognizing that they were stronger together. Today, we stand at the cusp of another such moment - PyTorch and Linux, particularly Fedora, are coming together to shape the future of AI/ML. This is not just an opportunity but a responsibility for contributors, developers, and AI/ML enthusiasts to be part of this movement.

        + +

        Looking Ahead

        + +

        man speaking at a conference

        + +

        One of the best parts of the event was the enthusiasm it generated. Diverse audience, including students, AI enthusiasts, and industry professionals. Notably, Vincent Caldeira (CTO, APAC, Red Hat) and Chris Butler (Senior Principal Chief Architect, Red Hat) were present, reinforcing the growing interest in open-source AI/ML. Many students were eager to explore PyTorch and Fedora, contribute to open-source AI projects, and start their own AI experiments. Industry experts saw the potential for scalable, community-driven AI innovation. The session sparked curiosity and conversations that continued long after the event ended.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-foundation-expands/index.html b/blog/pt-foundation-expands/index.html new file mode 100644 index 000000000000..3b94b847eaec --- /dev/null +++ b/blog/pt-foundation-expands/index.html @@ -0,0 +1,693 @@ + + + + + + + + + + + + + PyTorch Foundation Expands to an Umbrella Foundation to Accelerate AI Innovation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Matt White, Executive Director, PyTorch Foundation + +

        +

        Today, I am thrilled to announce a significant milestone for the PyTorch Foundation: we are expanding our scope to become an umbrella foundation, allowing us to host additional projects. This expansion positions the PyTorch Foundation to foster a broader ecosystem of high-value, trusted, and innovative AI projects that cater to all stages of the AI lifecycle—from training and inference to industry-specific applications.

        + +

        Why Expand?

        + +

        Since its inception at the Linux Foundation two and a half years ago, the PyTorch Foundation has rapidly grown, now encompassing over 30 member organizations and 120 vibrant ecosystem projects. PyTorch itself has become the framework of choice for AI researchers, practitioners, and industry leaders worldwide. Our flagship PyTorch Conference has seen attendance multiply sixfold over just two years, reflecting the community’s tremendous enthusiasm and engagement.

        + +

        With new initiatives such as PyTorch Day events, global community meetups, the PyTorch Ambassador Program, Open Source Program Office (OSPO) outreach, the Speaker’s Bureau, and our upcoming training and certification programs, we have significantly deepened our community’s expertise and collaboration capabilities. To sustain and accelerate this momentum, the logical next step was to expand the PyTorch Foundation into an umbrella organization.

        + +

        What Does an Umbrella Foundation Mean?

        + +

        By transitioning into an umbrella foundation, PyTorch will now host a range of diverse, high-quality AI and ML projects beyond PyTorch Core. These include foundation-hosted projects in two categories:

        + +
          +
        • Platform Projects: Domain-agnostic solutions essential across various stages of the AI lifecycle, such as training, inference, model optimization, and deployment as well as agentic systems.
        • +
        • Vertical Projects: Domain-specific projects tailored to particular industries or applications, such as biomedical imaging, protein folding, and geospatial analysis.
        • +
        + +

        Projects under our umbrella gain immediate access to vendor-neutral governance, enhanced visibility, increased funding opportunities, and robust community engagement and support.

        + +

        Foundation-Hosted vs. Ecosystem Projects

        + +

        As we expand, it’s important to clarify the distinction between foundation-hosted and ecosystem projects:

        + +
          +
        • Foundation-Hosted Projects are projects that fall under the umbrella, they are officially governed and administered under the PyTorch Foundation’s neutral and transparent governance model. Project maintainers continue to oversee their project, and they transfer assets to the Linux Foundation for independent stewardship and adopt an open governance model significantly reducing vendor bias and encouraging broader community contributions and adoption. These projects have greater stability and longevity and integrate with the larger PyTorch community.
        • +
        • Ecosystem Projects remain independently managed but receive recognition and increased visibility by aligning themselves closely with the PyTorch Foundation community standards. These projects meet specific quality and maturity criteria but retain full independence in governance and asset management.
        • +
        + +

        How to Join the PyTorch Ecosystem or Become a Foundation-Hosted Project

        + +

        We have clearly defined pathways for projects looking to become part of the PyTorch community:

        + +
          +
        1. Ecosystem Project Status: Projects must meet defined criteria, such as active development, comprehensive documentation, CI/CD infrastructure, clear governance, and community engagement. Approved ecosystem projects benefit from increased exposure and official recognition on the PyTorch Landscape.
        2. +
        3. Candidate Project Status: Ecosystem projects aspiring to foundation-hosted status can become candidates by securing sponsorship from a PyTorch Foundation Technical Advisory Council (TAC) voting member. Candidates receive guidance on meeting all necessary governance, technical, and strategic criteria.
        4. +
        5. Foundation-Hosted Project Status: Candidate projects demonstrating high maturity, stability, multi-platform support, security best practices, and strategic value to the PyTorch community can be approved by the TAC. These projects gain extensive benefits, including neutral trademark hosting, foundation support, marketing and events resources, governance guidance, and strategic funding opportunities.
        6. +
        + +

        Ensuring Long-Term Success and Innovation

        + +

        By expanding our scope to become an umbrella foundation, the PyTorch Foundation is uniquely positioned to enhance collaboration, innovation, and sustained growth across the entire AI community. Our mission is clear: create a vendor-neutral, open source environment where the best AI and ML tools can thrive, benefiting users, contributors, and industry stakeholders worldwide.

        + +

        “PyTorch is absolutely the foundation of the innovation happening in AI today and with projects like Llama, ChatGPT, and hundreds of thousands of open projects built on PyTorch, it has cemented itself as a critical ingredient to the world of AI. This move to create an umbrella foundation enables PyTorch to significantly expand its ecosystem both horizontally and vertically in this new era of agentic systems. I am very excited about this opportunity to take the PyTorch community to the next level!” - Joe Spisak, Product Director for PyTorch at Meta.

        + +

        “PyTorch sits at the very core of AI today. Meanwhile, the depth of the AI stack has grown dramatically—evolving from enabling accelerated compute to powering fully autonomous systems. Broadening the PyTorch Foundation is a key step in keeping the AI revolution open and accessible to all, across the stack and aligned with the principles PyTorch was built on.” - Luca Antiga, CTO at Lightning AI.

        + +

        We are incredibly optimistic about the opportunities ahead and excited to welcome new projects into our growing family. The PyTorch Foundation remains deeply committed to driving AI innovation forward, and together, we will continue to build the future of open source artificial intelligence.

        + +

        Stay tuned for more updates, announcements, and opportunities to participate!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-korea-user-group-recap/index.html b/blog/pt-korea-user-group-recap/index.html new file mode 100644 index 000000000000..92c64658b660 --- /dev/null +++ b/blog/pt-korea-user-group-recap/index.html @@ -0,0 +1,703 @@ + + + + + + + + + + + + + Recap of the PyTorch Korea User Group Meetup: A Technical Conference with a PyTorch Core Maintainer | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Jiho Kim, PyTorch Korea User Group + +

        +

        At the end of March, the PyTorch Korea User Group hosted a special meetup that brought together prominent speakers for deep discussions on the PyTorch core and its broader ecosystem. With the event more than doubling in size compared to past gatherings, we were able to connect with even more developers and share insights. Huge thanks to goorm for sponsoring the fantastic venue! 😄

        + +

        people at a conference

        + +

        This recap is for those who couldn’t attend in person, as well as for participants who want to revisit the energy and insights of the day. The event featured experts in core PyTorch, AI accelerators, inference optimization, and large language model development. Below is a quick overview of the key sessions that anchored the conference.

        + +

        1️⃣ Jerry Lee | PyTorch Foundation

        + +

        Representing the PyTorch Foundation, part of the Linux Foundation, Jaeung provided an overview of how PyTorch is driving core open source technologies forward. He shared PyTorch’s growth story, the many global projects currently in motion, and the ecosystem’s impressive 20%+ annual growth. The session also covered how the foundation operates, how member organizations are involved, and upcoming plans that are particularly useful for practitioners.

        + +

        people at a conference

        + +

        2️⃣ Alban Desmaison | PyTorch Roadmap

        + +

        Alban shared the design philosophy behind PyTorch and Meta’s official contribution roadmap (link). He provided a deep technical dive into the differences between Eager and Compiled modes, especially breaking down the backend architecture of device Eager execution. Practical tools and improvements were also introduced—such as memory profilers, enhanced custom operator support, and pinned memory optimizations.

        + +

        people at a conference

        + +

        3️⃣ Hongseok Kim | PyTorch on Rebellions AI Accelerators: Status

        + +

        Rebellions is building runtime integration for their proprietary NPU architecture, fully aligned with the structural changes in PyTorch 2.0. This talk introduced the performance and scalability of their upcoming chip, their integration strategy with the PyTorch runtime, and challenges in supporting Eager Mode. Hongseok also previewed their roadmap toward releasing these features within the year.

        + +

        people at a conference

        + +

        4️⃣ Kyujin Cho | Backend.AI: A Unified Platform for All AI Accelerators

        + +

        Backend.AI abstracts and integrates various AI accelerators into a unified workflow. As the diversity of accelerator architectures grows, the need for portability and infrastructure unification becomes even more important. This session showcased features across development and operations—from NPU scheduling and resource allocation to monitoring. Backend.AI currently supports accelerators from NVIDIA, Intel, Tenstorrent, Rebellions, and more.

        + +

        people at a conference

        + +

        5️⃣ Taeho Kim | Optimizing & Deploying Models Across Multiple Chipsets Using NetsPresso

        + +

        This talk focused on the challenges of inference in real-world industrial applications of AI models. As new state-of-the-art models emerge rapidly, there’s a growing need for environments that can quickly validate device compatibility—ideally with one-click ease. NetsPresso is actively working on a static graph representation compatible with PyTorch, offering efficient support for model development, optimization, and testing.

        + +

        people at a conference

        + +

        6️⃣ Jungyeop Lee | The Journey to Reproduce Deepseek-R1

        + +

        Jungyeop took us through his journey of reproducing Deepseek, a large language model—an effort that involved 201 experiments. He shared real-world lessons from training with Korean data, tokenizer modifications, and fine-tuning strategies. His practical insights and next steps were especially valuable for those building or re-implementing large models from scratch.

        + +

        people at a conference

        + +

        7️⃣ Sol Kim | A journey from TCP architecture to production-level LLMs

        + +

        Sol presented an integrated optimization approach to deploying large models using the TCP(Tensor Contraction Processor) architecture, which supports tensor contraction at the hardware level. The talk highlighted optimization techniques built on hardware abstraction layers (HALs) and bottom-up integration strategies with PyTorch—offering a hybrid hardware-software perspective.

        + +

        people at a conference

        + +

        💡 Panel Talk & Q&A 💡

        + +

        The event wrapped up with an engaging panel discussion. Attendees asked sharp questions, and the speakers offered insightful answers. It was a powerful moment that captured the community’s enthusiasm for PyTorch and their hunger for deeper technical understanding.

        + +

        people at a conference

        + +

        Final Thoughts

        + +

        Since our first offline meetup in October 2022, the PyTorch Korea User Group has held five major technical conferences. Each event deepens our appreciation for the scale and depth of the PyTorch ecosystem. With perspectives from users, contributors, and ecosystem builders, the stories we share are only growing—and we’re committed to continuing this journey together.

        + +

        See you at the next conference—with even more exciting talks to come! 🙌

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pt-multidevice-integration/index.html b/blog/pt-multidevice-integration/index.html new file mode 100644 index 000000000000..8f0a1ead8140 --- /dev/null +++ b/blog/pt-multidevice-integration/index.html @@ -0,0 +1,773 @@ + + + + + + + + + + + + + Challenges and Efforts in PyTorch Multi-Device Integration: Compatibility, Portability, and Integration Efficiencies | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Zesheng Zong (Huawei), Jiawei Li (Huawei) | Co-authors: Jiong Gong (Intel), Bartosz Sochacki (Intel), Eikan Wang (Intel) + +

        +

        Introduction

        + +

        As the demand for diverse hardware accelerators grows, the need for a robust and adaptable deep learning framework becomes increasingly critical. While working through this integration, several challenges have surfaced in the PyTorch ecosystem, potentially affecting various hardware vendors. This blog aims to highlight these issues and propose solutions to enhance PyTorch’s adaptability, portability, and resilience across different hardware platforms.

        + +

        Improve Users’ Code Portability via Accelerator Autoloading

        + +

        Currently, users face additional work when running their code on different accelerators. One such task is manually importing modules for out-of-tree devices. This requires users to not only understand the different usage patterns between accelerators but also make their code aware of these differences. If you have projects originally running on GPU/CPU and want to migrate to other accelerators, this can lead to significant work and potential frustration.

        + +

        Examples of extra import:

        + +
        # Case 1: Use HPU
        +import torch
        +import torchvision.models as models
        +import habana_frameworks.torch # <-- extra import
        +model = models.resnet50().eval().to("hpu")
        +input = torch.rand(128, 3, 224, 224).to("hpu")
        +output = model(input)
        +
        +# Case 2: Use torch_npu
        +import torch
        +import torch_npu # <-- extra import
        +print(torch.ones(1, 2, device='npu'))
        +
        + +

        As a high-level machine learning framework, PyTorch’s ability to shield users from device differences is a competitive feature. Accelerator Autoloading allows users to continue using the familiar PyTorch device programming model without explicitly loading or importing device-specific extensions.

        + +

        How does it works?

        + +

        Utilize Python’s plugin architecture to enable automatic loading of device extensions via entry points in the PyTorch package.

        + +

        Python entry points provide a standardized way for Python packages to expose and discover components or plugins within an application. Via definition in accelerator’s package setup.py , PyTorch can automatically initialize accelerator modules when calling import torch , which gives users consistent experience between different backend devices.

        + +

        From device perspective, only need to claim following setup in setup.py (as example of torch_npu )

        + +
        // setup.py 
        +entry_points={
        + 'torch.backends': ['torch_npu = torch_npu:_autoload', ],
        +}
        +
        + +

        When import torch is invoked, the accelerator module will be loaded automatically. This provides users with a consistent programming experience across out-of-tree devices, eliminating the need to be aware of differences between CUDA, HPU, and NPU.

        + +
        # Case 1: Use HPU 
        +import torch 
        +import torchvision.models as models 
        +model = models.resnet50().eval().to("hpu") 
        +input = torch.rand(128, 3, 224, 224).to("hpu") 
        +output = model(input) 
        +
        +# Case 2: Use torch_npu 
        +import torch 
        +print(torch.ones(1, 2, device='npu'))
        +
        + +

        Device Integration Optimization

        + +

        What is PrivateUse1?

        + +

        In PyTorch, the dispatcher is a crucial component of the framework’s backend that manages how operations are routed to the appropriate device-specific implementation. Dispatch keys are an integral part of this system, serving as identifiers that represent various execution contexts—such as the device (CPU, CUDA, XPU), layout (dense, sparse), and autograd functionality. These keys ensure that operations are directed to the correct implementation.

        + +

        PrivateUse1 is a customizable device dispatch key, similar to CUDA/CPU/XPU, etc.), reserved for out-of-tree devices. It provides developers with a way to extend PyTorch’s functionality without modifying the core framework, allowing for the integration of new devices, hardware accelerators, or other specialized computing environments.

        + +

        Why do we need PrivateUse1?

        + +

        Internally, dispatch keys are represented as bit masks, each bit represents whether a certain key is active. This bit mask representation is efficient for quick lookup and combination of keys, but it inherently limits the number of distinct keys (typically to 64 or fewer).

        + +

        The current implementation of BackendComponent dispatch keys in PyTorch has encountered a critical bottleneck, which restricts the addition of new backends and, as a result, limits the expansion of the PyTorch ecosystem.

        + +

        bit diagram

        + +

        In response to this challenge, a series of optimizations have been applied to the PrivateUse1 mechanism to enhance its capacity.

        + +
          +
        • +

          PrivateUse1 integration mechanism

          + +

          Initially reserved as fallback options, PrivateUse1, along with PrivateUse2 and PrivateUse3, were designed to be activated only when existing key resources became scarce.

          + +

          PrivateUse1 is now being developed to match the robustness and versatility of established keys like CUDA and CPU. Achieving this required a deep integration across critical PyTorch modules. This integration wasn’t just a simple switch—it involved significant updates to core components such as AMP (Automatic Mixed Precision), Autograd, Distributed Training, Checkpointing, DataLoader, Optimization, and Quantization, etc.

          +
        • +
        + +

        flow diagram

        + +

        The activation of PrivateUse1 was a massive collaborative effort, culminating in over 100 pull requests aimed at making it from a placeholder to a fully operational dispatch key.

        + +
          +
        • +

          PrivateUse1 UT/CI Quality Assurance

          + +

          While unit tests are essential for ensuring quality during the development of the PrivateUse1 mechanism, they are not sufficient on their own to prevent new pull requests from inadvertently affecting existing functionality or compatibility of out-of-tree devices.

          + +

          To mitigate this risk, the community has added the pytorch_openreg module to the test suite. This module leverages a CPU backend to simulate interactions with accelerators, creating a controlled environment for rigorous testing. After implemented, this will enable automatic execution of device-generic test cases whenever relevant code is updated, allowing us to quickly detect and address any potential issues affecting the PrivateUse1 integration mechanism.

          +
        • +
        • +

          Comprehensive Documentation

          + +

          By providing comprehensive and easy-to-understand documentation, we aim to lower the barrier to entry for developers and encourage wider adoption of the PrivateUse1 mechanism in the PyTorch ecosystem. This documentation includes:

          +
            +
          • Step-by-step guides for integrating new backends using PrivateUse1
          • +
          • Clear explanations of PrivateUse1’s functionality and benefits
          • +
          • Code examples and best practices for efficient implementation
          • +
          +
        • +
        + +

        These enhancements aim to improve the robustness and reliability of the PrivateUse1 mechanism, facilitating better integration of new backends and expanding the capabilities of PyTorch.

        + +

        Compatibility Between Upstream and Downstream

        + +

        Device-Generic Unit Tests

        + +

        Most unit tests in PyTorch focus on CPU and CUDA devices, which limits participation from users with other hardware. To address this, a plan to modify PyTorch’s unit testing framework, enabling better support for non-CUDA devices. This plan includes removing existing device restrictions, implementing dynamic data type loading, and generalizing decorators to accommodate a broader range of devices. Additionally, we aim to enforce the use of universal device code and expand distributed testing to support non-NCCL backends.

        + +

        Through these improvements, we hope to significantly increase test coverage and pass rates for non-CUDA devices, integrating them into PyTorch’s continuous integration process. Initial changes have already been implemented, paving the way for new hardware support and creating a reference template for other devices.

        + +

        Ensuring Robust Device Integration through Automated Testing

        + +

        To uphold the high standards of quality assurance in PyTorch, an independent build repository and daily continuous integration (CI) workflows have been established, focusing on smoke and integration testing.

        + +

        The pytorch-integration-tests repository automates the testing of PyTorch’s device-specific functionalities, ensuring that they operate correctly and efficiently across a variety of hardware platforms(NPUs and other specialized devices). In repository we are trying to make a fully automated system that continuously validates PyTorch’s compatibility with different hardware backends.

        + +
          +
        • Automated Integration Tests: Run automated tests across different devices using GitHub Actions. This automation ensures that every change in the codebase is thoroughly tested against multiple hardware platforms, catching potential issues early in the development process.
        • +
        • Reusable Workflows: Workflows in this repository are modular and reusable, which streamlines the testing process. Developers can easily adapt these workflows to new devices or testing scenarios, making the system both flexible and scalable as PyTorch evolves.
        • +
        • Awareness of Out-of-Tree Devices: The repository displays the existence and behavior of all out-of-tree devices, keeping the community informed. This approach minimizes the risk of accidentally breaking downstream functionalities and provides fast feedback on changes.
        • +
        + +

        Efforts to enhance multi-device integration are pivotal for its adaptability in the evolving deep learning landscape. These initiatives not only benefit current users but also lower entry barriers for new hardware vendors and developers, fostering innovation in AI and machine learning. As PyTorch continues to evolve, its commitment to flexibility, robustness, and inclusivity positions it as a leading framework capable of meeting the diverse needs of the deep learning community.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-0_4_0-migration-guide/index.html b/blog/pytorch-0_4_0-migration-guide/index.html new file mode 100644 index 000000000000..8037a04c348f --- /dev/null +++ b/blog/pytorch-0_4_0-migration-guide/index.html @@ -0,0 +1,1129 @@ + + + + + + + + + + + + + PyTorch 0.4.0 Migration Guide | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        April 22, 2018

        +

        + PyTorch 0.4.0 Migration Guide +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Welcome to the migration guide for PyTorch 0.4.0. In this release we introduced many exciting new features and critical bug fixes, with the goal of providing users a better and cleaner interface. In this guide, we will cover the most important changes in migrating existing code from previous versions:

        + +
          +
        • Tensors and Variables have merged
        • +
        • Support for 0-dimensional (scalar) Tensors
        • +
        • Deprecation of the volatile flag
        • +
        • dtypes, devices, and Numpy-style Tensor creation functions
        • +
        • Writing device-agnostic code
        • +
        • New edge-case constraints on names of submodules, parameters, and buffers in nn.Module
        • +
        + +

        Merging Tensor and Variable and classes

        + +

        torch.Tensor and torch.autograd.Variable are now the same class. More precisely, torch.Tensor is capable of tracking history and behaves like the old Variable; Variable wrapping continues to work as before but returns an object of type torch.Tensor. This means that you don’t need the Variable wrapper everywhere in your code anymore.

        + +

        The type() of a Tensor has changed

        + +

        Note also that the type() of a Tensor no longer reflects the data type. Use isinstance() or x.type()instead:

        + +
        >>> x = torch.DoubleTensor([1, 1, 1])
        +>>> print(type(x))  # was torch.DoubleTensor
        +"<class 'torch.Tensor'>"
        +>>> print(x.type())  # OK: 'torch.DoubleTensor'
        +'torch.DoubleTensor'
        +>>> print(isinstance(x, torch.DoubleTensor))  # OK: True
        +True
        +
        + +

        When does autograd start tracking history now?

        + +

        requires_grad, the central flag for autograd, is now an attribute on Tensors. The same rules previously used for Variables applies to Tensors; autograd starts tracking history when any input Tensor of an operation has requires_grad=True. For example,

        + +
        >>> x = torch.ones(1)  # create a tensor with requires_grad=False (default)
        +>>> x.requires_grad
        +False
        +>>> y = torch.ones(1)  # another tensor with requires_grad=False
        +>>> z = x + y
        +>>> # both inputs have requires_grad=False. so does the output
        +>>> z.requires_grad
        +False
        +>>> # then autograd won't track this computation. let's verify!
        +>>> z.backward()
        +RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
        +>>>
        +>>> # now create a tensor with requires_grad=True
        +>>> w = torch.ones(1, requires_grad=True)
        +>>> w.requires_grad
        +True
        +>>> # add to the previous result that has require_grad=False
        +>>> total = w + z
        +>>> # the total sum now requires grad!
        +>>> total.requires_grad
        +True
        +>>> # autograd can compute the gradients as well
        +>>> total.backward()
        +>>> w.grad
        +tensor([ 1.])
        +>>> # and no computation is wasted to compute gradients for x, y and z, which don't require grad
        +>>> z.grad == x.grad == y.grad == None
        +True
        +
        + +

        Manipulating requires_grad flag

        + +

        Other than directly setting the attribute, you can change this flag in-place using my_tensor.requires_grad_(), or, as in the above example, at creation time by passing it in as an argument (default is False), e.g.,

        + +
        >>> existing_tensor.requires_grad_()
        +>>> existing_tensor.requires_grad
        +True
        +>>> my_tensor = torch.zeros(3, 4, requires_grad=True)
        +>>> my_tensor.requires_grad
        +True
        +
        + +

        What about .data?

        + +

        .data was the primary way to get the underlying Tensor from a Variable. After this merge, calling y = x.data still has similar semantics. So y will be a Tensor that shares the same data with x, is unrelated with the computation history of x, and has requires_grad=False.

        + +

        However, .data can be unsafe in some cases. Any changes on x.data wouldn’t be tracked by autograd, and the computed gradients would be incorrect if x is needed in a backward pass. A safer alternative is to use x.detach(), which also returns a Tensor that shares data with requires_grad=False, but will have its in-place changes reported by autograd if x is needed in backward.

        + +

        Here is an example of the difference between .data and x.detach() (and why we recommend using detach in general).

        + +

        If you use Tensor.detach(), the gradient computation is guaranteed to be correct.

        + +
        >>> a = torch.tensor([1,2,3.], requires_grad = True)
        +>>> out = a.sigmoid()
        +>>> c = out.detach()
        +>>> c.zero_()
        +tensor([ 0.,  0.,  0.])
        +
        +>>> out  # modified by c.zero_() !!
        +tensor([ 0.,  0.,  0.])
        +
        +>>> out.sum().backward()  # Requires the original value of out, but that was overwritten by c.zero_()
        +RuntimeError: one of the variables needed for gradient computation has been modified by an
        +
        + +

        However, using Tensor.data can be unsafe and can easily result in incorrect gradients when a tensor is required for gradient computation but modified in-place.

        + +
        >>> a = torch.tensor([1,2,3.], requires_grad = True)
        +>>> out = a.sigmoid()
        +>>> c = out.data
        +>>> c.zero_()
        +tensor([ 0.,  0.,  0.])
        +
        +>>> out  # out  was modified by c.zero_()
        +tensor([ 0.,  0.,  0.])
        +
        +>>> out.sum().backward()
        +>>> a.grad  # The result is very, very wrong because `out` changed!
        +tensor([ 0.,  0.,  0.])
        +
        + +

        Support for 0-dimensional (scalar) Tensors

        + +

        Previously, indexing into a Tensor vector (1-dimensional tensor) gave a Python number but indexing into a Variable vector gave (inconsistently!) a vector of size (1,)! Similar behavior existed with reduction functions, e.g. tensor.sum() would return a Python number, but variable.sum() would return a vector of size (1,).

        + +

        Fortunately, this release introduces proper scalar (0-dimensional tensor) support in PyTorch! Scalars can be created using the new torch.tensor function (which will be explained in more detail later; for now just think of it as the PyTorch equivalent of numpy.array). Now you can do things like:

        + +
        >>> torch.tensor(3.1416)         # create a scalar directly
        +tensor(3.1416)
        +>>> torch.tensor(3.1416).size()  # scalar is 0-dimensional
        +torch.Size([])
        +>>> torch.tensor([3]).size()     # compare to a vector of size 1
        +torch.Size([1])
        +>>>
        +>>> vector = torch.arange(2, 6)  # this is a vector
        +>>> vector
        +tensor([ 2.,  3.,  4.,  5.])
        +>>> vector.size()
        +torch.Size([4])
        +>>> vector[3]                    # indexing into a vector gives a scalar
        +tensor(5.)
        +>>> vector[3].item()             # .item() gives the value as a Python number
        +5.0
        +>>> mysum = torch.tensor([2, 3]).sum()
        +>>> mysum
        +tensor(5)
        +>>> mysum.size()
        +torch.Size([])
        +
        + +

        Accumulating losses

        + +

        Consider the widely used pattern total_loss += loss.data[0]. Before 0.4.0. loss was a Variable wrapping a tensor of size (1,), but in 0.4.0 loss is now a scalar and has 0 dimensions. Indexing into a scalar doesn’t make sense (it gives a warning now, but will be a hard error in 0.5.0). Use loss.item() to get the Python number from a scalar.

        + +

        Note that if you don’t convert to a Python number when accumulating losses, you may find increased memory usage in your program. This is because the right-hand-side of the above expression used to be a Python float, while it is now a zero-dim Tensor. The total loss is thus accumulating Tensors and their gradient history, which may keep around large autograd graphs for much longer than necessary.

        + +

        Deprecation of volatile flag

        + +

        The volatile flag is now deprecated and has no effect. Previously, any computation that involves a Variable with volatile=True wouldn’t be tracked by autograd. This has now been replaced by a set of more flexible context managers including torch.no_grad(), torch.set_grad_enabled(grad_mode), and others.

        + +
        >>> x = torch.zeros(1, requires_grad=True)
        +>>> with torch.no_grad():
        +...     y = x * 2
        +>>> y.requires_grad
        +False
        +>>>
        +>>> is_train = False
        +>>> with torch.set_grad_enabled(is_train):
        +...     y = x * 2
        +>>> y.requires_grad
        +False
        +>>> torch.set_grad_enabled(True)  # this can also be used as a function
        +>>> y = x * 2
        +>>> y.requires_grad
        +True
        +>>> torch.set_grad_enabled(False)
        +>>> y = x * 2
        +>>> y.requires_grad
        +False
        +
        + +

        dtypes, devices and NumPy-style creation functions

        + +

        In previous versions of PyTorch, we used to specify data type (e.g. float vs double), device type (cpu vs cuda) and layout (dense vs sparse) together as a “tensor type”. For example, torch.cuda.sparse.DoubleTensor was the Tensor type representing the double data type, living on CUDA devices, and with COO sparse tensor layout.

        + +

        In this release, we introduce torch.dtype, torch.device and torch.layout classes to allow better management of these properties via NumPy-style creation functions.

        + +

        torch.dtype

        + +

        Below is a complete list of available torch.dtypes (data types) and their corresponding tensor types.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Datatype torch.dtypeTensor types
        32-bit floating pointtorch.float32 or torch.floattorch.*.FloatTensor
        64-bit floating pointtorch.float64 or torch.doubletorch.*.DoubleTensor
        16-bit floating pointtorch.float16 or torch.halftorch.*.HalfTensor
        8-bit integer (unsigned)torch.uint8torch.*.ByteTensor
        8-bit integer (signed)torch.int8torch.*.CharTensor
        16-bit integer (signed)torch.int16 or torch.shorttorch.*.ShortTensor
        32-bit integer (signed)torch.int32 or torch.inttorch.*.IntTensor
        64-bit integer (signed)torch.int64 or torch.longtorch.*.LongTensor
        + +

        The dtype of a tensor can be access via its dtype attribute.

        + +

        torch.device

        + +

        A torch.device contains a device type ('cpu' or 'cuda') and optional device ordinal (id) for the device type. It can be initialized with torch.device('{device_type}') or torch.device('{device_type}:{device_ordinal}').

        + +

        If the device ordinal is not present, this represents the current device for the device type; e.g., torch.device('cuda') is equivalent to torch.device('cuda:X') where X is the result of torch.cuda.current_device().

        + +

        The device of a tensor can be accessed via its device attribute.

        + +

        torch.layout

        + +

        torch.layout represents the data layout of a Tensor. Currently torch.strided (dense tensors, the default) and torch.sparse_coo (sparse tensors with COO format) are supported.

        + +

        The layout of a tensor can be access via its layout attribute.

        + +

        Creating Tensors

        + +

        Methods that create a Tensor now also take in dtype, device, layout, and requires_grad options to specify the desired attributes on the returned Tensor. For example,

        + +
        >>> device = torch.device("cuda:1")
        +>>> x = torch.randn(3, 3, dtype=torch.float64, device=device)
        +tensor([[-0.6344,  0.8562, -1.2758],
        +        [ 0.8414,  1.7962,  1.0589],
        +        [-0.1369, -1.0462, -0.4373]], dtype=torch.float64, device='cuda:1')
        +>>> x.requires_grad  # default is False
        +False
        +>>> x = torch.zeros(3, requires_grad=True)
        +>>> x.requires_grad
        +True
        +
        + +
        torch.tensor(data, ...)
        + +

        torch.tensor is one of the newly added tensor creation methods. It takes in array-like data of all kinds and copies the contained values into a new Tensor. As mentioned earlier, torch.tensor is the PyTorch equivalent of NumPy’s numpy.arrayconstructor. Unlike the torch.*Tensor methods, you can also create zero-dimensional Tensors (aka scalars) this way (a single python number is treated as a Size in the torch.*Tensor methods). Moreover, if a dtype argument isn’t given, it will infer the suitable dtype given the data. It is the recommended way to create a tensor from existing data like a Python list. For example,

        + +
        >>> cuda = torch.device("cuda")
        +>>> torch.tensor([[1], [2], [3]], dtype=torch.half, device=cuda)
        +tensor([[ 1],
        +        [ 2],
        +        [ 3]], device='cuda:0')
        +>>> torch.tensor(1)               # scalar
        +tensor(1)
        +>>> torch.tensor([1, 2.3]).dtype  # type inferece
        +torch.float32
        +>>> torch.tensor([1, 2]).dtype    # type inferece
        +torch.int64
        +
        + +

        We’ve also added more tensor creation methods. Some of them have torch.*_like and/or tensor.new_* variants.

        + +
          +
        • +

          torch.*_like takes in an input Tensor instead of a shape. It returns a Tensor with same attributes as the input Tensor by default unless otherwise specified:

          + +
           >>> x = torch.randn(3, dtype=torch.float64)
          + >>> torch.zeros_like(x)
          + tensor([ 0.,  0.,  0.], dtype=torch.float64)
          + >>> torch.zeros_like(x, dtype=torch.int)
          + tensor([ 0,  0,  0], dtype=torch.int32)
          +
          +
        • +
        • +

          tensor.new_* can also create Tensors with same attributes as tensor, but it always takes in a shape argument:

          + +
           >>> x = torch.randn(3, dtype=torch.float64)
          + >>> x.new_ones(2)
          + tensor([ 1.,  1.], dtype=torch.float64)
          + >>> x.new_ones(4, dtype=torch.int)
          + tensor([ 1,  1,  1,  1], dtype=torch.int32)
          +
          +
        • +
        + +

        To specify the desired shape, you can either use a tuple (e.g., torch.zeros((2, 3))) or variable arguments (e.g., torch.zeros(2, 3)) in most cases.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        NameReturned Tensortorch.*_like varianttensor.new_* variant
        torch.emptyuninitialized memory
        torch.zerosall zeros
        torch.onesall ones
        torch.fullfilled with a given value
        torch.randi.i.d. continuous Uniform[0, 1) 
        torch.randni.i.d. Normal(0, 1) 
        torch.randinti.i.d. discrete Uniform in given range 
        torch.randpermrandom permutation of {0, 1, ..., n - 1}  
        torch.tensorcopied from existing data (list, NumPy ndarray, etc.) 
        torch.from_numpy*from NumPy ndarray (sharing storage without copying)  
        torch.arange, torch.range, and torch.linspaceuniformly spaced values in a given range  
        torch.logspacelogarithmically spaced values in a given range  
        torch.eyeidentity matrix  
        + +

        *: torch.from_numpy only takes in a NumPy ndarray as its input argument.

        + +

        Writing device-agnostic code

        + +

        Previous versions of PyTorch made it difficult to write code that was device agnostic (i.e. that could run on both CUDA-enabled and CPU-only machines without modification).

        + +

        PyTorch 0.4.0 makes this easier in two ways:

        + +
          +
        • The device attribute of a Tensor gives the torch.device for all Tensors (get_device only works for CUDA tensors)
        • +
        • The to method of Tensors and Modules can be used to easily move objects to different devices (instead of having to call cpu() or cuda() based on the context)
        • +
        + +

        We recommend the following pattern:

        + +
        # at beginning of the script
        +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        +
        +...
        +
        +# then whenever you get a new Tensor or Module
        +# this won't copy if they are already on the desired device
        +input = data.to(device)
        +model = MyModule(...).to(device)
        +
        + +

        New edge-case constraints on names of submodules, parameters, and buffers in nn.Module

        + +

        name that is an empty string or contains "." is no longer permitted in module.add_module(name, value), module.add_parameter(name, value) or module.add_buffer(name, value) because such names may cause lost data in the state_dict. If you are loading a checkpoint for modules containing such names, please update the module definition and patch the state_dict before loading it.

        + +

        Code Samples (Putting it all together)

        + +

        To get a flavor of the overall recommended changes in 0.4.0, let’s look at a quick example for a common code pattern in both 0.3.1 and 0.4.0:

        + +
          +
        • 0.3.1 (old): +
          model = MyRNN()
          +if use_cuda:
          +    model = model.cuda()
          +
          +# train
          +total_loss = 0
          +for input, target in train_loader:
          +    input, target = Variable(input), Variable(target)
          +    hidden = Variable(torch.zeros(*h_shape))  # init hidden
          +    if use_cuda:
          +        input, target, hidden = input.cuda(), target.cuda(), hidden.cuda()
          +    ...  # get loss and optimize
          +    total_loss += loss.data[0]
          +
          +# evaluate
          +for input, target in test_loader:
          +    input = Variable(input, volatile=True)
          +    if use_cuda:
          +        ...
          +    ...
          +
          +
        • +
        • 0.4.0 (new): +
          # torch.device object used throughout this script
          +device = torch.device("cuda" if use_cuda else "cpu")
          +
          +model = MyRNN().to(device)
          +
          +# train
          +total_loss = 0
          +for input, target in train_loader:
          +    input, target = input.to(device), target.to(device)
          +    hidden = input.new_zeros(*h_shape)  # has the same device & dtype as `input`
          +    ...  # get loss and optimize
          +    total_loss += loss.item()           # get Python number from 1-element Tensor
          +
          +# evaluate
          +with torch.no_grad():                   # operations inside don't track history
          +    for input, target in test_loader:
          +        ...
          +
          +
        • +
        + +

        Thank you for reading! Please refer to our documentation and release notes for more details.

        + +

        Happy PyTorch-ing!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors/index.html b/blog/pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors/index.html new file mode 100644 index 000000000000..e4ac6d2b6039 --- /dev/null +++ b/blog/pytorch-1-dot-3-adds-mobile-privacy-quantization-and-named-tensors/index.html @@ -0,0 +1,778 @@ + + + + + + + + + + + + + PyTorch 1.3 adds mobile, privacy, quantization, and named tensors | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        PyTorch continues to gain momentum because of its focus on meeting the needs of researchers, its streamlined workflow for production use, and most of all because of the enthusiastic support it has received from the AI community. PyTorch citations in papers on ArXiv grew 194 percent in the first half of 2019 alone, as noted by O’Reilly, and the number of contributors to the platform has grown more than 50 percent over the last year, to nearly 1,200. Facebook, Microsoft, Uber, and other organizations across industries are increasingly using it as the foundation for their most important machine learning (ML) research and production workloads.

        + +

        We are now advancing the platform further with the release of PyTorch 1.3, which includes experimental support for features such as seamless model deployment to mobile devices, model quantization for better performance at inference time, and front-end improvements, like the ability to name tensors and create clearer code with less need for inline comments. We’re also launching a number of additional tools and libraries to support model interpretability and bringing multimodal research to production.

        + +

        Additionally, we’ve collaborated with Google and Salesforce to add broad support for Cloud Tensor Processing Units, providing a significantly accelerated option for training large-scale deep neural networks. Alibaba Cloud also joins Amazon Web Services, Microsoft Azure, and Google Cloud as supported cloud platforms for PyTorch users. You can get started now at pytorch.org.

        + +

        PyTorch 1.3

        + +

        The 1.3 release of PyTorch brings significant new features, including experimental support for mobile device deployment, eager mode quantization at 8-bit integer, and the ability to name tensors. With each of these enhancements, we look forward to additional contributions and improvements from the PyTorch community.

        + +

        Named tensors (experimental)

        + +

        Cornell University’s Sasha Rush has argued that, despite its ubiquity in deep learning, the traditional implementation of tensors has significant shortcomings, such as exposing private dimensions, broadcasting based on absolute position, and keeping type information in documentation. He proposed named tensors as an alternative approach.

        + +

        Today, we name and access dimensions by comment:

        + +
        # Tensor[N, C, H, W]
        + images = torch.randn(32, 3, 56, 56)
        + images.sum(dim=1)
        + images.select(dim=1, index=0)
        +
        + +

        But naming explicitly leads to more readable and maintainable code:

        + +
        NCHW = [N, C, H, W]
        +   images = torch.randn(32, 3, 56, 56, names=NCHW)
        +   images.sum('C')
        +   images.select('C', index=0)
        +
        + +

        Quantization (experimental)

        + +

        It’s important to make efficient use of both server-side and on-device compute resources when developing ML applications. To support more efficient deployment on servers and edge devices, PyTorch 1.3 now supports 8-bit model quantization using the familiar eager mode Python API. Quantization refers to techniques used to perform computation and storage at reduced precision, such as 8-bit integer. This currently experimental feature includes support for post-training quantization, dynamic quantization, and quantization-aware training. It leverages the FBGEMM and QNNPACK state-of-the-art quantized kernel back ends, for x86 and ARM CPUs, respectively, which are integrated with PyTorch and now share a common API.

        + +

        To learn more about the design and architecture, check out the API docs here, and get started with any of the supported techniques using the tutorials available here.

        + +

        PyTorch mobile (experimental)

        + +

        Running ML on edge devices is growing in importance as applications continue to demand lower latency. It is also a foundational element for privacy-preserving techniques such as federated learning. To enable more efficient on-device ML, PyTorch 1.3 now supports an end-to-end workflow from Python to deployment on iOS and Android.

        + +

        This is an early, experimental release, optimized for end-to-end development. Coming releases will focus on:

        + +
          +
        • Optimization for size: Build level optimization and selective compilation depending on the operators needed for user applications (i.e., you pay binary size for only the operators you need)
        • +
        • Performance: Further improvements to performance and coverage on mobile CPUs and GPUs
        • +
        • High level API: Extend mobile native APIs to cover common preprocessing and integration tasks needed for incorporating ML in mobile applications. e.g. Computer vision and NLP
        • +
        + +

        Learn more or get started on Android or iOS here.

        + +

        New tools for model interpretability and privacy

        + +

        Captum

        + +

        As models become ever more complex, it is increasingly important to develop new methods for model interpretability. To help address this need, we’re launching Captum, a tool to help developers working in PyTorch understand why their model generates a specific output. Captum provides state-of-the-art tools to understand how the importance of specific neurons and layers and affect predictions made by the models. Captum’s algorithms include integrated gradients, conductance, SmoothGrad and VarGrad, and DeepLift.

        + +

        The example below shows how to apply model interpretability algorithms on a pretrained ResNet model and then visualize the attributions for each pixel by overlaying them on the image.

        + +
        noise_tunnel = NoiseTunnel(integrated_gradients)
        +
        +attributions_ig_nt, delta = noise_tunnel.attribute(input, n_samples=10, nt_type='smoothgrad_sq', target=pred_label_idx)
        +_ = viz.visualize_image_attr_multiple(["original_image", "heat_map"],
        +                                      ["all", "positive"],
        +                                      np.transpose(attributions_ig_nt.squeeze().cpu().detach().numpy(), (1,2,0)),
        +                                      np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)),
        +                                      cmap=default_cmap,
        +                                      show_colorbar=True)
        +
        + +
        + +
        +
        + +
        + +

        Learn more about Captum at captum.ai.

        + +

        CrypTen

        + +

        Practical applications of ML via cloud-based or machine-learning-as-a-service (MLaaS) platforms pose a range of security and privacy challenges. In particular, users of these platforms may not want or be able to share unencrypted data, which prevents them from taking full advantage of ML tools. To address these challenges, the ML community is exploring a number of technical approaches, at various levels of maturity. These include homomorphic encryption, secure multiparty computation, trusted execution environments, on-device computation, and differential privacy.

        + +

        To provide a better understanding of how some of these technologies can be applied, we are releasing CrypTen, a new community-based research platform for taking the field of privacy-preserving ML forward. Learn more about CrypTen here. It is available on GitHub here.

        + +

        Tools for multimodal AI systems

        + +

        Digital content is often made up of several modalities, such as text, images, audio, and video. For example, a single public post might contain an image, body text, a title, a video, and a landing page. Even one particular component may have more than one modality, such as a video that contains both visual and audio signals, or a landing page that is composed of images, text, and HTML sources.

        + +

        The ecosystem of tools and libraries that work with PyTorch offer enhanced ways to address the challenges of building multimodal ML systems. Here are some of the latest libraries launching today:

        + +

        Detectron2

        + +

        Object detection and segmentation are used for tasks ranging from autonomous vehicles to content understanding for platform integrity. To advance this work, Facebook AI Research (FAIR) is releasing Detectron2, an object detection library now implemented in PyTorch. Detectron2 provides support for the latest models and tasks, increased flexibility to aid computer vision research, and improvements in maintainability and scalability to support production use cases.

        + +

        Detectron2 is available here and you can learn more here.

        + +

        Speech extensions to fairseq

        + +

        Language translation and audio processing are critical components in systems and applications such as search, translation, speech, and assistants. There has been tremendous progress in these fields recently thanks to the development of new architectures like transformers, as well as large-scale pretraining methods. We’ve extended fairseq, a framework for sequence-to-sequence applications such as language translation, to include support for end-to-end learning for speech and audio recognition tasks.These extensions to fairseq enable faster exploration and prototyping of new speech research ideas while offering a clear path to production.

        + +

        Get started with fairseq here.

        + +

        Cloud provider and hardware ecosystem support

        + +

        Cloud providers such as Amazon Web Services, Microsoft Azure, and Google Cloud provide extensive support for anyone looking to develop ML on PyTorch and deploy in production. We’re excited to share the general availability of Google Cloud TPU support and a newly launched integration with Alibaba Cloud. We’re also expanding hardware ecosystem support.

        + +
          +
        • Google Cloud TPU support now broadly available. To accelerate the largest-scale machine learning (ML) applications deployed today and enable rapid development of the ML applications of tomorrow, Google created custom silicon chips called Tensor Processing Units (TPUs). When assembled into multi-rack ML supercomputers called Cloud TPU Pods, these TPUs can complete ML workloads in minutes or hours that previously took days or weeks on other systems. Engineers from Facebook, Google, and Salesforce worked together to enable and pilot Cloud TPU support in PyTorch, including experimental support for Cloud TPU Pods. PyTorch support for Cloud TPUs is also available in Colab. Learn more about how to get started with PyTorch on Cloud TPUs here.
        • +
        • Alibaba adds support for PyTorch in Alibaba Cloud. The initial integration involves a one-click solution for PyTorch 1.x, Data Science Workshop notebook service, distributed training with Gloo/NCCL, as well as seamless integration with Alibaba IaaS such as OSS, ODPS, and NAS. Together with the toolchain provided by Alibaba, we look forward to significantly reducing the overhead necessary for adoption, as well as helping Alibaba Cloud’s global customer base leverage PyTorch to develop new AI applications.
        • +
        • ML hardware ecosystem expands. In addition to key GPU and CPU partners, the PyTorch ecosystem has also enabled support for dedicated ML accelerators. Updates from Intel and Habana showcase how PyTorch, connected to the Glow optimizing compiler, enables developers to utilize these market-specific solutions.
        • +
        + +

        Growth in the PyTorch community

        + +

        As an open source, community-driven project, PyTorch benefits from wide range of contributors bringing new capabilities to the ecosystem. Here are some recent examples:

        + +
          +
        • Mila SpeechBrain aims to provide an open source, all-in-one speech toolkit based on PyTorch. The goal is to develop a single, flexible, user-friendly toolkit that can be used to easily develop state-of-the-art systems for speech recognition (both end to end and HMM-DNN), speaker recognition, speech separation, multi-microphone signal processing (e.g., beamforming), self-supervised learning, and many others. Learn more
        • +
        • SpaCy is a new wrapping library with consistent and easy-to-use interfaces to several models, in order to extract features to power NLP pipelines. Support is provided for via spaCy’s standard training API. The library also calculates an alignment so the transformer features can be related back to actual words instead of just wordpieces. Learn more
        • +
        • HuggingFace PyTorch-Transformers (formerly known as pytorch-pretrained-bert is a library of state-of-the-art pretrained models for Natural Language Processing (NLP). The library currently contains PyTorch implementations, pretrained model weights, usage scripts, and conversion utilities for models such as BERT, GPT-2, RoBERTa, and DistilBERT. It has also grown quickly, with more than 13,000 GitHub stars and a broad set of users. Learn more
        • +
        • PyTorch Lightning is a Keras-like ML library for PyTorch. It leaves core training and validation logic to you and automates the rest. Reproducibility is a crucial requirement for many fields of research, including those based on ML techniques. As the number of research papers submitted to arXiv and conferences skyrockets into the tens of thousands, scaling reproducibility becomes difficult. Learn more.
        • +
        + +

        We recently held the first online Global PyTorch Summer Hackathon, where researchers and developers around the world were invited to build innovative new projects with PyTorch. Nearly 1,500 developers participated, submitting projects ranging from livestock disease detection to AI-powered financial assistants. The winning projects were:

        + +
          +
        • Torchmeta, which provides extensions for PyTorch to simplify the development of meta-learning algorithms in PyTorch. It features a unified interface inspired by TorchVision for both few-shot classification and regression problems, to allow easy benchmarking on multiple data sets to aid with reproducibility.
        • +
        • Open-Unmix, a system for end-to-end music demixing with PyTorch. Demixing separates the individual instruments or vocal track from any stereo recording.
        • +
        • Endless AI-Generated Tees, a store featuring AI-generated T-shirt designs that can be purchased and delivered worldwide. The system uses a state-of-the-art generative model (StyleGAN) that was built with PyTorch and then trained on modern art.
        • +
        + +

        Visit pytorch.org to learn more and get started with PyTorch 1.3 and the latest libraries and ecosystem projects. We look forward to the contributions, exciting research advancements, and real-world applications that the community builds with PyTorch.

        + +

        We’d like to thank the entire PyTorch team and the community for all their contributions to this work.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1-dot-4-released-and-domain-libraries-updated/index.html b/blog/pytorch-1-dot-4-released-and-domain-libraries-updated/index.html new file mode 100644 index 000000000000..31eb0c6173a4 --- /dev/null +++ b/blog/pytorch-1-dot-4-released-and-domain-libraries-updated/index.html @@ -0,0 +1,757 @@ + + + + + + + + + + + + + PyTorch 1.4 released, domain libraries updated | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we’re announcing the availability of PyTorch 1.4, along with updates to the PyTorch domain libraries. These releases build on top of the announcements from NeurIPS 2019, where we shared the availability of PyTorch Elastic, a new classification framework for image and video, and the addition of Preferred Networks to the PyTorch community. For those that attended the workshops at NeurIPS, the content can be found here.

        + +

        PyTorch 1.4

        + +

        The 1.4 release of PyTorch adds new capabilities, including the ability to do fine grain build level customization for PyTorch Mobile, and new experimental features including support for model parallel training and Java language bindings.

        + +

        PyTorch Mobile - Build level customization

        + +

        Following the open sourcing of PyTorch Mobile in the 1.3 release, PyTorch 1.4 adds additional mobile support including the ability to customize build scripts at a fine-grain level. This allows mobile developers to optimize library size by only including the operators used by their models and, in the process, reduce their on device footprint significantly. Initial results show that, for example, a customized MobileNetV2 is 40% to 50% smaller than the prebuilt PyTorch mobile library. You can learn more here about how to create your own custom builds and, as always, please engage with the community on the PyTorch forums to provide any feedback you have.

        + +

        Example code snippet for selectively compiling only the operators needed for MobileNetV2:

        + +
        # Dump list of operators used by MobileNetV2:
        +import torch, yaml
        +model = torch.jit.load('MobileNetV2.pt')
        +ops = torch.jit.export_opnames(model)
        +with open('MobileNetV2.yaml', 'w') as output:
        +    yaml.dump(ops, output)
        +
        + +
        # Build PyTorch Android library customized for MobileNetV2:
        +SELECTED_OP_LIST=MobileNetV2.yaml scripts/build_pytorch_android.sh arm64-v8a
        +
        +# Build PyTorch iOS library customized for MobileNetV2:
        +SELECTED_OP_LIST=MobileNetV2.yaml BUILD_PYTORCH_MOBILE=1 IOS_ARCH=arm64 scripts/build_ios.sh
        +
        + +

        Distributed model parallel training (Experimental)

        + +

        With the scale of models, such as RoBERTa, continuing to increase into the billions of parameters, model parallel training has become ever more important to help researchers push the limits. This release provides a distributed RPC framework to support distributed model parallel training. It allows for running functions remotely and referencing remote objects without copying the real data around, and provides autograd and optimizer APIs to transparently run backwards and update parameters across RPC boundaries.

        + +

        To learn more about the APIs and the design of this feature, see the links below:

        + + + +

        For the full tutorials, see the links below:

        + + + +

        As always, you can connect with community members and discuss more on the forums.

        + +

        Java bindings (Experimental)

        + +

        In addition to supporting Python and C++, this release adds experimental support for Java bindings. Based on the interface developed for Android in PyTorch Mobile, the new bindings allow you to invoke TorchScript models from any Java program. Note that the Java bindings are only available for Linux for this release, and for inference only. We expect support to expand in subsequent releases. See the code snippet below for how to use PyTorch within Java:

        + +
        Module mod = Module.load("demo-model.pt1");
        +Tensor data =
        +    Tensor.fromBlob(
        +        new int[] {1, 2, 3, 4, 5, 6}, // data
        +        new long[] {2, 3} // shape
        +        );
        +IValue result = mod.forward(IValue.from(data), IValue.from(3.0));
        +Tensor output = result.toTensor();
        +System.out.println("shape: " + Arrays.toString(output.shape()));
        +System.out.println("data: " + Arrays.toString(output.getDataAsFloatArray()));
        +
        + +

        Learn more about how to use PyTorch from Java here, and see the full Javadocs API documentation here.

        + +

        For the full 1.4 release notes, see here.

        + +

        Domain Libraries

        + +

        PyTorch domain libraries like torchvision, torchtext, and torchaudio complement PyTorch with common datasets, models, and transforms. We’re excited to share new releases for all three domain libraries alongside the PyTorch 1.4 core release.

        + +

        torchvision 0.5

        + +

        The improvements to torchvision 0.5 mainly focus on adding support for production deployment including quantization, TorchScript, and ONNX. Some of the highlights include:

        + +
          +
        • All models in torchvision are now torchscriptable making them easier to ship into non-Python production environments
        • +
        • ResNets, MobileNet, ShuffleNet, GoogleNet and InceptionV3 now have quantized counterparts with pre-trained models, and also include scripts for quantization-aware training.
        • +
        • In partnership with the Microsoft team, we’ve added ONNX support for all models including Mask R-CNN.
        • +
        + +

        Learn more about torchvision 0.5 here.

        + +

        torchaudio 0.4

        + +

        Improvements in torchaudio 0.4 focus on enhancing the currently available transformations, datasets, and backend support. Highlights include:

        + +
          +
        • SoX is now optional, and a new extensible backend dispatch mechanism exposes SoundFile as an alternative to SoX.
        • +
        • The interface for datasets has been unified. This enables the addition of two large datasets: LibriSpeech and Common Voice.
        • +
        • New filters such as biquad, data augmentation such as time and frequency masking, transforms such as MFCC, gain and dither, and new feature computation such as deltas, are now available.
        • +
        • Transformations now support batches and are jitable.
        • +
        • An interactive speech recognition demo with voice activity detection is available for experimentation.
        • +
        + +

        Learn more about torchaudio 0.4 here.

        + +

        torchtext 0.5

        + +

        torchtext 0.5 focuses mainly on improvements to the dataset loader APIs, including compatibility with core PyTorch APIs, but also adds support for unsupervised text tokenization. Highlights include:

        + +
          +
        • Added bindings for SentencePiece for unsupervised text tokenization .
        • +
        • Added a new unsupervised learning dataset - enwik9.
        • +
        • Made revisions to PennTreebank, WikiText103, WikiText2, IMDb to make them compatible with torch.utils.data. Those datasets are in an experimental folder and we welcome your feedback.
        • +
        + +

        Learn more about torchtext 0.5 here.

        + +

        We’d like to thank the entire PyTorch team and the community for all their contributions to this work.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1-dot-5-released-with-new-and-updated-apis/index.html b/blog/pytorch-1-dot-5-released-with-new-and-updated-apis/index.html new file mode 100644 index 000000000000..f2a628bcd3b2 --- /dev/null +++ b/blog/pytorch-1-dot-5-released-with-new-and-updated-apis/index.html @@ -0,0 +1,735 @@ + + + + + + + + + + + + + PyTorch 1.5 released, new and updated APIs including C++ frontend API parity with Python | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we’re announcing the availability of PyTorch 1.5, along with new and updated libraries. This release includes several major new API additions and improvements. PyTorch now includes a significant update to the C++ frontend, ‘channels last’ memory format for computer vision models, and a stable release of the distributed RPC framework used for model-parallel training. The release also has new APIs for autograd for hessians and jacobians, and an API that allows the creation of Custom C++ Classes that was inspired by pybind.

        + +

        You can find the detailed release notes here.

        + +

        C++ Frontend API (Stable)

        + +

        The C++ frontend API is now at parity with Python, and the features overall have been moved to ‘stable’ (previously tagged as experimental). Some of the major highlights include:

        + +
          +
        • Now with ~100% coverage and docs for C++ torch::nn module/functional, users can easily translate their model from Python API to C++ API, making the model authoring experience much smoother.
        • +
        • Optimizers in C++ had deviated from the Python equivalent: C++ optimizers can’t take parameter groups as input while the Python ones can. Additionally, step function implementations were not exactly the same. With the 1.5 release, C++ optimizers will always behave the same as the Python equivalent.
        • +
        • The lack of tensor multi-dim indexing API in C++ is a well-known issue and had resulted in many posts in PyTorch Github issue tracker and forum. The previous workaround was to use a combination of narrow / select / index_select / masked_select, which was clunky and error-prone compared to the Python API’s elegant tensor[:, 0, ..., mask] syntax. With the 1.5 release, users can use tensor.index({Slice(), 0, "...", mask}) to achieve the same purpose.
        • +
        + +

        ‘Channels last’ memory format for Computer Vision models (Experimental)

        + +

        ‘Channels last’ memory layout unlocks ability to use performance efficient convolution algorithms and hardware (NVIDIA’s Tensor Cores, FBGEMM, QNNPACK). Additionally, it is designed to automatically propagate through the operators, which allows easy switching between memory layouts.

        + +

        Learn more here on how to write memory format aware operators.

        + +

        Custom C++ Classes (Experimental)

        + +

        This release adds a new API, torch::class_, for binding custom C++ classes into TorchScript and Python simultaneously. This API is almost identical in syntax to pybind11. It allows users to expose their C++ class and its methods to the TorchScript type system and runtime system such that they can instantiate and manipulate arbitrary C++ objects from TorchScript and Python. An example C++ binding:

        + +
        template <class T>
        +struct MyStackClass : torch::CustomClassHolder {
        +  std::vector<T> stack_;
        +  MyStackClass(std::vector<T> init) : stack_(std::move(init)) {}
        +
        +  void push(T x) {
        +    stack_.push_back(x);
        +  }
        +  T pop() {
        +    auto val = stack_.back();
        +    stack_.pop_back();
        +    return val;
        +  }
        +};
        +
        +static auto testStack =
        +  torch::class_<MyStackClass<std::string>>("myclasses", "MyStackClass")
        +      .def(torch::init<std::vector<std::string>>())
        +      .def("push", &MyStackClass<std::string>::push)
        +      .def("pop", &MyStackClass<std::string>::pop)
        +      .def("size", [](const c10::intrusive_ptr<MyStackClass>& self) {
        +        return self->stack_.size();
        +      });
        +
        + +

        Which exposes a class you can use in Python and TorchScript like so:

        + +
        @torch.jit.script
        +def do_stacks(s : torch.classes.myclasses.MyStackClass):
        +    s2 = torch.classes.myclasses.MyStackClass(["hi", "mom"])
        +    print(s2.pop()) # "mom"
        +    s2.push("foobar")
        +    return s2 # ["hi", "foobar"]
        +
        + +

        You can try it out in the tutorial here.

        + +

        Distributed RPC framework APIs (Now Stable)

        + +

        The Distributed RPC framework was launched as experimental in the 1.4 release and the proposal is to mark Distributed RPC framework as stable and no longer experimental. This work involves a lot of enhancements and bug fixes to make the distributed RPC framework more reliable and robust overall, as well as adding a couple of new features, including profiling support, using TorchScript functions in RPC, and several enhancements for ease of use. Below is an overview of the various APIs within the framework:

        + +

        RPC API

        +

        The RPC API allows users to specify functions to run and objects to be instantiated on remote nodes. These functions are transparently recorded so that gradients can backpropagate through remote nodes using Distributed Autograd.

        + +

        Distributed Autograd

        +

        Distributed Autograd connects the autograd graph across several nodes and allows gradients to flow through during the backwards pass. Gradients are accumulated into a context (as opposed to the .grad field as with Autograd) and users must specify their model’s forward pass under a with dist_autograd.context() manager in order to ensure that all RPC communication is recorded properly. Currently, only FAST mode is implemented (see here for the difference between FAST and SMART modes).

        + +

        Distributed Optimizer

        +

        The distributed optimizer creates RRefs to optimizers on each worker with parameters that require gradients, and then uses the RPC API to run the optimizer remotely. The user must collect all remote parameters and wrap them in an RRef, as this is required input to the distributed optimizer. The user must also specify the distributed autograd context_id so that the optimizer knows in which context to look for gradients.

        + +

        Learn more about distributed RPC framework APIs here.

        + +

        New High level autograd API (Experimental)

        + +

        PyTorch 1.5 brings new functions including jacobian, hessian, jvp, vjp, hvp and vhp to the torch.autograd.functional submodule. This feature builds on the current API and allows the user to easily perform these functions.

        + +

        Detailed design discussion on GitHub can be found here.

        + +

        Python 2 no longer supported

        + +

        Starting PyTorch 1.5.0, we will no longer support Python 2, specifically version 2.7. Going forward support for Python will be limited to Python 3, specifically Python 3.5, 3.6, 3.7 and 3.8 (first enabled in PyTorch 1.4.0).

        + +

        We’d like to thank the entire PyTorch team and the community for all their contributions to this work.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.10-new-library-releases/index.html b/blog/pytorch-1.10-new-library-releases/index.html new file mode 100644 index 000000000000..55bee430860b --- /dev/null +++ b/blog/pytorch-1.10-new-library-releases/index.html @@ -0,0 +1,845 @@ + + + + + + + + + + + + + New Library Releases in PyTorch 1.10, including TorchX, TorchAudio, TorchVision | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we are announcing a number of new features and improvements to PyTorch libraries, alongside the PyTorch 1.10 release. Some highlights include:

        + +

        Some highlights include:

        + +
          +
        • TorchX - a new SDK for quickly building and deploying ML applications from research & development to production.
        • +
        • TorchAudio - Added text-to-speech pipeline, self-supervised model support, multi-channel support and MVDR beamforming module, RNN transducer (RNNT) loss function, and batch and filterbank support to lfilter function. See the TorchAudio release notes here.
        • +
        • TorchVision - Added new RegNet and EfficientNet models, FX based feature extraction added to utilities, two new Automatic Augmentation techniques: Rand Augment and Trivial Augment, and updated training recipes. See the TorchVision release notes here.
        • +
        + +

        Introducing TorchX

        +

        TorchX is a new SDK for quickly building and deploying ML applications from research & development to production. It offers various builtin components that encode MLOps best practices and make advanced features like distributed training and hyperparameter optimization accessible to all.

        + +

        Users can get started with TorchX 0.1 with no added setup cost since it supports popular ML schedulers and pipeline orchestrators that are already widely adopted and deployed in production. No two production environments are the same. To comply with various use cases, TorchX’s core APIs allow tons of customization at well-defined extension points so that even the most unique applications can be serviced without customizing the whole vertical stack.

        + +

        Read the documentation for more details and try out this feature using this quickstart tutorial.

        + +

        TorchAudio 0.10

        + +

        [Beta] Text-to-speech pipeline

        +

        TorchAudio now adds the Tacotron2 model and pretrained weights. It is now possible to build a text-to-speech pipeline with existing vocoder implementations like WaveRNN and Griffin-Lim. Building a TTS pipeline requires matching data processing and pretrained weights, which are often non-trivial to users. So TorchAudio introduces a bundle API so that constructing pipelines for specific pretrained weights is easy. The following example illustrates this.

        + +
        >>> import torchaudio
        +>>>
        +>>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
        +>>>
        +>>> # Build text processor, Tacotron2 and vocoder (WaveRNN) model
        +>>> processor = bundle.get_text_processor()
        +>>> tacotron2 = bundle.get_tacotron2()
        +Downloading:
        +100%|███████████████████████████████| 107M/107M [00:01<00:00, 87.9MB/s]
        +>>> vocoder = bundle.get_vocoder()
        +Downloading:
        +100%|███████████████████████████████| 16.7M/16.7M [00:00<00:00, 78.1MB/s]
        +>>>
        +>>> text = "Hello World!"
        +>>>
        +>>> # Encode text
        +>>> input, lengths = processor(text)
        +>>>
        +>>> # Generate (mel-scale) spectrogram
        +>>> specgram, lengths, _ = tacotron2.infer(input, lengths)
        +>>>
        +>>> # Convert spectrogram to waveform
        +>>> waveforms, lengths = vocoder(specgram, lengths)
        +>>>
        +>>> # Save audio
        +>>> torchaudio.save('hello-world.wav', waveforms, vocoder.sample_rate)
        +
        +
        + +

        For the details of this API please refer to the documentation. You can also try this from the tutorial.

        + +

        (Beta) Self-Supervised Model Support

        +

        TorchAudio added HuBERT model architecture and pre-trained weight support for wav2vec 2.0 and HuBERT. HuBERT and wav2vec 2.0 are novel ways for audio representation learning and they yield high accuracy when fine-tuned on downstream tasks. These models can serve as baseline in future research, therefore, TorchAudio is providing a simple way to run the model. Similar to the TTS pipeline, the pretrained weights and associated information, such as expected sample rates and output class labels (for fine-tuned weights) are put together as a bundle, so that they can be used to build pipelines. The following example illustrates this.

        + +
        >>> import torchaudio
        +>>>
        +>>> bundle = torchaudio.pipelines.HUBERT_ASR_LARGE
        +>>>
        +>>> # Build the model and load pretrained weight.
        +>>> model = bundle.get_model()
        +Downloading:
        +100%|███████████████████████████████| 1.18G/1.18G [00:17<00:00, 73.8MB/s]
        +>>> # Check the corresponding labels of the output.
        +>>> labels = bundle.get_labels()
        +>>> print(labels)
        +('<s>', '<pad>', '</s>', '<unk>', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
        +>>>
        +>>> # Infer the label probability distribution
        +>>> waveform, sample_rate = torchaudio.load(hello-world.wav')
        +>>>
        +>>> emissions, _ = model(waveform)
        +>>>
        +>>> # Pass emission to (hypothetical) decoder
        +>>> transcripts = ctc_decode(emissions, labels)
        +>>> print(transcripts[0])
        +HELLO WORLD
        +
        +
        + +

        Please refer to the documentation for more details and try out this feature using this tutorial.

        + +

        (Beta) Multi-channel support and MVDR beamforming

        +

        Far-field speech recognition is a more challenging task compared to near-field recognition. Multi-channel methods such as beamforming help reduce the noises and enhance the target speech.

        + +

        TorchAudio now adds support for differentiable Minimum Variance Distortionless Response (MVDR) beamforming on multi-channel audio using Time-Frequency masks. Researchers can easily assemble it with any multi-channel ASR pipeline. There are three solutions (ref_channel, stv_evd, stv_power) and it supports single-channel and multi-channel (perform average in the method) masks. It provides an online option that recursively updates the parameters for streaming audio. We also provide a tutorial on how to apply MVDR beamforming to the multi-channel audio in the example directory.

        + +
        >>> from torchaudio.transforms import MVDR, Spectrogram, InverseSpectrogram
        +>>>
        +>>> # Load the multi-channel noisy audio
        +>>> waveform_mix, sr = torchaudio.load('mix.wav')
        +>>> # Initialize the stft and istft modules
        +>>> stft = Spectrogram(n_fft=1024, hop_length=256, return_complex=True, power=None)
        +>>> istft = InverseSpectrogram(n_fft=1024, hop_length=256)
        +>>> # Get the noisy spectrogram
        +>>> specgram_mix = stft(waveform_mix)
        +>>> # Get the Time-Frequency mask via machine learning models
        +>>> mask = model(waveform)
        +>>> # Initialize the MVDR module 
        +>>> mvdr = MVDR(ref_channel=0, solution=ref_channel, multi_mask=False)
        +>>> # Apply MVDR beamforming
        +>>> specgram_enhanced = mvdr(specgram_mix, mask)
        +>>> # Get the enhanced waveform via iSTFT
        +>>> waveform_enhanced = istft(specgram_enhanced, length=waveform.shape[-1])
        +
        +

        Please refer to the documentation for more details and try out this feature using the MVDR tutorial.

        + +

        (Beta) RNN Transducer Loss

        +

        The RNN transducer (RNNT) loss is part of the RNN transducer pipeline, which is a popular architecture for speech recognition tasks. Recently it has gotten attention for being used in a streaming setting, and has also achieved state-of-the-art WER for the LibriSpeech benchmark.

        + +

        TorchAudio’s loss function supports float16 and float32 logits, has autograd and torchscript support, and can be run on both CPU and GPU, which has a custom CUDA kernel implementation for improved performance. The implementation is consistent with the original loss function in Sequence Transduction with Recurrent Neural Networks, but relies on code from Alignment Restricted Streaming Recurrent Neural Network Transducer. Special thanks to Jay Mahadeokar and Ching-Feng Yeh for their code contributions and guidance.

        + +

        Please refer to the documentation for more details.

        + +

        (Beta) Batch support and filter bank support

        +

        torchaudio.functional.lfilter now supports batch processing and multiple filters.

        + +

        (Prototype) Emformer Module

        +

        Automatic speech recognition (ASR) research and productization have increasingly focused on on-device applications. Towards supporting such efforts, TorchAudio now includes Emformer, a memory-efficient transformer architecture that has achieved state-of-the-art results on LibriSpeech in low-latency streaming scenarios, as a prototype feature.

        + +

        Please refer to the documentation for more details.

        + +

        GPU Build

        +

        GPU builds that support custom CUDA kernels in TorchAudio, like the one being used for RNN transducer loss, have been added. Following this change, TorchAudio’s binary distribution now includes CPU-only versions and CUDA-enabled versions. To use CUDA-enabled binaries, PyTorch also needs to be compatible with CUDA.

        + +

        TorchVision 0.11

        + +

        (Stable) New Models

        +

        RegNet and EfficientNet are two popular architectures that can be scaled to different computational budgets. In this release we include 22 pre-trained weights for their classification variants. The models were trained on ImageNet and the accuracies of the pre-trained models obtained on ImageNet val can be found below (see #4403, #4530 and #4293 for more details).

        + +

        The models can be used as follows:

        + +
        import torch
        +from torchvision import models
        +
        +x = torch.rand(1, 3, 224, 224)
        +
        +regnet = models.regnet_y_400mf(pretrained=True)
        +regnet.eval()
        +predictions = regnet(x)
        +
        +efficientnet = models.efficientnet_b0(pretrained=True)
        +efficientnet.eval()
        +predictions = efficientnet(x)
        +
        +

        See the full list of new models on the torchvision.models documentation page.

        + +

        We would like to thank Ross Wightman and Luke Melas-Kyriazi for contributing the weights of the EfficientNet variants.

        + +

        (Beta) FX-based Feature Extraction

        +

        A new Feature Extraction method has been added to our utilities. It uses torch.fx and enables us to retrieve the outputs of intermediate layers of a network which is useful for feature extraction and visualization.

        + +

        Here is an example of how to use the new utility:

        + +
        import torch
        +from torchvision.models import resnet50
        +from torchvision.models.feature_extraction import create_feature_extractor
        +
        +
        +x = torch.rand(1, 3, 224, 224)
        +
        +model = resnet50()
        +
        +return_nodes = {
        +"layer4.2.relu_2": "layer4"
        +}
        +model2 = create_feature_extractor(model, return_nodes=return_nodes)
        +intermediate_outputs = model2(x)
        +
        +print(intermediate_outputs['layer4'].shape)
        +
        +

        We would like to thank Alexander Soare for developing this utility.

        + +

        (Stable) New Data Augmentations

        +

        Two new Automatic Augmentation techniques were added: RandAugment and Trivial Augment. They apply a series of transformations on the original data to enhance them and to boost the performance of the models. The new techniques build on top of the previously added AutoAugment and focus on simplifying the approach, reducing the search space for the optimal policy and improving the performance gain in terms of accuracy. These techniques enable users to reproduce recipes to achieve state-of-the-art performance on the offered models. Additionally, it enables users to apply these techniques in order to do transfer learning and achieve optimal accuracy on new datasets.

        + +

        Both methods can be used as drop-in replacement of the AutoAugment technique as seen below:

        + +
        from torchvision import transforms
        +
        +t = transforms.RandAugment()
        +# t = transforms.TrivialAugmentWide()
        +transformed = t(image)
        +
        +transform = transforms.Compose([
        +transforms.Resize(256),
        +transforms.RandAugment(), # transforms.TrivialAugmentWide()
        +transforms.ToTensor()])
        +
        +

        Read the automatic augmentation transforms for more details.

        + +

        We would like to thank Samuel G. Müller for contributing to Trivial Augment and for his help on refactoring the AA package.

        + +

        Updated Training Recipes

        +

        We have updated our training reference scripts to add support for Exponential Moving Average, Label Smoothing, Learning-Rate Warmup, Mixup, Cutmix and other SOTA primitives. The above enabled us to improve the classification Acc@1 of some pre-trained models by over 4 points. A major update of the existing pre-trained weights is expected in the next release.

        + +

        Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube and LinkedIn.

        + +

        Cheers! +Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.10-released/index.html b/blog/pytorch-1.10-released/index.html new file mode 100644 index 000000000000..3a6c8e025868 --- /dev/null +++ b/blog/pytorch-1.10-released/index.html @@ -0,0 +1,736 @@ + + + + + + + + + + + + + PyTorch 1.10 Release, including CUDA Graphs APIs, Frontend and Compiler Improvements | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch 1.10. This release is composed of over 3,400 commits since 1.9, made by 426 contributors. We want to sincerely thank our community for continuously improving PyTorch.

        + +

        PyTorch 1.10 updates are focused on improving training and performance of PyTorch, and developer usability. The full release notes are available here. Highlights include:

        +
          +
        1. CUDA Graphs APIs are integrated to reduce CPU overheads for CUDA workloads.
        2. +
        3. Several frontend APIs such as FX, torch.special, and nn.Module Parametrization, have moved from beta to stable.
        4. +
        5. Support for automatic fusion in JIT Compiler expands to CPUs in addition to GPUs.
        6. +
        7. Android NNAPI support is now available in beta.
        8. +
        + +

        Along with 1.10, we are also releasing major updates to the PyTorch libraries, which you can read about in this blog post.

        + +

        Frontend APIs

        + +

        (Stable) Python code transformations with FX

        + +

        FX provides a Pythonic platform for transforming and lowering PyTorch programs. It is a toolkit for pass writers to facilitate Python-to-Python transformation of functions and nn.Module instances. This toolkit aims to support a subset of Python language semantics—rather than the whole Python language—to facilitate ease of implementation of transforms. With 1.10, FX is moving to stable.

        + +

        You can learn more about FX in the official documentation and GitHub examples of program transformations implemented using torch.fx.

        + +

        (Stable) torch.special

        +

        A torch.special module, analogous to SciPy’s special module, is now available in stable. The module has 30 operations, including gamma, Bessel, and (Gauss) error functions.

        + +

        Refer to this documentation for more details.

        + +

        (Stable) nn.Module Parametrization

        +

        nn.Module parametrizaton, a feature that allows users to parametrize any parameter or buffer of an nn.Module without modifying the nn.Module itself, is available in stable. This release adds weight normalization (weight_norm), orthogonal parametrization (matrix constraints and part of pruning) and more flexibility when creating your own parametrization.

        + +

        Refer to this tutorial and the general documentation for more details.

        + +

        (Beta) CUDA Graphs APIs Integration

        +

        PyTorch now integrates CUDA Graphs APIs to reduce CPU overheads for CUDA workloads.

        + +

        CUDA Graphs greatly reduce the CPU overhead for CPU-bound cuda workloads and thus improve performance by increasing GPU utilization. For distributed workloads, CUDA Graphs also reduce jitter, and since parallel workloads have to wait for the slowest worker, reducing jitter improves overall parallel efficiency.

        + +

        Integration allows seamless interop between the parts of the network captured by cuda graphs, and parts of the network that cannot be captured due to graph limitations.

        + +

        Read the note for more details and examples, and refer to the general documentation for additional information.

        + +

        [Beta] Conjugate View

        +

        PyTorch’s conjugation for complex tensors (torch.conj()) is now a constant time operation, and returns a view of the input tensor with a conjugate bit set as can be seen by calling torch.is_conj() . This has already been leveraged in various other PyTorch operations like matrix multiplication, dot product etc., to fuse conjugation with the operation leading to significant performance gain and memory savings on both CPU and CUDA.

        + +

        Distributed Training

        + +

        Distributed Training Releases Now in Stable

        +

        In 1.10, there are a number of features that are moving from beta to stable in the distributed package:

        +
          +
        • (Stable) Remote Module: This feature allows users to operate a module on a remote worker like using a local module, where the RPCs are transparent to the user. Refer to this documentation for more details.
        • +
        • (Stable) DDP Communication Hook: This feature allows users to override how DDP synchronizes gradients across processes. Refer to this documentation for more details.
        • +
        • (Stable) ZeroRedundancyOptimizer: This feature can be used in conjunction with DistributedDataParallel to reduce the size of per-process optimizer states. With this stable release, it now can handle uneven inputs to different data-parallel workers. Check out this tutorial. We also improved the parameter partition algorithm to better balance memory and computation overhead across processes. Refer to this documentation and this tutorial to learn more.
        • +
        + +

        Performance Optimization and Tooling

        + +

        [Beta] Profile-directed typing in TorchScript

        +

        TorchScript has a hard requirement for source code to have type annotations in order for compilation to be successful. For a long time, it was only possible to add missing or incorrect type annotations through trial and error (i.e., by fixing the type-checking errors generated by torch.jit.script one by one), which was inefficient and time consuming.

        + +

        Now, we have enabled profile directed typing for torch.jit.script by leveraging existing tools like MonkeyType, which makes the process much easier, faster, and more efficient. For more details, refer to the documentation.

        + +

        (Beta) CPU Fusion

        +

        In PyTorch 1.10, we’ve added an LLVM-based JIT compiler for CPUs that can fuse together sequences of torch library calls to improve performance. While we’ve had this capability for some time on GPUs, this release is the first time we’ve brought compilation to the CPU.
        +You can check out a few performance results for yourself in this Colab notebook.

        + +

        (Beta) PyTorch Profiler

        +

        The objective of PyTorch Profiler is to target the execution steps that are the most costly in time and/or memory, and visualize the workload distribution between GPUs and CPUs. PyTorch 1.10 includes the following key features:

        + +
          +
        • Enhanced Memory View: This helps you understand your memory usage better. This tool will help you avoid Out of Memory errors by showing active memory allocations at various points of your program run.
        • +
        • Enhanced Automated Recommendations: This helps provide automated performance recommendations to help optimize your model. The tools recommend changes to batch size, TensorCore, memory reduction technologies, etc.
        • +
        • Enhanced Kernel View: Additional columns show grid and block sizes as well as shared memory usage and registers per thread.
        • +
        • Distributed Training: Gloo is now supported for distributed training jobs.
        • +
        • Correlate Operators in the Forward & Backward Pass: This helps map the operators found in the forward pass to the backward pass, and vice versa, in a trace view.
        • +
        • TensorCore: This tool shows the Tensor Core (TC) usage and provides recommendations for data scientists and framework developers.
        • +
        • NVTX: Support for NVTX markers was ported from the legacy autograd profiler.
        • +
        • Support for profiling on mobile devices: The PyTorch profiler now has better integration with TorchScript and mobile backends, enabling trace collection for mobile workloads.
        • +
        + +

        Refer to this documentation for details. Check out this tutorial to learn how to get started with this feature.

        + +

        PyTorch Mobile

        + +

        (Beta) Android NNAPI Support in Beta

        +

        Last year we released prototype support for Android’s Neural Networks API (NNAPI). NNAPI allows Android apps to run computationally intensive neural networks on the most powerful and efficient parts of the chips that power mobile phones, including GPUs (Graphics Processing Units) and NPUs (specialized Neural Processing Units).

        + +

        Since the prototype we’ve added more op coverage, added support for load-time flexible shapes and ability to run the model on the host for testing. Try out this feature using the tutorial.

        + +

        Additionally, Transfer Learning steps have been added to Object Detection examples. Check out this GitHub page to learn more. Please provide your feedback or ask questions on the forum. You can also check out this presentation to get an overview.

        + +

        Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

        + +

        Cheers! +Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.11-new-library-releases/index.html b/blog/pytorch-1.11-new-library-releases/index.html new file mode 100644 index 000000000000..4064255291e8 --- /dev/null +++ b/blog/pytorch-1.11-new-library-releases/index.html @@ -0,0 +1,965 @@ + + + + + + + + + + + + + Introducing TorchRec, and other domain library updates in PyTorch 1.11 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are introducing the beta release of TorchRec and a number of improvements to the current PyTorch domain libraries, alongside the PyTorch 1.11 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch. Highlights include:

        + +
          +
        • TorchRec, a PyTorch domain library for Recommendation Systems, is available in beta. View it on GitHub.
        • +
        • TorchAudio - Added Enformer- and RNN-T-based models and recipes to support the full development lifecycle of a streaming ASR model. See the release notes here.
        • +
        • TorchText - Added beta support for RoBERTa and XLM-R models, byte-level BPE tokenizer, and text datasets backed by TorchData. See the release notes here.
        • +
        • TorchVision - Added 4 new model families and 14 new classification datasets such as CLEVR, GTSRB, FER2013. See the release notes here.
        • +
        + +

        TorchRec 0.1

        + +

        We announced TorchRec a few weeks ago and we are excited to release the beta version today. To recap, TorchRec is a PyTorch domain library for Recommendation Systems. This new library provides common sparsity and parallelism primitives, enabling researchers to build state-of-the-art personalization models and deploy them in production. TorchRec was used to train a 1.25 trillion parameter model, pushed to production in January 2022.

        + +

        In particular, the library includes:

        + +
          +
        • Modeling primitives, such as embedding bags and jagged tensors, that enable easy authoring of large, performant multi-device/multi-node models using hybrid data-parallelism and model-parallelism.
        • +
        • Optimized RecSys kernels powered by FBGEMM, including support for sparse and quantized operations.
        • +
        • A sharder which can partition embedding tables with a variety of different strategies including data-parallel, table-wise, row-wise, table-wise-row-wise, and column-wise sharding.
        • +
        • A planner which can automatically generate optimized sharding plans for models.
        • +
        • Pipelining to overlap dataloading device transfer (copy to GPU), inter-device communications (input_dist), and computation (forward, backward) for increased performance.
        • +
        • GPU inference support.
        • +
        • Common modules for RecSys, such as models and public datasets (Criteo & Movielens).
        • +
        + +

        Please check the TorchRec announcement post here, video tutorial, install instructions here, test drive the feature through this tutorial here, and refer to the reference document here.

        + +

        TorchAudio 0.11

        + +

        TorchAudio: Building Blocks for Audio and Speech Processing

        + +

        We published a paper, TorchAudio: Building Blocks for Audio and Speech Processing, describing the overview of the TorchAudio library. If you find TorchAudio useful for your research, please help us share with the community by citing our paper.

        + +

        (Beta) RNN-T & (Prototype) Emformer Models and Recipes

        + +

        + +

        + +

        Emformer is an efficient memory-transformer-based streaming acoustic model that has demonstrated state-of-the-art streaming automatic speech recognition (ASR) performance in low-latency, resource-constrained scenarios, such as on-device applications (citation: https://arxiv.org/abs/2010.10759).

        + +

        The TorchAudio v0.11 release includes the following beta features:

        + +
          +
        • Implementation of Emformer (docs)
        • +
        • Recurrent neural network transducer (RNN-T) streaming ASR model that uses Emformer for its transcription network (docs)
        • +
        • RNN-T beam search decoder with TorchScript support (docs)
        • +
        • LibriSpeech Emformer RNN-T training recipe (GitHub) and corresponding pre-trained streaming ASR inference pipeline (docs)
        • +
        + +

        Also there are prototype features that are available from nightly builds or the main branch.

        + +
          +
        • Training recipes trained on MuST-C and TED-LIUM3 datasets. (GitHub)
        • +
        • Pre-trained pipelines corresponding to the recipes. (docs)
        • +
        • Tutorial that steps through performing online speech recognition with RNN-T Emformer model. (docs)
        • +
        + +

        Collectively, these features cover the full development lifecycle of a streaming ASR model, from definition through training and inference, and enable users to easily develop their own Emformer- and RNN-T-based models.

        + +

        Special thanks to Yangyang Shi, Jay Mahadeokar, and Gil Keren for their code contributions and guidance.

        + +

        (Beta) HuBERT Pretrain Model

        + +

        The masked prediction training of HuBERT model requires the masked logits, unmasked logits, and feature norm as the outputs. The logits are for cross-entropy losses and the feature norm is for penalty loss. The release adds HuBERTPretrainModel and corresponding factory functions (hubert_pretrain_base, hubert_pretrain_large, and hubert_pretrain_xlarge) to enable training from scratch.

        + +

        (Prototype) CTC Beam Search Decoder

        + +

        In recent releases, TorchAudio has added support for ASR models fine-tuned on CTC loss. The addition of an inference time CTC beam search decoder enables running end-to-end ASR evaluation using TorchAudio utils.

        + +

        The CTC decoder in TorchAudio supports customizable beam search decoding with lexicon constraint. It also has optional KenLM language model support.

        + +

        For more details, please check out the API tutorial. This prototype feature is available through nightly builds.

        + +

        (Prototype) Streaming API

        + +

        TorchAudio started as simple audio I/O APIs that supplement PyTorch. With the recent addition of ASR models and training recipes, the project has received requests to support high-level application development.

        + +

        Streaming API makes it easy to develop and test the model in online inference. It utilizes ffmpeg under the hood, and enables reading media from online services and hardware devices, decoding media in an incremental manner, and applying filters and preprocessing.

        + +

        Please checkout the API tutorial and the documentation. There are also the streaming ASR tutorial and the device streaming ASR tutorial. This feature is available from nightly releases. Please refer to pytorch.org for how to install nightly builds.

        + +

        TorchText 0.12

        + +

        (Beta) RoBERTa and XLM-R Models

        + +

        TorchText has added support for pre-trained RoBERTa and XLM-R models. It would allow users to train end-2-end Transformer Encoder based models on standard NLP tasks using TorchText.

        + +

        More specifically:

        + +
          +
        • The models are torchscriptable and hence can be employed for production use-cases.
        • +
        • The model APIs let users to easily attach custom task-specific heads with pre-trained encoders.
        • +
        • The API also comes equipped with data pre-processing transforms to match the pre-trained weights and model configuration.
        • +
        + +

        We have added a tutorial to demonstrate SST-2 binary text classification task with pre-trained XLM-R base architecture.

        + +

        For additional details on model APIs and usage examples, please refer to the documentation.

        + +

        (Beta) byte-level BPE tokenizer

        + +

        TorchText has added support for a Byte-Level BPE tokenizer, as used in GPT-2. This tokenizer is also used for tokenizing inputs to the pre-trained RoBERTa models described previously. In addition to the RoBERTa vocab, users can also load their own custom BPE vocab to use the tokenizer. Furthermore, the tokenizer is fully torchscriptable and hence can be employed for production use-cases. For additional details on model APIs and usage examples, please refer to the documentation.

        + +

        (Beta) Text datasets backed by TorchData

        + +

        TorchText has modernized its datasets by migrating from older-style Iterable Datasets to TorchData’s DataPipes. TorchData is a library that provides modular/composable primitives, allowing users to load and transform data in performant data pipelines.

        + +

        These DataPipes work out-of-the-box with PyTorch DataLoader and would enable new functionalities like auto-sharding. Users can now easily do data manipulation and pre-processing using user-defined functions and transformations in a functional style programming. Datasets backed by DataPipes also enable standard flow-control like batching, collation, shuffling and bucketizing.

        + +

        Collectively, DataPipes provides a comprehensive experience for data preprocessing and tensorization needs in a pythonic and flexible way for model training. We have added a tutorial to demonstrate data-processing pipelining using the modernized dataset for binary text-classification.

        + +

        You can learn more about TorchData DataPipe APIs in its official documentation.

        + +

        TorchVision 0.12

        + +

        New Models

        + +

        Four new model families have been released in the latest version along with pre-trained weights for their variants.

        + +

        #1 Object Detection

        + +

        FCOS is a popular, fully convolutional, anchor-free model for object detection. In this release we include a community-contributed model implementation as well as pre-trained weights. The model was trained on COCO train2017 and can be used as follows:

        + +
        import torch
        +from torchvision import models
        +
        +x = [torch.rand(3, 224, 224)]
        +fcos = models.detection.fcos_resnet50_fpn(pretrained=True).eval()
        +predictions =  fcos(x)
        +
        + +

        The box AP of the pre-trained model on COCO val2017 is 39.2 (see #4961 for more details).

        + +

        We would like to thank Hu Ye and Zhiqiang Wang for contributing to the model implementation and initial training. This was the first community-contributed model in a long while, and given its success, we decided to use the learnings from this process and create a new model contribution guidelines.

        + +

        #2 Optical Flow support and RAFT model

        + +

        TorchVision now supports optical flow! Optical Flow models try to predict movement in a video: given two consecutive frames, the model predicts where each pixel of the first frame ends up in the second frame. Check out our new tutorial on Optical Flow!

        + +

        We implemented a torchscript-compatible RAFT model with pre-trained weights (both normal and “small” versions), and added support for training and evaluating optical flow models. Our training scripts support distributed training across processes and nodes, leading to much faster training time than the original implementation. We also added 5 new optical flow datasets: Flying Chairs, Flying Things, Sintel, Kitti, and HD1K.

        + +

        + +

        + +

        #3. Image Classification

        + +

        Vision Transformer (ViT) and ConvNeXt are two popular architectures which can be used as image classifiers or as backbones for downstream vision tasks. In this release we include 8 pre-trained weights for their classification variants. The models were trained on ImageNet and can be used as follows:

        + +
        import torch
        +from torchvision import models
        +
        +x = torch.rand(1, 3, 224, 224)
        +vit = models.vit_b_16(pretrained=True).eval()
        +convnext = models.convnext_tiny(pretrained=True).eval()
        +predictions1 = vit(x)
        +predictions2 = convnext(x)
        +
        + +

        The accuracies of the pre-trained models obtained on ImageNet val are seen below:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelAcc@1Acc@5
        vit_b_1681.07295.318
        vit_b_3275.91292.466
        vit_l_1679.66294.638
        vit_l_3276.97293.07
        convnext_tiny82.5296.146
        convnext_small83.61696.65
        convnext_base84.06296.87
        convnext_large84.41496.976
        + +

        The above models have been trained using an adjusted version of our new training recipe and this allows us to offer models with accuracies significantly higher than the ones on the original papers.

        + +

        #4. GPU Video Decoding

        + +

        In this release, we add support for GPU video decoding in the video reading API. To use hardware-accelerated decoding, we just need to pass a cuda device to the video reading API as shown below:

        + +
        import torchvision
        +
        +reader = torchvision.io.VideoReader(file_name, device="cuda:0")
        +for frame in reader:
        +    print(frame)
        +
        + +

        We also support seeking to anyframe or a keyframe in the video before reading, as shown below:

        + +
        reader.seek(seek_time)
        +
        + +

        New Datasets

        + +

        We have implemented 14 new classification datasets: CLEVR, GTSRB, FER2013, SUN397, Country211, Flowers102, fvgc_aircraft, OxfordIIITPet, DTD, Food 101, Rendered SST2, Stanford cars, PCAM, and EuroSAT.

        + +

        As part of our work on Optical Flow support (see above for more details), we also added 5 new optical flow datasets: Flying Chairs, Flying Things, Sintel, Kitti, and HD1K.

        + +

        Other Updates

        + +
          +
        • New documentation layout: Each function / class is now documented in a separate page, clearing up some space in the per-module pages, and easing the discovery of the proposed APIs. Compare e.g. our previous docs vs the new ones. Please let us know if you have any feedback!
        • +
        • New model contribution guidelines have been published following the success of the FCOS model which was contributed by the community. These guidelines aim to be an overview of the model contribution process for anyone who would like to suggest, implement and train a new model.
        • +
        • Upcoming Prototype API - We are currently working on a prototype API which adds Multi-weight support on all of our model builder methods. This will enable us to offer multiple pre-trained weights, associated with their meta-data and inference transforms. The API is still under review and thus was not included in the release but you can read more about it on our blogpost and provide your feedback on the dedicated Github issue.
        • +
        • Changes in our deprecation policy - Up until now, torchvision would almost never remove deprecated APIs. In order to be more aligned and consistent with pytorch core, we are updating our deprecation policy. We are now following a 2-release deprecation cycle: deprecated APIs will raise a warning for 2 versions, and will be removed after that. To reflect these changes and to smooth the transition, we have decided to: +
            +
          • Remove all APIs that had been deprecated before or on v0.8, released 1.5 years ago.
          • +
          • Update the removal timeline of all other deprecated APIs to v0.14, to reflect the new 2-cycle policy starting now in v0.12.
          • +
          +
        • +
        + +

        Captum 0.5

        + +

        Captum is a PyTorch library for model interpretability. For this release, we expanded Captum with influential instances and added support for both similarity based influences and novel algorithms, TracIn and its variants. TracIn variants offer faster approximation of influence scores based on random projections for fully connected layers.

        + +

        More specifically the new, influence, subsection of Captum includes:

        + +
          +
        • SimilarityInfluence computes similarity scores between test and training examples using default (cosine or euclidean) or custom user definite metrics w.r.t. given input model layers.
        • +
        • TracInCP approximates the influential score of each training example on a given test example based on the dot-product similarity between loss gradients w.r.t. model parameters for test and training examples. Note that if we use training examples as test examples then we compute self influence. This method and its variants described below also return top-k proponents and opponents which are the top-k largest positive and negative influential examples respectively.
        • +
        • TracInCPFast is an approximation of TracInCP that avoids computing the gradients w.r.t. large parameter matrices. It approximates influence score based on the dot products between last fully connected layer activations and loss gradients w.r.t. that layer for training and test examples.
        • +
        • TracInCPFastRandProj uses a nearest neighbor approximation library such as annoy to compute the dot product between the training and test quantities. In order to reduce the dimensionality of layer activations and corresponding gradients this method, in addition, allows to project those vectors into a lower dimensional space using random projection matrices.
        • +
        + +

        More about the implementation of influential instances can be found on our GitHub page and tutorials.

        + +

        Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        + +
        +
        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.11-released/index.html b/blog/pytorch-1.11-released/index.html new file mode 100644 index 000000000000..72df4944a654 --- /dev/null +++ b/blog/pytorch-1.11-released/index.html @@ -0,0 +1,701 @@ + + + + + + + + + + + + + PyTorch 1.11, TorchData, and functorch are now available | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch 1.11 (release notes). This release is composed of over 3,300 commits since 1.10, made by 434 contributors. Along with 1.11, we are releasing beta versions of TorchData and functorch.

        + +

        Summary:

        + +
          +
        • TorchData is a new library for common modular data loading primitives for easily constructing flexible and performant data pipelines. View it on GitHub.
        • +
        • functorch, a library that adds composable function transforms to PyTorch, is now available in beta. View it on GitHub.
        • +
        • Distributed Data Parallel (DDP) static graph optimizations available in stable.
        • +
        + +

        Introducing TorchData

        + +

        We are delighted to present the Beta release of TorchData. This is a library of common modular data loading primitives for easily constructing flexible and performant data pipelines. Based on community feedback, we have found that the existing DataLoader bundled too many features together and can be difficult to extend. Moreover, different use cases often have to rewrite the same data loading utilities over and over again. The goal here is to enable composable data loading through Iterable-style and Map-style building blocks called “DataPipes” that work well out of the box with the PyTorch’s DataLoader.

        + +

        A DataPipe takes in some access function over Python data structures, __iter__ for IterDataPipe and __getitem__ for MapDataPipe, and returns a new access function with a slight transformation applied. You can chain multiple DataPipes together to form a data pipeline that performs all the necessary data transformation.

        + +

        We have implemented over 50 DataPipes that provide different core functionalities, such as opening files, parsing texts, transforming samples, caching, shuffling, and batching. For users who are interested in connecting to cloud providers (such as Google Drive or AWS S3), the fsspec and iopath DataPipes will allow you to do so. The documentation provides detailed explanations and usage examples of each IterDataPipe and MapDataPipe.

        + +

        In this release, some of the PyTorch domain libraries have migrated their datasets to use DataPipes. In TorchText, the popular datasets provided by the library are implemented using DataPipes and a section of its SST-2 binary text classification tutorial demonstrates how you can use DataPipes to preprocess data for your model. There also are other prototype implementations of datasets with DataPipes in TorchVision (available in nightly releases) and in TorchRec.

        + +

        The documentation for TorchData is now live. It contains a tutorial that covers how to use DataPipes, use them with DataLoader, and implement custom ones. FAQs and future plans related to DataLoader are described in our project’s README file.

        + +

        Introducing functorch

        + +

        We’re excited to announce the first beta release of functorch. Heavily inspired by Google JAX, functorch is a library that adds composable function transforms to PyTorch. It aims to provide composable vmap (vectorization) and autodiff transforms that work with PyTorch modules and PyTorch autograd with good eager-mode performance.

        + +

        Composable function transforms can help with a number of use cases that are tricky to do in PyTorch today:

        + +
          +
        • computing per-sample-gradients (or other per-sample quantities)
        • +
        • running ensembles of models on a single machine
        • +
        • efficiently batching together tasks in the inner-loop of MAML
        • +
        • efficiently computing Jacobians and Hessians as well as batched ones
        • +
        + +

        Composing vmap (vectorization), vjp (reverse-mode AD), and jvp (forward-mode AD) transforms allows us to effortlessly express the above without designing a separate library for each.

        + +

        For more details, please see our documentation, tutorials, and installation instructions.

        + +

        Distributed Training

        + +

        (Stable) DDP static graph

        + +

        DDP static graph assumes that your model employs the same set of used/unused parameters in every iteration, so that it can deterministically know states like which hooks will fire, how many times the hooks will fire and gradients computation ready order after the first iteration. Static graph caches these states in the first iteration, and thus it could support features that DDP can not support in previous releases, e.g., support multiple activation checkpoints on the same parameters regardless of whether there are unused parameters or not. The static graph feature also applies performance optimizations when there are unused parameters, e.g., it avoids traversing graphs to search unused parameters every iteration, and enables dynamic bucketing order. These optimizations in the DDP static graph brought 10% QPS gain for some recommendation models.

        + +

        To enable static graph, just simply set static_graph=True in the DDP API like this:

        + +
        ddp_model = DistributedDataParallel(model, static_graph=True)
        +
        + +

        For more details, please see our documentation and tutorials.

        + +

        Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.12-new-library-releases/index.html b/blog/pytorch-1.12-new-library-releases/index.html new file mode 100644 index 000000000000..ef705d97a90f --- /dev/null +++ b/blog/pytorch-1.12-new-library-releases/index.html @@ -0,0 +1,1299 @@ + + + + + + + + + + + + + New library updates in PyTorch 1.12 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        June 28, 2022

        +

        + New library updates in PyTorch 1.12 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 1.12 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.

        + +

        Summary:

        +
          +
        • TorchVision - Added multi-weight support API, new architectures, model variants, and pretrained weight. See the release notes here.
        • +
        • TorchAudio - Introduced beta features including a streaming API, a CTC beam search decoder, and new beamforming modules and methods. See the release notes here.
        • +
        • TorchText - Extended support for scriptable BERT tokenizer and added datasets for GLUE benchmark. See the release notes here.
        • +
        • TorchRec - Added EmbeddingModule benchmarks, examples for TwoTower Retrieval, inference and sequential embeddings, metrics, improved planner and demonstrated integration with production components. See the release notes here.
        • +
        • TorchX - Launch PyTorch trainers developed on local workspaces onto five different types of schedulers. See the release notes here.
        • +
        • FBGemm - Added and improved kernels for Recommendation Systems inference workloads, including table batched embedding bag, jagged tensor operations, and other special-case optimizations.
        • +
        + +

        TorchVision v0.13

        + +

        Multi-weight support API

        + +

        TorchVision v0.13 offers a new Multi-weight support API for loading different weights to the existing model builder methods:

        + +
        from torchvision.models import *
        +
        +# Old weights with accuracy 76.130%
        +resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        +
        +# New weights with accuracy 80.858%
        +resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        +
        +# Best available weights (currently alias for IMAGENET1K_V2)
        +# Note that these weights may change across versions
        +resnet50(weights=ResNet50_Weights.DEFAULT)
        +
        +# Strings are also supported
        +resnet50(weights="IMAGENET1K_V2")
        +
        +# No weights - random initialization
        +resnet50(weights=None)
        +
        + +

        The new API bundles along with the weights important details such as the preprocessing transforms and meta-data such as labels. Here is how to make the most out of it:

        + +
        from torchvision.io import read_image
        +from torchvision.models import resnet50, ResNet50_Weights
        +
        +img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg")
        +
        +# Step 1: Initialize model with the best available weights
        +weights = ResNet50_Weights.DEFAULT
        +model = resnet50(weights=weights)
        +model.eval()
        +
        +# Step 2: Initialize the inference transforms
        +preprocess = weights.transforms()
        +
        +# Step 3: Apply inference preprocessing transforms
        +batch = preprocess(img).unsqueeze(0)
        +
        +# Step 4: Use the model and print the predicted category
        +prediction = model(batch).squeeze(0).softmax(0)
        +class_id = prediction.argmax().item()
        +score = prediction[class_id].item()
        +category_name = weights.meta["categories"][class_id]
        +print(f"{category_name}: {100 * score:.1f}%")
        +
        + +

        You can read more about the new API in the docs. To provide your feedback, please use this dedicated Github issue.

        + +

        New architectures and model variants

        + +

        Classification

        + +

        The Swin Transformer and EfficienetNetV2 are two popular classification models which are often used for downstream vision tasks. This release includes 6 pre-trained weights for their classification variants. Here is how to use the new models:

        + +
        import torch
        +from torchvision.models import *
        +
        +image = torch.rand(1, 3, 224, 224)
        +model = swin_t(weights="DEFAULT").eval()
        +prediction = model(image)
        +
        +image = torch.rand(1, 3, 384, 384)
        +model = efficientnet_v2_s(weights="DEFAULT").eval()
        +prediction = model(image)
        +
        + +

        In addition to the above, we also provide new variants for existing architectures such as ShuffleNetV2, ResNeXt and MNASNet. The accuracies of all the new pre-trained models obtained on ImageNet-1K are seen below:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelAcc@1Acc@5
        swin_t81.47495.776
        swin_s83.19696.36
        swin_b83.58296.64
        efficientnet_v2_s84.22896.878
        efficientnet_v2_m85.11297.156
        efficientnet_v2_l85.80897.788
        resnext101_64x4d83.24696.454
        resnext101_64x4d (quantized)82.89896.326
        shufflenet_v2_x1_572.99691.086
        shufflenet_v2_x1_5 (quantized)72.0520.700
        shufflenet_v2_x2_076.23093.006
        shufflenet_v2_x2_0 (quantized)75.35492.488
        mnasnet0_7571.18090.496
        mnas1_376.50693.522
        + +

        We would like to thank Hu Ye for contributing to TorchVision the Swin Transformer implementation.

        + +

        (BETA) Object Detection and Instance Segmentation

        + +

        We have introduced 3 new model variants for RetinaNet, FasterRCNN and MaskRCNN that include several post-paper architectural optimizations and improved training recipes. All models can be used similarly:

        + +
        import torch
        +from torchvision.models.detection import *
        +
        +images = [torch.rand(3, 800, 600)]
        +model = retinanet_resnet50_fpn_v2(weights="DEFAULT")
        +# model = fasterrcnn_resnet50_fpn_v2(weights="DEFAULT")
        +# model = maskrcnn_resnet50_fpn_v2(weights="DEFAULT")
        +model.eval()
        +prediction = model(images)
        +
        + +

        Below we present the metrics of the new variants on COCO val2017. In parenthesis we denote the improvement over the old variants:

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelBox mAPMask mAP
        retinanet_resnet50_fpn_v241.5 (+5.1)-
        fasterrcnn_resnet50_fpn_v246.7 (+9.7)-
        maskrcnn_resnet50_fpn_v247.4 (+9.5)41.8 (+7.2)
        + +

        We would like to thank Ross Girshick, Piotr Dollar, Vaibhav Aggarwal, Francisco Massa and Hu Ye for their past research and contributions to this work.

        + +

        New pre-trained weights

        + +

        SWAG weights

        + +

        The ViT and RegNet model variants offer new pre-trained SWAG (​​Supervised Weakly from hashtAGs) weights. One of the biggest of these models achieves a whopping 88.6% accuracy on ImageNet-1K. We currently offer two versions of the weights: 1) fine-tuned end-to-end weights on ImageNet-1K (highest accuracy) and 2) frozen trunk weights with a linear classifier fit on ImageNet-1K (great for transfer learning). Below we see the detailed accuracies of each model variant:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model WeightsAcc@1Acc@5
        RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_E2E_V186.01298.054
        RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_LINEAR_V183.97697.244
        RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_E2E_V186.83898.362
        RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_LINEAR_V184.62297.48
        RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_E2E_V188.22898.682
        RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_LINEAR_V186.06897.844
        ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V185.30497.65
        ViT_B_16_Weights.IMAGENET1K_SWAG_LINEAR_V181.88696.18
        ViT_L_16_Weights.IMAGENET1K_SWAG_E2E_V188.06498.512
        ViT_L_16_Weights.IMAGENET1K_SWAG_LINEAR_V185.14697.422
        ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V188.55298.694
        ViT_H_14_Weights.IMAGENET1K_SWAG_LINEAR_V185.70897.73
        + +

        The SWAG weights are released under the Attribution-NonCommercial 4.0 International license. We would like to thank Laura Gustafson, Mannat Singh and Aaron Adcock for their work and support in making the weights available to TorchVision.

        + +

        Model Refresh

        + +

        The release of the Multi-weight support API enabled us to refresh the most popular models and offer more accurate weights. We improved on average each model by ~3 points. The new recipe used was learned on top of ResNet50 and its details were covered on a previous blog post.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelOld weightsNew weights
        efficientnet_b178.64279.838
        mobilenet_v271.87872.154
        mobilenet_v3_large74.04275.274
        regnet_y_400mf74.04675.804
        regnet_y_800mf76.4278.828
        regnet_y_1_6gf77.9580.876
        regnet_y_3_2gf78.94881.982
        regnet_y_8gf80.03282.828
        regnet_y_16gf80.42482.886
        regnet_y_32gf80.87883.368
        regnet_x_400mf72.83474.864
        regnet_x_800mf75.21277.522
        regnet_x_1_6gf77.0479.668
        regnet_x_3_2gf78.36481.196
        regnet_x_8gf79.34481.682
        regnet_x_16gf80.05882.716
        regnet_x_32gf80.62283.014
        resnet5076.1380.858
        resnet50 (quantized)75.9280.282
        resnet10177.37481.886
        resnet15278.31282.284
        resnext50_32x4d77.61881.198
        resnext101_32x8d79.31282.834
        resnext101_32x8d (quantized)78.98682.574
        wide_resnet50_278.46881.602
        wide_resnet101_278.84882.51
        + +

        We would like to thank Piotr Dollar, Mannat Singh and Hugo Touvron for their past research and contributions to this work.

        + +

        New Augmentations, Layers and Losses

        + +

        This release brings a bunch of new primitives which can be used to produce SOTA models. Some highlights include the addition of AugMix data-augmentation method, the DropBlock layer, the cIoU/dIoU loss and many more. We would like to thank Aditya Oke, Abhijit Deo, Yassine Alouini and Hu Ye for contributing to the project and for helping us maintain TorchVision relevant and fresh.

        + +

        Documentation

        + +

        We completely revamped our models documentation to make them easier to browse, and added various key information such as supported image sizes, or image pre-processing steps of pre-trained weights. We now have a main model page with various summary tables of available weights, and each model has a dedicated page. Each model builder is also documented in their own page, with more details about the available weights, including accuracy, minimal image size, link to training recipes, and other valuable info. For comparison, our previous models docs are here. To provide feedback on the new documentation, please use the dedicated Github issue.

        + +

        TorchAudio v0.12

        + +

        (BETA) Streaming API

        + +

        + +

        + +

        StreamReader is TorchAudio’s new I/O API. It is backed by FFmpeg†, and allows users to:

        +
          +
        • Decode audio and video formats, including MP4 and AAC
        • +
        • Handle input forms, such as local files, network protocols, microphones, webcams, screen captures and file-like objects
        • +
        • Iterate over and decode chunk-by-chunk, while changing the sample rate or frame rate
        • +
        • Apply audio and video filters, such as low-pass filter and image scaling
        • +
        • Decode video with Nvidia’s hardware-based decoder (NVDEC)
        • +
        + +

        For usage details, please check out the documentation and tutorials:

        + + +

        † To use StreamReader, FFmpeg libraries are required. Please install FFmpeg. The coverage of codecs depends on how these libraries are configured. TorchAudio official binaries are compiled to work with FFmpeg 4 libraries; FFmpeg 5 can be used if TorchAudio is built from source.

        + +

        (BETA) CTC Beam Search Decoder

        + +

        TorchAudio integrates the wav2letter CTC beam search decoder from Flashlight (GitHub). The addition of this inference time decoder enables running end-to-end CTC ASR evaluation using TorchAudio utils.

        + +

        Customizable lexicon and lexicon-free decoders are supported, and both are compatible with KenLM n-gram language models or without using a language model. TorchAudio additionally supports downloading token, lexicon, and pretrained KenLM files for the LibriSpeech dataset.

        + +

        For usage details, please check out the documentation and ASR inference tutorial.

        + +

        (BETA) New Beamforming Modules and Methods

        + +

        To improve flexibility in usage, the release adds two new beamforming modules under torchaudio.transforms: SoudenMVDR and RTFMVDR. The main differences from MVDR are:

        +
          +
        • Use power spectral density (PSD) and relative transfer function (RTF) matrices as inputs instead of time-frequency masks. The module can be integrated with neural networks that directly predict complex-valued STFT coefficients of speech and noise
        • +
        • Add 'reference_channel' as an input argument in the forward method, to allow users to select the reference channel in model training or dynamically change the reference channel in inference
        • +
        + +

        Besides the two modules, new function-level beamforming methods are added under torchaudio.functional. These include:

        + + +

        For usage details, please check out the documentation at torchaudio.transforms and torchaudio.functional and the Speech Enhancement with MVDR Beamforming tutorial.

        + +

        TorchText v0.13

        + +

        Glue Datasets

        + +

        We increased the number of datasets in TorchText from 22 to 30 by adding the remaining 8 datasets from the GLUE benchmark (SST-2 was already supported). The complete list of GLUE datasets is as follows:

        +
          +
        • CoLA (paper): Single sentence binary classification acceptability task
        • +
        • SST-2 (paper): Single sentence binary classification sentiment task
        • +
        • MRPC (paper): Dual sentence binary classification paraphrase task
        • +
        • QQP: Dual sentence binary classification paraphrase task
        • +
        • STS-B (paper): Single sentence to float regression sentence similarity task
        • +
        • MNLI (paper): Sentence ternary classification NLI task
        • +
        • QNLI (paper): Sentence binary classification QA and NLI tasks
        • +
        • RTE (paper): Dual sentence binary classification NLI task
        • +
        • WNLI (paper): Dual sentence binary classification coreference and NLI tasks
        • +
        + +

        Scriptable BERT Tokenizer

        + +

        TorchText has extended support for scriptable tokenizer by adding the WordPiece tokenizer used in BERT. It is one of the commonly used algorithms for splitting input text into sub-words units and was introduced in Japanese and Korean Voice Search (Schuster et al., 2012).

        + +

        TorchScriptabilty support would allow users to embed the BERT text-preprocessing natively in C++ without needing the support of python runtime. As TorchText now supports the CMAKE build system to natively link torchtext binaries with application code, users can easily integrate BERT tokenizers for deployment needs.

        + +

        For usage details, please refer to the corresponding documentation.

        + +

        TorchRec v0.2.0

        + +

        EmbeddingModule + DLRM benchmarks

        + +

        A set of benchmarking tests, showing performance characteristics of TorchRec’s base modules and research models built out of TorchRec.

        + +

        TwoTower Retrieval Example, with FAISS

        + +

        We provide an example demonstrating training a distributed TwoTower (i.e. User-Item) Retrieval model that is sharded using TorchRec. The projected item embeddings are added to an IVFPQ FAISS index for candidate generation. The retrieval model and KNN lookup are bundled in a Pytorch model for efficient end-to-end retrieval.

        + +

        Integrations

        + +

        We demonstrate that TorchRec works out of the box with many components commonly used alongside PyTorch models in production like systems, such as

        +
          +
        • Training a TorchRec model on Ray Clusters utilizing the Torchx Ray scheduler
        • +
        • Preprocessing and DataLoading with NVTabular on DLRM
        • +
        • Training a TorchRec model with on-the-fly preprocessing with TorchArrow showcasing RecSys domain UDFs
        • +
        + +

        Sequential Embeddings Example: Bert4Rec

        + +

        We provide an example, using TorchRec, that reimplements the BERT4REC paper, showcasing EmbeddingCollection for non-pooled embeddings. Using DistributedModelParallel we see a 35% QPS gain over conventional data parallelism.

        + +

        (Beta) Planner

        + +

        The TorchRec library includes a built-in planner that selects near optimal sharding plan for a given model. The planner attempts to identify the best sharding plan by evaluating a series of proposals which are statically analyzed and fed into an integer partitioner. The planner is able to automatically adjust plans for a wide range of hardware setups, allowing users to scale performance seamlessly from local development environment to large scale production hardware. See this notebook for a more detailed tutorial.

        + +

        (Beta) Inference

        + +

        TorchRec Inference is a C++ library that supports multi-gpu inference. The TorchRec library is used to shard models written and packaged in Python via torch.package (an alternative to TorchScript). The torch.deploy library is used to serve inference from C++ by launching multiple Python interpreters carrying the packaged model, thus subverting the GIL. Two models are provided as examples: DLRM multi-GPU (sharded via TorchRec) and DLRM single-GPU.

        + +

        (Beta) RecMetrics

        + +

        RecMetrics is a metrics library that collects common utilities and optimizations for Recommendation models. It extends torchmetrics.

        +
          +
        • A centralized metrics module that allows users to add new metrics
        • +
        • Commonly used metrics, including AUC, Calibration, CTR, MSE/RMSE, NE & Throughput
        • +
        • Optimization for metrics related operations to reduce the overhead of metric computation
        • +
        • Checkpointing
        • +
        + +

        (Prototype) Single process Batched + Fused Embeddings

        + +

        Previously TorchRec’s abstractions (EmbeddingBagCollection/EmbeddingCollection) over FBGEMM kernels, which provide benefits such as table batching, optimizer fusion, and UVM placement, could only be used in conjunction with DistributedModelParallel. We’ve decoupled these notions from sharding, and introduced the FusedEmbeddingBagCollection, which can be used as a standalone module, with all of the above features, and can also be sharded.

        + +

        TorchX v0.2.0

        + +

        TorchX is a job launcher that makes it easier to run PyTorch in distributed training clusters with many scheduler integrations including Kubernetes and Slurm. We’re excited to release TorchX 0.2.0 with a number of improvements. TorchX is currently being used in production in both on-premise and cloud environments.

        + +

        Check out the quickstart to start launching local and remote jobs.

        + +

        Workspaces

        + +

        TorchX now supports workspaces which allows users to easily launch training jobs using their local workspace. TorchX can automatically build a patch with your local training code on top of a base image to minimize iteration time and time to training.

        + +

        .torchxconfig

        + +

        Specifying options in .torchxconfig saves you from having to type long CLI commands each time you launch a job. You can also define project level generic configs and drop a config file in your home directory for user-level overrides.

        + +

        Expanded Scheduler Support

        + +

        TorchX now supports AWS Batch and Ray (experimental) schedulers in addition to our existing integrations.

        + +

        Distributed Training On All Schedulers

        + +

        The TorchX dist.ddp component now works on all schedulers without any configuration. Distributed training workers will automatically discover each other when using torchelastic via the builtin dist.ddp component.

        + +

        Hyper Parameter Optimization

        + +

        TorchX integrates with Ax to let you scale hyper-parameter optimizations (HPO) by launching the search trials onto remote clusters.

        + +

        File and Device Mounts

        + +

        TorchX now supports remote filesystem mounts and custom devices. This enables your PyTorch jobs to efficiently access cloud storage such as NFS or Lustre. The device mounts enables usage of network accelerators like Infiniband and custom inference/training accelerators.

        + +

        FBGemm v0.2.0

        + +

        The FBGEMM library contains optimized kernels meant to improve the performance of PyTorch workloads. We’ve added a number of new features and optimizations over the last few months that we are excited to report.

        + +

        Inference Table Batched Embedding (TBE)

        + +

        The table batched embedding bag (TBE) operator is an important base operation for embedding lookup for recommendation system inference on GPU. We added the following enhancements for performance and flexibility:

        + +

        Alignment restriction removed

        +
          +
        • Embedding dimension * data type size had to be multiple of 4B before and now, it is 1B.
        • +
        + +

        Unified Virtual Memory (UVM) caching kernel optimizations

        +
          +
        • UVM caching kernels now scale linearly with # of tables using UVM caching. Previously, it was having similar overhead as all tables using UVM caching
        • +
        • UVM caching kernel overhead is much smaller than before
        • +
        + +

        Inference FP8 Table Batched Embedding (TBE)

        + +

        The table batched embedding bag (TBE) previously supported FP32, FP16, INT8, INT4, and INT2 embedding weight types. While these weight types work well in many models, we integrate FP8 weight types (in both GPU and CPU operations) to allow for numerical and performance evaluations of FP8 in our models. Compared to INT8, FP8 does not require the additional bias and scale storage and calculations. Additionally, the next generation of H100 GPUs has the FP8 support on Tensor Core (mainly matmul ops).

        + +

        Jagged Tensor Kernels

        + +

        We added optimized kernels to speed up TorchRec JaggedTensor. The purpose of JaggedTensor is to handle the case where one dimension of the input data is “jagged”, meaning that each consecutive row in a given dimension may be a different length, which is often the case with sparse feature inputs in recommendation systems. The internal representation is shown below:

        + +

        + +

        + +

        We added ops for converting jagged tensors from sparse to dense formats and back, performing matrix multiplications with jagged tensors, and elementwise ops.

        + +

        Optimized permute102-baddbmm-permute102

        + +

        It is difficult to fuse various matrix multiplications where the batch size is not the batch size of the model, switching the batch dimension is a quick solution. We created the permute102_baddbmm_permute102 operation that switches the first and the second dimension, performs the batched matrix multiplication and then switches back. Currently we only support forward pass with FP16 data type and will support FP32 type and backward pass in the future.

        + +

        Optimized index_select for dim 0 index selection

        + +

        index_select is normally used as part of a sparse operation. While PyTorch supports a generic index_select for an arbitrary-dimension index selection, its performance for a special case like the dim 0 index selection is suboptimal. For this reason, we implement a specialized index_select for dim 0. In some cases, we have observed 1.4x performance gain from FBGEMM’s index_select compared to the one from PyTorch (using uniform index distribution).

        + +

        More about the implementation of influential instances can be found on our GitHub page and tutorials.

        + +

        Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.12-released/index.html b/blog/pytorch-1.12-released/index.html new file mode 100644 index 000000000000..501fecdf804b --- /dev/null +++ b/blog/pytorch-1.12-released/index.html @@ -0,0 +1,865 @@ + + + + + + + + + + + + + PyTorch 1.12: TorchArrow, Functional API for Modules and nvFuser, are now available | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch 1.12 (release note)! This release is composed of over 3124 commits, 433 contributors. Along with 1.12, we are releasing beta versions of AWS S3 Integration, PyTorch Vision Models on Channels Last on CPU, Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16 and FSDP API. We want to sincerely thank our dedicated community for your contributions.

        + +

        Summary:

        +
          +
        • Functional APIs to functionally apply module computation with a given set of parameters
        • +
        • Complex32 and Complex Convolutions in PyTorch
        • +
        • DataPipes from TorchData fully backward compatible with DataLoader
        • +
        • functorch with improved coverage for APIs
        • +
        • nvFuser a deep learning compiler for PyTorch
        • +
        • Changes to float32 matrix multiplication precision on Ampere and later CUDA hardware
        • +
        • TorchArrow, a new beta library for machine learning preprocessing over batch data
        • +
        + +

        Frontend APIs

        + +

        Introducing TorchArrow

        + +

        We’ve got a new Beta release ready for you to try and use: TorchArrow. This is a library for machine learning preprocessing over batch data. It features a performant and Pandas-style, easy-to-use API in order to speed up your preprocessing workflows and development.

        + +

        Currently, it provides a Python DataFrame interface with the following features:

        +
          +
        • High-performance CPU backend, vectorized and extensible User-Defined Functions (UDFs) with Velox
        • +
        • Seamless handoff with PyTorch or other model authoring, such as Tensor collation and easily plugging into PyTorch DataLoader and DataPipes
        • +
        • Zero copy for external readers via Arrow in-memory columnar format
        • +
        + +

        For more details, please find our 10-min tutorial, installation instructions, API documentation, and a prototype for data preprocessing in TorchRec.

        + +

        (Beta) Functional API for Modules

        + +

        PyTorch 1.12 introduces a new beta feature to functionally apply Module computation with a given set of parameters. Sometimes, the traditional PyTorch Module usage pattern that maintains a static set of parameters internally is too restrictive. This is often the case when implementing algorithms for meta-learning, where multiple sets of parameters may need to be maintained across optimizer steps.

        + +

        The new torch.nn.utils.stateless.functional_call() API allows for:

        +
          +
        • Module computation with full flexibility over the set of parameters used
        • +
        • No need to reimplement your module in a functional way
        • +
        • Any parameter or buffer present in the module can be swapped with an externally-defined value for use in the call. Naming for referencing parameters / buffers follows the fully-qualified form in the module’s state_dict()
        • +
        + +

        Example:

        +
        import torch
        +from torch import nn
        +from torch.nn.utils.stateless import functional_call
        +
        +class MyModule(nn.Module):
        +    def __init__(self):
        +        super().__init__()
        +        self.fc1 = nn.Linear(3, 3)
        +        self.bn = nn.BatchNorm1d(3)
        +        self.fc2 = nn.Linear(3, 3)
        +
        +    def forward(self, x):
        +        return self.fc2(self.bn(self.fc1(x)))
        +
        +m = MyModule()
        +
        +# Define parameter / buffer values to use during module computation.
        +my_weight = torch.randn(3, 3, requires_grad=True)
        +my_bias = torch.tensor([1., 2., 3.], requires_grad=True)
        +params_and_buffers = {
        +    'fc1.weight': my_weight,
        +    'fc1.bias': my_bias,
        +    # Custom buffer values can be used too.
        +    'bn.running_mean': torch.randn(3),
        +}
        +
        +# Apply module computation to the input with the specified parameters / buffers.
        +inp = torch.randn(5, 3)
        +output = functional_call(m, params_and_buffers, inp)
        +
        + +

        (Beta) Complex32 and Complex Convolutions in PyTorch

        + +

        PyTorch today natively supports complex numbers, complex autograd, complex modules, and numerous complex operations, including linear algebra and Fast Fourier Transform (FFT) operators. Many libraries, including torchaudio and ESPNet, already make use of complex numbers in PyTorch, and PyTorch 1.12 further extends complex functionality with complex convolutions and the experimental complex32 (“complex half”) data type that enables half precision FFT operations. Due to the bugs in CUDA 11.3 package, we recommend using CUDA 11.6 package from wheels if you are using complex numbers.

        + +

        (Beta) Forward-mode Automatic Differentiation

        + +

        Forward-mode AD allows the computation of directional derivatives (or equivalently, Jacobian-vector products) eagerly in the forward pass. PyTorch 1.12 significantly improves the operator coverage for forward-mode AD. See our tutorial for more information.

        + +

        TorchData

        + +

        BC DataLoader + DataPipe

        + +

        `DataPipe` from TorchData becomes fully backward compatible with the existing `DataLoader` regarding shuffle determinism and dynamic sharding in both multiprocessing and distributed environments.

        + +

        (Beta) AWS S3 Integration

        + +

        DataPipes based on AWSSDK have been integrated into TorchData. It provides the following features backed by native AWSSDK:

        +
          +
        • Retrieve list of urls from each S3 bucket based on prefix +
            +
          • Support timeout to prevent hanging indefinitely
          • +
          • Support to specify S3 bucket region
          • +
          +
        • +
        • Load data from S3 urls +
            +
          • Support buffered and multi-part download
          • +
          • Support to specify S3 bucket region
          • +
          +
        • +
        + +

        AWS native DataPipes are still in the beta phase. And, we will keep tuning them to improve their performance.

        + +

        (Prototype) DataLoader2

        + +

        DataLoader2 became available in prototype mode. We are introducing new ways to interact between DataPipes, DataLoading API, and backends (aka ReadingServices). Feature is stable in terms of API, but functionally not complete yet. We welcome early adopters and feedback, as well as potential contributors.

        + +

        For more details, please checkout the link.

        + +

        functorch

        + +

        Inspired by Google JAX, functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples of these include:

        + + +

        We’re excited to announce functorch 0.2.0 with a number of improvements and new experimental features.

        + +

        Significantly improved coverage

        + +

        We significantly improved coverage for functorch.jvp (our forward-mode autodiff API) and other APIs that rely on it (functorch.{jacfwd, hessian}).

        + +

        (Prototype) functorch.experimental.functionalize

        + +

        Given a function f, functionalize(f) returns a new function without mutations (with caveats). This is useful for constructing traces of PyTorch functions without in-place operations. For example, you can use make_fx(functionalize(f)) to construct a mutation-free trace of a pytorch function. To learn more, please see the documentation.

        + +

        For more details, please see our installation instructions, documentation, tutorials, and release notes.

        + +

        Performance Improvements

        + +

        Introducing nvFuser, a deep learning compiler for PyTorch

        + +

        In PyTorch 1.12, Torchscript is updating its default fuser (for Volta and later CUDA accelerators) to nvFuser, which supports a wider range of operations and is faster than NNC, the previous fuser for CUDA devices. A soon to be published blog post will elaborate on nvFuser and show how it speeds up training on a variety of networks.

        + +

        See the nvFuser documentation for more details on usage and debugging.

        + +

        Changes to float32 matrix multiplication precision on Ampere and later CUDA hardware

        + +

        PyTorch supports a variety of “mixed precision” techniques, like the torch.amp (Automated Mixed Precision) module and performing float32 matrix multiplications using the TensorFloat32 datatype on Ampere and later CUDA hardware for faster internal computations. In PyTorch 1.12 we’re changing the default behavior of float32 matrix multiplications to always use full IEEE fp32 precision, which is more precise but slower than using the TensorFloat32 datatype for internal computation. For devices with a particularly high ratio of TensorFloat32 to float32 throughput such as A100, this change in defaults can result in a large slowdown.

        + +

        If you’ve been using TensorFloat32 matrix multiplications then you can continue to do so by setting torch.backends.cuda.matmul.allow_tf32 = True

        + +

        which is supported since PyTorch 1.7. Starting in PyTorch 1.12 the new matmul precision API can be used, too: torch.set_float32_matmul_precision(“highest”|”high”|”medium”)

        + +

        To reiterate, PyTorch’s new default is “highest” precision for all device types. We think this provides better consistency across device types for matrix multiplications. Documentation for the new precision API can be found here. Setting the “high” or “medium” precision types will enable TensorFloat32 on Ampere and later CUDA devices. If you’re updating to PyTorch 1.12 then to preserve the current behavior and faster performance of matrix multiplications on Ampere devices, set precision to “high”.

        + +

        Using mixed precision techniques is essential for training many modern deep learning networks efficiently, and if you’re already using torch.amp this change is unlikely to affect you. If you’re not familiar with mixed precision training then see our soon to be published “What Every User Should Know About Mixed Precision Training in PyTorch” blogpost.

        + +

        (Beta) Accelerating PyTorch Vision Models with Channels Last on CPU

        + +

        Memory formats have a significant impact on performance when running vision models, generally Channels Last is more favorable from a performance perspective due to better data locality. 1.12 includes fundamental concepts of memory formats and demonstrates performance benefits using Channels Last on popular PyTorch vision models on Intel® Xeon® Scalable processors.

        +
          +
        • Enables Channels Last memory format support for the commonly used operators in CV domain on CPU, applicable for both inference and training
        • +
        • Provides native level optimization on Channels Last kernels from ATen, applicable for both AVX2 and AVX512
        • +
        • Delivers 1.3x to 1.8x inference performance gain over Channels First for TorchVision models on Intel® Xeon® Ice Lake (or newer) CPUs
        • +
        + +

        (Beta) Empowering PyTorch on Intel® Xeon® Scalable processors with Bfloat16

        + +

        Reduced precision numeric formats like bfloat16 improves PyTorch performance across multiple deep learning training workloads. PyTorch 1.12 includes the latest software enhancements on bfloat16 which applies to a broader scope of user scenarios and showcases even higher performance gains. The main improvements include:

        +
          +
        • 2x hardware compute throughput vs. float32 with the new bfloat16 native instruction VDPBF16PS, introduced on Intel® Xeon® Cooper Lake CPUs
        • +
        • 1/2 memory footprint of float32, faster speed for memory bandwidth intensive operators
        • +
        • 1.4x to 2.2x inference performance gain over float32 for TorchVision models on Intel® Xeon® Cooper Lake (or newer) CPUs
        • +
        + +

        (Prototype) Introducing Accelerated PyTorch Training on Mac

        + +

        With the PyTorch 1.12 release, developers and researchers can now take advantage of Apple silicon GPUs for significantly faster model training. This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac. Accelerated GPU training is enabled using Apple’s Metal Performance Shaders (MPS) as a backend. The benefits include performance speedup from accelerated GPU training and the ability to train larger networks or batch sizes locally. Learn more here.

        + +

        + +

        + +

        + Accelerated GPU training and evaluation speedups over CPU-only (times faster) +

        + +

        Alongside the new MPS device support, the M1 binaries for Core and Domain libraries that have been available for the last few releases are now an official prototype feature. These binaries can be used to run PyTorch natively on Apple Silicon.

        + +

        (Prototype) BetterTransformer: Fastpath execution for Transformer Encoder Inference

        + +

        PyTorch now supports CPU and GPU fastpath implementations (“BetterTransformer”) for several Transformer Encoder modules including TransformerEncoder, TransformerEncoderLayer, and MultiHeadAttention (MHA). The BetterTransformer fastpath architecture Better Transformer is consistently faster – 2x for many common execution scenarios, depending on model and input characteristics. The new BetterTransformer-enabled modules are API compatible with previous releases of the PyTorch Transformer API and will accelerate existing models if they meet fastpath execution requirements, as well as read models trained with previous versions of PyTorch. PyTorch 1.12 includes:

        +
          +
        • BetterTransformer integration for Torchtext’s pretrained RoBERTa and XLM-R models
        • +
        • Torchtext which builds on the PyTorch Transformer API
        • +
        • Fastpath execution for improved performance by reducing execution overheads with fused kernels which combines multiple operators into a single kernel
        • +
        • Option to achieve additional speedups by taking advantage of data sparsity during the processing of padding tokens in natural-language processing (by setting enable_nested_tensor=True when creating a TransformerEncoder)
        • +
        • Diagnostics to help users understand why fastpath execution did not occur
        • +
        + +

        + +

        + +

        Distributed

        + +

        (Beta) Fully Sharded Data Parallel (FSDP) API

        + +

        FSDP API helps easily scale large model training by sharding a model’s parameters, gradients and optimizer states across data parallel workers while maintaining the simplicity of data parallelism. The prototype version was released in PyTorch 1.11 with a minimum set of features that helped scaling tests of models with up to 1T parameters.

        + +

        In this beta release, FSDP API added the following features to support various production workloads. Highlights of the the newly added features in this beta release include:

        +
          +
        1. Universal sharding strategy API - Users can easily change between sharding strategies with a single line change, and thus compare and use DDP (only data sharding), FSDP (full model and data sharding), or Zero2 (only sharding of optimizer and gradients) to optimize memory and performance for their specific training needs
        2. +
        3. Fine grained mixed precision policies - Users can specify a mix of half and full data types (bfloat16, fp16 or fp32) for model parameters, gradient communication, and buffers via mixed precision policies. Models are automatically saved in fp32 to allow for maximum portability
        4. +
        5. Transformer auto wrapping policy - allows for optimal wrapping of Transformer based models by registering the models layer class, and thus accelerated training performance
        6. +
        7. Faster model initialization using device_id init - initialization is performed in a streaming fashion to avoid OOM issues and optimize init performance vs CPU init
        8. +
        9. Rank0 streaming for full model saving of larger models - Fully sharded models can be saved by all GPU’s streaming their shards to the rank 0 GPU, and the model is built in full state on the rank 0 CPU for saving
        10. +
        + +

        For more details and example code, please checkout the documentation and the tutorial.

        + +

        Thanks for reading, If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Twitter, Medium, YouTube, and LinkedIn.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.2-and-domain-api-release/index.html b/blog/pytorch-1.2-and-domain-api-release/index.html new file mode 100644 index 000000000000..aeaa86b8116a --- /dev/null +++ b/blog/pytorch-1.2-and-domain-api-release/index.html @@ -0,0 +1,832 @@ + + + + + + + + + + + + + New Releases: PyTorch 1.2, torchtext 0.4, torchaudio 0.3, and torchvision 0.4 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Since the release of PyTorch 1.0, we’ve seen the community expand to add new tools, contribute to a growing set of models available in the PyTorch Hub, and continually increase usage in both research and production.

        + +

        From a core perspective, PyTorch has continued to add features to support both research and production usage, including the ability to bridge these two worlds via TorchScript. Today, we are excited to announce that we have four new releases including PyTorch 1.2, torchvision 0.4, torchaudio 0.3, and torchtext 0.4. You can get started now with any of these releases at pytorch.org.

        + +

        PyTorch 1.2

        + +

        With PyTorch 1.2, the open source ML framework takes a major step forward for production usage with the addition of an improved and more polished TorchScript environment. These improvements make it even easier to ship production models, expand support for exporting ONNX formatted models, and enhance module level support for Transformers. In addition to these new features, TensorBoard is now no longer experimental - you can simply type from torch.utils.tensorboard import SummaryWriter to get started.

        + +

        TorchScript Improvements

        + +

        Since its release in PyTorch 1.0, TorchScript has provided a path to production for eager PyTorch models. The TorchScript compiler converts PyTorch models to a statically typed graph representation, opening up opportunities for +optimization and execution in constrained environments where Python is not available. You can incrementally convert your model to TorchScript, mixing compiled code seamlessly with Python.

        + +

        PyTorch 1.2 significantly expands TorchScript’s support for the subset of Python used in PyTorch models and delivers a new, easier-to-use API for compiling your models to TorchScript. See the migration guide for details. Below is an example usage of the new API:

        + +
        import torch
        +
        +class MyModule(torch.nn.Module):
        +    def __init__(self, N, M):
        +        super(MyModule, self).__init__()
        +        self.weight = torch.nn.Parameter(torch.rand(N, M))
        +
        +    def forward(self, input):
        +        if input.sum() > 0:
        +          output = self.weight.mv(input)
        +        else:
        +          output = self.weight + input
        +        return output
        +
        +# Compile the model code to a static representation
        +my_script_module = torch.jit.script(MyModule(3, 4))
        +
        +# Save the compiled code and model data so it can be loaded elsewhere
        +my_script_module.save("my_script_module.pt")
        +
        + +

        To learn more, see our Introduction to TorchScript and Loading a +PyTorch Model in C++ tutorials.

        + +

        Expanded ONNX Export

        + +

        The ONNX community continues to grow with an open governance structure and additional steering committee members, special interest groups (SIGs), and working groups (WGs). In collaboration with Microsoft, we’ve added full support to export ONNX Opset versions 7(v1.2), 8(v1.3), 9(v1.4) and 10 (v1.5). We’ve have also enhanced the constant folding pass to support Opset 10, the latest available version of ONNX. ScriptModule has also been improved including support for multiple outputs, tensor factories, and tuples as inputs and outputs. Additionally, users are now able to register their own symbolic to export custom ops, and specify the dynamic dimensions of inputs during export. Here is a summary of the all of the major improvements:

        + +
          +
        • Support for multiple Opsets including the ability to export dropout, slice, flip, and interpolate in Opset 10.
        • +
        • Improvements to ScriptModule including support for multiple outputs, tensor factories, and tuples as inputs and outputs.
        • +
        • More than a dozen additional PyTorch operators supported including the ability to export a custom operator.
        • +
        • Many big fixes and test infra improvements.
        • +
        + +

        You can try out the latest tutorial here, contributed by @lara-hdr at Microsoft. A big thank you to the entire Microsoft team for all of their hard work to make this release happen!

        + +

        nn.Transformer

        + +

        In PyTorch 1.2, we now include a standard nn.Transformer module, based on the paper “Attention is All You Need”. The nn.Transformer module relies entirely on an attention mechanism to draw global dependencies between input and output. The individual components of the nn.Transformer module are designed so they can be adopted independently. For example, the nn.TransformerEncoder can be used by itself, without the larger nn.Transformer. The new APIs include:

        + +
          +
        • nn.Transformer
        • +
        • nn.TransformerEncoder and nn.TransformerEncoderLayer
        • +
        • nn.TransformerDecoder and nn.TransformerDecoderLayer
        • +
        + +
        + +
        + +

        See the Transformer Layers documentation for more information. See here for the full PyTorch 1.2 release notes.

        + +

        Domain API Library Updates

        + +

        PyTorch domain libraries like torchvision, torchtext, and torchaudio provide convenient access to common datasets, models, and transforms that can be used to quickly create a state-of-the-art baseline. Moreover, they also provide common abstractions to reduce boilerplate code that users might have to otherwise repeatedly write. Since research domains have distinct requirements, an ecosystem of specialized libraries called domain APIs (DAPI) has emerged around PyTorch to simplify the development of new and existing algorithms in a number of fields. We’re excited to release three updated DAPI libraries for text, audio, and vision that compliment the PyTorch 1.2 core release.

        + +

        Torchaudio 0.3 with Kaldi Compatibility, New Transforms

        + +
        + +
        + +

        Torchaudio specializes in machine understanding of audio waveforms. It is an ML library that provides relevant signal processing functionality (but is not a general signal processing library). It leverages PyTorch’s GPU support to provide many tools and transformations for waveforms to make data loading and standardization easier and more readable. For example, it offers data loaders for waveforms using sox, and transformations such as spectrograms, resampling, and mu-law encoding and decoding.

        + +

        We are happy to announce the availability of torchaudio 0.3.0, with a focus on standardization and complex numbers, a transformation (resample) and two new functionals (phase_vocoder, ISTFT), Kaldi compatibility, and a new tutorial. Torchaudio was redesigned to be an extension of PyTorch and a part of the domain APIs (DAPI) ecosystem.

        + +

        Standardization

        + +

        Significant effort in solving machine learning problems goes into data preparation. In this new release, we’ve updated torchaudio’s interfaces for its transformations to standardize around the following vocabulary and conventions.

        + +

        Tensors are assumed to have channel as the first dimension and time as the last dimension (when applicable). This makes it consistent with PyTorch’s dimensions. For size names, the prefix n_ is used (e.g. “a tensor of size (n_freq, n_mel)”) whereas dimension names do not have this prefix (e.g. “a tensor of dimension (channel, time)”). The input of all transforms and functions now assumes channel first. This is done to be consistent with PyTorch, which has channel followed by the number of samples. The channel parameter of all transforms and functions is now deprecated.

        + +

        The output of STFT is (channel, frequency, time, 2), meaning for each channel, the columns are the Fourier transform of a certain window, so as we travel horizontally we can see each column (the Fourier transformed waveform) change over time. This matches the output of librosa so we no longer need to transpose in our test comparisons with Spectrogram, MelScale, MelSpectrogram, and MFCC. Moreover, because of these new conventions, we deprecated LC2CL and BLC2CBL which were used to transfer from one shape of signal to another.

        + +

        As part of this release, we’re also introducing support for complex numbers via tensors of dimension (…, 2), and providing magphase to convert such a tensor into its magnitude and phase, and similarly complex_norm and angle.

        + +

        The details of the standardization are provided in the README.

        + +

        Functionals, Transformations, and Kaldi Compatibility

        + +

        Prior to the standardization, we separated state and computation into torchaudio.transforms and torchaudio.functional.

        + +

        As part of the transforms, we’re adding a new transformation in 0.3.0: Resample. Resample can upsample or downsample a waveform to a different frequency.

        + +

        As part of the functionals, we’re introducing: phase_vocoder, a phase vocoder to change the speed of a waveform without changing its pitch, and ISTFT, the inverse STFT implemented to be compatible with STFT provided by PyTorch. This separation allows us to make functionals weak scriptable and to utilize JIT in 0.3.0. We thus have JIT and CUDA support for the following transformations: Spectrogram, AmplitudeToDB (previously named SpectrogramToDB), MelScale, +MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding (previously named MuLawExpanding).

        + +

        We now also provide a compatibility interface with Kaldi to ease onboarding and reduce a user’s code dependency on Kaldi. We now have an interface for spectrogram, fbank, and resample_waveform.

        + +

        New Tutorial

        + +

        To showcase the new conventions and transformations, we have a new tutorial demonstrating how to preprocess waveforms using torchaudio. This tutorial walks through an example of loading a waveform and applying some of the available transformations to it.

        + +

        We are excited to see an active community around torchaudio and eager to further grow and support it. We encourage you to go ahead and experiment for yourself with this tutorial and the two datasets that are available: VCTK and YESNO! They have an interface to download the datasets and preprocess them in a convenient format. You can find the details in the release notes here.

        + +

        Torchtext 0.4 with supervised learning datasets

        + +

        A key focus area of torchtext is to provide the fundamental elements to help accelerate NLP research. This includes easy access to commonly used datasets and basic preprocessing pipelines for working on raw text based data. The torchtext 0.4.0 release includes several popular supervised learning baselines with “one-command” data loading. A tutorial is included to show how to use the new datasets for text classification analysis. We also added and improved on a few functions such as get_tokenizer and build_vocab_from_iterator to make it easier to implement future datasets. Additional examples can be found here.

        + +

        Text classification is an important task in Natural Language Processing with many applications, such as sentiment analysis. The new release includes several popular text classification datasets for supervised learning including:

        + +
          +
        • AG_NEWS
        • +
        • SogouNews
        • +
        • DBpedia
        • +
        • YelpReviewPolarity
        • +
        • YelpReviewFull
        • +
        • YahooAnswers
        • +
        • AmazonReviewPolarity
        • +
        • AmazonReviewFull
        • +
        + +

        Each dataset comes with two parts (train vs. test), and can be easily loaded with a single command. The datasets also support an ngrams feature to capture the partial information about the local word order. Take a look at the tutorial here to learn more about how to use the new datasets for supervised problems such as text classification analysis.

        + +
        from torchtext.datasets.text_classification import DATASETS
        +train_dataset, test_dataset = DATASETS['AG_NEWS'](ngrams=2)
        +
        + +

        In addition to the domain library, PyTorch provides many tools to make data loading easy. Users now can load and preprocess the text classification datasets with some well supported tools, like torch.utils.data.DataLoader and torch.utils.data.IterableDataset. Here are a few lines to wrap the data with DataLoader. More examples can be found here.

        + +
        from torch.utils.data import DataLoader
        +data = DataLoader(train_dataset, collate_fn=generate_batch)
        +
        + +

        Check out the release notes here to learn more and try out the tutorial here.

        + +

        Torchvision 0.4 with Support for Video

        + +

        Video is now a first-class citizen in torchvision, with support for data loading, datasets, pre-trained models, and transforms. The 0.4 release of torchvision includes:

        + +
          +
        • Efficient IO primitives for reading/writing video files (including audio), with support for arbitrary encodings and formats.
        • +
        • Standard video datasets, compatible with torch.utils.data.Dataset and torch.utils.data.DataLoader.
        • +
        • Pre-trained models built on the Kinetics-400 dataset for action classification on videos (including the training scripts).
        • +
        • Reference training scripts for training your own video models.
        • +
        + +

        We wanted working with video data in PyTorch to be as straightforward as possible, without compromising too much on performance. +As such, we avoid the steps that would require re-encoding the videos beforehand, as it would involve:

        + +
          +
        • A preprocessing step which duplicates the dataset in order to re-encode it.
        • +
        • An overhead in time and space because this re-encoding is time-consuming.
        • +
        • Generally, an external script should be used to perform the re-encoding.
        • +
        + +

        Additionally, we provide APIs such as the utility class, VideoClips, that simplifies the task of enumerating all possible clips of fixed size in a list of video files by creating an index of all clips in a set of videos. It also allows you to specify a fixed frame-rate for the videos. An example of the API is provided below:

        + +
        from torchvision.datasets.video_utils import VideoClips
        +
        +class MyVideoDataset(object):
        +    def __init__(self, video_paths):
        +        self.video_clips = VideoClips(video_paths,
        +                                      clip_length_in_frames=16,
        +                                      frames_between_clips=1,
        +                                      frame_rate=15)
        +
        +    def __getitem__(self, idx):
        +        video, audio, info, video_idx = self.video_clips.get_clip(idx)
        +        return video, audio
        +
        +    def __len__(self):
        +        return self.video_clips.num_clips()
        +
        + +

        Most of the user-facing API is in Python, similar to PyTorch, which makes it easily extensible. Plus, the underlying implementation is fast — torchvision decodes as little as possible from the video on-the-fly in order to return a clip from the video.

        + +

        Check out the torchvision 0.4 release notes here for more details.

        + +

        We look forward to continuing our collaboration with the community and hearing your feedback as we further improve and expand the PyTorch deep learning platform.

        + +

        We’d like to thank the entire PyTorch team and the community for all of the contributions to this work!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/index.html b/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/index.html new file mode 100644 index 000000000000..c1c65d7b7d0f --- /dev/null +++ b/blog/pytorch-1.6-now-includes-stochastic-weight-averaging/index.html @@ -0,0 +1,946 @@ + + + + + + + + + + + + + PyTorch 1.6 now includes Stochastic Weight Averaging | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Pavel Izmailov, Andrew Gordon Wilson and Vincent Quenneville-Belair + +

        +

        Do you use stochastic gradient descent (SGD) or Adam? Regardless of the procedure you use to train your neural network, you can likely achieve significantly better generalization at virtually no additional cost with a simple new technique now natively supported in PyTorch 1.6, Stochastic Weight Averaging (SWA) [1]. Even if you have already trained your model, it’s easy to realize the benefits of SWA by running SWA for a small number of epochs starting with a pre-trained model. Again and again, researchers are discovering that SWA improves the performance of well-tuned models in a wide array of practical applications with little cost or effort!

        + +

        SWA has a wide range of applications and features:

        +
          +
        • SWA significantly improves performance compared to standard training techniques in computer vision (e.g., VGG, ResNets, Wide ResNets and DenseNets on ImageNet and CIFAR benchmarks [1, 2]).
        • +
        • SWA provides state-of-the-art performance on key benchmarks in semi-supervised learning and domain adaptation [2].
        • +
        • SWA was shown to improve performance in language modeling (e.g., AWD-LSTM on WikiText-2 [4]) and policy-gradient methods in deep reinforcement learning [3].
        • +
        • SWAG, an extension of SWA, can approximate Bayesian model averaging in Bayesian deep learning and achieves state-of-the-art uncertainty calibration results in various settings. Moreover, its recent generalization MultiSWAG provides significant additional performance gains and mitigates double-descent [4, 10]. Another approach, Subspace Inference, approximates the Bayesian posterior in a small subspace of the parameter space around the SWA solution [5].
        • +
        • SWA for low precision training, SWALP, can match the performance of full-precision SGD training, even with all numbers quantized down to 8 bits, including gradient accumulators [6].
        • +
        • SWA in parallel, SWAP, was shown to greatly speed up the training of neural networks by using large batch sizes and, in particular, set a record by training a neural network to 94% accuracy on CIFAR-10 in 27 seconds [11].
        • +
        + +
        + +
        + +

        Figure 1. Illustrations of SWA and SGD with a Preactivation ResNet-164 on CIFAR-100 [1]. Left: test error surface for three FGE samples and the corresponding SWA solution (averaging in weight space). Middle and Right: test error and train loss surfaces showing the weights proposed by SGD (at convergence) and SWA, starting from the same initialization of SGD after 125 training epochs. Please see [1] for details on how these figures were constructed.

        + +

        In short, SWA performs an equal average of the weights traversed by SGD (or any stochastic optimizer) with a modified learning rate schedule (see the left panel of Figure 1.). SWA solutions end up in the center of a wide flat region of loss, while SGD tends to converge to the boundary of the low-loss region, making it susceptible to the shift between train and test error surfaces (see the middle and right panels of Figure 1). We emphasize that SWA can be used with any optimizer, such as Adam, and is not specific to SGD.

        + +

        Previously, SWA was in PyTorch contrib. In PyTorch 1.6, we provide a new convenient implementation of SWA in torch.optim.swa_utils.

        + +

        Is this just Averaged SGD?

        + +

        At a high level, averaging SGD iterates dates back several decades in convex optimization [7, 8], where it is sometimes referred to as Polyak-Ruppert averaging, or averaged SGD. But the details matter. Averaged SGD is often used in conjunction with a decaying learning rate, and an exponential moving average (EMA), typically for convex optimization. In convex optimization, the focus has been on improved rates of convergence. In deep learning, this form of averaged SGD smooths the trajectory of SGD iterates but does not perform very differently.

        + +

        By contrast, SWA uses an equal average of SGD iterates with a modified cyclical or high constant learning rate and exploits the flatness of training objectives [8] specific to deep learning for improved generalization.

        + +

        How does Stochastic Weight Averaging Work?

        + +

        There are two important ingredients that make SWA work. First, SWA uses a modified learning rate schedule so that SGD (or other optimizers such as Adam) continues to bounce around the optimum and explore diverse models instead of simply converging to a single solution. For example, we can use the standard decaying learning rate strategy for the first 75% of training time and then set the learning rate to a reasonably high constant value for the remaining 25% of the time (see Figure 2 below). The second ingredient is to take an average of the weights (typically an equal average) of the networks traversed by SGD. For example, we can maintain a running average of the weights obtained at the end of every epoch within the last 25% of training time (see Figure 2). After training is complete, we then set the weights of the network to the computed SWA averages.

        + +
        + +
        + +

        Figure 2. Illustration of the learning rate schedule adopted by SWA. Standard decaying schedule is used for the first 75% of the training and then a high constant value is used for the remaining 25%. The SWA averages are formed during the last 25% of training.

        + +

        One important detail is the batch normalization. Batch normalization layers compute running statistics of activations during training. Note that the SWA averages of the weights are never used to make predictions during training. So the batch normalization layers do not have the activation statistics computed at the end of training. We can compute these statistics by doing a single forward pass on the train data with the SWA model.

        + +

        While we focus on SGD for simplicity in the description above, SWA can be combined with any optimizer. You can also use cyclical learning rates instead of a high constant value (see e.g., [2]).

        + +

        How to use SWA in PyTorch?

        + +

        In torch.optim.swa_utils we implement all the SWA ingredients to make it convenient to use SWA with any model. In particular, we implement AveragedModel class for SWA models, SWALR learning rate scheduler, and update_bn utility function to update SWA batch normalization statistics at the end of training.

        + +

        In the example below, swa_model is the SWA model that accumulates the averages of the weights. We train the model for a total of 300 epochs, and we switch to the SWA learning rate schedule and start to collect SWA averages of the parameters at epoch 160.

        + +
        from torch.optim.swa_utils import AveragedModel, SWALR
        +from torch.optim.lr_scheduler import CosineAnnealingLR
        +
        +loader, optimizer, model, loss_fn = ...
        +swa_model = AveragedModel(model)
        +scheduler = CosineAnnealingLR(optimizer, T_max=100)
        +swa_start = 5
        +swa_scheduler = SWALR(optimizer, swa_lr=0.05)
        +
        +for epoch in range(100):
        +      for input, target in loader:
        +          optimizer.zero_grad()
        +          loss_fn(model(input), target).backward()
        +          optimizer.step()
        +      if epoch > swa_start:
        +          swa_model.update_parameters(model)
        +          swa_scheduler.step()
        +      else:
        +          scheduler.step()
        +
        +# Update bn statistics for the swa_model at the end
        +torch.optim.swa_utils.update_bn(loader, swa_model)
        +# Use swa_model to make predictions on test data 
        +preds = swa_model(test_input)
        +
        + +

        Next, we explain each component of torch.optim.swa_utils in detail.

        + +

        AveragedModel class serves to compute the weights of the SWA model. You can create an averaged model by running swa_model = AveragedModel(model). You can then update the parameters of the averaged model by swa_model.update_parameters(model). By default, AveragedModel computes a running equal average of the parameters that you provide, but you can also use custom averaging functions with the avg_fn parameter. In the following example, ema_model computes an exponential moving average.

        + +
        ema_avg = lambda averaged_model_parameter, model_parameter, num_averaged:\
        +0.1 * averaged_model_parameter + 0.9 * model_parameter
        +ema_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=ema_avg)
        +
        + +

        In practice, we find an equal average with the modified learning rate schedule in Figure 2 provides the best performance.

        + +

        SWALR is a learning rate scheduler that anneals the learning rate to a fixed value, and then keeps it constant. For example, the following code creates a scheduler that linearly anneals the learning rate from its initial value to 0.05 in 5 epochs within each parameter group.

        + +
        swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, 
        +anneal_strategy="linear", anneal_epochs=5, swa_lr=0.05)
        +
        +
        +

        We also implement cosine annealing to a fixed value (anneal_strategy="cos"). In practice, we typically switch to SWALR at epoch swa_start (e.g. after 75% of the training epochs), and simultaneously start to compute the running averages of the weights:

        + +
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
        +swa_start = 75
        +for epoch in range(100):
        +      # <train epoch>
        +      if i > swa_start:
        +          swa_model.update_parameters(model)
        +          swa_scheduler.step()
        +      else:
        +          scheduler.step()
        +
        + +

        Finally, update_bn is a utility function that computes the batchnorm statistics for the SWA model on a given dataloader loader:

        +
        torch.optim.swa_utils.update_bn(loader, swa_model) 
        +
        +

        update_bn applies the swa_model to every element in the dataloader and computes the activation statistics for each batch normalization layer in the model.

        + +

        Once you computed the SWA averages and updated the batch normalization layers, you can apply swa_model to make predictions on test data.

        + +

        Why does it work?

        + +

        There are large flat regions of the loss surface [9]. In Figure 3 below, we show a visualization of the loss surface in a subspace of the parameter space containing a path connecting two independently trained SGD solutions, such that the loss is similarly low at every point along the path. SGD converges near the boundary of these regions because there isn’t much gradient signal to move inside, as the points in the region all have similarly low values of loss. By increasing the learning rate, SWA spins around this flat region, and then by averaging the iterates, moves towards the center of the flat region.

        + +
        + +
        + +

        Figure 3: visualization of mode connectivity for ResNet-20 with no skip connections on CIFAR-10 dataset. The visualization is created in collaboration with Javier Ideami (https://losslandscape.com/). For more details, see this blogpost.

        + +

        We expect solutions that are centered in the flat region of the loss to generalize better than those near the boundary. Indeed, train and test error surfaces are not perfectly aligned in the weight space. Solutions that are centered in the flat region are not as susceptible to the shifts between train and test error surfaces as those near the boundary. In Figure 4 below, we show the train loss and test error surfaces along the direction connecting the SWA and SGD solutions. As you can see, while the SWA solution has a higher train loss compared to the SGD solution, it is centered in a region of low loss and has a substantially better test error.

        + +
        + +
        + +

        Figure 4. Train loss and test error along the line connecting the SWA solution (circle) and SGD solution (square). The SWA solution is centered in a wide region of low train loss, while the SGD solution lies near the boundary. Because of the shift between train loss and test error surfaces, the SWA solution leads to much better generalization.

        + +

        What are the results achieved with SWA?

        + +

        We release a GitHub repo with examples using the PyTorch implementation of SWA for training DNNs. For example, these examples can be used to achieve the following results on CIFAR-100:

        + + + + + + + + + + + + + + + + + + + + + + + + +
         VGG-16ResNet-164WideResNet-28x10
        SGD72.8 ± 0.378.4 ± 0.381.0 ± 0.3
        SWA74.4 ± 0.379.8 ± 0.482.5 ± 0.2
        + +

        Semi-Supervised Learning

        + +

        In a follow-up paper SWA was applied to semi-supervised learning, where it improved the best reported results in multiple settings [2]. For example, with SWA you can get 95% accuracy on CIFAR-10 if you only have the training labels for 4k training data points (the previous best reported result on this problem was 93.7%). This paper also explores averaging multiple times within epochs, which can accelerate convergence and find still flatter solutions in a given time.

        + +
        + +
        +

        Figure 5. Performance of fast-SWA on semi-supervised learning with CIFAR-10. fast-SWA achieves record results in every setting considered.

        + +

        Reinforcement Learning

        + +

        In another follow-up paper SWA was shown to improve the performance of policy gradient methods A2C and DDPG on several Atari games and MuJoCo environments [3]. This application is also an instance of where SWA is used with Adam. Recall that SWA is not specific to SGD and can benefit essentially any optimizer.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Environment NameA2CA2C + SWA
        Breakout522 ± 34703 ± 60
        Qbert18777 ± 77821272 ± 655
        SpaceInvaders7727 ± 112121676 ± 8897
        Seaquest1779 ± 41795 ± 4
        BeamRider9999 ± 40211321 ± 1065
        CrazyClimber147030 ± 10239139752 ± 11618
        + +

        Low Precision Training

        + +

        We can filter through quantization noise by combining weights that have been rounded down with weights that have been rounded up. Moreover, by averaging weights to find a flat region of the loss surface, large perturbations of the weights will not affect the quality of the solution (Figures 9 and 10). Recent work shows that by adapting SWA to the low precision setting, in a method called SWALP, one can match the performance of full-precision SGD even with all training in 8 bits [5]. This is quite a practically important result, given that (1) SGD training in 8 bits performs notably worse than full precision SGD, and (2) low precision training is significantly harder than predictions in low precision after training (the usual setting). For example, a ResNet-164 trained on CIFAR-100 with float (16-bit) SGD achieves 22.2% error, while 8-bit SGD achieves 24.0% error. By contrast, SWALP with 8 bit training achieves 21.8% error.

        + +
        + +
        +

        Figure 9. Quantizing a solution leads to a perturbation of the weights which has a greater effect on the quality of the sharp solution (left) compared to wide solution (right).

        + +
        + +
        +

        Figure 10. The difference between standard low precision training and SWALP.

        + +

        Another work, SQWA, presents an approach for quantization and fine-tuning of neural networks in low precision [12]. In particular, SQWA achieved state-of-the-art results for DNNs quantized to 2 bits on CIFAR-100 and ImageNet.

        + +

        Calibration and Uncertainty Estimates

        + +

        By finding a centred solution in the loss, SWA can also improve calibration and uncertainty representation. Indeed, SWA can be viewed as an approximation to an ensemble, resembling a Bayesian model average, but with a single model [1].

        + +

        SWA can be viewed as taking the first moment of SGD iterates with a modified learning rate schedule. We can directly generalize SWA by also taking the second moment of iterates to form a Gaussian approximate posterior over the weights, further characterizing the loss geometry with SGD iterates. This approach,SWA-Gaussian (SWAG) is a simple, scalable and convenient approach to uncertainty estimation and calibration in Bayesian deep learning [4]. The SWAG distribution approximates the shape of the true posterior: Figure 6 below shows the SWAG distribution and the posterior log-density for ResNet-20 on CIFAR-10.

        + +
        + +
        +

        Figure 6. SWAG posterior approximation and the loss surface for a ResNet-20 without skip-connections trained on CIFAR-10 in the subspace formed by the two largest eigenvalues of the SWAG covariance matrix. The shape of SWAG distribution is aligned with the posterior: the peaks of the two distributions coincide, and both distributions are wider in one direction than in the orthogonal direction. Visualization created in collaboration with Javier Ideami.

        + +

        Empirically, SWAG performs on par or better than popular alternatives including MC dropout, KFAC Laplace, and temperature scaling on uncertainty quantification, out-of-distribution detection, calibration and transfer learning in computer vision tasks. Code for SWAG is available here.

        + +
        + +
        +

        Figure 7. MultiSWAG generalizes SWAG and deep ensembles, to perform Bayesian model averaging over multiple basins of attraction, leading to significantly improved performance. By contrast, as shown here, deep ensembles select different modes, while standard variational inference (VI) marginalizes (model averages) within a single basin.

        + +

        MultiSWAG [9] uses multiple independent SWAG models to form a mixture of Gaussians as an approximate posterior distribution. Different basins of attraction contain highly complementary explanations of the data. Accordingly, marginalizing over these multiple basins provides a significant boost in accuracy and uncertainty representation. MultiSWAG can be viewed as a generalization of deep ensembles, but with performance improvements.

        + +

        Indeed, we see in Figure 8 that MultiSWAG entirely mitigates double descent – more flexible models have monotonically improving performance – and provides significantly improved generalization over SGD. For example, when the ResNet-18 has layers of width 20, Multi-SWAG achieves under 30% error whereas SGD achieves over 45%, more than a 15% gap!

        + +
        + +
        +

        Figure 8. SGD, SWAG, and Multi-SWAG on CIFAR-100 for a ResNet-18 with varying widths. We see Multi-SWAG in particular mitigates double descent and provides significant accuracy improvements over SGD.

        + +

        Reference [10] also considers Multi-SWA, which uses multiple independently trained SWA solutions in an ensemble, providing performance improvements over deep ensembles without any additional computational cost. Code for MultiSWA and MultiSWAG is available here.

        + +

        Another method, Subspace Inference, constructs a low-dimensional subspace around the SWA solution and marginalizes the weights in this subspace to approximate the Bayesian model average [5]. Subspace Inference uses the statistics from the SGD iterates to construct both the SWA solution and the subspace. The method achieves strong performance in terms of prediction accuracy and uncertainty calibration both in classification and regression problems. Code is available here.

        + +

        Try it Out!

        + +

        One of the greatest open questions in deep learning is why SGD manages to find good solutions, given that the training objectives are highly multimodal, and there are many settings of parameters that achieve no training loss but poor generalization. By understanding geometric features such as flatness, which relate to generalization, we can begin to resolve these questions and build optimizers that provide even better generalization, and many other useful features, such as uncertainty representation. We have presented SWA, a simple drop-in replacement for standard optimizers such as SGD and Adam, which can in principle, benefit anyone training a deep neural network. SWA has been demonstrated to have a strong performance in several areas, including computer vision, semi-supervised learning, reinforcement learning, uncertainty representation, calibration, Bayesian model averaging, and low precision training.

        + +

        We encourage you to try out SWA! SWA is now as easy as any standard training in PyTorch. And even if you have already trained your model, you can use SWA to significantly improve performance by running it for a small number of epochs from a pre-trained model.

        + +

        [1] Averaging Weights Leads to Wider Optima and Better Generalization; Pavel Izmailov, Dmitry Podoprikhin, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson; Uncertainty in Artificial Intelligence (UAI), 2018.

        + +

        [2] There Are Many Consistent Explanations of Unlabeled Data: Why You Should Average; Ben Athiwaratkun, Marc Finzi, Pavel Izmailov, Andrew Gordon Wilson; +International Conference on Learning Representations (ICLR), 2019.

        + +

        [3] Improving Stability in Deep Reinforcement Learning with Weight Averaging; Evgenii Nikishin, Pavel Izmailov, Ben Athiwaratkun, Dmitrii Podoprikhin, +Timur Garipov, Pavel Shvechikov, Dmitry Vetrov, Andrew Gordon Wilson; UAI 2018 Workshop: Uncertainty in Deep Learning, 2018.

        + +

        [4] A Simple Baseline for Bayesian Uncertainty in Deep Learning +Wesley Maddox, Timur Garipov, Pavel Izmailov, Andrew Gordon Wilson; Neural Information Processing Systems (NeurIPS), 2019.

        + +

        [5] Subspace Inference for Bayesian Deep Learning +Pavel Izmailov, Wesley Maddox, Polina Kirichenko, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson +Uncertainty in Artificial Intelligence (UAI), 2019.

        + +

        [6] SWALP : Stochastic Weight Averaging in Low Precision Training +Guandao Yang, Tianyi Zhang, Polina Kirichenko, Junwen Bai, +Andrew Gordon Wilson, Christopher De Sa; International Conference on Machine Learning (ICML), 2019.

        + +

        [7] David Ruppert. Efficient estimations from a slowly convergent Robbins-Monro process; Technical report, Cornell University Operations Research and Industrial Engineering, 1988.

        + +

        [8] Acceleration of stochastic approximation by averaging. Boris T Polyak and Anatoli B Juditsky; SIAM Journal on Control and Optimization, 30(4):838–855, 1992.

        + +

        [9] Loss Surfaces, Mode Connectivity, and Fast Ensembling of DNNs +Timur Garipov, Pavel Izmailov, Dmitrii Podoprikhin, Dmitry Vetrov, +Andrew Gordon Wilson. Neural Information Processing Systems (NeurIPS), 2018.

        + +

        [10] Bayesian Deep Learning and a Probabilistic Perspective of Generalization +Andrew Gordon Wilson, Pavel Izmailov. ArXiv preprint, 2020.

        + +

        [11] Stochastic Weight Averaging in Parallel: Large-Batch Training That Generalizes Well +Gupta, Vipul, Santiago Akle Serrano, and Dennis DeCoste; International Conference on Learning Representations (ICLR). 2019.

        + +

        [12] SQWA: Stochastic Quantized Weight Averaging for Improving the Generalization Capability of Low-Precision Deep Neural Networks +Shin, Sungho, Yoonho Boo, and Wonyong Sung; arXiv preprint 2020.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.6-released/index.html b/blog/pytorch-1.6-released/index.html new file mode 100644 index 000000000000..8a42771a7d92 --- /dev/null +++ b/blog/pytorch-1.6-released/index.html @@ -0,0 +1,880 @@ + + + + + + + + + + + + + PyTorch 1.6 released w/ Native AMP Support, Microsoft joins as maintainers for Windows | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we’re announcing the availability of PyTorch 1.6, along with updated domain libraries. We are also excited to announce the team at Microsoft is now maintaining Windows builds and binaries and will also be supporting the community on GitHub as well as the PyTorch Windows discussion forums.

        + +

        The PyTorch 1.6 release includes a number of new APIs, tools for performance improvement and profiling, as well as major updates to both distributed data parallel (DDP) and remote procedure call (RPC) based distributed training. +A few of the highlights include:

        + +
          +
        1. Automatic mixed precision (AMP) training is now natively supported and a stable feature (See here for more details) - thanks for NVIDIA’s contributions;
        2. +
        3. Native TensorPipe support now added for tensor-aware, point-to-point communication primitives built specifically for machine learning;
        4. +
        5. Added support for complex tensors to the frontend API surface;
        6. +
        7. New profiling tools providing tensor-level memory consumption information;
        8. +
        9. Numerous improvements and new features for both distributed data parallel (DDP) training and the remote procedural call (RPC) packages.
        10. +
        + +

        Additionally, from this release onward, features will be classified as Stable, Beta and Prototype. Prototype features are not included as part of the binary distribution and are instead available through either building from source, using nightlies or via compiler flag. You can learn more about what this change means in the post here. You can also find the full release notes here.

        + +

        Performance & Profiling

        + +

        [Stable] Automatic Mixed Precision (AMP) Training

        + +

        AMP allows users to easily enable automatic mixed precision training enabling higher performance and memory savings of up to 50% on Tensor Core GPUs. Using the natively supported torch.cuda.amp API, AMP provides convenience methods for mixed precision, where some operations use the torch.float32 (float) datatype and other operations use torch.float16 (half). Some ops, like linear layers and convolutions, are much faster in float16. Other ops, like reductions, often require the dynamic range of float32. Mixed precision tries to match each op to its appropriate datatype.

        + +
          +
        • Design doc (Link)
        • +
        • Documentation (Link)
        • +
        • Usage examples (Link)
        • +
        + +

        [Beta] Fork/Join Parallelism

        + +

        This release adds support for a language-level construct as well as runtime support for coarse-grained parallelism in TorchScript code. This support is useful for situations such as running models in an ensemble in parallel, or running bidirectional components of recurrent nets in parallel, and allows the ability to unlock the computational power of parallel architectures (e.g. many-core CPUs) for task level parallelism.

        + +

        Parallel execution of TorchScript programs is enabled through two primitives: torch.jit.fork and torch.jit.wait. In the below example, we parallelize execution of foo:

        + +
        import torch
        +from typing import List
        +
        +def foo(x):
        +    return torch.neg(x)
        +
        +@torch.jit.script
        +def example(x):
        +    futures = [torch.jit.fork(foo, x) for _ in range(100)]
        +    results = [torch.jit.wait(future) for future in futures]
        +    return torch.sum(torch.stack(results))
        +
        +print(example(torch.ones([])))
        +
        + +
          +
        • Documentation (Link)
        • +
        + +

        [Beta] Memory Profiler

        + +

        The torch.autograd.profiler API now includes a memory profiler that lets you inspect the tensor memory cost of different operators inside your CPU and GPU models.

        + +

        Here is an example usage of the API:

        + +
        import torch
        +import torchvision.models as models
        +import torch.autograd.profiler as profiler
        +
        +model = models.resnet18()
        +inputs = torch.randn(5, 3, 224, 224)
        +with profiler.profile(profile_memory=True, record_shapes=True) as prof:
        +    model(inputs)
        +
        +# NOTE: some columns were removed for brevity
        +print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
        +# ---------------------------  ---------------  ---------------  ---------------
        +# Name                         CPU Mem          Self CPU Mem     Number of Calls
        +# ---------------------------  ---------------  ---------------  ---------------
        +# empty                        94.79 Mb         94.79 Mb         123
        +# resize_                      11.48 Mb         11.48 Mb         2
        +# addmm                        19.53 Kb         19.53 Kb         1
        +# empty_strided                4 b              4 b              1
        +# conv2d                       47.37 Mb         0 b              20
        +# ---------------------------  ---------------  ---------------  ---------------
        +
        + + + +

        Distributed Training & RPC

        + +

        [Beta] TensorPipe backend for RPC

        + +

        PyTorch 1.6 introduces a new backend for the RPC module which leverages the TensorPipe library, a tensor-aware point-to-point communication primitive targeted at machine learning, intended to complement the current primitives for distributed training in PyTorch (Gloo, MPI, …) which are collective and blocking. The pairwise and asynchronous nature of TensorPipe lends itself to new networking paradigms that go beyond data parallel: client-server approaches (e.g., parameter server for embeddings, actor-learner separation in Impala-style RL, …) and model and pipeline parallel training (think GPipe), gossip SGD, etc.

        + +
        # One-line change needed to opt in
        +torch.distributed.rpc.init_rpc(
        +    ...
        +    backend=torch.distributed.rpc.BackendType.TENSORPIPE,
        +)
        +
        +# No changes to the rest of the RPC API
        +torch.distributed.rpc.rpc_sync(...)
        +
        + +
          +
        • Design doc (Link)
        • +
        • Documentation (Link)
        • +
        + +

        [Beta] DDP+RPC

        + +

        PyTorch Distributed supports two powerful paradigms: DDP for full sync data parallel training of models and the RPC framework which allows for distributed model parallelism. Previously, these two features worked independently and users couldn’t mix and match these to try out hybrid parallelism paradigms.

        + +

        Starting in PyTorch 1.6, we’ve enabled DDP and RPC to work together seamlessly so that users can combine these two techniques to achieve both data parallelism and model parallelism. An example is where users would like to place large embedding tables on parameter servers and use the RPC framework for embedding lookups, but store smaller dense parameters on trainers and use DDP to synchronize the dense parameters. Below is a simple code snippet.

        + +
        // On each trainer
        +
        +remote_emb = create_emb(on="ps", ...)
        +ddp_model = DDP(dense_model)
        +
        +for data in batch:
        +   with torch.distributed.autograd.context():
        +      res = remote_emb(data)
        +      loss = ddp_model(res)
        +      torch.distributed.autograd.backward([loss])
        +
        + +
          +
        • DDP+RPC Tutorial (Link)
        • +
        • Documentation (Link)
        • +
        • Usage Examples (Link)
        • +
        + +

        [Beta] RPC - Asynchronous User Functions

        + +

        RPC Asynchronous User Functions supports the ability to yield and resume on the server side when executing a user-defined function. Prior to this feature, when a callee processes a request, one RPC thread waits until the user function returns. If the user function contains IO (e.g., nested RPC) or signaling (e.g., waiting for another request to unblock), the corresponding RPC thread would sit idle waiting for these events. As a result, some applications have to use a very large number of threads and send additional RPC requests, which can potentially lead to performance degradation. To make a user function yield on such events, applications need to: 1) Decorate the function with the @rpc.functions.async_execution decorator; and 2) Let the function return a torch.futures.Future and install the resume logic as callbacks on the Future object. See below for an example:

        + +
        @rpc.functions.async_execution
        +def async_add_chained(to, x, y, z):
        +    return rpc.rpc_async(to, torch.add, args=(x, y)).then(
        +        lambda fut: fut.wait() + z
        +    )
        +
        +ret = rpc.rpc_sync(
        +    "worker1", 
        +    async_add_chained, 
        +    args=("worker2", torch.ones(2), 1, 1)
        +)
        +        
        +print(ret)  # prints tensor([3., 3.])
        +
        + +
          +
        • Tutorial for performant batch RPC using Asynchronous User Functions
        • +
        • Documentation (Link)
        • +
        • Usage examples (Link)
        • +
        + +

        Frontend API Updates

        + +

        [Beta] Complex Numbers

        + +

        The PyTorch 1.6 release brings beta level support for complex tensors including torch.complex64 and torch.complex128 dtypes. A complex number is a number that can be expressed in the form a + bj, where a and b are real numbers, and j is a solution of the equation x^2 = −1. Complex numbers frequently occur in mathematics and engineering, especially in signal processing and the area of complex neural networks is an active area of research. The beta release of complex tensors will support common PyTorch and complex tensor functionality, plus functions needed by Torchaudio, ESPnet and others. While this is an early version of this feature, and we expect it to improve over time, the overall goal is provide a NumPy compatible user experience that leverages PyTorch’s ability to run on accelerators and work with autograd to better support the scientific community.

        + +

        Mobile Updates

        + +

        PyTorch 1.6 brings increased performance and general stability for mobile on-device inference. We squashed a few bugs, continued maintenance and added few new features while improving fp32 and int8 performance on a large variety of ML model inference on CPU backend.

        + +

        [Beta] Mobile Features and Performance

        + +
          +
        • Stateless and stateful XNNPACK Conv and Linear operators
        • +
        • Stateless MaxPool2d + JIT optimization passes
        • +
        • JIT pass optimizations: Conv + BatchNorm fusion, graph rewrite to replace conv2d/linear with xnnpack ops, relu/hardtanh fusion, dropout removal
        • +
        • QNNPACK integration removes requantization scale constraint
        • +
        • Per-channel quantization for conv, linear and dynamic linear
        • +
        • Disable tracing for mobile client to save ~600 KB on full-jit builds
        • +
        + +

        Updated Domain Libraries

        + +

        torchvision 0.7

        + +

        torchvision 0.7 introduces two new pretrained semantic segmentation models, FCN ResNet50 and DeepLabV3 ResNet50, both trained on COCO and using smaller memory footprints than the ResNet101 backbone. We also introduced support for AMP (Automatic Mixed Precision) autocasting for torchvision models and operators, which automatically selects the floating point precision for different GPU operations to improve performance while maintaining accuracy.

        + +
          +
        • Release notes (Link)
        • +
        + +

        torchaudio 0.6

        + +

        torchaudio now officially supports Windows. This release also introduces a new model module (with wav2letter included), new functionals (contrast, cvm, dcshift, overdrive, vad, phaser, flanger, biquad), datasets (GTZAN, CMU), and a new optional sox backend with support for TorchScript.

        + +
          +
        • Release notes (Link)
        • +
        + +

        Additional updates

        + +

        HACKATHON

        + +

        The Global PyTorch Summer Hackathon is back! This year, teams can compete in three categories virtually:

        + +
          +
        1. PyTorch Developer Tools: Tools or libraries designed to improve productivity and efficiency of PyTorch for researchers and developers
        2. +
        3. Web/Mobile Applications powered by PyTorch: Applications with web/mobile interfaces and/or embedded devices powered by PyTorch
        4. +
        5. PyTorch Responsible AI Development Tools: Tools, libraries, or web/mobile apps for responsible AI development
        6. +
        + +

        This is a great opportunity to connect with the community and practice your machine learning skills.

        + + + +

        LPCV Challenge

        + +

        The 2020 CVPR Low-Power Vision Challenge (LPCV) - Online Track for UAV video submission deadline is coming up shortly. You have until July 31, 2020 to build a system that can discover and recognize characters in video captured by an unmanned aerial vehicle (UAV) accurately using PyTorch and Raspberry Pi 3B+.

        + +

        Prototype Features

        + +

        To reiterate, Prototype features in PyTorch are early features that we are looking to gather feedback on, gauge the usefulness of and improve ahead of graduating them to Beta or Stable. The following features are not part of the PyTorch 1.6 release and instead are available in nightlies with separate docs/tutorials to help facilitate early usage and feedback.

        + +

        Distributed RPC/Profiler

        +

        Allow users to profile training jobs that use torch.distributed.rpc using the autograd profiler, and remotely invoke the profiler in order to collect profiling information across different nodes. The RFC can be found here and a short recipe on how to use this feature can be found here.

        + +

        TorchScript Module Freezing

        +

        Module Freezing is the process of inlining module parameters and attributes values into the TorchScript internal representation. Parameter and attribute values are treated as final value and they cannot be modified in the frozen module. The PR for this feature can be found here and a short tutorial on how to use this feature can be found here.

        + +

        Graph Mode Quantization

        +

        Eager mode quantization requires users to make changes to their model, including explicitly quantizing activations, module fusion, rewriting use of torch ops with Functional Modules and quantization of functionals are not supported. If we can trace or script the model, then the quantization can be done automatically with graph mode quantization without any of the complexities in eager mode, and it is configurable through a qconfig_dict. A tutorial on how to use this feature can be found here.

        + +

        Quantization Numerical Suite

        +

        Quantization is good when it works, but it’s difficult to know what’s wrong when it doesn’t satisfy the expected accuracy. A prototype is now available for a Numerical Suite that measures comparison statistics between quantized modules and float modules. This is available to test using eager mode and on CPU only with more support coming. A tutorial on how to use this feature can be found here.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.7-released/index.html b/blog/pytorch-1.7-released/index.html new file mode 100644 index 000000000000..2300cf04c296 --- /dev/null +++ b/blog/pytorch-1.7-released/index.html @@ -0,0 +1,964 @@ + + + + + + + + + + + + + PyTorch 1.7 released w/ CUDA 11, New APIs for FFTs, Windows support for Distributed training and more | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we’re announcing the availability of PyTorch 1.7, along with updated domain libraries. The PyTorch 1.7 release includes a number of new APIs including support for NumPy-Compatible FFT operations, profiling tools and major updates to both distributed data parallel (DDP) and remote procedure call (RPC) based distributed training. In addition, several features moved to stable including custom C++ Classes, the memory profiler, extensions via custom tensor-like objects, user async functions in RPC and a number of other features in torch.distributed such as Per-RPC timeout, DDP dynamic bucketing and RRef helper.

        + +

        A few of the highlights include:

        +
          +
        • CUDA 11 is now officially supported with binaries available at PyTorch.org
        • +
        • Updates and additions to profiling and performance for RPC, TorchScript and Stack traces in the autograd profiler
        • +
        • (Beta) Support for NumPy compatible Fast Fourier transforms (FFT) via torch.fft
        • +
        • (Prototype) Support for Nvidia A100 generation GPUs and native TF32 format
        • +
        • (Prototype) Distributed training on Windows now supported
        • +
        • torchvision +
            +
          • (Stable) Transforms now support Tensor inputs, batch computation, GPU, and TorchScript
          • +
          • (Stable) Native image I/O for JPEG and PNG formats
          • +
          • (Beta) New Video Reader API
          • +
          +
        • +
        • torchaudio +
            +
          • (Stable) Added support for speech rec (wav2letter), text to speech (WaveRNN) and source separation (ConvTasNet)
          • +
          +
        • +
        + +

        To reiterate, starting PyTorch 1.6, features are now classified as stable, beta and prototype. You can see the detailed announcement here. Note that the prototype features listed in this blog are available as part of this release.

        + +

        Find the full release notes here.

        + +

        Front End APIs

        +

        [Beta] NumPy Compatible torch.fft module

        +

        FFT-related functionality is commonly used in a variety of scientific fields like signal processing. While PyTorch has historically supported a few FFT-related functions, the 1.7 release adds a new torch.fft module that implements FFT-related functions with the same API as NumPy.

        + +

        This new module must be imported to be used in the 1.7 release, since its name conflicts with the historic (and now deprecated) torch.fft function.

        + +

        Example usage:

        +
        >>> import torch.fft
        +>>> t = torch.arange(4)
        +>>> t
        +tensor([0, 1, 2, 3])
        +
        +>>> torch.fft.fft(t)
        +tensor([ 6.+0.j, -2.+2.j, -2.+0.j, -2.-2.j])
        +
        +>>> t = tensor([0.+1.j, 2.+3.j, 4.+5.j, 6.+7.j])
        +>>> torch.fft.fft(t)
        +tensor([12.+16.j, -8.+0.j, -4.-4.j,  0.-8.j])
        +
        + + + +

        [Beta] C++ Support for Transformer NN Modules

        +

        Since PyTorch 1.5, we’ve continued to maintain parity between the python and C++ frontend APIs. This update allows developers to use the nn.transformer module abstraction from the C++ Frontend. And moreover, developers no longer need to save a module from python/JIT and load into C++ as it can now be used it in C++ directly.

        + + +

        [Beta] torch.set_deterministic

        +

        Reproducibility (bit-for-bit determinism) may help identify errors when debugging or testing a program. To facilitate reproducibility, PyTorch 1.7 adds the torch.set_deterministic(bool) function that can direct PyTorch operators to select deterministic algorithms when available, and to throw a runtime error if an operation may result in nondeterministic behavior. By default, the flag this function controls is false and there is no change in behavior, meaning PyTorch may implement its operations nondeterministically by default.

        + +

        More precisely, when this flag is true:

        +
          +
        • Operations known to not have a deterministic implementation throw a runtime error;
        • +
        • Operations with deterministic variants use those variants (usually with a performance penalty versus the non-deterministic version); and
        • +
        • torch.backends.cudnn.deterministic = True is set.
        • +
        + +

        Note that this is necessary, but not sufficient, for determinism within a single run of a PyTorch program. Other sources of randomness like random number generators, unknown operations, or asynchronous or distributed computation may still cause nondeterministic behavior.

        + +

        See the documentation for torch.set_deterministic(bool) for the list of affected operations.

        + + +

        Performance & Profiling

        +

        [Beta] Stack traces added to profiler

        +

        Users can now see not only operator name/inputs in the profiler output table but also where the operator is in the code. The workflow requires very little change to take advantage of this capability. The user uses the autograd profiler as before but with optional new parameters: with_stack and group_by_stack_n. Caution: regular profiling runs should not use this feature as it adds significant overhead.

        + + +

        Distributed Training & RPC

        +

        [Stable] TorchElastic now bundled into PyTorch docker image

        +

        Torchelastic offers a strict superset of the current torch.distributed.launch CLI with the added features for fault-tolerance and elasticity. If the user is not be interested in fault-tolerance, they can get the exact functionality/behavior parity by setting max_restarts=0 with the added convenience of auto-assigned RANK and MASTER_ADDR|PORT (versus manually specified in torch.distributed.launch).

        + +

        By bundling torchelastic in the same docker image as PyTorch, users can start experimenting with TorchElastic right-away without having to separately install torchelastic. In addition to convenience, this work is a nice-to-have when adding support for elastic parameters in the existing Kubeflow’s distributed PyTorch operators.

        + + +

        [Beta] Support for uneven dataset inputs in DDP

        +

        PyTorch 1.7 introduces a new context manager to be used in conjunction with models trained using torch.nn.parallel.DistributedDataParallel to enable training with uneven dataset size across different processes. This feature enables greater flexibility when using DDP and prevents the user from having to manually ensure dataset sizes are the same across different process. With this context manager, DDP will handle uneven dataset sizes automatically, which can prevent errors or hangs at the end of training.

        + + +

        [Beta] NCCL Reliability - Async Error/Timeout Handling

        +

        In the past, NCCL training runs would hang indefinitely due to stuck collectives, leading to a very unpleasant experience for users. This feature will abort stuck collectives and throw an exception/crash the process if a potential hang is detected. When used with something like torchelastic (which can recover the training process from the last checkpoint), users can have much greater reliability for distributed training. This feature is completely opt-in and sits behind an environment variable that needs to be explicitly set in order to enable this functionality (otherwise users will see the same behavior as before).

        + + +

        [Beta] TorchScript rpc_remote and rpc_sync

        +

        torch.distributed.rpc.rpc_async has been available in TorchScript in prior releases. For PyTorch 1.7, this functionality will be extended the remaining two core RPC APIs, torch.distributed.rpc.rpc_sync and torch.distributed.rpc.remote. This will complete the major RPC APIs targeted for support in TorchScript, it allows users to use the existing python RPC APIs within TorchScript (in a script function or script method, which releases the python Global Interpreter Lock) and could possibly improve application performance in multithreaded environment.

        + + +

        [Beta] Distributed optimizer with TorchScript support

        +

        PyTorch provides a broad set of optimizers for training algorithms, and these have been used repeatedly as part of the python API. However, users often want to use multithreaded training instead of multiprocess training as it provides better resource utilization and efficiency in the context of large scale distributed training (e.g. Distributed Model Parallel) or any RPC-based training application). Users couldn’t do this with with distributed optimizer before because we need to get rid of the python Global Interpreter Lock (GIL) limitation to achieve this.

        + +

        In PyTorch 1.7, we are enabling the TorchScript support in distributed optimizer to remove the GIL, and make it possible to run optimizer in multithreaded applications. The new distributed optimizer has the exact same interface as before but it automatically converts optimizers within each worker into TorchScript to make each GIL free. This is done by leveraging a functional optimizer concept and allowing the distributed optimizer to convert the computational portion of the optimizer into TorchScript. This will help use cases like distributed model parallel training and improve performance using multithreading.

        + +

        Currently, the only optimizer that supports automatic conversion with TorchScript is Adagrad and all other optimizers will still work as before without TorchScript support. We are working on expanding the coverage to all PyTorch optimizers and expect more to come in future releases. The usage to enable TorchScript support is automatic and exactly the same with existing python APIs, here is an example of how to use this:

        + +
        import torch.distributed.autograd as dist_autograd
        +import torch.distributed.rpc as rpc
        +from torch import optim
        +from torch.distributed.optim import DistributedOptimizer
        +
        +with dist_autograd.context() as context_id:
        +  # Forward pass.
        +  rref1 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 3))
        +  rref2 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 1))
        +  loss = rref1.to_here() + rref2.to_here()
        +
        +  # Backward pass.
        +  dist_autograd.backward(context_id, [loss.sum()])
        +
        +  # Optimizer, pass in optim.Adagrad, DistributedOptimizer will
        +  # automatically convert/compile it to TorchScript (GIL-free)
        +  dist_optim = DistributedOptimizer(
        +     optim.Adagrad,
        +     [rref1, rref2],
        +     lr=0.05,
        +  )
        +  dist_optim.step(context_id)
        +
        + + +

        [Beta] Enhancements to RPC-based Profiling

        +

        Support for using the PyTorch profiler in conjunction with the RPC framework was first introduced in PyTorch 1.6. In PyTorch 1.7, the following enhancements have been made:

        +
          +
        • Implemented better support for profiling TorchScript functions over RPC
        • +
        • Achieved parity in terms of profiler features that work with RPC
        • +
        • Added support for asynchronous RPC functions on the server-side (functions decorated with rpc.functions.async_execution).
        • +
        + +

        Users are now able to use familiar profiling tools such as with torch.autograd.profiler.profile() and with torch.autograd.profiler.record_function, and this works transparently with the RPC framework with full feature support, profiles asynchronous functions, and TorchScript functions.

        + + +

        [Prototype] Windows support for Distributed Training

        +

        PyTorch 1.7 brings prototype support for DistributedDataParallel and collective communications on the Windows platform. In this release, the support only covers Gloo-based ProcessGroup and FileStore.

        + +

        To use this feature across multiple machines, please provide a file from a shared file system in init_process_group.

        + +
        # initialize the process group
        +dist.init_process_group(
        +    "gloo",
        +    # multi-machine example:
        +    # init_method = "file://////{machine}/{share_folder}/file"
        +    init_method="file:///{your local file path}",
        +    rank=rank,
        +    world_size=world_size
        +)
        +
        +model = DistributedDataParallel(local_model, device_ids=[rank])
        +
        + + +

        Mobile

        +

        PyTorch Mobile supports both iOS and Android with binary packages available in Cocoapods and JCenter respectively. You can learn more about PyTorch Mobile here.

        + +

        [Beta] PyTorch Mobile Caching allocator for performance improvements

        +

        On some mobile platforms, such as Pixel, we observed that memory is returned to the system more aggressively. This results in frequent page faults as PyTorch being a functional framework does not maintain state for the operators. Thus outputs are allocated dynamically on each execution of the op, for the most ops. To ameliorate performance penalties due to this, PyTorch 1.7 provides a simple caching allocator for CPU. The allocator caches allocations by tensor sizes and, is currently, available only via the PyTorch C++ API. The caching allocator itself is owned by client and thus the lifetime of the allocator is also maintained by client code. Such a client owned caching allocator can then be used with scoped guard, c10::WithCPUCachingAllocatorGuard, to enable the use of cached allocation within that scope. +Example usage:

        + +
        #include <c10/mobile/CPUCachingAllocator.h>
        +.....
        +c10::CPUCachingAllocator caching_allocator;
        +  // Owned by client code. Can be a member of some client class so as to tie the
        +  // the lifetime of caching allocator to that of the class.
        +.....
        +{
        +  c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
        +  if (FLAGS_use_caching_allocator) {
        +    caching_allocator_guard.emplace(&caching_allocator);
        +  }
        +  ....
        +  model.forward(..);
        +}
        +...
        +
        +

        NOTE: Caching allocator is only available on mobile builds, thus the use of caching allocator outside of mobile builds won’t be effective.

        + + +

        torchvision

        +

        [Stable] Transforms now support Tensor inputs, batch computation, GPU, and TorchScript

        +

        torchvision transforms are now inherited from nn.Module and can be torchscripted and applied on torch Tensor inputs as well as on PIL images. They also support Tensors with batch dimensions and work seamlessly on CPU/GPU devices:

        +
        import torch
        +import torchvision.transforms as T
        +
        +# to fix random seed, use torch.manual_seed
        +# instead of random.seed
        +torch.manual_seed(12)
        +
        +transforms = torch.nn.Sequential(
        +    T.RandomCrop(224),
        +    T.RandomHorizontalFlip(p=0.3),
        +    T.ConvertImageDtype(torch.float),
        +    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        +)
        +scripted_transforms = torch.jit.script(transforms)
        +# Note: we can similarly use T.Compose to define transforms
        +# transforms = T.Compose([...]) and 
        +# scripted_transforms = torch.jit.script(torch.nn.Sequential(*transforms.transforms))
        +
        +tensor_image = torch.randint(0, 256, size=(3, 256, 256), dtype=torch.uint8)
        +# works directly on Tensors
        +out_image1 = transforms(tensor_image)
        +# on the GPU
        +out_image1_cuda = transforms(tensor_image.cuda())
        +# with batches
        +batched_image = torch.randint(0, 256, size=(4, 3, 256, 256), dtype=torch.uint8)
        +out_image_batched = transforms(batched_image)
        +# and has torchscript support
        +out_image2 = scripted_transforms(tensor_image)
        +
        +

        These improvements enable the following new features:

        +
          +
        • support for GPU acceleration
        • +
        • batched transformations e.g. as needed for videos
        • +
        • transform multi-band torch tensor images (with more than 3-4 channels)
        • +
        • torchscript transforms together with your model for deployment +Note: Exceptions for TorchScript support includes Compose, RandomChoice, RandomOrder, Lambda and those applied on PIL images, such as ToPILImage.
        • +
        + +

        [Stable] Native image IO for JPEG and PNG formats

        +

        torchvision 0.8.0 introduces native image reading and writing operations for JPEG and PNG formats. Those operators support TorchScript and return CxHxW tensors in uint8 format, and can thus be now part of your model for deployment in C++ environments.

        +
        from torchvision.io import read_image
        +
        +# tensor_image is a CxHxW uint8 Tensor
        +tensor_image = read_image('path_to_image.jpeg')
        +
        +# or equivalently
        +from torchvision.io import read_file, decode_image
        +# raw_data is a 1d uint8 Tensor with the raw bytes
        +raw_data = read_file('path_to_image.jpeg')
        +tensor_image = decode_image(raw_data)
        +
        +# all operators are torchscriptable and can be
        +# serialized together with your model torchscript code
        +scripted_read_image = torch.jit.script(read_image)
        +
        +

        [Stable] RetinaNet detection model

        +

        This release adds pretrained models for RetinaNet with a ResNet50 backbone from Focal Loss for Dense Object Detection.

        + +

        [Beta] New Video Reader API

        +

        This release introduces a new video reading abstraction, which gives more fine-grained control of iteration over videos. It supports image and audio, and implements an iterator interface so that it is interoperable with other the python libraries such as itertools.

        +
        from torchvision.io import VideoReader
        +
        +# stream indicates if reading from audio or video
        +reader = VideoReader('path_to_video.mp4', stream='video')
        +# can change the stream after construction
        +# via reader.set_current_stream
        +
        +# to read all frames in a video starting at 2 seconds
        +for frame in reader.seek(2):
        +    # frame is a dict with "data" and "pts" metadata
        +    print(frame["data"], frame["pts"])
        +
        +# because reader is an iterator you can combine it with
        +# itertools
        +from itertools import takewhile, islice
        +# read 10 frames starting from 2 seconds
        +for frame in islice(reader.seek(2), 10):
        +    pass
        +    
        +# or to return all frames between 2 and 5 seconds
        +for frame in takewhile(lambda x: x["pts"] < 5, reader):
        +    pass
        +
        +

        Notes:

        +
          +
        • In order to use the Video Reader API beta, you must compile torchvision from source and have ffmpeg installed in your system.
        • +
        • The VideoReader API is currently released as beta and its API may change following user feedback.
        • +
        + +

        torchaudio

        +

        With this release, torchaudio is expanding its support for models and end-to-end applications, adding a wav2letter training pipeline and end-to-end text-to-speech and source separation pipelines. Please file an issue on github to provide feedback on them.

        + +

        [Stable] Speech Recognition

        +

        Building on the addition of the wav2letter model for speech recognition in the last release, we’ve now added an example wav2letter training pipeline with the LibriSpeech dataset.

        + +

        [Stable] Text-to-speech

        +

        With the goal of supporting text-to-speech applications, we added a vocoder based on the WaveRNN model, based on the implementation from this repository. The original implementation was introduced in “Efficient Neural Audio Synthesis”. We also provide an example WaveRNN training pipeline that uses the LibriTTS dataset added to torchaudio in this release.

        + +

        [Stable] Source Separation

        +

        With the addition of the ConvTasNet model, based on the paper “Conv-TasNet: Surpassing Ideal Time-Frequency Magnitude Masking for Speech Separation,” torchaudio now also supports source separation. An example ConvTasNet training pipeline is provided with the wsj-mix dataset.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.8-new-library-releases/index.html b/blog/pytorch-1.8-new-library-releases/index.html new file mode 100644 index 000000000000..795ee46f659b --- /dev/null +++ b/blog/pytorch-1.8-new-library-releases/index.html @@ -0,0 +1,797 @@ + + + + + + + + + + + + + New PyTorch library releases including TorchVision Mobile, TorchAudio I/O, and more | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we are announcing updates to a number of PyTorch libraries, alongside the PyTorch 1.8 release. The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio as well as new version of TorchCSPRNG. These releases include a number of new features and improvements and, along with the PyTorch 1.8 release, provide a broad set of updates for the PyTorch community to build on and leverage.

        + +

        Some highlights include:

        +
          +
        • TorchVision - Added support for PyTorch Mobile including Detectron2Go (D2Go), auto-augmentation of data during training, on the fly type conversion, and AMP autocasting.
        • +
        • TorchAudio - Major improvements to I/O, including defaulting to sox_io backend and file-like object support. Added Kaldi Pitch feature and support for CMake based build allowing TorchAudio to better support no-Python environments.
        • +
        • TorchText - Updated the dataset loading API to be compatible with standard PyTorch data loading utilities.
        • +
        • TorchCSPRNG - Support for cryptographically secure pseudorandom number generators for PyTorch is now stable with new APIs for AES128 ECB/CTR and CUDA support on Windows.
        • +
        + +

        Please note that, starting in PyTorch 1.6, features are classified as Stable, Beta, and Prototype. Prototype features are not included as part of the binary distribution and are instead available through either building from source, using nightlies or via compiler flag. You can see the detailed announcement here.

        + +

        TorchVision 0.9.0

        +

        [Stable] TorchVision Mobile: Operators, Android Binaries, and Tutorial

        +

        We are excited to announce the first on-device support and binaries for a PyTorch domain library. We have seen significant appetite in both research and industry for on-device vision support to allow low latency, privacy friendly, and resource efficient mobile vision experiences. You can follow this new tutorial to build your own Android object detection app using TorchVision operators, D2Go, or your own custom operators and model.

        + +
        + +
        + +

        [Stable] New Mobile models for Classification, Object Detection and Semantic Segmentation

        +

        We have added support for the MobileNetV3 architecture and provided pre-trained weights for Classification, Object Detection and Segmentation. It is easy to get up and running with these models, just import and load them as you would any torchvision model:

        +
        import torch
        +import torchvision
        +
        +# Classification
        +x = torch.rand(1, 3, 224, 224)
        +m_classifier = torchvision.models.mobilenet_v3_large(pretrained=True)
        +m_classifier.eval()
        +predictions = m_classifier(x)
        +
        +# Quantized Classification
        +x = torch.rand(1, 3, 224, 224)
        +m_classifier = torchvision.models.quantization.mobilenet_v3_large(pretrained=True)
        +m_classifier.eval()
        +predictions = m_classifier(x)
        +
        +# Object Detection: Highly Accurate High Resolution Mobile Model
        +x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
        +m_detector = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
        +m_detector.eval()
        +predictions = m_detector(x)
        +
        +# Semantic Segmentation: Highly Accurate Mobile Model
        +x = torch.rand(1, 3, 520, 520)
        +m_segmenter = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True)
        +m_segmenter.eval()
        +predictions = m_segmenter(x)
        +
        +

        These models are highly competitive with TorchVision’s existing models on resource efficiency, speed, and accuracy. See our release notes for detailed performance metrics.

        + +

        [Stable] AutoAugment

        +

        AutoAugment is a common Data Augmentation technique that can increase the accuracy of Scene Classification models. Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that ImageNet policies provide significant improvements when applied to other datasets. We’ve implemented 3 policies learned on the following datasets: ImageNet, CIFA10 and SVHN. These can be used standalone or mixed-and-matched with existing transforms:

        +
        from torchvision import transforms
        +
        +t = transforms.AutoAugment()
        +transformed = t(image)
        +
        +
        +transform=transforms.Compose([
        +   transforms.Resize(256),
        +   transforms.AutoAugment(),
        +   transforms.ToTensor()])
        +
        +

        Other New Features for TorchVision

        +
          +
        • [Stable] All read and decode methods in the io.image package now support: +
            +
          • Palette, Grayscale Alpha and RBG Alpha image types during PNG decoding
          • +
          • On-the-fly conversion of image from one type to the other during read
          • +
          +
        • +
        • [Stable] WiderFace dataset
        • +
        • [Stable] Improved FasterRCNN speed and accuracy by introducing a score threshold on RPN
        • +
        • [Stable] Modulation input for DeformConv2D
        • +
        • [Stable] Option to write audio to a video file
        • +
        • [Stable] Utility to draw bounding boxes
        • +
        • [Beta] Autocast support in all Operators +Find the full TorchVision release notes here.
        • +
        + +

        TorchAudio 0.8.0

        +

        I/O Improvements

        +

        We have continued our work from the previous release to improve TorchAudio’s I/O support, including:

        +
          +
        • [Stable] Changing the default backend to “sox_io” (for Linux/macOS), and updating the “soundfile” backend’s interface to align with that of “sox_io”. The legacy backend and interface are still accessible, though it is strongly discouraged to use them.
        • +
        • [Stable] File-like object support in both “sox_io” backend, “soundfile” backend and sox_effects.
        • +
        • [Stable] New options to change the format, encoding, and bits_per_sample when saving.
        • +
        • [Stable] Added GSM, HTK, AMB, AMR-NB and AMR-WB format support to the “sox_io” backend.
        • +
        • [Beta] A new functional.apply_codec function which can degrade audio data by applying audio codecs supported by “sox_io” backend in an in-memory fashion. +Here are some examples of features landed in this release:
        • +
        + +
        # Load audio over HTTP
        +with requests.get(URL, stream=True) as response:
        +    waveform, sample_rate = torchaudio.load(response.raw)
        + 
        +# Saving to Bytes buffer as 32-bit floating-point PCM
        +buffer_ = io.BytesIO()
        +torchaudio.save(
        +    buffer_, waveform, sample_rate,
        +    format="wav", encoding="PCM_S", bits_per_sample=16)
        + 
        +# Apply effects while loading audio from S3
        +client = boto3.client('s3')
        +response = client.get_object(Bucket=S3_BUCKET, Key=S3_KEY)
        +waveform, sample_rate = torchaudio.sox_effects.apply_effect_file(
        +    response['Body'],
        +    [["lowpass", "-1", "300"], ["rate", "8000"]])
        + 
        +# Apply GSM codec to Tensor
        +encoded = torchaudio.functional.apply_codec(
        +    waveform, sample_rate, format="gsm")
        +
        + +

        Check out the revamped audio preprocessing tutorial, Audio Manipulation with TorchAudio.

        + +

        [Stable] Switch to CMake-based build

        +

        In the previous version of TorchAudio, it was utilizing CMake to build third party dependencies. Starting in 0.8.0, TorchaAudio uses CMake to build its C++ extension. This will open the door to integrate TorchAudio in non-Python environments (such as C++ applications and mobile). We will continue working on adding example applications and mobile integrations.

        + +

        [Beta] Improved and New Audio Transforms

        +

        We have added two widely requested operators in this release: the SpectralCentroid transform and the Kaldi Pitch feature extraction (detailed in “A pitch extraction algorithm tuned for automatic speech recognition”). We’ve also exposed a normalization method to Mel transforms, and additional STFT arguments to Spectrogram. We would like to ask our community to continue to raise feature requests for core audio processing features like these!

        + +

        Community Contributions

        +

        We had more contributions from the open source community in this release than ever before, including several completely new features. We would like to extend our sincere thanks to the community. Please check out the newly added CONTRIBUTING.md for ways to contribute code, and remember that reporting bugs and requesting features are just as valuable. We will continue posting well-scoped work items as issues labeled “help-wanted” and “contributions-welcome” for anyone who would like to contribute code, and are happy to coach new contributors through the contribution process.

        + +

        Find the full TorchAudio release notes here.

        + +

        TorchText 0.9.0

        +

        [Beta] Dataset API Updates

        +

        In this release, we are updating TorchText’s dataset API to be compatible with PyTorch data utilities, such as DataLoader, and are deprecating TorchText’s custom data abstractions such as Field. The updated datasets are simple string-by-string iterators over the data. For guidance about migrating from the legacy abstractions to use modern PyTorch data utilities, please refer to our migration guide.

        + +

        The text datasets listed below have been updated as part of this work. For examples of how to use these datasets, please refer to our end-to-end text classification tutorial.

        +
          +
        • Language modeling: WikiText2, WikiText103, PennTreebank, EnWik9
        • +
        • Text classification: AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, YelpReviewFull, YahooAnswers, AmazonReviewPolarity, AmazonReviewFull, IMDB
        • +
        • Sequence tagging: UDPOS, CoNLL2000Chunking
        • +
        • Translation: IWSLT2016, IWSLT2017
        • +
        • Question answer: SQuAD1, SQuAD2
        • +
        + +

        Find the full TorchText release notes here.

        + +

        [Stable] TorchCSPRNG 0.2.0

        +

        We released TorchCSPRNG in August 2020, a PyTorch C++/CUDA extension that provides cryptographically secure pseudorandom number generators for PyTorch. Today, we are releasing the 0.2.0 version and designating the library as stable. This release includes a new API for encrypt/decrypt with AES128 ECB/CTR as well as CUDA 11 and Windows CUDA support.

        + +

        Find the full TorchCSPRNG release notes here.

        + +

        Thanks for reading, and if you are excited about these updates and want to participate in the future of PyTorch, we encourage you to join the discussion forums and open GitHub issues.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.8-released/index.html b/blog/pytorch-1.8-released/index.html new file mode 100644 index 000000000000..c5a0cdaf182d --- /dev/null +++ b/blog/pytorch-1.8-released/index.html @@ -0,0 +1,817 @@ + + + + + + + + + + + + + PyTorch 1.8 Release, including Compiler and Distributed Training updates, and New Mobile Tutorials | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the availability of PyTorch 1.8. This release is composed of more than 3,000 commits since 1.7. It includes major updates and new features for compilation, code optimization, frontend APIs for scientific computing, and AMD ROCm support through binaries that are available via pytorch.org. It also provides improved features for large-scale training for pipeline and model parallelism, and gradient compression. A few of the highlights include:

        +
          +
        1. Support for doing python to python functional transformations via torch.fx;
        2. +
        3. Added or stabilized APIs to support FFTs (torch.fft), Linear Algebra functions (torch.linalg), added support for autograd for complex tensors and updates to improve performance for calculating hessians and jacobians; and
        4. +
        5. Significant updates and improvements to distributed training including: Improved NCCL reliability; Pipeline parallelism support; RPC profiling; and support for communication hooks adding gradient compression. +See the full release notes here.
        6. +
        + +

        Along with 1.8, we are also releasing major updates to PyTorch libraries including TorchCSPRNG, TorchVision, TorchText and TorchAudio. For more on the library releases, see the post here. As previously noted, features in PyTorch releases are classified as Stable, Beta and Prototype. You can learn more about the definitions in the post here.

        + +

        New and Updated APIs

        +

        The PyTorch 1.8 release brings a host of new and updated API surfaces ranging from additional APIs for NumPy compatibility, also support for ways to improve and scale your code for performance at both inference and training time. Here is a brief summary of the major features coming in this release:

        + +

        [Stable] Torch.fft support for high performance NumPy style FFTs

        +

        As part of PyTorch’s goal to support scientific computing, we have invested in improving our FFT support and with PyTorch 1.8, we are releasing the torch.fft module. This module implements the same functions as NumPy’s np.fft module, but with support for hardware acceleration and autograd.

        + + +

        [Beta] Support for NumPy style linear algebra functions via torch.linalg

        +

        The torch.linalg module, modeled after NumPy’s np.linalg module, brings NumPy-style support for common linear algebra operations including Cholesky decompositions, determinants, eigenvalues and many others.

        + + +

        [Beta] Python code Transformations with FX

        +

        FX allows you to write transformations of the form transform(input_module : nn.Module) -> nn.Module, where you can feed in a Module instance and get a transformed Module instance out of it.

        + +

        This kind of functionality is applicable in many scenarios. For example, the FX-based Graph Mode Quantization product is releasing as a prototype contemporaneously with FX. Graph Mode Quantization automates the process of quantizing a neural net and does so by leveraging FX’s program capture, analysis and transformation facilities. We are also developing many other transformation products with FX and we are excited to share this powerful toolkit with the community.

        + +

        Because FX transforms consume and produce nn.Module instances, they can be used within many existing PyTorch workflows. This includes workflows that, for example, train in Python then deploy via TorchScript.

        + +

        You can read more about FX in the official documentation. You can also find several examples of program transformations implemented using torch.fx here. We are constantly improving FX and invite you to share any feedback you have about the toolkit on the forums or issue tracker.

        + +

        We’d like to acknowledge TorchScript tracing, Apache MXNet hybridize, and more recently JAX as influences for program acquisition via tracing. We’d also like to acknowledge Caffe2, JAX, and TensorFlow as inspiration for the value of simple, directed dataflow graph program representations and transformations over those representations.

        + +

        Distributed Training

        +

        The PyTorch 1.8 release added a number of new features as well as improvements to reliability and usability. Concretely, support for: Stable level async error/timeout handling was added to improve NCCL reliability; and stable support for RPC based profiling. Additionally, we have added support for pipeline parallelism as well as gradient compression through the use of communication hooks in DDP. Details are below:

        + +

        [Beta] Pipeline Parallelism

        +

        As machine learning models continue to grow in size, traditional Distributed DataParallel (DDP) training no longer scales as these models don’t fit on a single GPU device. The new pipeline parallelism feature provides an easy to use PyTorch API to leverage pipeline parallelism as part of your training loop.

        + + +

        [Beta] DDP Communication Hook

        +

        The DDP communication hook is a generic interface to control how to communicate gradients across workers by overriding the vanilla allreduce in DistributedDataParallel. A few built-in communication hooks are provided including PowerSGD, and users can easily apply any of these hooks to optimize communication. Additionally, the communication hook interface can also support user-defined communication strategies for more advanced use cases.

        + + +

        Additional Prototype Features for Distributed Training

        +

        In addition to the major stable and beta distributed training features in this release, we also have a number of prototype features available in our nightlies to try out and provide feedback. We have linked in the draft docs below for reference:

        +
          +
        • (Prototype) ZeroRedundancyOptimizer - Based on and in partnership with the Microsoft DeepSpeed team, this feature helps reduce per-process memory footprint by sharding optimizer states across all participating processes in the ProcessGroup gang. Refer to this documentation for more details.
        • +
        • (Prototype) Process Group NCCL Send/Recv - The NCCL send/recv API was introduced in v2.7 and this feature adds support for it in NCCL process groups. This feature will provide an option for users to implement collective operations at Python layer instead of C++ layer. Refer to this documentation and code examples to learn more.
        • +
        • (Prototype) CUDA-support in RPC using TensorPipe - This feature should bring consequent speed improvements for users of PyTorch RPC with multiple-GPU machines, as TensorPipe will automatically leverage NVLink when available, and avoid costly copies to and from host memory when exchanging GPU tensors between processes. When not on the same machine, TensorPipe will fall back to copying the tensor to host memory and sending it as a regular CPU tensor. This will also improve the user experience as users will be able to treat GPU tensors like regular CPU tensors in their code. Refer to this documentation for more details.
        • +
        • (Prototype) Remote Module - This feature allows users to operate a module on a remote worker like using a local module, where the RPCs are transparent to the user. In the past, this functionality was implemented in an ad-hoc way and overall this feature will improve the usability of model parallelism on PyTorch. Refer to this documentation for more details.
        • +
        + +

        PyTorch Mobile

        +

        Support for PyTorch Mobile is expanding with a new set of tutorials to help new users launch models on-device quicker and give existing users a tool to get more out of our framework. These include:

        + + +

        Our new demo apps also include examples of image segmentation, object detection, neural machine translation, question answering, and vision transformers. They are available on both iOS and Android:

        + + +

        In addition to performance improvements on CPU for MobileNetV3 and other models, we also revamped our Android GPU backend prototype for broader models coverage and faster inferencing:

        + + +

        Lastly, we are launching the PyTorch Mobile Lite Interpreter as a prototype feature in this release. The Lite Interpreter allows users to reduce the runtime binary size. Please try these out and send us your feedback on the PyTorch Forums. All our latest updates can be found on the PyTorch Mobile page

        + +

        [Prototype] PyTorch Mobile Lite Interpreter

        +

        PyTorch Lite Interpreter is a streamlined version of the PyTorch runtime that can execute PyTorch programs in resource constrained devices, with reduced binary size footprint. This prototype feature reduces binary sizes by up to 70% compared to the current on-device runtime in the current release.

        + + +

        Performance Optimization

        +

        In 1.8, we are releasing the support for benchmark utils to enable users to better monitor performance. We are also opening up a new automated quantization API. See the details below:

        + +

        (Beta) Benchmark utils

        +

        Benchmark utils allows users to take accurate performance measurements, and provides composable tools to help with both benchmark formulation and post processing. This expected to be helpful for contributors to PyTorch to quickly understand how their contributions are impacting PyTorch performance.

        + +

        Example:

        +
        from torch.utils.benchmark import Timer
        +
        +results = []
        +for num_threads in [1, 2, 4]:
        +    timer = Timer(
        +        stmt="torch.add(x, y, out=out)",
        +        setup="""
        +            n = 1024
        +            x = torch.ones((n, n))
        +            y = torch.ones((n, 1))
        +            out = torch.empty((n, n))
        +        """,
        +        num_threads=num_threads,
        +    )
        +    results.append(timer.blocked_autorange(min_run_time=5))
        +    print(
        +        f"{num_threads} thread{'s' if num_threads > 1 else ' ':<4}"
        +        f"{results[-1].median * 1e6:>4.0f} us   " +
        +        (f"({results[0].median / results[-1].median:.1f}x)" if num_threads > 1 else '')
        +    )
        +
        +1 thread     376 us   
        +2 threads    189 us   (2.0x)
        +4 threads     99 us   (3.8x)
        +
        + + +

        (Prototype) FX Graph Mode Quantization

        +

        FX Graph Mode Quantization is the new automated quantization API in PyTorch. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with torch.fx).

        + + +

        Hardware Support

        + +

        [Beta] Ability to Extend the PyTorch Dispatcher for a new backend in C++

        +

        In PyTorch 1.8, you can now create new out-of-tree devices that live outside the pytorch/pytorch repo. The tutorial linked below shows how to register your device and keep it in sync with native PyTorch devices.

        + + +

        [Beta] AMD GPU Binaries Now Available

        +

        Starting in PyTorch 1.8, we have added support for ROCm wheels providing an easy onboarding to using AMD GPUs. You can simply go to the standard PyTorch installation selector and choose ROCm as an installation option and execute the provided command.

        + +

        Thanks for reading, and if you are excited about these updates and want to participate in the future of PyTorch, we encourage you to join the discussion forums and open GitHub issues.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.9-new-library-releases/index.html b/blog/pytorch-1.9-new-library-releases/index.html new file mode 100644 index 000000000000..893bb5264596 --- /dev/null +++ b/blog/pytorch-1.9-new-library-releases/index.html @@ -0,0 +1,857 @@ + + + + + + + + + + + + + New PyTorch Library Releases in PyTorch 1.9, including TorchVision, TorchAudio, and more | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we are announcing updates to a number of PyTorch libraries, alongside the PyTorch 1.9 release. The updates include new releases for the domain libraries including TorchVision, TorchText and TorchAudio. These releases, along with the PyTorch 1.9 release, include a number of new features and improvements that will provide a broad set of updates for the PyTorch community.

        + +

        Some highlights include:

        + +
          +
        • TorchVision - Added new SSD and SSDLite models, quantized kernels for object detection, GPU Jpeg decoding, and iOS support. See release notes here.
        • +
        • TorchAudio - Added wav2vec 2.0 model deployable in non-Python environments (including C++, Android, and iOS). Many performance improvements in lfilter, spectral operations, resampling. Added options for quality control in sampling (i.e. Kaiser window support). Initiated the migration of complex tensors operations. Improved autograd support. See release notes here.
        • +
        • TorchText - Added a new high-performance Vocab module that provides common functional APIs for NLP workflows. See release notes here.
        • +
        + +

        We’d like to thank the community for their support and work on this latest release.

        + +

        Features in PyTorch releases are classified as Stable, Beta, and Prototype. You can learn more about the definitions in this blog post.

        + +

        TorchVision 0.10

        + +

        (Stable) Quantized kernels for object detection

        +

        The forward pass of the nms and roi_align operators now support tensors with a quantized dtype, which can help lower the memory footprint of object detection models, particularly on mobile environments. For more details, refer to the documentation.

        + +

        (Stable) Speed optimizations for Tensor transforms

        +

        The resize and flip transforms have been optimized and its runtime improved by up to 5x on the CPU.

        + +

        (Stable) Documentation improvements

        +

        Significant improvements were made to the documentation. In particular, a new gallery of examples is available. These examples visually illustrate how each transform acts on an image, and also properly documents and illustrates the output of the segmentation models.

        + +

        The example gallery will be extended in the future to provide more comprehensive examples and serve as a reference for common torchvision tasks. For more details, refer to the documentation.

        + +

        (Beta) New models for detection

        +

        SSD and SSDlite are two popular object detection architectures that are efficient in terms of speed and provide good results for low resolution pictures. In this release, we provide implementations for the original SSD model with VGG16 backbone and for its mobile-friendly variant SSDlite with MobileNetV3-Large backbone.

        + +

        The models were pre-trained on COCO train2017 and can be used as follows:

        + +
        import torch
        +import torchvision
        +
        +# Original SSD variant
        +x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)]
        +m_detector = torchvision.models.detection.ssd300_vgg16(pretrained=True)
        +m_detector.eval()
        +predictions = m_detector(x)
        +
        +# Mobile-friendly SSDlite variant
        +x = [torch.rand(3, 320, 320), torch.rand(3, 500, 400)]
        +m_detector = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)
        +m_detector.eval()
        +predictions = m_detector(x)
        +
        + +

        The following accuracies can be obtained on COCO val2017 (full results available in #3403 and #3757):

        + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelmAPmAP@50mAP@75
        SSD300 VGG1625.141.526.2
        SSDlite320 MobileNetV3-Large21.334.322.1
        + +

        For more details, refer to the documentation.

        + +

        (Beta) JPEG decoding on the GPU

        +

        Decoding jpegs is now possible on GPUs with the use of nvjpeg, which should be readily available in your CUDA setup. The decoding time of a single image should be about 2 to 3 times faster than with libjpeg on CPU. While the resulting tensor will be stored on the GPU device, the input raw tensor still needs to reside on the host (CPU), because the first stages of the decoding process take place on the host: +from torchvision.io.image import read_file, decode_jpeg

        + +
        data = read_file('path_to_image.jpg')  # raw data is on CPU
        +img = decode_jpeg(data, device='cuda')  # decoded image in on GPU
        +
        +

        For more details, see the documentation.

        + +

        (Beta) iOS support

        +

        TorchVision 0.10 now provides pre-compiled iOS binaries for its C++ operators, which means you can run Faster R-CNN and Mask R-CNN on iOS. An example app on how to build a program leveraging those ops can be found here.

        + +

        TorchAudio 0.9.0

        + +

        (Stable) Complex Tensor Migration

        +

        TorchAudio has functions that handle complex-valued tensors. These functions follow a convention to use an extra dimension to represent real and imaginary parts. In PyTorch 1.6, the native complex type was introduced. As its API is getting stable, torchaudio has started to migrate to the native complex type.

        + +

        In this release, we added support for native complex tensors, and you can opt-in to use them. Using the native complex types, we have verified that affected functions continue to support autograd and TorchScript, moreover, switching to native complex types improves their performance. For more details, refer to pytorch/audio#1337.

        + +

        (Stable) Filtering Improvement

        +

        In release 0.8, we added the C++ implementation of the core part of lfilter for CPU, which improved the performance. In this release, we optimized some internal operations of the CPU implementation for further performance improvement. We also added autograd support to both CPU and GPU. Now lfilter and all the biquad filters (biquad, band_biquad, bass_biquad, treble_biquad, allpass_biquad, lowpass_biquad, highpass_biquad, bandpass_biquad, equalizer_biquad and bandrefect_biquad) benefit from the performance improvement and support autograd. We also moved the implementation of overdrive to C++ for performance improvement. For more details, refer to the documentation.

        + +

        (Stable) Improved Autograd Support

        +

        Along with the work of Complex Tensor Migration and Filtering Improvement, we also added autograd tests to transforms. lfilter, biquad and its variants, and most transforms are now guaranteed to support autograd. For more details, refer to the release note.

        + +

        (Stable) Improved Windows Support

        +

        Torchaudio implements some operations in C++ for reasons such as performance and integration with third-party libraries. These C++ components were only available on Linux and macOS. In this release, we have added support to Windows. With this, the efficient filtering implementation mentioned above is also available on Windows.

        + +

        However, please note that not all the C++ components are available for Windows. “sox_io” backend and torchaudio.functional.compute_kaldi_pitch are not supported.

        + +

        (Stable) I/O Functions Migration

        +

        Since the 0.6 release, we have continuously improved I/O functionality. Specifically, in 0.8 we changed the default backend from “sox” to “sox_io” and applied the same switch to API of the “soundfile” backend. The 0.9 release concludes this migration by removing the deprecated backends. For more details, please refer to #903.

        + +

        (Beta) Wav2Vec2.0 Model

        +

        We have added the model architectures from Wav2Vec2.0. You can import fine-tuned models parameters published on fairseq and Hugging Face Hub. Our model definition supports TorchScript, and it is possible to deploy the model to non-Python environments, such as C++, Android and iOS.

        + +

        The following code snippet illustrates such a use case. Please check out our c++ example directory for the complete example. Currently, it is designed for running inference. If you would like more support for training, please file a feature request.

        + +
        # Import fine-tuned model from Hugging Face Hub
        +import transformers
        +from torchaudio.models.wav2vec2.utils import import_huggingface_model
        +
        +original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        +imported = import_huggingface_model(original)
        +
        + +
        # Import fine-tuned model from fairseq
        +import fairseq
        +from torchaudio.models.wav2vec2.utils import import_fairseq_model
        +
        +original, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        +    ["wav2vec_small_960h.pt"], arg_overrides={'data': "<data_dir>"})
        +imported = import_fairseq_model(original[0].w2v_encoder)
        +
        + +
        # Build uninitialized model and load state dict
        +from torchaudio.models import wav2vec2_base
        +
        +model = wav2vec2_base(num_out=32)
        +model.load_state_dict(imported.state_dict())
        +
        +# Quantize / script / optimize for mobile
        +quantized_model = torch.quantization.quantize_dynamic(
        +    model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
        +scripted_model = torch.jit.script(quantized_model)
        +optimized_model = optimize_for_mobile(scripted_model)
        +optimized_model.save("model_for_deployment.pt")
        +
        + +

        For more details, see the documentation.

        + +

        (Beta) Resampling Improvement

        +

        In release 0.8, we vectorized the operation in torchaudio.compliance.kaldi.resample_waveform, which improved the performance of resample_waveform and torchaudio.transforms.Resample. In this release, we have further revised the way the resampling algorithm is implemented.

        + +

        We have:

        +
          +
        • Added Kaiser Window support for a wider range of resampling quality.
        • +
        • Added rolloff parameter for anti-aliasing control.
        • +
        • Added the mechanism to precompute the kernel and cache it in torchaudio.transforms.Resample for even faster operation.
        • +
        • Moved the implementation from torchaudio.compliance.kaldi.resample_waveform to torchaudio.functional.resample and deprecated torchaudio.compliance.kaldi.resample_waveform.
        • +
        + +

        For more details, see the documentation.

        + +

        (Prototype) RNN Transducer Loss

        +

        The RNN transducer loss is used in training RNN transducer models, which is a popular architecture for speech recognition tasks. The prototype loss in torchaudio currently supports autograd, torchscript, float16 and float32, and can also be run on both CPU and CUDA. For more details, please refer to the documentation.

        + +

        TorchText 0.10.0

        + +

        (Beta) New Vocab Module

        +

        In this release, we introduce a new Vocab module that replaces the current Vocab class. The new Vocab provides common functional APIs for NLP workflows. This module is backed by an efficient C++ implementation that reduces batch look-up time by up-to ~85% (refer to summary of #1248 and #1290 for further information on benchmarks), and provides support for TorchScript. We provide accompanying factory functions that can be used to build the Vocab object either through a python ordered dictionary or an Iterator that yields lists of tokens.

        + +
        #creating Vocab from text file
        +import io
        +from torchtext.vocab import build_vocab_from_iterator
        +#generator that yield list of tokens
        +def yield_tokens(file_path):
        +   with io.open(file_path, encoding = 'utf-8') as f:
        +      for line in f:
        +          yield line.strip().split()
        +#get Vocab object
        +vocab_obj = build_vocab_from_iterator(yield_tokens(file_path), specials=["<unk>"])
        +
        +#creating Vocab through ordered dict
        +from torchtext.vocab import vocab
        +from collections import Counter, OrderedDict
        +counter = Counter(["a", "a", "b", "b", "b"])
        +sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        +ordered_dict = OrderedDict(sorted_by_freq_tuples)
        +vocab_obj = vocab(ordered_dict)
        +
        +#common API usage
        +
        +#look-up index
        +vocab_obj["a"]
        +
        +#batch look-up indices
        +vocab_obj.looup_indices(["a","b"])
        +#support forward API of PyTorch nn Modules
        +vocab_obj(["a","b"])
        +
        +#batch look-up tokens
        +vocab_obj.lookup_tokens([0,1])
        +
        +#set default index to return when token not found
        +vocab_obj.set_default_index(0)
        +vocab_obj["out_of_vocabulary"] #prints 0
        +
        + +

        For more details, refer to the documentation.

        + +

        Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Facebook, Twitter, Medium, YouTube or LinkedIn.

        + +

        Cheers!

        + +

        -Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-1.9-released/index.html b/blog/pytorch-1.9-released/index.html new file mode 100644 index 000000000000..c50d5832aa5c --- /dev/null +++ b/blog/pytorch-1.9-released/index.html @@ -0,0 +1,847 @@ + + + + + + + + + + + + + PyTorch 1.9 Release, including torch.linalg and Mobile Interpreter | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch 1.9. The release is composed of more than 3,400 commits since 1.8, made by 398 contributors. The release notes are available here. Highlights include:

        +
          +
        1. Major improvements to support scientific computing, including torch.linalg, torch.special, and Complex Autograd
        2. +
        3. Major improvements in on-device binary size with Mobile Interpreter
        4. +
        5. Native support for elastic-fault tolerance training through the upstreaming of TorchElastic into PyTorch Core
        6. +
        7. Major updates to the PyTorch RPC framework to support large scale distributed training with GPU support
        8. +
        9. New APIs to optimize performance and packaging for model inference deployment
        10. +
        11. Support for Distributed training, GPU utilization and SM efficiency in the PyTorch Profiler
        12. +
        + +

        Along with 1.9, we are also releasing major updates to the PyTorch libraries, which you can read about in this blog post.

        + +

        We’d like to thank the community for their support and work on this latest release. We’d especially like to thank Quansight and Microsoft for their contributions.

        + +

        Features in PyTorch releases are classified as Stable, Beta, and Prototype. You can learn more about the definitions in this blog post.

        + +

        Frontend APIs

        + +

        (Stable) torch.linalg

        + +

        In 1.9, the torch.linalg module is moving to a stable release. Linear algebra is essential to deep learning and scientific computing, and the torch.linalg module extends PyTorch’s support for it with implementations of every function from NumPy’s linear algebra module (now with support for accelerators and autograd) and more, like torch.linalg.matrix_norm and torch.linalg.householder_product. This makes the module immediately familiar to users who have worked with NumPy. Refer to the documentation here.

        + +

        We plan to publish another blog post with more details on the torch.linalg module next week!

        + +

        (Stable) Complex Autograd

        + +

        The Complex Autograd feature, released as a beta in PyTorch 1.8, is now stable. Since the beta release, we have extended support for Complex Autograd for over 98% operators in PyTorch 1.9, improved testing for complex operators by adding more OpInfos, and added greater validation through TorchAudio migration to native complex tensors (refer to this issue).

        + +

        This feature provides users the functionality to calculate complex gradients and optimize real valued loss functions with complex variables. This is a required feature for multiple current and downstream prospective users of complex numbers in PyTorch like TorchAudio, ESPNet, Asteroid, and FastMRI. Refer to the documentation for more details.

        + +

        (Stable) torch.use_deterministic_algorithms()

        + +

        To help with debugging and writing reproducible programs, PyTorch 1.9 includes a torch.use_determinstic_algorithms option. When this setting is enabled, operations will behave deterministically, if possible, or throw a runtime error if they might behave nondeterministically. Here are a couple examples:

        + +
        >>> a = torch.randn(100, 100, 100, device='cuda').to_sparse()
        +>>> b = torch.randn(100, 100, 100, device='cuda')
        +
        +# Sparse-dense CUDA bmm is usually nondeterministic
        +>>> torch.bmm(a, b).eq(torch.bmm(a, b)).all().item()
        +False
        +
        +>>> torch.use_deterministic_algorithms(True)
        +
        +# Now torch.bmm gives the same result each time, but with reduced performance
        +>>> torch.bmm(a, b).eq(torch.bmm(a, b)).all().item()
        +True
        +
        +# CUDA kthvalue has no deterministic algorithm, so it throws a runtime error
        +>>> torch.zeros(10000, device='cuda').kthvalue(1)
        +RuntimeError: kthvalue CUDA does not have a deterministic implementation...
        +
        + +

        PyTorch 1.9 adds deterministic implementations for a number of indexing operations, too, including index_add, index_copy, and index_put with accum=False. For more details, refer to the documentation and reproducibility note.

        + +

        (Beta) torch.special

        + +

        A torch.special module, analogous to SciPy’s special module, is now available in beta. This module contains many functions useful for scientific computing and working with distributions such as iv, ive, erfcx, logerfc, and logerfcx. Refer to the documentation for more details.

        + +

        (Beta) nn.Module parameterization

        + +

        nn.Module parameterization allows users to parametrize any parameter or buffer of an nn.Module without modifying the nn.Module itself. It allows you to constrain the space in which your parameters live without the need for special optimization methods.

        + +

        This also contains a new implementation of the spectral_norm parametrization for PyTorch 1.9. More parametrization will be added to this feature (weight_norm, matrix constraints and part of pruning) for the feature to become stable in 1.10. For more details, refer to the documentation and tutorial.

        + +

        PyTorch Mobile

        + +

        (Beta) Mobile Interpreter

        + +

        We are releasing Mobile Interpreter, a streamlined version of the PyTorch runtime, in beta. The Interpreter will execute PyTorch programs in edge devices, with reduced binary size footprint.

        + +

        Mobile Interpreter is one of the top requested features for PyTorch Mobile. This new release will significantly reduce binary size compared with the current on-device runtime. In order for you to get the binary size improvements with our interpreter (which can reduce the binary size up to ~75% for a typical application) follow these instructions. As an example, using Mobile Interpreter, we can reach 2.6 MB compressed with MobileNetV2 in arm64-v7a Android. With this latest release we are making it much simpler to integrate the interpreter by providing pre-built libraries for iOS and Android.

        + +

        TorchVision Library

        + +

        Starting from 1.9, users can use the TorchVision library on their iOS/Android apps. The Torchvision library contains the C++ TorchVision ops and needs to be linked together with the main PyTorch library for iOS, for Android it can be added as a gradle dependency. This allows using TorchVision prebuilt MaskRCNN operators for object detections and segmentation. To learn more about the library, please refer to our tutorials and demo apps.

        + +

        Demo apps

        + +

        We are releasing a new video app based on PyTorch Video library and an updated speech recognition app based on the latest torchaudio, wave2vec model. Both are available on iOS and Android. In addition, we have updated the seven Computer Vision and three Natural Language Processing demo apps, including the HuggingFace DistilBERT, and the DeiT vision transformer models, with PyTorch Mobile v1.9. With the addition of these two apps, we now offer a full suite of demo apps covering image, text, audio, and video. To get started check out our iOS demo apps and Android demo apps.

        + +
        + +
        + +

        Distributed Training

        + +

        (Beta) TorchElastic is now part of core

        + +

        TorchElastic, which was open sourced over a year ago in the pytorch/elastic github repository, is a runner and coordinator for PyTorch worker processes. Since then, it has been adopted by various distributed torch use-cases: 1) deepspeech.pytorch 2) pytorch-lightning 3) Kubernetes CRD. Now, it is part of PyTorch core.

        + +

        As its name suggests, the core function of TorcheElastic is to gracefully handle scaling events. A notable corollary of elasticity is that peer discovery and rank assignment are built into TorchElastic enabling users to run distributed training on preemptible instances without requiring a gang scheduler. As a side note, etcd used to be a hard dependency of TorchElastic. With the upstream, this is no longer the case since we have added a “standalone” rendezvous based on c10d::Store. For more details, refer to the documentation.

        + +

        (Beta) Distributed Training Updates

        + +

        In addition to TorchElastic, there are a number of beta features available in the distributed package:

        + +
          +
        • +

          (Beta) CUDA support is available in RPC: Compared to CPU RPC and general-purpose RPC frameworks, CUDA RPC is a much more efficient way for P2P Tensor communication. It is built on top of TensorPipe which can automatically choose a communication channel for each Tensor based on Tensor device type and channel availability on both the caller and the callee. Existing TensorPipe channels cover NVLink, InfiniBand, SHM, CMA, TCP, etc. See this recipe for how CUDA RPC helps to attain 34x speedup compared to CPU RPC.

          +
        • +
        • +

          (Beta) ZeroRedundancyOptimizer: ZeroRedundancyOptimizer can be used in conjunction with DistributedDataParallel to reduce the size of per-process optimizer states. The idea of ZeroRedundancyOptimizer comes from DeepSpeed/ZeRO project and Marian, where the optimizer in each process owns a shard of model parameters and their corresponding optimizer states. When running step(), each optimizer only updates its own parameters, and then uses collective communication to synchronize updated parameters across all processes. Refer to this documentation and this tutorial to learn more.

          +
        • +
        • +

          (Beta) Support for profiling distributed collectives: PyTorch’s profiler tools, torch.profiler and torch.autograd.profiler, are able to profile distributed collectives and point to point communication primitives including allreduce, alltoall, allgather, send/recv, etc. This is enabled for all backends supported natively by PyTorch: gloo, mpi, and nccl. This can be used to debug performance issues, analyze traces that contain distributed communication, and gain insight into performance of applications that use distributed training. To learn more, refer to this documentation.

          +
        • +
        + +

        Performance Optimization and Tooling

        + +

        (Stable) Freezing API

        + +

        Module Freezing is the process of inlining module parameters and attributes values as constants into the TorchScript internal representation. This allows further optimization and specialization of your program, both for TorchScript optimizations and lowering to other backends. It is used by optimize_for_mobile API, ONNX, and others.

        + +

        Freezing is recommended for model deployment. It helps TorchScript JIT optimizations optimize away overhead and bookkeeping that is necessary for training, tuning, or debugging PyTorch models. It enables graph fusions that are not semantically valid on non-frozen graphs - such as fusing Conv-BN. For more details, refer to the documentation.

        + +

        (Beta) PyTorch Profiler

        + +
        + +
        + +

        The new PyTorch Profiler graduates to beta and leverages Kineto for GPU profiling, TensorBoard for visualization and is now the standard across our tutorials and documentation.

        + +

        PyTorch 1.9 extends support for the new torch.profiler API to more builds, including Windows and Mac and is recommended in most cases instead of the previous torch.autograd.profiler API. The new API supports existing profiler features, integrates with CUPTI library (Linux-only) to trace on-device CUDA kernels and provides support for long-running jobs, e.g.:

        + +
        def trace_handler(p):
        +    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
        +    print(output)
        +    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
        +
        +with profile(
        +    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        +    # schedule argument specifies the iterations on which the profiler is active
        +    schedule=torch.profiler.schedule(
        +        wait=1,
        +        warmup=1,
        +        active=2),
        +    # on_trace_ready argument specifies the handler for the traces
        +    on_trace_ready=trace_handler
        +) as p:
        +    for idx in range(8):
        +        model(inputs)
        +        # profiler will trace iterations 2 and 3, and then 6 and 7 (counting from zero)
        +        p.step()
        +
        + +

        More usage examples can be found on the profiler recipe page.

        + +

        The PyTorch Profiler Tensorboard plugin has new features for:

        +
          +
        • Distributed Training summary view with communications overview for NCCL
        • +
        • GPU Utilization and SM Efficiency in Trace view and GPU operators view
        • +
        • Memory Profiling view
        • +
        • Jump to source when launched from Microsoft VSCode
        • +
        • Ability for load traces from cloud object storage systems
        • +
        + +

        (Beta) Inference Mode API

        + +

        Inference Mode API allows significant speed-up for inference workloads while remaining safe and ensuring no incorrect gradients can ever be computed. It offers the best possible performance when no autograd is required. For more details, refer to the documentation for inference mode itself and the documentation explaining when to use it and the difference with no_grad mode.

        + +

        (Beta) torch.package

        + +

        torch.package is a new way to package PyTorch models in a self-contained, stable format. A package will include both the model’s data (e.g. parameters, buffers) and its code (model architecture). Packaging a model with its full set of Python dependencies, combined with a description of a conda environment with pinned versions, can be used to easily reproduce training. Representing a model in a self-contained artifact will also allow it to be published and transferred throughout a production ML pipeline while retaining the flexibility of a pure-Python representation. For more details, refer to the documentation.

        + +

        (Prototype) prepare_for_inference

        + +

        prepare_for_inference is a new prototype feature that takes in a module and performs graph-level optimizations to improve inference performance, depending on the device. It is meant to be a PyTorch-native option that requires minimal changes to user’s workflows. For more details, see the documentation for the Torchscript version here or the FX version here.

        + +

        (Prototype) Profile-directed typing in TorchScript

        + +

        TorchScript has a hard requirement for source code to have type annotations in order for compilation to be successful. For a long time, it was only possible to add missing or incorrect type annotations through trial and error (i.e., by fixing the type-checking errors generated by torch.jit.script one by one), which was inefficient and time consuming. Now, we have enabled profile directed typing for torch.jit.script by leveraging existing tools like MonkeyType, which makes the process much easier, faster, and more efficient. For more details, refer to the documentation.

        + +

        Thanks for reading. If you’re interested in these updates and want to join the PyTorch community, we encourage you to join the discussion forums and open GitHub issues. To get the latest news from PyTorch, follow us on Facebook, Twitter, Medium, YouTube, or LinkedIn.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2-1/index.html b/blog/pytorch-2-1/index.html new file mode 100644 index 000000000000..98267c55bb9b --- /dev/null +++ b/blog/pytorch-2-1/index.html @@ -0,0 +1,819 @@ + + + + + + + + + + + + + PyTorch 2.1: automatic dynamic shape compilation, distributed checkpointing | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 2.1 (release note)! PyTorch 2.1 offers automatic dynamic shape support in torch.compile, torch.distributed.checkpoint for saving/loading distributed training jobs on multiple ranks in parallel, and torch.compile support for the NumPy API.

        + +

        In addition, this release offers numerous performance improvements (e.g. CPU inductor improvements, AVX512 support, scaled-dot-product-attention support) as well as a prototype release of torch.export, a sound full-graph capture mechanism, and torch.export-based quantization.

        + +

        Along with 2.1, we are also releasing a series of updates to the PyTorch domain libraries. More details can be found in the library updates blog. 

        + +

        This release is composed of 6,682 commits and 784 contributors since 2.0. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.1.  More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

        + +

        Summary: 

        +
          +
        • torch.compile now includes automatic support for detecting and minimizing recompilations due to tensor shape changes using automatic dynamic shapes.
        • +
        • torch.distributed.checkpoint enables saving and loading models from multiple ranks in parallel, as well as resharding due to changes in cluster topology.
        • +
        • torch.compile can now compile NumPy operations via translating them into PyTorch-equivalent operations.
        • +
        • torch.compile now includes improved support for Python 3.11.
        • +
        • New CPU performance features include inductor improvements (e.g. bfloat16 support and dynamic shapes), AVX512 kernel support, and scaled-dot-product-attention kernels.
        • +
        • torch.export, a sound full-graph capture mechanism is introduced as a prototype feature, as well as torch.export-based quantization.
        • +
        • torch.sparse now includes prototype support for semi-structured (2:4) sparsity on NVIDIA® GPUs.
        • +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        StableBetaPrototypePerformance Improvements
         Automatic Dynamic Shapestorch.export()AVX512 kernel support
         torch.distributed.checkpointTorch.export-based QuantizationCPU optimizations for scaled-dot-product-attention (SPDA)
         torch.compile + NumPysemi-structed (2:4) sparsityCPU optimizations for bfloat16
         torch.compile + Python 3.11cpp_wrapper for torchinductor 
         torch.compile + autograd.Function  
         third-party device integration: PrivateUse1  
        + +

        *To see a full list of public 2.1, 2.0, and 1.13 feature submissions click here.

        + +

        Beta Features

        + +

        (Beta) Automatic Dynamic Shapes

        + +

        Dynamic shapes is functionality built into torch.compile that can minimize recompilations by tracking and generating code based on the symbolic shape of a tensor rather than the static shape (e.g. [B, 128, 4] rather than [64, 128, 4]). This allows torch.compile to generate a single kernel that can work for many sizes, at only a modest cost to efficiency. Dynamic shapes has been greatly stabilized in PyTorch 2.1, and is now automatically enabled if torch.compile notices recompilation due to varying input shapes. You can disable automatic dynamic by passing dynamic=False to torch.compile, or by setting torch._dynamo.config.automatic_dynamic_shapes = False.

        + +

        In PyTorch 2.1, we have shown good performance with dynamic shapes enabled on a variety of model types, including large language models, on both CUDA and CPU.

        + +

        For more information on dynamic shapes, see this documentation.

        + +

        [Beta] torch.distributed.checkpoint

        + +

        torch.distributed.checkpoint enables saving and loading models from multiple ranks in parallel. In addition, checkpointing automatically handles fully-qualified-name (FQN) mappings across models and optimizers, enabling load-time resharding across differing cluster topologies.

        + +

        For more information, see torch.distributed.checkpoint documentation and tutorial.

        + +

        [Beta] torch.compile + NumPy

        + +

        torch.compile now understands how to compile NumPy operations via translating them into PyTorch-equivalent operations.  Because this integration operates in a device-agnostic manner, you can now GPU-accelerate NumPy programs – or even mixed NumPy/PyTorch programs – just by using torch.compile.

        + +

        Please see this section in the torch.compile FAQ for more information about torch.compile + NumPy interaction, and follow the PyTorch Blog for a forthcoming blog about this feature.

        + +

        [Beta] torch.compile + Python 3.11

        + +

        torch.compile previously only supported Python versions 3.8-3.10. Users can now optimize models with torch.compile in Python 3.11.

        + +

        [Beta] torch.compile + autograd.Function

        + +

        torch.compile can now trace and optimize the backward function of user-defined autograd Functions, which unlocks training optimizations for models that make heavier use of extensions mechanisms.

        + +

        [Beta] Improved third-party device support: PrivateUse1

        + +

        Third-party device types can now be registered to PyTorch using the privateuse1 dispatch key.  This allows device extensions to register new kernels to PyTorch and to associate them with the new key, allowing user code to work equivalently to built-in device types.  For example, to register “my_hardware_device”, one can do the following:

        + +
        torch.rename_privateuse1_backend("my_hardware_device")
        +torch.utils.generate_methods_for_privateuse1_backend()
        +x = torch.randn((2, 3), device='my_hardware_device')
        +y = x + x # run add kernel on 'my_hardware_device'
        +
        + +

        To validate this feature, the OSS team from Ascend NPU has successfully integrated torch_npu into pytorch as a plug-in through the PrivateUse1 functionality.

        + +

        For more information, please see the PrivateUse1 tutorial here.

        + +

        Prototype Features

        + +

        [Prototype] torch.export()

        + +

        torch.export() provides a sound tracing mechanism to capture a full graph from a PyTorch program based on new technologies provided by PT2.0.

        + +

        Users can extract a clean representation (Export IR) of a PyTorch program in the form of a dataflow graph, consisting of mostly straight-line calls to PyTorch operators. Export IR can then be transformed, serialized, saved to file, transferred, loaded back for execution in an environment with or without Python.

        + +

        For more information, please see the tutorial here.

        + +

        [Prototype] torch.export-based Quantization

        + +

        torch.ao.quantization now supports quantization on PyTorch 2 torch.export-based flows.  This includes support for built-in XNNPACK and X64Inductor Quantizer, as well as the ability to specify one’s own Quantizer.

        + +

        For an explanation on post-training static quantization with torch.export, see this tutorial, for quantization-aware training for static quantization with torch.export, see this tutorial.

        + +

        For an explanation on how to write one’s own Quantizer, see this tutorial.

        + +

        [Prototype] semi-structured (2:4) sparsity for NVIDIA® GPUs

        + +

        torch.sparse now supports creating and accelerating compute over semi-structured sparse (2:4) tensors.  For more information on the format, see this blog from NVIDIA.A minimal example introducing semi-structured sparsity is as follows:

        + +
        from torch.sparse import to_sparse_semi_structured
        + 
        +x = torch.rand(64, 64).half().cuda()
        +mask = torch.tensor([0, 0, 1, 1]).tile((64, 16)).cuda().bool()
        +linear = nn.Linear(64, 64).half().cuda()
        +
        +linear.weight = nn.Parameter(to_sparse_semi_structured(linear.weight.masked_fill(~mask, 0)))
        +linear(x)
        +
        + +

        To learn more, please see the documentation and accompanying tutorial.

        + +

        [Prototype] cpp_wrapper for torchinductor

        + +

        cpp_wrapper can reduce the Python overhead for invoking kernels in torchinductor by generating the kernel wrapper code in C++. This feature is still in the prototype phase; it does not support all programs that successfully compile in PT2 today. Please file issues if you discover limitations for your use case to help us prioritize.

        + +

        The API to turn this feature on is:

        +
        import torch
        +import torch._inductor.config as config
        +config.cpp_wrapper = True
        +
        + +

        For more information, please see the tutorial.

        + +

        Performance Improvements

        + +

        AVX512 kernel support

        + +

        In PyTorch 2.0, AVX2 kernels would be used even if the CPU supported AVX512 instructions.  Now, PyTorch defaults to using AVX512 CPU kernels if the CPU supports those instructions, equivalent to setting ATEN_CPU_CAPABILITY=avx512 in previous releases.  The previous behavior can be enabled by setting ATEN_CPU_CAPABILITY=avx2.

        + +

        CPU optimizations for scaled-dot-product-attention (SDPA)

        + +

        Previous versions of PyTorch provided optimized CUDA implementations for transformer primitives via torch.nn.functiona.scaled_dot_product_attention.  PyTorch 2.1 includes optimized FlashAttention-based CPU routines.

        + +

        See the documentation here.

        + +

        CPU optimizations for bfloat16

        + +

        PyTorch 2.1 includes CPU optimizations for bfloat16, including improved vectorization support and torchinductor codegen.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2-7-intel-gpus/index.html b/blog/pytorch-2-7-intel-gpus/index.html new file mode 100644 index 000000000000..f8908107fab1 --- /dev/null +++ b/blog/pytorch-2-7-intel-gpus/index.html @@ -0,0 +1,733 @@ + + + + + + + + + + + + + Accelerate PyTorch 2.7 on Intel® GPUs | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        April 25, 2025

        +

        + Accelerate PyTorch 2.7 on Intel® GPUs +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + the Intel PyTorch Team + +

        +

        PyTorch 2.7 continues to deliver significant functionality and performance enhancements on Intel® GPU architectures to streamline AI workflows. Application developers and researchers seeking to fine-tune, inference and develop PyTorch models on Intel GPUs will now have a consistent user experience across various operating systems, including Windows, Linux and Windows Subsystem for Linux (WSL2). This is made possible through improved installation, eager mode script debugging, a performance profiler, and graph model (torch.compile) deployment. As a result, developers have greater options with a unified GPU programming paradigm for both front-end and back-end development.

        + +

        Incremental improvements of Intel GPU support in PyTorch

        + +

        Since PyTorch 2.4, we’ve made steady improvements to Intel GPU support with each release. With PyTorch 2.7, we are excited to share that we have established a solid foundation to have Intel GPU work in both graph mode (torch.compile) and eager mode on Windows and Linux. This includes a wide range of Intel GPU products, many of which you may already access. We hope these enhancements will unlock more ubiquitous hardware for your AI research and development.

        + + + +

        Check out the detailed advancements in these related release blogs: PyTorch 2.4, PyTorch 2.5, and PyTorch 2.6.

        + +

        What’s New in PyTorch 2.7

        + +

        These are the features in PyTorch 2.7 that were added to help accelerate performance on Intel GPUs.

        + +
          +
        • Improve scaled dot-product attention (SDPA) inference performance with bfloat16 and float16 to accelerate attention-based models on Intel GPUs.
          +With the new SDPA optimization for Intel GPUs on PyTorch 2.7, Stable Diffusion float16 inference achieved up to 3x gain over PyTorch 2.6 release on Intel® Arc™ B580 Graphics and Intel® Core™ Ultra 7 Processor 258V with Intel® Arc™ Graphics 140V on eager mode. See Figure 1 below.
        • +
        + +

        chart

        + +

        Figure 1. PyTorch 2.7 Stable Diffusion Performance Gains Over PyTorch 2.6

        + +
          +
        • Enable torch.compile on Windows 11 for Intel GPUs, delivering the performance advantages over eager mode as on Linux. With this, Intel GPUs became the first accelerator to support torch.compile on Windows. Refer to Windows tutorial for details.
          +Graph model (torch.compile) is enabled in Windows 11 for the first time across Intel GPUs, delivering the performance advantages over eager mode as on Linux by PyTorch 2.7. The latest performance data was measured on top of PyTorch Dynamo Benchmarking Suite using Intel® Arc™ B580 Graphics on Windows showcase torch.compile speedup ratio over eager mode as shown in Figure 2. Both training and inference achieved similar significant improvements.
        • +
        + +

        chart

        + +

        Figure 2. Torch.compile Performance Gains Over Eager Mode on Windows

        + +
          +
        • Optimize the performance of PyTorch 2 Export Post Training Quantization (PT2E) on Intel GPU to provide full graph mode quantization pipelines with enhanced computational efficiency. Refer to PT2E tutorial for details.
        • +
        • Enable AOTInductor and torch.export on Linux to simplify deployment workflows. Refer to AOTInductor tutorial for details.
        • +
        • Enable profiler on both Windows and Linux to facilitate model performance analysis. Refer to the PyTorch profiler tutorial for details.
        • +
        + +

        Review the Getting Started on Intel GPU Guide for a tour of the environment setup and a quick start on Intel GPUs.

        + +

        Future Work

        + +

        Looking ahead, we will continue the Intel GPU upstream efforts in future PyTorch releases to:

        + +
          +
        • Attain state-of-the-art PyTorch-native performance to showcase competitive GEMM computational efficiency for torch.compile, and enhance performance for LLM models through FlexAttention and lower precision data types.
        • +
        • Broaden feature compatibility by delivering distributed XCCL backend support for Intel® Data Center GPU Max Series.
        • +
        • Expand accelerator support across core PyTorch ecosystem components including torchao, torchtune, and torchtitan.
        • +
        + +

        Follow along in the PyTorch Dev Discussion to learn more about Intel GPU & CPU enabling status and features. As we get further along, we will create tickets on GitHub to document our progress.

        + +

        Summary

        + +

        In this blog, we reviewed the Intel GPU upstream progress starting in PyTorch 2.4 and highlighted the new features of PyTorch 2.7 that accelerate AI workload performance across various Intel GPUs. These new features, especially SDPA on Windows, achieved up to 3x inference (Stable Diffusion, float16) gain over PyTorch 2.6 release on Intel Arc B580 Graphics and Intel Core Ultra 7 Processor 258V with Intel Arc Graphics 140V. Also, torch.compile on Windows delivers similar performance advantages over eager mode on Dynamo benchmarks as on Linux.

        + +

        Acknowledgments

        + +

        We want to thank the following PyTorch maintainers for their technical discussions and insights: Nikita Shulga, Jason Ansel, Andrey Talman, Alban Desmaison, and Bin Bao.

        + +

        We also thank collaborators from PyTorch for their professional support and guidance.

        + +

        Product and Performance Information

        + +

        Measurement on Intel Core Ultra 7 258V: 2200 MHz, 8 Core(s), 8 Logical Processor(s) with Intel Arc 140V GPU (16GB), GPU memory 18.0 GB, using Intel Graphics Driver 32.0.101.6647 (WHQL Certified), Windows 11 Pro - 24H2. And Intel Core Ultra 5 245KF: 4200 MHz, 14 Core(s), 14 Logical Processor(s), Intel Arc B580 Graphics, dedicated GPU memory 12.0 GB, shared GPU memory 15.8 GB, using Intel Graphics Driver 32.0.101.6647 (WHQL Certified), Windows 11 Enterprise LTSC - 24H2. Test by Intel on Apr 8th, 2025.

        + +

        Notices and Disclaimers

        + +

        Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

        + +

        Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

        + +

        AI Disclaimer

        + +

        AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at www.intel.com/AIPC. Results may vary.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2-7/index.html b/blog/pytorch-2-7/index.html new file mode 100644 index 000000000000..c99f25779361 --- /dev/null +++ b/blog/pytorch-2-7/index.html @@ -0,0 +1,789 @@ + + + + + + + + + + + + + PyTorch 2.7 Release | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        April 23, 2025

        +

        + PyTorch 2.7 Release +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 2.7 (release notes)! This release features:

        + +
          +
        • support for the NVIDIA Blackwell GPU architecture and pre-built wheels for CUDA 12.8 across Linux x86 and arm64 architectures.
        • +
        • torch.compile support for Torch Function Modes which enables users to override any *torch.** operation to implement custom user-defined behavior.
        • +
        • Mega Cache which allows users to have end-to-end portable caching for torch;
        • +
        • new features for FlexAttention - LLM first token processing, LLM throughput mode optimization and Flex Attention for Inference.
        • +
        + +

        This release is composed of 3262 commits from 457 contributors since PyTorch 2.6. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.7. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Beta + Prototype +
        Torch.Compile support for Torch Function Modes + NVIDIA Blackwell Architecture Support +
        Mega Cache + PyTorch Native Context Parallel +
        + Enhancing Intel GPU Acceleration +
        + FlexAttention LLM first token processing on x86 CPUs +
        + FlexAttention LLM throughput mode optimization on x86 CPUs +
        + Foreach Map +
        + Flex Attention for Inference +
        + Prologue Fusion Support in Inductor +
        + +

        *To see a full list of public feature submissions click here.

        + +

        BETA FEATURES

        + +

        [Beta] Torch.Compile support for Torch Function Modes

        + +

        This feature enables users to override any *torch.** operation to implement custom user-defined behavior. For example, ops can be rewritten to accommodate a specific backend. This is used in FlexAttention to re-write indexing ops.

        + +

        See the tutorial for more information.

        + +

        [Beta] Mega Cache

        + +

        Mega Cache allows users to have end-to-end portable caching for torch. The intended use case is after compiling and executing a model, the user calls torch.compiler.save_cache_artifacts() which will return the compiler artifacts in a portable form. Later, potentially on a different machine, the user may call torch.compiler.load_cache_artifacts() with these artifacts to pre-populate the torch.compile caches in order to jump-start their cache.

        + +

        See the tutorial for more information.

        + +

        PROTOTYPE FEATURES

        + +

        [Prototype] NVIDIA Blackwell Architecture Support

        + +

        PyTorch 2.7 introduces support for NVIDIA’s new Blackwell GPU architecture and ships pre-built wheels for CUDA 12.8. For more details on CUDA 12.8 see CUDA Toolkit Release.

        + +
          +
        • Core components and libraries including cuDNN, NCCL, and CUTLASS have been upgraded to ensure compatibility with Blackwell platforms.
        • +
        • PyTorch 2.7 includes Triton 3.3, which adds support for the Blackwell architecture with torch.compile compatibility.
        • +
        • To utilize these new features, install PyTorch with CUDA 12.8 using: pip install torch==2.7.0 –index-url https://download.pytorch.org/whl/cu128
        • +
        + +

        More context can also be found here.

        + +

        [Prototype] PyTorch Native Context Parallel

        + +

        PyTorch Context Parallel API allows users to create a Python context so that every *torch.nn.functional.scaled_dot_product_attention() *call within will run with context parallelism. Currently, PyTorch Context Parallel supports 3 attention backends: 1. Flash attention; 2. Efficient attention; and 3. cuDNN attention.

        + +

        As an example, this is used within TorchTitan as the Context Parallel solution for LLM training.

        + +

        See tutorial here.

        + +

        [Prototype] Enhancing Intel GPU Acceleration

        + +

        This latest release introduces enhanced performance optimizations for Intel GPU architectures. These improvements accelerate workloads across various Intel GPUs through the following key enhancements:

        + +
          +
        • Enable torch.compile on Windows 11 for Intel GPUs, delivering the performance advantages over eager mode as on Linux.
        • +
        • Optimize the performance of PyTorch 2 Export Post Training Quantization (PT2E) on Intel GPU to provide a full graph mode quantization pipelines with enhanced computational efficiency.
        • +
        • Improve Scaled Dot-Product Attention (SDPA) inference performance with bfloat16 and float16 to accelerate attention-based models on Intel GPUs.
        • +
        • Enable AOTInuctor and torch.export on Linux to simplify deployment workflows.
        • +
        • Implement more Aten operators to enhance the continuity of operators execution on Intel GPU and increase the performance on Intel GPU in eager mode.
        • +
        • Enable profiler on both Windows and Linux to facilitate model performance analysis.
        • +
        • Expand the Intel GPUs support to Intel® Core™ Ultra Series 2 with Intel® Arc™ Graphics, and Intel® Arc™ B-Series graphics on both Windows and Linux.
        • +
        + +

        For more information regarding Intel GPU support, please refer to Getting Started Guide.

        + +

        See also the tutorials here and here.

        + +

        [Prototype] FlexAttention LLM first token processing on x86 CPUs

        + +

        FlexAttention x86 CPU support was first introduced in PyTorch 2.6, offering optimized implementations — such as PageAttention, which is critical for LLM inference—via the TorchInductor C++ backend. In PyTorch 2.7, more attention variants for first token processing of LLMs are supported. With this feature, users can have a smoother experience running FlexAttention on x86 CPUs, replacing specific scaled_dot_product_attention operators with a unified FlexAttention API, and benefiting from general support and good performance when using torch.compile.

        + +

        [Prototype] FlexAttention LLM throughput mode optimization

        + +

        The performance of FlexAttention on x86 CPUs for LLM inference throughput scenarios has been further improved by adopting the new C++ micro-GEMM template ability. This addresses the performance bottlenecks for large batch size scenarios present in PyTorch 2.6. With this enhancement, users can transparently benefit from better performance and a smoother experience when using FlexAttention APIs and torch.compile for LLM throughput serving on x86 CPUs.

        + +

        [Prototype] Foreach Map

        + +

        This feature uses torch.compile to allow users to apply any pointwise or user-defined function (e.g. torch.add) to lists of tensors, akin to the existing *torch.foreach** ops. The main advantage over the existing *torch.foreach** ops is that any mix of scalars or lists of tensors can be supplied as arguments, and even user-defined python functions can be lifted to apply to lists of tensors. Torch.compile will automatically generate a horizontally fused kernel for optimal performance.

        + +

        See tutorial here.

        + +

        [Prototype] Flex Attention for Inference

        + +

        In release 2.5.0, FlexAttention* torch.nn.attention.flex_attention* was introduced for ML researchers who’d like to customize their attention kernels without writing kernel code. This update introduces a decoding backend optimized for inference, supporting GQA and PagedAttention, along with feature updates including nested jagged tensor support, performance tuning guides and trainable biases support.

        + +

        [Prototype] Prologue Fusion Support in Inductor

        + +

        Prologue fusion optimizes matrix multiplication (matmul) operations by fusing operations that come before the matmul into the matmul kernel itself, improving performance by reducing global memory bandwidth.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2-paper-tutorial/index.html b/blog/pytorch-2-paper-tutorial/index.html new file mode 100644 index 000000000000..f6b890fd96f9 --- /dev/null +++ b/blog/pytorch-2-paper-tutorial/index.html @@ -0,0 +1,664 @@ + + + + + + + + + + + + + PyTorch 2 paper and tutorial @ ASPLOS 2024 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        February 06, 2024

        +

        + PyTorch 2 paper and tutorial @ ASPLOS 2024 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        The PyTorch team is excited to share that our paper on PyTorch 2 has been accepted for presentation at the ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), scheduled to take place from April 27 to May 1, 2024, in San Diego, CA, USA.

        + +

        The paper delves into the implementation of torch.compile and highlights the key technologies driving it, including TorchDynamo (graph capture), TorchInductor (backend compiler), and Dynamic Shape support.

        + +

        During the ASPLOS conference, we’ll be conducting a tutorial on Saturday, April 27, focusing on the inner workings of PyTorch 2 and how systems researchers can leverage and build upon it. Stay tuned for more details as the event approaches – we look forward to your participation!

        + +

        A preview of the paper is attached below:

        + +

        Title: PyTorch 2: Faster Machine Learning Through Dynamic Python Bytecode Transformation and Graph Compilation. Full Paper PDF

        + +

        Abstract

        +

        This paper introduces two extensions to the popular PyTorch machine learning framework, TorchDynamo and TorchInductor, which implement the torch.compile feature released in PyTorch 2. TorchDynamo is a Python-level just-in-time (JIT) compiler that enables graph compilation in PyTorch programs without sacrificing the flexibility of Python. It achieves this by dynamically modifying Python bytecode before execution and extracting sequences of PyTorch operations into an FX graph, which is then JIT compiled using one of many extensible backends. TorchInductor is the default compiler backend for TorchDynamo, which translates PyTorch programs into OpenAI’s Triton for GPUs and C++ for CPUs. Results show that TorchDynamo is able to capture graphs more robustly than prior approaches while adding minimal overhead, and TorchInductor is able to provide a 2.27x inference and 1.41x training geometric mean speedup on an NVIDIA A100 GPU across 180+ real-world models, which outperforms six other compilers. These extensions provide a new way to apply optimizations through compilers in eager mode frameworks like PyTorch.

        + +

        Authors

        + +

        Jason Ansel (Meta); Edward Yang (Meta); Horace He (Meta); Natalia Gimelshein (OpenAI); Animesh Jain (Meta); Michael Voznesensky (Meta); Bin Bao (Meta); Peter Bell (Quansight); David Berard (Meta); Evgeni Burovski Quansight; Geeta Chauhan (Meta); Anjali Chourdia (Meta); Will Constable (Meta); Alban Desmaison (Meta); Zachary DeVito (Meta); Elias Ellison (Meta); Will Feng (Meta); Jiong Gong (Intel); Michael Gschwind (Meta); Brian Hirsh (Meta); Sherlock Huang (Meta); Kshiteej Kalambarkar (Quansight); Laurent Kirsch (Meta); Michael Lazos (Meta); Mario Lezcano (Quansight); Yanbo Liang (Meta); Jason Liang (Meta); Yinghai Lu (Meta); CK Luk (Meta); Bert Maher (Meta); Yunjie Pan (University of Michigan); Christian Puhrsch (Meta); Matthias Reso (Meta); Mark Saroufim (Meta); Marcos Yukio Siraichi (Quansight); Helen Suk (Meta); Michael Suo (Meta); Phil Tillet (OpenAI); Eikan Wang (Intel); Xiaodong Wang (Meta); William Wen (Meta); Shunting Zhang (Meta); Xu Zhao (Meta); Keren Zhou (OpenAI & George Mason University); Richard Zou (Meta); Ajit Mathews (Meta); Gregory Chanan (Meta); Peng Wu (Meta); Soumith Chintala (Meta)

        + +

        ASPLOS’24 - Full Day Tutorial Schedule

        + +

        Full schedule for the ASPLOS’24 PyTorch 2 Tutoral on Saturday, April 27th is available here

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2.0-release/index.html b/blog/pytorch-2.0-release/index.html new file mode 100644 index 000000000000..db84b184157f --- /dev/null +++ b/blog/pytorch-2.0-release/index.html @@ -0,0 +1,1150 @@ + + + + + + + + + + + + + PyTorch 2.0: Our next generation release that is faster, more Pythonic and Dynamic as ever | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 2.0 which we highlighted during the PyTorch Conference on 12/2/22! PyTorch 2.0 offers the same eager-mode development and user experience, while fundamentally changing and supercharging how PyTorch operates at compiler level under the hood with faster performance and support for Dynamic Shapes and Distributed.

        + +

        This next-generation release includes a Stable version of Accelerated Transformers (formerly called Better Transformers); Beta includes torch.compile as the main API for PyTorch 2.0, the scaled_dot_product_attention function as part of torch.nn.functional, the MPS backend, functorch APIs in the torch.func module; and other Beta/Prototype improvements across various inferences, performance and training optimization features on GPUs and CPUs. For a comprehensive introduction and technical overview of torch.compile, please visit the 2.0 Get Started page.

        + +

        Along with 2.0, we are also releasing a series of beta updates to the PyTorch domain libraries, including those that are in-tree, and separate libraries including TorchAudio, TorchVision, and TorchText. An update for TorchX is also being released as it moves to community supported mode. More details can be found in this library blog.

        + +

        This release is composed of over 4,541 commits and 428 contributors since 1.13.1. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.0 and the overall 2-series this year.

        + +

        Summary:

        +
          +
        • torch.compile is the main API for PyTorch 2.0, which wraps your model and returns a compiled model. It is a fully additive (and optional) feature and hence 2.0 is 100% backward compatible by definition.
        • +
        • As an underpinning technology of torch.compile, TorchInductor with Nvidia and AMD GPUs will rely on OpenAI Triton deep learning compiler to generate performant code and hide low level hardware details. OpenAI Triton-generated kernels achieve performance that’s on par with hand-written kernels and specialized cuda libraries such as cublas.
        • +
        • Accelerated Transformers introduce high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA). The API is integrated with torch.compile() and model developers may also use the scaled dot product attention kernels directly by calling the new scaled_dot_product_attention() operator.
        • +
        • Metal Performance Shaders (MPS) backend provides GPU accelerated PyTorch training on Mac platforms with added support for Top 60 most used ops, bringing coverage to over 300 operators.
        • +
        • Amazon AWS optimizes the PyTorch CPU inference on AWS Graviton3 based C7g instances. PyTorch 2.0 improves inference performance on Graviton compared to the previous releases, including improvements for Resnet50 and Bert.
        • +
        • New prototype features and technologies across TensorParallel, DTensor, 2D parallel, TorchDynamo, AOTAutograd, PrimTorch and TorchInductor.
        • +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +Stable + Beta + Prototype + Performance Improvements +
        + +Accelerated PT 2 Transformers + + +torch.compile + + +DTensor + + +CUDA support for 11.7 & 11.8 (deprecating CUDA 11.6) +
        + + +PyTorch MPS Backend + + +TensorParallel + + +Python 3.8 (deprecating Python 3.7) +
        + + +Scaled dot product attention + + +2D Parallel + + +AWS Graviton3 +
        + + +functorch + + +Torch.compile (dynamic=True) + +
        + Dispatchable Collectives + +
        + Torch.set_default & torch.device + + +
        + + +X86 quantization backend + + +
        + + +GNN inference and training performance + + +
        + +

        *To see a full list of public 2.0, 1.13 and 1.12 feature submissions click here.

        + +

        Stable Features

        + +

        [Stable] Accelerated PyTorch 2 Transformers

        + +

        The PyTorch 2.0 release includes a new high-performance implementation of the PyTorch Transformer API. In releasing Accelerated PT2 Transformers, our goal is to make training and deployment of state-of-the-art Transformer models affordable across the industry. This release introduces high-performance support for training and inference using a custom kernel architecture for scaled dot product attention (SPDA), extending the inference “fastpath” architecture, previously known as “Better Transformer.”

        + +

        Similar to the “fastpath” architecture, custom kernels are fully integrated into the PyTorch Transformer API – thus, using the native Transformer and MultiHeadAttention API will enable users to:

        + +
          +
        • transparently see significant speed improvements;
        • +
        • support many more use cases including models using Cross-Attention, Transformer Decoders, and for training models; and
        • +
        • continue to use fastpath inference for fixed and variable sequence length Transformer Encoder and Self Attention use cases.
        • +
        + +

        To take full advantage of different hardware models and Transformer use cases, multiple SDPA custom kernels are supported (see below), with custom kernel selection logic that will pick the highest-performance kernel for a given model and hardware type. In addition to the existing Transformer API, model developers may also use the scaled dot product attention kernels directly by calling the new scaled_dot_product_attention() operator. Accelerated PyTorch 2 Transformers are integrated with torch.compile() . To use your model while benefiting from the additional acceleration of PT2-compilation (for inference or training), pre-process the model with model = torch.compile(model).

        + +

        We have achieved major speedups for training transformer models and in particular large language models with Accelerated PyTorch 2 Transformers using a combination of custom kernels and torch.compile().

        + +

        alt_text +Figure: Using scaled dot product attention with custom kernels and torch.compile delivers significant speedups for training large language models, such as for nanoGPT shown here.

        + +

        Beta Features

        + +

        [Beta] torch.compile

        + +

        torch.compile is the main API for PyTorch 2.0, which wraps your model and returns a compiled model. It is a fully additive (and optional) feature and hence 2.0 is 100% backward compatible by definition.

        + +

        Underpinning torch.compile are new technologies – TorchDynamo, AOTAutograd, PrimTorch and TorchInductor:

        +
          +
        • TorchDynamo captures PyTorch programs safely using Python Frame Evaluation Hooks and is a significant innovation that was a result of 5 years of our R&D into safe graph capture.
        • +
        • AOTAutograd overloads PyTorch’s autograd engine as a tracing autodiff for generating ahead-of-time backward traces.
        • +
        • PrimTorch canonicalizes ~2000+ PyTorch operators down to a closed set of ~250 primitive operators that developers can target to build a complete PyTorch backend. This substantially lowers the barrier of writing a PyTorch feature or backend.
        • +
        • TorchInductor is a deep learning compiler that generates fast code for multiple accelerators and backends. For NVIDIA and AMD GPUs, it uses OpenAI Triton as a key building block. For intel CPUs, we generate C++ code using multithreading, vectorized instructions and offloading appropriate operations to mkldnn when possible.
        • +
        + +

        With all the new technologies, torch.compile is able to work 93% of time across 165 open-source models and runs 20% faster on average at float32 precision and 36% faster on average at AMP precision.

        + +

        For more information, please refer to https://pytorch.org/get-started/pytorch-2.0/ and for TorchInductor CPU with Intel here.

        + +

        [Beta] PyTorch MPS Backend

        + +

        MPS backend provides GPU-accelerated PyTorch training on Mac platforms. This release brings improved correctness, stability, and operator coverage.

        + +

        MPS backend now includes support for the Top 60 most used ops, along with the most frequently requested operations by the community, bringing coverage to over 300 operators. The major focus of the release was to enable full OpInfo-based forward and gradient mode testing to address silent correctness issues. These changes have resulted in wider adoption of MPS backend by 3rd party networks such as Stable Diffusion, YoloV5, WhisperAI, along with increased coverage for Torchbench networks and Basic tutorials. We encourage developers to update to the latest macOS release to see the best performance and stability on the MPS backend.

        + +

        Links

        + +
          +
        1. MPS Backend
        2. +
        3. Developer information
        4. +
        5. Accelerated PyTorch training on Mac
        6. +
        7. Metal, Metal Performance Shaders & Metal Performance Shaders Graph
        8. +
        + +

        [Beta] Scaled dot product attention 2.0

        + +

        We are thrilled to announce the release of PyTorch 2.0, which introduces a powerful scaled dot product attention function as part of torch.nn.functional. This function includes multiple implementations that can be seamlessly applied depending on the input and hardware in use.

        + +

        In previous versions of PyTorch, you had to rely on third-party implementations and install separate packages to take advantage of memory-optimized algorithms like FlashAttention. With PyTorch 2.0, all these implementations are readily available by default.

        + +

        These implementations include FlashAttention from HazyResearch, Memory-Efficient Attention from the xFormers project, and a native C++ implementation that is ideal for non-CUDA devices or when high-precision is required.

        + +

        PyTorch 2.0 will automatically select the optimal implementation for your use case, but you can also toggle them individually for finer-grained control. Additionally, the scaled dot product attention function can be used to build common transformer architecture components.

        + +

        Learn more with the documentation and this tutorial.

        + +

        [Beta] functorch -> torch.func

        + +

        Inspired by Google JAX, functorch is a library that offers composable vmap (vectorization) and autodiff transforms. It enables advanced autodiff use cases that would otherwise be tricky to express in PyTorch. Examples include:

        + + +

        We’re excited to announce that, as the final step of upstreaming and integrating functorch into PyTorch, the functorch APIs are now available in the torch.func module. Our function transform APIs are identical to before, but we have changed how the interaction with NN modules work. Please see the docs and the migration guide for more details.

        + +

        Furthermore, we have added support for torch.autograd.Function: one is now able to apply function transformations (e.g. vmap, grad, jvp) over torch.autograd.Function.

        + +

        [Beta] Dispatchable Collectives

        + +

        Dispatchable collectives is an improvement to the existing init_process_group() API which changes backend to an optional argument. For users, the main advantage of this feature is that it will allow them to write code that can run on both GPU and CPU machines without having to change the backend specification. The dispatchability feature will also make it easier for users to support both GPU and CPU collectives, as they will no longer need to specify the backend manually (e.g. “NCCL” or “GLOO”). Existing backend specifications by users will be honored and will not require change.

        + +

        Usage example:

        +
        import torch.distributed.dist
        +…
        +# old
        +dist.init_process_group(backend=”nccl”, ...)
        +dist.all_reduce(...) # with CUDA tensors works
        +dist.all_reduce(...) # with CPU tensors does not work
        +
        +# new
        +dist.init_process_group(...) # backend is optional
        +dist.all_reduce(...) # with CUDA tensors works
        +dist.all_reduce(...) # with CPU tensors works
        +
        + +

        Learn more here.

        + +

        [Beta] torch.set_default_device and torch.device as context manager

        + +

        torch.set_default_device allows users to change the default device that factory functions in PyTorch allocate on. For example, if you torch.set_default_device(‘cuda’), a call to torch.empty(2) will allocate on CUDA (rather than on CPU). You can also use torch.device as a context manager to change the default device on a local basis. This resolves a long standing feature request from PyTorch’s initial release for a way to do this.

        + +

        Learn more here.

        + +

        [Beta] “X86” as the new default quantization backend for x86 CPU

        + +

        The new X86 quantization backend, which utilizes FBGEMM and oneDNN kernel libraries, replaces FBGEMM as the default quantization backend for x86 CPU platforms and offers improved int8 inference performance compared to the original FBGEMM backend, leveraging the strengths of both libraries, with 1.3X – 2X inference performance speedup measured on 40+ deep learning models. The new backend is functionally compatible with the original FBGEMM backend.

        + +

        Table: Geomean Speedup of X86 Quantization Backend vs. FBGEMM Backend

        + + + + + + + + + + + + + + + + +
        + 1 core/instance + 2 cores/instance + 4 cores/instance + 1 socket (32 cores)/instance +
        Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz + 1.76X + 1.80X + 2.04X + 1.34X +
        + +

        By default, users on x86 platforms will utilize the x86 quantization backend and their PyTorch programs will remain unchanged when using the default backend. Alternatively, users have the option to specify “X86” as the quantization backend explicitly. Example code is shown below:

        + +
        import torch
        +from torch.ao.quantization import get_default_qconfig_mappingfrom torch.quantization.quantize_fx
        +import prepare_fx, convert_fx
        + 
        +# get default configuration
        +qconfig_mapping = get_default_qconfig_mapping()
        + 
        +# or explicitly specify the backend
        +# qengine = 'x86'
        +# torch.backends.quantized.engine = qengine
        +# qconfig_mapping = get_default_qconfig_mapping(qengine)
        + 
        +# construct fp32 model
        +model_fp32 = ...
        + 
        +# prepare
        +prepared_model = prepare_fx(model_fp32, qconfig_mapping, example_inputs=x)
        + 
        +# calibrate
        +...
        + 
        +# convert
        +quantized_model = convert_fx(prepared_model)
        +
        + +

        Find more information: https://github.com/pytorch/pytorch/issues/83888 and https://www.intel.com/content/www/us/en/developer/articles/technical/accelerate-pytorch-int8-inf-with-new-x86-backend.html.

        + +

        [Beta] GNN inference and training optimization on CPU

        + +

        PyTorch 2.0 includes several critical optimizations to improve GNN inference and training performance on CPU. Before 2.0, GNN models of PyG suffers from low efficiency on CPU due to lack of performance tuning for several critical kernels (scatter/gather, etc) and the lack of GNN-related sparse matrix multiplication ops. To be specific, optimizations include:

        +
          +
        • scatter_reduce: performance hotspot in Message Passing when the edge index is stored in Coordinate format (COO).
        • +
        • gather: backward of scatter_reduce, specially tuned for the GNN compute when the index is an expanded tensor.
        • +
        • torch.sparse.mm with reduce flag: performance hotspot in Message Passing when the edge index is stored in Compressed Sparse Row (CSR). Supported reduce flag of: sum, mean, amax, amin.
        • +
        + +

        On PyG benchmarks/examples, OGB benchmarks, a 1.12x - 4.07x performance speedup is measured (1.13.1 compared with 2.0) for single node inference and training.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model-Dataset + Option + Speedup Ratio +
        GCN-Reddit (inference) + 512-2-64-dense + 1.22x +
        1024-3-128-dense + 1.25x +
        512-2-64-sparse + 1.31x +
        1024-3-128-sparse + 1.68x +
        512-2-64-dense + 1.22x +
        +GraphSage-ogbn-products (inference) + 1024-3-128-dense + 1.15x +
        512-2-64-sparse + 1.20x +
        1024-3-128-sparse + 1.33x +
        full-batch-sparse + 4.07x +
        GCN-PROTEINS (training) + 3-32 + 1.67x +
        GCN-REDDIT-BINARY (training) + 3-32 + 1.67x +
        GCN-Reddit (training) + 512-2-64-dense + 1.20x +
        1024-3-128-dense + 1.12x +
        + +

        Learn more: PyG CPU Performance Optimization.

        + +

        [Beta] Accelerating inference on CPU with PyTorch by leveraging oneDNN Graph

        + +

        oneDNN Graph API extends oneDNN with a flexible graph API to maximize the optimization opportunity for generating efficient code on AI hardware.

        +
          +
        • It automatically identifies the graph partitions to be accelerated via fusion.
        • +
        • The fusion patterns focus on fusing compute-intensive operations such as convolution, matmul and their neighbor operations for both inference and training use cases.
        • +
        • Although work is ongoing to integrate oneDNN Graph with TorchDynamo as well, its integration with the PyTorch JIT Fuser attained beta status in PyTorch 2.0 for Float32 & BFloat16 inference (on machines that support AVX512_BF16 ISA).
        • +
        + +

        From a developer’s/researcher’s perspective, the usage is quite simple & intuitive, with the only change in code being an API invocation:

        +
          +
        • Leverage oneDNN Graph, with JIT-tracing, a model is profiled with an example input.
        • +
        • The context manager with torch.jit.fuser(“fuser3”): can also be used instead of invoking torch.jit.enable_onednn_fusion(True).
        • +
        • For accelerating BFloat16 inference, we rely on eager-mode AMP (Automatic Mixed Precision) support in PyTorch & disable JIT mode’s AMP, as both of them are currently divergent:
        • +
        + +
        # Assuming we have a model of the name 'model'
        + 
        +example_input = torch.rand(1, 3, 224, 224)
        + 
        +# enable oneDNN Graph
        +torch.jit.enable_onednn_fusion(True)
        +# Disable AMP for JIT
        +torch._C._jit_set_autocast_mode(False)
        +with torch.no_grad(), torch.cpu.amp.autocast():
        +	model = torch.jit.trace(model, (example_input))
        +	model = torch.jit.freeze(model)
        + 	# 2 warm-ups (2 for tracing/scripting with an example, 3 without an example)
        +	model(example_input)
        +	model(example_input)
        + 
        +	# speedup would be observed in subsequent runs.
        +	model(example_input)
        +
        + +

        Learn more here.

        + +

        Prototype Features

        + +

        Distributed API

        + +

        [Prototype] DTensor

        + +

        PyTorch DistributedTensor (DTensor) is a prototyping effort with distributed tensor primitives to allow easier distributed computation authoring in the SPMD (Single Program Multiple Devices) paradigm. The primitives are simple but powerful when used to express tensor distributions with both sharded and replicated parallelism strategies. PyTorch DTensor empowered PyTorch Tensor Parallelism along with other advanced parallelism explorations. In addition, it also offers a uniform way to save/load state_dict for distributed checkpointing purposes, even when there’re complex tensor distribution strategies such as combining tensor parallelism with parameter sharding in FSDP. More details can be found in this RFC and the DTensor examples notebook.

        + +

        [Prototype] TensorParallel

        + +

        We now support DTensor based Tensor Parallel which users can distribute their model parameters across different GPU devices. We also support Pairwise Parallel which shards two concatenated linear layers in a col-wise and row-wise style separately so that only one collective(all-reduce/reduce-scatter) is needed in the end.

        + +

        [Prototype] 2D Parallel

        + +

        We implemented the integration of the aforementioned TP with FullyShardedDataParallel(FSDP) as 2D parallel to further scale large model training. More details can be found in this slide.

        + +

        [Prototype] torch.compile(dynamic=True)

        + +

        Experimental support for PT2 compilation with dynamic shapes is available in this release. Inference compilation with inductor for simple models is supported, but there are a lot of limitations:

        + +
          +
        • Training available in a future release (This is partially fixed in nightlies!)
        • +
        • Minifier available in a future release.
        • +
        • It is easy to end up in a situation where the dimension you wanted to be dynamic gets specialized anyway. Some of these issues are fixed in nightlies, others are not.
        • +
        • We do not appropriately propagate Inductor guards to the top-level, this is tracked at #96296.
        • +
        • Data-dependent operations like nonzero still require a graph break.
        • +
        • Dynamic does not work with non-standard modes like reduce-overhead or max-autotune.
        • +
        • There are many bugs in Inductor compilation. To track known bugs, check the dynamic shapes label on the PyTorch issue tracker.
        • +
        + +

        For the latest and greatest news about dynamic shapes support on master, check out our status reports.

        + +

        Highlights/Performance Improvements

        + +

        Deprecation of Cuda 11.6 and Python 3.7 support for PyTorch 2.0

        + +

        If you are still using or depending on CUDA 11.6 or Python 3.7 builds, we strongly recommend moving to at least CUDA 11.7 and Python 3.8, as it would be the minimum versions required for PyTorch 2.0. For more detail, please refer to the Release Compatibility Matrix for PyTorch releases.

        + +

        Python 3.11 support on Anaconda Platform

        + +

        Due to lack of Python 3.11 support for packages that PyTorch depends on, including NumPy, SciPy, SymPy, Pillow and others on the Anaconda platform. We will not be releasing Conda binaries compiled with Python 3.11 for PyTorch Release 2.0. The Pip packages with Python 3.11 support will be released, hence if you intend to use PyTorch 2.0 with Python 3.11 please use our Pip packages. Please note: Conda packages with Python 3.11 support will be made available on our nightly channel. Also we are planning on releasing Conda Python 3.11 binaries as part of future release once Anaconda provides these key dependencies. More information and instructions on how to download the Pip packages can be found here.

        + +

        Optimized PyTorch Inference with AWS Graviton processors

        + +

        The optimizations focused on three key areas: GEMM kernels, bfloat16 support, primitive caching and the memory allocator. For aarch64 platforms, PyTorch supports Arm Compute Library (ACL) GEMM kernels via Mkldnn(OneDNN) backend. The ACL library provides Neon/SVE GEMM kernels for fp32 and bfloat16 formats. The bfloat16 support on c7g allows efficient deployment of bfloat16 trained, AMP (Automatic Mixed Precision) trained, or even the standard fp32 trained models. The standard fp32 models leverage bfloat16 kernels via OneDNN fast math mode, without any model quantization. Next we implemented primitive caching for conv, matmul and inner product operators. More information on the updated PyTorch user guide with the upcoming 2.0 release improvements and TorchBench benchmark details can be found here.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2.0-xla-path-forward/index.html b/blog/pytorch-2.0-xla-path-forward/index.html new file mode 100644 index 000000000000..f33aa310416e --- /dev/null +++ b/blog/pytorch-2.0-xla-path-forward/index.html @@ -0,0 +1,662 @@ + + + + + + + + + + + + + PyTorch & OpenXLA: The Path Forward | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        April 03, 2023

        +

        + PyTorch & OpenXLA: The Path Forward +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Milad Mohammadi, Jack Cao, Shauheen Zahirazami, Joe Spisak, and Jiewen Tan + +

        +

        As we celebrate the release of OpenXLA, PyTorch 2.0, and PyTorch/XLA 2.0, it’s worth taking a step back and sharing where we see it all going in the short to medium term. With PyTorch adoption leading in the AI space and XLA supporting best-in-class compiler features, PyTorch/XLA is well positioned to provide a cutting edge development stack for both model training and inference. To achieve this, we see investments in three main areas:

        + +
          +
        • Training Large Models - Large language models (LLM) and diffusion models have quickly risen in popularity and many cutting edge applications today are built on them. Further to this, training these models requires scale and more specifically the ability to train across thousands of accelerators. To achieve this we are investing in features such as AMP for mixed precision training, PjRt for increased runtime performance, SPMD / FSDP for efficient model sharding, Dynamic Shapes to enable new research approaches, faster data loading through Ray and tf.data, and a toolchain that packages all of these features together into a seamless workflow. Some of these features are already available in experimental or beta stages, and others are coming up this year with many heavily leveraging the underlying OpenXLA compiler stack.
        • +
        • Model Inference - With large models continuing to grow in size and computational cost, deployment becomes the next challenge as these models continue to find their way into applications. With the introduction of Dynamo in the PyTorch 2.0 release, PyTorch/XLA delivers performance competitive inference. We are, however, incorporating additional inference-oriented including model serving support, Dynamo for sharded large models, quantization via Torch.Export and StableHLO.
        • +
        • Ecosystem integration - We are expanding integration with Hugging Face and PyTorch Lightning so users can take advantage of upcoming PyTorch/XLA cutting edge features (e.g. FSDP support in Hugging Face) and the downstream OpenXLA features (e.g. Quantization) through familiar APIs.
        • +
        + +

        Additionally, PyTorch/XLA is set to migrate to the open source OpenXLA as its default downstream compiler; allowing the PyTorch community to gain access to a leading, framework-agnostic compiler stack that enjoys industry-wide contribution and innovation. To achieve this, we will begin supporting StableHLO. As a result, OpenXLA will replace the existing TF:XLA dependency, overall streamlining the dependencies and creating leverage from the broader compiler ecosystem. PyTorch/XLA will also sunset the XRT runtime after migration. You can see the resulting high level stack below with the TensorFlow dependency stricken out:

        + +

        the upcoming PyTorch/XLA features and integrations

        + +

        Figure: the upcoming PyTorch/XLA features and integrations are illustrated here

        + +

        We cannot be more excited about what’s ahead for PyTorch/XLA and invite the community to join us. PyTorch/XLA is developed fully in open source so please file issues, submit pull requests, and send RFCs to GitHub such that we can openly collaborate. You can also try out PyTorch/XLA for yourself on various XLA devices including TPUs and GPUs.

        + +

        Cheers,
        +The PyTorch/XLA Team at Google

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-2.0-xla/index.html b/blog/pytorch-2.0-xla/index.html new file mode 100644 index 000000000000..6c58557f7595 --- /dev/null +++ b/blog/pytorch-2.0-xla/index.html @@ -0,0 +1,837 @@ + + + + + + + + + + + + + PyTorch 2.0 & XLA—The Latest Cutting Edge Features | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Jack Cao, Milad Mohammadi, Alex Wertheim, Yeounoh Chung, Joe Spisak, Will Cromar, Shauheen Zahirazami + +

        +

        Today, we are excited to share our latest work for PyTorch/XLA 2.0. The release of PyTorch 2.0 is yet another major milestone for this storied community and we are excited to continue to be part of it. When the PyTorch/XLA project started in 2018 between Google and Meta, the focus was on bringing cutting edge Cloud TPUs to help support the PyTorch community. Along the way, others in the community such as Amazon joined the project and very quickly the community expanded. We are excited about XLA’s direction and the benefits this project continues to bring to the PyTorch community. In this blog we’d like to showcase some key features that have been in development, show code snippets, and illustrate the benefit through some benchmarks.

        + +

        TorchDynamo / torch.compile (Experimental)

        + +

        TorchDynamo (Dynamo) is a Python-level JIT compiler designed to make unmodified PyTorch programs faster. It provides a clean API for compiler backends to hook in; its biggest feature is to dynamically modify Python bytecode just before execution. In the PyTorch/XLA 2.0 release, an experimental backend for Dynamo is provided for both inference and training.

        + +

        Dynamo provides a Torch FX (FX) graph when it recognizes a model pattern and PyTorch/XLA uses a Lazy Tensor approach to compile the FX graph and return the compiled function. To get more insight regarding the technical details about PyTorch/XLA’s dynamo implementation, check out this dev-discuss post and dynamo doc.

        + +

        Here is a small code example of running ResNet18 with torch.compile:

        + +
        import torch
        +import torchvision
        +import torch_xla.core.xla_model as xm
        +
        +def eval_model(loader):
        +  device = xm.xla_device()
        +  xla_resnet18 = torchvision.models.resnet18().to(device)
        +  xla_resnet18.eval()
        +  dynamo_resnet18 = torch.compile(
        +      xla_resnet18, backend='torchxla_trace_once')
        +  for data, _ in loader:
        +    output = dynamo_resnet18(data)
        +
        + +

        With torch.compile PyTorch/XLA only traces the ResNet18 model once during the init time and executes the compiled binary everytime dynamo_resnet18 is invoked, instead of tracing the model every step. To illustrate the benefits of Dynamo+XLA, below is an inference speedup analysis to compare Dynamo and LazyTensor (without Dynamo) using TorchBench on a Cloud TPU v4-8 where the y-axis is the speedup multiplier.

        + +

        Inference Speedup - PyTorch/XLA Dynamo on TPU

        + +

        Dynamo for training is in the development stage with its implementation being at an earlier stage than inference. Developers are welcome to test this early feature, however, in the 2.0 release, PyTorch/XLA supports the forward and backward pass graphs and not the optimizer graph; the optimizer graph is available in the nightly builds and will land in the PyTorch/XLA 2.1 release. Below is an example of what training looks like using the ResNet18 example with torch.compile:

        + +
        import torch
        +import torchvision
        +import torch_xla.core.xla_model as xm
        +
        +def train_model(model, data, target):
        +  loss_fn = torch.nn.CrossEntropyLoss()
        +  pred = model(data)
        +  loss = loss_fn(pred, target)
        +  loss.backward()
        +  return pred
        +
        +def train_model_main(loader):
        +  device = xm.xla_device()
        +  xla_resnet18 = torchvision.models.resnet18().to(device)
        +  xla_resnet18.train()
        +  dynamo_train_model = torch.compile(
        +        train_model, backend='aot_torchxla_trace_once')
        +  for data, target in loader:
        +    output = dynamo_train_model(xla_resnet18, data, target)
        +
        + +

        Note that the backend for training is aot_torchxla_trace_once (API will be updated for stable release) whereas the inference backend is torchxla_trace_once (name subject to change). We expect to extract and execute 3 graphs per training step instead of 1 training step if you use the Lazy tensor. Below is a training speedup analysis to compare Dynamo and Lazy using the TorchBench on Cloud TPU v4-8.

        + +

        Training Speedup - PyTorch/XLA Dynamo on TPU

        + +

        PJRT Runtime (Beta)

        + +

        PyTorch/XLA is migrating from XRT to the new PJRT runtime. PJRT is a better-maintained stack, with demonstrated performance advantages, including, on average, a 35% performance for training on TorchBench 2.0 models. It also supports a richer set of features enabling technologies like SPMD. In the PyTorch/XLA 2.0 release, PJRT is the default runtime for TPU and CPU; GPU support is in experimental state. The PJRT features included in the PyTorch/XLA 2.0 release are:

        + +
          +
        • TPU runtime implementation in libtpu using the PJRT Plugin API improves performance by up to 30%
        • +
        • torch.distributed support for TPU v2 and v3, including pjrt:// init_method (Experimental)
        • +
        • Single-host GPU support. Multi-host support coming soon. (Experimental)
        • +
        + +

        Switching to PJRT requires no change (or minimal change for GPUs) to user code (see pjrt.md for more details). Runtime configuration is as simple as setting the PJRT_DEVICE environment variable to the local device type (i.e. TPU, GPU, CPU). Below are examples of using PJRT runtimes on different devices.

        + +
        # TPU Device
        +PJRT_DEVICE=TPU python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=256 --num_epochs=1
        +
        + +
        # TPU Pod Device
        +gcloud alpha compute tpus tpu-vm ssh $USER-pjrt --zone=us-central2-b --project=$PROJECT --worker=all --command="git clone --depth=1 --branch r2.0 https://github.com/pytorch/xla.git"
        +
        +gcloud alpha compute tpus tpu-vm ssh $USER-pjrt --zone=us-central2-b --project=$PROJECT --worker=all --command="PJRT_DEVICE=TPU python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=256 --num_epochs=1"
        +
        + +
        # GPU Device (Experimental)
        +PJRT_DEVICE=GPU GPU_NUM_DEVICES=4 python3 xla/test/test_train_mp_imagenet.py --fake_data --batch_size=128 --num_epochs=1
        +
        + +

        Below is a performance comparison between XRT and PJRT by task on TorchBench 2.0 on v4-8 TPU. To learn more about PJRT vs. XRT please review the documentation.

        + +

        TorchBench Training Time

        + +

        Parallelization

        + +

        GSPMD (Experimental)

        + +

        We are delighted to introduce General and Scalable Parallelization for ML Computation Graphs (GSPMD) in PyTorch as a new experimental data & model sharding solution. GSPMD provides automatic parallelization for common ML workloads, allowing developers to write PyTorch programs as if on a single large device and without custom sharded computation ops and/or collective communication ops. The XLA compiler transforms the single device program into a partitioned one with proper collectives, based on the user provided sharding hints. The API (RFC) will be available in the PyTorch/XLA 2.0 release as an experimental feature on a single TPU VM host.

        + +

        Next Steps for GSPMD

        + +

        GSPMD is experimental in 2.0 release. To bring it to Stable status, we plan to address a number of feature gaps and known issues in the following releases, including multi-host support, DTensor integration, partial replication sharding, asynchronous data loading, and checkpointing.

        + +

        FSDP (Beta)

        + +

        PyTorch/XLA introduced fully sharded data parallel (FSDP) experimental support in version 1.12. This feature is a parallel representation of PyTorch FSDP and there are subtle differences in how XLA and upstream CUDA kernels are set up. auto_wrap_policy is a new argument that enables developers to automatically specify conditions for propagating partitioning specifications to neural network submodules. auto_wrap_policys may be simply passed in as an argument when wrapping a model with FSDP. Two auto_wrap_policy callables worth noting are: size_based_auto_wrap_policy, transformer_auto_wrap_policy.

        + +

        size_based_auto_wrap_policy enables users to wrap submodules with a minimum number of parameters. The example below wraps model submodules having at least 10M parameters.

        + +
        auto_wrap_policy = partial(size_based_auto_wrap_policy, min_num_params=1e7)
        +
        + +

        transformer_auto_wrap_policy enables users to wrap all submodules that match a specific layer type. The example below wraps model submodules named torch.nn.Conv2d. To learn more, review this ResNet example by Ronghang Hu.

        + +
        auto_wrap_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={torch.nn.Conv2d})
        +
        + +

        PyTorch/XLA FSDP is now integrated in HuggingFace trainer class (PR) enabling users to train much larger models on PyTorch/XLA (official Hugging Face documentation). A 16B parameters GPT2 model trained on Cloud TPU v4-64 with this FSDP configuration achieved 39% hardware utilization.

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        TPU Accelerator - Num Devices + v4-64 +
        GPT2 Parameter Count + 16B +
        Layers Wrapped with FSDP + GPT2Block +
        TFLOPs / Chip + 275 +
        PFLOPs / Step + 50 +
        Hardware Utilization + 39% +
        + +

        Differences Between FSDP & GSPMD

        + +

        FSDP is a data parallelism technique that reduces device memory footprint by storing model parameters, optimizer states, and gradients all sharded. Note that the actual computation is still local to the device and requires all-gathering the sharded model parameters for both forward and backward passes, hence the name “data parallel”. FSDP is one of the newest additions to PyTorch/XLA to scale large model training.

        + +

        GSPMD on the other hand, is a general parallelization system that enables various types of parallelisms, including both data and model parallelisms. PyTorch/XLA provides a sharding annotation API and XLAShardedTensor abstraction, so a user can annotate any tensor with sharding specs in the PyTorch program. Developers don’t need to manually implement sharded computations or inject collective communications ops to get it right. The XLA compiler does the work so that each computation can run in a distributed manner on multiple devices.

        + +

        Examples & Preliminary Results

        + +

        To learn about PyTorch/XLA parallelism sharding API, visit our RFC and see the Sample Code references. Below is a simple example to enable data and model parallelism.

        + +
        model = SimpleLinear().to(xm.xla_device())
        +# Sharding annotate the linear layer weights.
        +xs.mark_sharding(model.fc1.weight, mesh, partition_spec)
        +# Training loop
        +model.train()
        +for step, (data, target) in enumerate(loader):
        +  optimizer.zero_grad()
        +  data = data.to(xm.xla_device())
        +  target = target.to(xm.xla_device())
        +  # Sharding annotate input data, we can shard any input
        +  # dimensions. Sharidng the batch dimension enables 
        +  # data parallelism, sharding the feature dimension enables
        +  # spatial partitioning.
        +  xs.mark_sharding(data, mesh, partition_spec)
        +  ouput = model(data)
        +  loss = loss_fn(output, target)
        +  optimizer.step()
        +  xm.mark_step()
        +
        + +

        The following graph highlights the memory efficiency benefits of PyTorch/XLA FSDP and SPMD on Cloud TPU v4-8 running ResNet50.

        + +

        Batch Size Scaling with Spatial Partitioning

        + +

        Closing Thoughts…

        + +

        We are excited to bring these features to the PyTorch community, and this is really just the beginning. Areas like dynamic shapes, deeper support for OpenXLA and many others are in development and we plan to put out more blogs to dive into the details. PyTorch/XLA is developed fully open source and we invite you to join the community of developers by filing issues, submitting pull requests, and sending RFCs on GitHub. You can try PyTorch/XLA on a variety of XLA devices including TPUs and GPUs. Here is how to get started.

        + +

        Congratulations again to the PyTorch community on this milestone!

        + +

        Cheers,

        + +

        The PyTorch Team at Google

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-adds-new-dev-tools/index.html b/blog/pytorch-adds-new-dev-tools/index.html new file mode 100644 index 000000000000..038d3dcb0045 --- /dev/null +++ b/blog/pytorch-adds-new-dev-tools/index.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + PyTorch adds new dev tools as it hits production scale | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + The PyTorch Team + +

        +

        This is a partial re-post of the original blog post on the Facebook AI Blog. The full post can be viewed here

        + +

        Since its release just a few months ago, PyTorch 1.0 has been rapidly adopted as a powerful, flexible deep learning platform that enables engineers and researchers to move quickly from research to production. We are highlighting some of the ways the AI engineering and research community is using PyTorch 1.0. We’re also sharing new details about the latest release, PyTorch 1.1, and showcasing some of the new development tools created by the community.

        + +

        Building on the initial launch of PyTorch in 2017, we partnered with the AI community to ship the stable release of PyTorch 1.0 last December. Along with enhanced production-oriented capabilities and deep integration with leading cloud platforms, PyTorch 1.0 expands on the open source library’s core features, with the addition of PyTorch JIT (Just in time compilation) that seamlessly transitions between eager mode and graph mode to provide both flexibility and speed.

        + +

        Leading businesses across industries are beginning to use PyTorch to both facilitate their research and then also deploy at large scale for applications such as translation, computer vision, conversational interfaces, pharmaceutical research, factory optimization, and automated driving research. Community adoption of PyTorch has also continued to expand. Stanford, UC Berkeley, Caltech, and other universities are using PyTorch as a fundamental tool for their machine learning (ML) courses; new ecosystem projects have launched to support development on PyTorch; and major cloud platforms have expanded their integration with PyTorch.

        + +

        Using PyTorch across industries

        + +

        Many leading businesses are moving to PyTorch 1.0 to accelerate development and deployment of new AI systems. Here are some examples:

        + +
          +
        • Airbnb leveraged PyTorch’s rich libraries and APIs for conversational AI and deployed a Smart Reply to help the company’s service agents respond more effectively to customers.
        • +
        • ATOM is building a platform to generate and optimize new drug candidates significantly faster and with greater success than conventional processes. Using machine learning frameworks such as PyTorch, ATOM was able to design a variational autoencoder for representing diverse chemical structures and designing new drug candidates.
        • +
        • Genentech is utilizing PyTorch’s flexible control structures and dynamic graphs to train deep learning models that will aid in the development of individualized cancer therapy.
        • +
        • Microsoft is using PyTorch across its organization to develop ML models at scale and deploy them via the ONNX Runtime. Using PyTorch, Microsoft Cognition has built distributed language models that scale to billions of words and are now in production in offerings such as Cognitive Services.
        • +
        • Toyota Research Institute (TRI) is developing a two-pronged approach toward automated driving with Toyota Guardian and Toyota Chauffeur technologies. The Machine Learning Team at TRI is creating new deep learning algorithms to leverage Toyota’s 10 million sales per year data advantage. The flexibility of PyTorch has vastly accelerated their pace of exploration and its new production features will enable faster deployment towards their safety critical applications.
        • +
        + +

        Following the release of PyTorch 1.0 in December 2018, we’re now announcing the availability of v1.1, which improves performance, adds new model understanding and visualization tools to improve usability, and provides new APIs.

        + +

        Key features of PyTorch v1.1 include:

        + +
          +
        • TensorBoard: First-class and native support for visualization and model debugging with TensorBoard, a web application suite for inspecting and understanding training runs and graphs. PyTorch now natively supports TensorBoard with a simple “from torch.utils.tensorboard import SummaryWriter” command.
        • +
        • JIT compiler: Improvements to just-in-time (JIT) compilation. These include various bug fixes as well as expanded capabilities in TorchScript, such as support for dictionaries, user classes, and attributes.
        • +
        • New APIs: Support for Boolean tensors and better support for custom recurrent neural networks.
        • +
        • Distributed Training: Improved performance for common models such as CNNs, added support for multi device modules including the ability to split models across GPUs while still using Distributed Data Parallel (DDP) and support for modules where not all parameters are used in every iteration (e.g. control flow, like adaptive softmax, etc). See the latest tutorials here.
        • +
        + +

        We’ve also continued to partner with the community to foster projects and tools aimed at supporting ML engineers for needs ranging from improved model understanding to auto-tuning using AutoML methods. With the release of Ax and BoTorch (below), we will be sharing some of our core algorithms, including meta-learning for efficiently optimizing hyperparameters from based on historical tasks. We are excited to see this work open-sourced for the community to build on.

        + +

        This ecosystem includes open source projects and tools that have been deployed at production scale, as well as products and services from our partnership with industry leaders who share our vision of an open and collaborative AI community. Here are a few of the latest tools:

        + +
          +
        • BoTorch: BoTorch is a research framework built on top of PyTorch to provide Bayesian optimization, a sample-efficient technique for sequential optimization of costly-to-evaluate black-box functions.
        • +
        • Ax: Ax is an ML platform for managing adaptive experiments. It enables researchers and engineers to systematically explore large configuration spaces in order to optimize machine learning models, infrastructure, and products.
        • +
        • PyTorch-BigGraph: PBG is a distributed system for creating embeddings of very large graphs with billions of entities and trillions of edges. It includes support for sharding and negative sampling and it offers sample use cases based on Wikidata embeddings.
        • +
        • Google AI Platform Notebooks: AI Platform Notebooks is a new, hosted JupyterLab service from Google Cloud Platform. Data scientists can quickly create virtual machines running JupyterLab with the latest version of PyTorch preinstalled. It is also tightly integrated with GCP services such as BigQuery, Cloud Dataproc, Cloud Dataflow, and AI Factory, making it easy to execute the full ML cycle without ever leaving JupyterLab.
        • +
        + +

        We’re also excited to see many interesting new projects from the broader PyTorch community. Highlights include:

        + +
          +
        • BigGAN-PyTorch:This is a full PyTorch reimplementation that uses gradient accumulation to provide the benefits of big batches on as few as four GPUs.
        • +
        • GeomLoss: A Python API that defines PyTorch layers for geometric loss functions between sampled measures, images, and volumes. It includes MMD, Wasserstein, Sinkhorn, and more.
        • +
        + +
        + +
        + +
          +
        • PyTorch Geometric: A deep learning extension library for PyTorch that offers several methods for deep learning on graphs and other irregular structures (also known as geometric deep learning) from a variety of published papers.
        • +
        • Curve-GCN: A real-time, interactive image annotation approach that uses an end-to-end-trained graph convolutional network (GCN). It supports object annotation by either polygons or splines, facilitating labeling efficiency for both line-based and curved objects. Curve-GCN runs 10x faster than traditional methods, such as Polygon-RNN++.
        • +
        + +

        Udacity, fast.ai, and others develop new PyTorch resources

        + +

        PyTorch is ideal for teaching ML development because it enables rapid experimentation through its flexible, dynamic programming environment and user-friendly Pythonic interface. In addition, Google Colab now offers an interactive Jupyter Notebook environment that natively supports PyTorch, allowing developers to run any PyTorch tutorial immediately with free CPU and GPU resources.

        + +

        University-level classes — including Stanford NLP, UC Berkeley Computer Vision, and Caltech Robotics courses — are now being taught on PyTorch. In addition, massive open online courses (MOOCs) are training thousands of new PyTorch developers.

        + +

        Today, we’re announcing a new Udacity course, building upon the Intro to Deep Learning course launched last year. This new course, led by Andrew Trask of Oxford University and OpenMined, covers important concepts around privacy in AI, including methods such as differential privacy and federated learning. Facebook will also be providing scholarships to support students as they continue their ML education in Udacity’s full Nanodegree programs.

        + +

        The fast.ai community is also continuing to invest energy and resources in PyTorch. In June, fast.ai will launch a new course called Deep Learning from the Foundations, which will show developers how to go all the way from writing matrix multiplication from scratch to how to train and implement a state-of-the-art ImageNet model. The course will include deep dives into the underlying implementation of methods in the PyTorch and fast.ai libraries, and will use the code to explain and illustrate the academic papers that underlie these methods.

        + +

        As part of the course, fast.ai will also release new software modules, including fastai.audio, which brings the power of fast.ai’s deep abstractions and curated algorithms to the new PyTorch.audio module, and show how fastai.vision can be used to create stunning high-resolution videos from material such as old classic movies, and from cutting-edge microscopy sequences through a collaboration with the Salk Institute. In addition, fast.ai is contributing its new X-ResNet module, including a suite of models pretrained on ImageNet.

        + +

        Getting started with PyTorch

        + +

        Everyone in the AI community — including those new to ML development as well as researchers and engineers looking for ways to accelerate their end-to-end workflows — can experiment with PyTorch instantly by visiting pytorch.org and launching a tutorial in Colab. There are also many easy ways to get started both locally and on popular cloud platforms.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community/index.html b/blog/pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community/index.html new file mode 100644 index 000000000000..538709e16e69 --- /dev/null +++ b/blog/pytorch-adds-new-tools-and-libraries-welcomes-preferred-networks-to-its-community/index.html @@ -0,0 +1,710 @@ + + + + + + + + + + + + + PyTorch adds new tools and libraries, welcomes Preferred Networks to its community | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        PyTorch continues to be used for the latest state-of-the-art research on display at the NeurIPS conference next week, making up nearly 70% of papers that cite a framework. In addition, we’re excited to welcome Preferred Networks, the maintainers of the Chainer framework, to the PyTorch community. Their teams are moving fully over to PyTorch for developing their ML capabilities and services.

        + +

        This growth underpins PyTorch’s focus on building for the needs of the research community, and increasingly, supporting the full workflow from research to production deployment. To further support researchers and developers, we’re launching a number of new tools and libraries for large scale computer vision and elastic fault tolerant training. Learn more on GitHub and at our NeurIPS booth.

        + +

        Preferred Networks joins the PyTorch community

        + +

        Preferred Networks, Inc. (PFN) announced plans to move its deep learning framework from Chainer to PyTorch. As part of this change, PFN will collaborate with the PyTorch community and contributors, including people from Facebook, Microsoft, CMU, and NYU, to participate in the development of PyTorch.

        + +

        PFN developed Chainer, a deep learning framework that introduced the concept of define-by-run (also referred to as eager execution), to support and speed up its deep learning development. Chainer has been used at PFN since 2015 to rapidly solve real-world problems with the latest, cutting-edge technology. Chainer was also one of the inspirations for PyTorch’s initial design, as outlined in the PyTorch NeurIPS paper.

        + +

        PFN has driven innovative work with CuPy, ImageNet in 15 minutes, Optuna, and other projects that have pushed the boundaries of design and engineering. As part of the PyTorch community, PFN brings with them creative engineering capabilities and experience to help take the framework forward. In addition, PFN’s migration to PyTorch will allow it to efficiently incorporate the latest research results to accelerate its R&D activities, given PyTorch’s broad adoption with researchers, and to collaborate with the community to add support for PyTorch on MN-Core, a deep learning processor currently in development.

        + +

        We are excited to welcome PFN to the PyTorch community, and to jointly work towards the common goal of furthering advances in deep learning technology. Learn more about the PFN’s migration to PyTorch here.

        + +

        Tools for elastic training and large scale computer vision

        + +

        PyTorch Elastic (Experimental)

        + +

        Large scale model training is becoming commonplace with architectures like BERT and the growth of model parameter counts into the billions or even tens of billions. To achieve convergence at this scale in a reasonable amount of time, the use of distributed training is needed.

        + +

        The current PyTorch Distributed Data Parallel (DDP) module enables data parallel training where each process trains the same model but on different shards of data. It enables bulk synchronous, multi-host, multi-GPU/CPU execution of ML training. However, DDP has several shortcomings; e.g. jobs cannot start without acquiring all the requested nodes; jobs cannot continue after a node fails due to error or transient issue; jobs cannot incorporate a node that joined later; and lastly; progress cannot be made with the presence of a slow/stuck node.

        + +

        The focus of PyTorch Elastic, which uses Elastic Distributed Data Parallelism, is to address these issues and build a generic framework/APIs for PyTorch to enable reliable and elastic execution of these data parallel training workloads. It will provide better programmability, higher resilience to failures of all kinds, higher-efficiency and larger-scale training compared with pure DDP.

        + +

        Elasticity, in this case, means both: 1) the ability for a job to continue after node failure (by running with fewer nodes and/or by incorporating a new host and transferring state to it); and 2) the ability to add/remove nodes dynamically due to resource availability changes or bottlenecks.

        + +

        While this feature is still experimental, you can try it out on AWS EC2, with the instructions here. Additionally, the PyTorch distributed team is working closely with teams across AWS to support PyTorch Elastic training within services such as Amazon Sagemaker and Elastic Kubernetes Service (EKS). Look for additional updates in the near future.

        + +

        New Classification Framework

        + +

        Image and video classification are at the core of content understanding. To that end, you can now leverage a new end-to-end framework for large-scale training of state-of-the-art image and video classification models. It allows researchers to quickly prototype and iterate on large distributed training jobs at the scale of billions of images. Advantages include:

        + +
          +
        • Ease of use - This framework features a modular, flexible design that allows anyone to train machine learning models on top of PyTorch using very simple abstractions. The system also has out-of-the-box integration with AWS on PyTorch Elastic, facilitating research at scale and making it simple to move between research and production.
        • +
        • High performance - Researchers can use the framework to train models such as Resnet50 on ImageNet in as little as 15 minutes.
        • +
        + +

        You can learn more at the NeurIPS Expo workshop on Multi-Modal research to production or get started with the PyTorch Elastic Imagenet example here.

        + +

        Come see us at NeurIPS

        + +

        The PyTorch team will be hosting workshops at NeurIPS during the industry expo on 12/8. Join the sessions below to learn more, and visit the team at the PyTorch booth on the show floor and during the Poster Session. At the booth, we’ll be walking through an interactive demo of PyTorch running fast neural style transfer on a Cloud TPU - here’s a sneak peek.

        + +

        We’re also publishing a paper that details the principles that drove the implementation of PyTorch and how they’re reflected in its architecture.

        + +

        Multi-modal Research to Production - This workshop will dive into a number of modalities such as computer vision (large scale image classification and instance segmentation) and Translation and Speech (seq-to-seq Transformers) from the lens of taking cutting edge research to production. Lastly, we will also walk through how to use the latest APIs in PyTorch to take eager mode developed models into graph mode via Torchscript and quantize them for scale production deployment on servers or mobile devices. Libraries used include:

        + +
          +
        • Classification Framework - a newly open sourced PyTorch framework developed by Facebook AI for research on large-scale image and video classification. It allows researchers to quickly prototype and iterate on large distributed training jobs. Models built on the framework can be seamlessly deployed to production.
        • +
        • Detectron2 - the recently released object detection library built by the Facebook AI Research computer vision team. We will articulate the improvements over the previous version including: 1) Support for latest models and new tasks; 2) Increased flexibility, to enable new computer vision research; 3) Maintainable and scalable, to support production use cases.
        • +
        • Fairseq - general purpose sequence-to-sequence library, can be used in many applications, including (unsupervised) translation, summarization, dialog and speech recognition.
        • +
        + +

        Responsible and Reproducible AI - This workshop on Responsible and Reproducible AI will dive into important areas that are shaping the future of how we interpret, reproduce research, and build AI with privacy in mind. We will cover major challenges, walk through solutions, and finish each talk with a hands-on tutorial.

        + +
          +
        • Reproducibility: As the number of research papers submitted to arXiv and conferences skyrockets, scaling reproducibility becomes difficult. We must address the following challenges: aid extensibility by standardizing code bases, democratize paper implementation by writing hardware agnostic code, facilitate results validation by documenting “tricks” authors use to make their complex systems function. To offer solutions, we will dive into tool like PyTorch Hub and PyTorch Lightning which are used by some of the top researchers in the world to reproduce the state of the art.
        • +
        • Interpretability: With the increase in model complexity and the resulting lack of transparency, model interpretability methods have become increasingly important. Model understanding is both an active area of research as well as an area of focus for practical applications across industries using machine learning. To get hands on, we will use the recently released Captum library that provides state-of-the-art algorithms to provide researchers and developers with an easy way to understand the importance of neurons/layers and the predictions made by our models.`
        • +
        • Private AI: Practical applications of ML via cloud-based or machine-learning-as-a-service platforms pose a range of security and privacy challenges. There are a number of technical approaches being studied including: homomorphic encryption, secure multi-party computation, trusted execution environments, on-device computation, and differential privacy. To provide an immersive understanding of how some of these technologies are applied, we will use the CrypTen project which provides a community based research platform to take the field of Private AI forward.
        • +
        + +

        We’d like to thank the entire PyTorch team and the community for all their contributions to this work.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-at-gtc/index.html b/blog/pytorch-at-gtc/index.html new file mode 100644 index 000000000000..a332dbeb7bcb --- /dev/null +++ b/blog/pytorch-at-gtc/index.html @@ -0,0 +1,740 @@ + + + + + + + + + + + + + PyTorch at GTC 2025 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        March 16, 2025

        +

        + PyTorch at GTC 2025 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch at NVIDIA + +

        +

        GTC is coming back to San Jose on March 17–21, 2025. Join PyTorch Foundation members Arm, AWS, Google Cloud, IBM, Lightning AI, Meta, Microsoft Azure, Snowflake, and thousands of developers as we celebrate PyTorch. Together learn how AI & accelerated computing are helping humanity solve our most complex challenges.

        + +

        Join in person with discounted GTC registration for PyTorch Foundation or watch online with free registration.

        + +

        book cover

        + +

        Scaling Open Source AI: From Foundation Models to Ecosystem Success

        + +

        Hear from PyTorch Foundation’s Executive Director Matt White & panelists from UC Berkeley, Meta, NVIDIA, & Sequoia Capital how open source is transforming AI development, bringing together experts from industry, academia, and venture capital to discuss the technical and business aspects of collaborative open source AI development They’ll examine how open source projects like PyTorch, vLLM, Ray, and NVIDIA’s NeMo are accelerating AI innovation while creating new opportunities for businesses and researchers. They’ll share real-world experiences from PyTorch’s development, Berkeley’s research initiatives, and successful AI startups. Take away valuable insights into the technical and business aspects of open source AI. – Monday, Mar 17 10:00 AM - 11:00 AM PDT

        + +

        PyTorch @ GTC

        + +

        The Performance of CUDA with the Flexibility of PyTorch
        +Mark Saroufim, Software Engineer, Meta Platforms

        + +

        This talk explores how PyTorch users are also becoming CUDA developers. We’ll start with motivating examples from eager, the launch of torch.compile and the more recent trend of kernel zoos. We will share details on how we went about integrating low bit matmuls in torchao and the torch.compile CUTLASS backend. We’ll also discuss details on how you can define, build and package your own custom ops in PyTorch so you get the raw performance of CUDA while maintaining the flexibility of PyTorch.

        + +

        Make My PyTorch Model Fast, and Show Me How You Did It
        +Thomas Viehmann, Principal Research Engineer, Lightning AI
        +Luca Antiga, CTO, Lightning AI

        + +

        PyTorch is popular in deep learning and LLMs for richness and ease of expressions. To make the most of compute resources, PyTorch models benefit from nontrivial optimizations, but this means losing some of their ease and understandability. Learn how with Thunder, a PyTorch-to-Python compiler focused on usability, understandability, and extensibility, you can optimize and transform (i.e., distribute across many machines) models while • leaving the PyTorch code unchanged • targeting a variety of models without needing to adapt to each of them • understanding each transformation step because the results are presented as simple Python code • accessing powerful extension code for your own optimizations with just one or a few lines of code We’ll show how the combination of Thunder transforms and the NVIDIA stack (NVFuser, cuDNN, Apex) delivers optimized performance in training and inference on a variety of models.

        + +

        FlexAttention: The Flexibility of PyTorch With the Performance of FlashAttention
        +Driss Guessous, Machine Learning Engineer, Meta Platforms

        + +

        Introducing FlexAttention: a novel PyTorch API that enables custom, user-defined attention mechanisms with performance comparable to state-of-the-art solutions. By leveraging the PyTorch compiler stack, FlexAttention supports dynamic modifications to attention scores within SDPA, achieving both runtime and memory efficiency through kernel fusion with the FlashAttention algorithm. Our benchmarks on A100 GPUs show FlexAttention achieves 90% of FlashAttention2’s performance in forward passes and 85% in backward passes. On H100 GPUs, FlexAttention’s forward performance averages 85% of FlashAttention3 and is ~25% faster than FlashAttention2, while backward performance averages 76% of FlashAttention3 and is ~3% faster than FlashAttention2. Explore how FlexAttention balances near-state-of-the-art performance with unparalleled flexibility, empowering researchers to rapidly iterate on attention mechanisms without sacrificing efficiency.

        + +

        Keep Your GPUs Going Brrr : Crushing Whitespace in Model Training
        +Syed Ahmed, Senior Software Engineer, NVIDIA
        +Alban Desmaison, Research Engineer, Meta
        +Aidyn Aitzhan, Senior Software Engineer, NVIDIA

        + +

        Substantial progress has recently been made on the compute-intensive portions of model training, such as high-performing attention variants. While invaluable, this progress exposes previously hidden bottlenecks in model training, such as redundant copies during collectives and data loading time. We’ll present recent improvements in PyTorch achieved through Meta/NVIDIA collaboration to tackle these newly exposed bottlenecks and how practitioners can leverage them.

        + +

        Accelerated Python: The Community and Ecosystem
        +Andy Terrel, CUDA Python Product Lead, NVIDIA
        +Jeremy Tanner, Open Source Programs, NVIDIA
        +Anshuman Bhat, CUDA Product Management, NVIDIA

        + +

        Python is everywhere. Simulation, data science, and Gen AI all depend on it. Unfortunately, the dizzying array of tools leaves a newcomer baffled at where to start. We’ll take you on a guided tour of the vibrant community and ecosystem surrounding accelerated Python programming. Explore a variety of tools, libraries, and frameworks that enable efficient computation and performance optimization in Python, including CUDA Python, RAPIDS, Warp, and Legate. We’ll also discuss integration points with PyData, PyTorch, and JAX communities. Learn about collaborative efforts within the community, including open source projects and contributions that drive innovation in accelerated computing. We’ll discuss best practices for leveraging these frameworks to enhance productivity in developing AI-driven applications and conducting large-scale data analyses.

        + +

        Supercharge large scale AI with Google Cloud AI hypercomputer (Presented by Google Cloud)
        +Deepak Patil, Product Manager, Google Cloud
        +Rajesh Anantharaman, Product Management Lead, ML Software, Google Cloud

        + +

        Unlock the potential of your large-scale AI workloads with Google Cloud AI Hypercomputer – a supercomputing architecture designed for maximum performance and efficiency. In this session, we will deep dive into PyTorch and JAX stacks on Google Cloud on NVIDIA GPUs, and showcase capabilities for high performance foundation model building on Google Cloud.

        + +

        Peering Into the Future: What AI and Graph Networks Can Mean for the Future of Financial Analysis
        +Siddharth Samsi, Sr. Solutions Architect, NVIDIA
        +Sudeep Kesh, Chief Innovation Officer, S&P Global

        + +

        Artificial Intelligence, agentic systems, and graph neural networks (GNNs) are providing the new frontier to assess, monitor, and estimate opportunities and risks across work portfolios within financial services. Although many of these technologies are still developing, organizations are eager to understand their potential. See how S&P Global and NVIDIA are working together to find practical ways to learn and integrate such capabilities, ranging from forecasting corporate debt issuance to understanding capital markets at a deeper level. We’ll show a graph representation of market data using the PyTorch-Geometric library and a dataset of issuances spanning three decades and across financial and non-financial industries. Technical developments include generation of a bipartite graph and link-prediction GNN forecasting. We’ll address data preprocessing, pipelines, model training, and how these technologies can broaden capabilities in an increasingly complex world.

        + +

        Unlock Deep Learning Performance on Blackwell With cuDNN
        +Yang Xu (Enterprise Products), DL Software Engineering Manager, NVIDIA

        + +

        Since its launch, cuDNN, a library for GPU-accelerating deep learning (DL) primitives, has been powering many AI applications in domains such as conversational AI, recommender systems, and speech recognition, among others. CuDNN remains a core library for DL primitives in popular frameworks such as PyTorch, JAX, Tensorflow, and many more while covering training, fine-tuning, and inference use cases. Even in the rapidly evolving space of Gen AI — be it Llama, Gemma, or mixture-of-experts variants requiring complex DL primitives such as flash attention variants — cuDNN is powering them all. Learn about new/updated APIs of cuDNN pertaining to Blackwell’s microscaling format, and how to program against those APIs. We’ll deep dive into leveraging its graph APIs to build some fusion patterns, such as matmul fusion patterns and fused flash attention from state-of-the-art models. Understand how new CUDA graph support in cuDNN, not to be mistaken with the cuDNN graph API, could be exploited to avoid rebuilding CUDA graphs, offering an alternative to CUDA graph capture with real-world framework usage.

        + +

        Train and Serve AI Systems Fast With the Lightning AI Open-Source Stack (Presented by Lightning AI)
        +Luca Antiga, CTO, Lightning AI

        + +

        See how the Lightning stack can cover the full life cycle, from data preparation to deployment, with practical examples and particular focus on distributed training and high-performance inference. We’ll show examples that focus on new features like support for multi-dimensional parallelism through DTensors, as well as quantization through torchao.

        + +

        Connect With Experts (Interactive Sessions)

        + +

        Meet the Experts From Deep Learning Framework Teams
        +Eddie Yan, Technical Lead of PyTorch, NVIDIA
        +Masaki Kozuki, Senior Software Engineer in PyTorch, NVIDIA
        +Patrick Wang (Enterprise Products), Software Engineer in PyTorch, NVIDIA
        +Mike Ruberry, Distinguished Engineer in Deep Learning Frameworks, NVIDIA
        +Rishi Puri, Sr. Deep Learning Engineer and Lead for PyTorch Geometric, NVIDIA

        + +

        Training Labs

        + +

        Kernel Optimization for AI and Beyond: Unlocking the Power of Nsight Compute
        +Felix Schmitt, Sr. System Software Engineer, NVIDIA
        +Peter Labus, Senior System Software Engineer, NVIDIA

        + +

        Learn how to unlock the full potential of NVIDIA GPUs with the powerful profiling and analysis capabilities of Nsight Compute. AI workloads are rapidly increasing the demand for GPU computing, and ensuring that they efficiently utilize all available GPU resources is essential. Nsight Compute is the most powerful tool for understanding kernel execution behavior and performance. Learn how to configure and launch profiles customized for your needs, including advice on profiling accelerated Python applications, AI frameworks like PyTorch, and optimizing Tensor Core utilization essential to modern AI performance. Learn how to debug your kernel and use the expert system built into Nsight Compute, known as “Guided Analysis,” that automatically detects common issues and directs you to the most relevant performance data all the way down to the source code level.

        + +

        Make Retrieval Better: Fine-Tuning an Embedding Model for Domain-Specific RAG
        +Gabriel Moreira, Sr. Research Scientist, NVIDIA
        +Ronay Ak, Sr. Data Scientist, NVIDIA

        + +

        LLMs power AI applications like conversational chatbots and content generators, but are constrained by their training data. This might lead to hallucinations in content generation, which requires up-to-date or domain-specific information. Retrieval augmented generation (RAG) addresses this issue by enabling LLMs to access external context without modifying model parameters. Embedding or dense retrieval models are a key component of a RAG pipeline for retrieving relevant context to the LLM. However, an embedding model’s effectiveness to capture the unique characteristics of the custom data hinges on the quality and domain relevance of its training data. Fine-tuning embedding models is gaining interest to provide more accurate and relevant responses tailored to users’ specific domain.

        + +

        In this lab, you’ll learn to generate a synthetic dataset with question-context pairs from a domain-specific corpus, and process the data for fine-tuning. Then, fine-tune a text embedding model using synthetic data and evaluate it.

        + +

        Poster Presentations

        + +

        Single-View X-Ray 3D Reconstruction Using Neural Back Projection and Frustum Resampling
        +Tran Minh Quan, Developer Technologist, NVIDIA

        + +

        Enable Novel Applications in the New AI Area in Medicine: Accelerated Feature Computation for Pathology Slides
        +Nils Bruenggel, Principal Software Engineer, Roche Diagnostics Int. AG

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-compile-to-speed-up-inference/index.html b/blog/pytorch-compile-to-speed-up-inference/index.html new file mode 100644 index 000000000000..ec5f7a030775 --- /dev/null +++ b/blog/pytorch-compile-to-speed-up-inference/index.html @@ -0,0 +1,803 @@ + + + + + + + + + + + + + PyTorch compile to speed up inference on Llama 2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + IBM Research: Antoni Viros i Martin, Brian Vaughan, Davis Wertheimer, Joshua Rosenkranz, Mudhakar Srivatsa, Nelson Mimura Gonzalez, Raghu Ganti, Supriyo Chakraborty, Zhuoran Liu Meta: Geeta Chauhan, Hamid Shojanazeri + +

        +

        In this blog, we discuss how to improve the inference latencies of the Llama 2 family of models using PyTorch native optimizations such as native fast kernels, compile transformations from torch compile, and tensor parallel for distributed inference. Our approach results in 29ms/token latency for single user requests on the 70B LLaMa model (as measured on 8 A100 GPUs). We are excited to share our findings with the community and make our code available here.

        + +

        Background

        + +

        We are amid a generative AI revolution with large language models of tens of billions of parameters becoming commoditized and available for use. However, it is well recognized in the community that deploying these large models in a cost-efficient manner remains a key challenge. Many different approaches have been attempted with varying degrees of success and offering different trade-offs. Hardware-specific optimizations (e.g., Faster Transformer from NVIDIA) are restricted to specific target hardware whereas approaches that rely on layers of abstraction (e.g., ONNX) enable arbitrary models but suffer from loss of efficiency. With the introduction of PyTorch compile last year, IBM and the PyTorch team started exploring the use of model compilation for inference optimizations with the goal of reducing the latency per token for generative models.

        + +

        Model Choice

        + +

        We choose to benchmark on the Llama 2 family of models, given their popularity. The models that we are interested in, and their hyper parameters relevant for this blog are given in the below table:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model size + Hidden dimension + Num heads + Num layers + Attention type +
        7B + 4096 + 32 + 32 + MHA +
        13B + 5120 + 40 + 40 + MHA +
        70B + 8192 + 64 + 80 + GQA +
        + +

        These models are decoder only, which means that tokens get generated in a serialized manner, which is typically sped up using KV caching. We take a similar approach in our latency and throughput measurements.

        + +

        Inference Approach

        + +

        Our goal for inference is to provide a path for achieving the best possible latencies rapidly, to keep up with the velocity with which new model architectures are emerging in the community. A PyTorch native approach is appealing as it allows for the maximum flexibility in terms of “coverage” of models. We note that there are four orthogonal techniques that provide acceleration in inference: (a) Kernel fusion using compile, (b) Faster kernels, (c) Tensor parallel for larger models, and (d) Quantization. In our approach, we use the first three of these four levers - compile natively working with faster kernels from SDPA and a custom tensor parallel implementation that all work hand-in-glove to achieve inference latencies of 29ms/token on a 70B model as measured on 8 NVIDIA A100 GPUs with single user.

        + +

        Compile all the way!

        + +

        PyTorch Compile leverages tracing and graph capture to reduce the CPU overhead and in an ideal scenario results in a single graph execution/instruction from CPU to GPU. However, often compile introduces graph breaks due to model architecture and ops unsupported by compile. For example, complex operations such as einops are not supported by compile today. Similarly, tensor parallel inference can introduce graph breaks at each layer, since compile requires the tensor parallel implementation to use traceable communication collectives. If these graph breaks are not removed, the performance of the compiled artifacts will be hampered and could even be lower compared to eager mode execution. To get full benefit of the compiled artifacts, the graph breaks need to be removed.

        + +

        Below, we describe how we went about doing this for the 70b Llama 2 model and the challenges we had to overcome to get compile to work all the way through.

        + +

        Our first attempt was to try using torch.compile to compile the out-of-box Llama 2 model, but it failed because complex ops were not supported. Using TORCH_COMPILE_DEBUG = 1 we identified the RoPE positional encodings was using complex number functions resulting in graph breaks and significant slowdowns. We rewrote the RoPE function to bypass torch.einsum (Original implementation uses torch.polar that also conflicts with compile) and use torch.cos and torch.sin instead.

        + +
        self.cached_freqs[dev_idx][alpha] = torch.stack(
        +            [
        +                torch.cos(freqs),
        +                -torch.sin(freqs),
        +                torch.sin(freqs),
        +                torch.cos(freqs),
        +            ],
        +            dim=2,
        +        ).view(*freqs.shape, 2, 2)
        +
        + +

        Our implementation of the frequencies computation

        +

        +
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
        +t = t / self.scaling_factor
        +
        +freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        +# Different from paper, but it uses a different permutation in order to obtain the same calculation
        +emb = torch.cat((freqs, freqs), dim=-1)
        +self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
        +self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
        +
        + +

        Hugging Face implementation of the frequencies computation

        + +

        Once RoPE was fixed, we were able to get 7B and 13B models to compile without ANY graph breaks on a single A100 GPU.

        + +

        We used SDPA, the PyTorch native implementation of efficient attention computation with tracing enabled (for compile). To avoid graph breaks related to forcing a single algorithm choice using a Python context, the recommended way, we had to use the torch.backends.cuda.enable_*_sdp functions.

        + +
        attn = torch.nn.functional.scaled_dot_product_attention(
        +            queries,
        +            keys_e,
        +            values_e,
        +            attn_mask=attn_mask,
        +            dropout_p=self.p_dropout if self.training else 0.0,
        +            is_causal=is_causal_mask,
        +)
        +
        + +

        Attention computation using SDPA

        + +

        Next we ran the same steps for the larger 70B model and found that even with half precision, the model does not fit in a single GPU and requires tensor parallel inference. Using torch.compile for the 70B model resulted in 162 graph breaks due to two all-reduces per layer, one all-gather for forward embedding, and one all-gather for reverse embedding. Due to this, we saw no significant improvement in inference latencies. We could not use the distributed tensor implementation from PyTorch at the time of writing this blog as it did not support compile. We rewrote the tensor parallel code from scratch so that it only depends on traceable collectives to make it work with compile. After this last change, PyTorch compiler did not introduce any graph breaks and we saw a significant speedup in inference latencies. Specifically, we measured latencies for the Llama 70B model at 29ms/token when using 8 A100 GPUs, a 2.4x improvement over unoptimized inference.

        + +

        Serving aspects

        + +

        Finally, a point to note here is that simply performing compile on a model is not sufficient to serve the model in a production setting. To realize the above performance with high throughput, we need to support dynamic batching, nested tensors, as well as have a warm up phase where we pre-compile for bucketized sequence lengths. We are working on these aspects to realize such performance in a production setting.

        + +

        Experiments and Measurements

        + +

        We use nodes with 8 A100 NVIDIA GPUs with 80G cards for all our measurements in two different environments (IBM Cloud and AWS, both running OpenShift). First, we compare the various techniques – eager mode, with SDPA Flash kernel, with Compile, and with Compile and SDPA. For the 70B model, we run it in Tensor Parallel mode with compile and SDPA. For this experiment, we use 512 tokens as input length with 50 token generation. For 7 and 13B models, we use single A100 for measurement of latencies, whereas we use 8 A100s for the 70B model. In addition, for the 70B model we use the reduce-overhead option in PyTorch compile that uses CudaGraphs to reduce CPU to GPU kernel launching overheads; the use of CudaGraphs in the 7B and 13B models did not show any benefits (and are thus not reported here). We observe from Figure 1 that compile and SDPA provide very low latencies, with 70B Llama 2 model at 29ms/token.

        + +

        Figure 1. Median latency across different techniques with sequence length 512 (measured on IBM Cloud A100 servers)

        + +

        Fig. 1: Median latency across different techniques with sequence length 512 (measured on IBM Cloud A100 servers)

        + +

        Next, we examine the impact of sequence length, where we increase it from 1024 to 4096 and observe that the median latency per token increases sub-linearly, demonstrating that when we increase context to large documents, we do not sacrifice response times.

        + +

        Figure 2. Median latency for compile+SDPA with different sequence lengths (Measured on A100s on AWS)

        + +

        Fig. 2: Median latency for compile+SDPA with different sequence lengths (Measured on A100s on AWS)

        + +

        Finally, with increased batch sizes, we observe that the response latencies increase sub-linearly. For the 13B model, at batch size 8, we encounter an OOM. For the 70B model, given that it is running on 8 GPUs with tensor parallel, we do not see any such OOM issues.

        + +

        Figure 3. Median latency for compile+SDPA with different batch sizes and sequence length fixed at 4096 (Measured on A100s on AWS)

        + +

        Fig. 3: Median latency for compile+SDPA with different batch sizes and sequence length fixed at 4096 (Measured on A100s on AWS)

        + +

        Final Thoughts

        + +

        We have demonstrated how a PyTorch compile pathway for inference demonstrates ultra low latencies for 70B model inference. The next steps are to enable dynamic batching and nested tensors with the above levers.

        + +

        Special thanks to Edward Yang, Elias Ellison, Driss Guessous, Will Feng, Will Constable, Horace He, Less Wright, and Andrew Gu from Team PyTorch, whose PRs reviews and code contributions made it possible for us to realize the latencies using PyTorch native approach. We thank the broader Team PyTorch that have been tirelessly working to make PyTorch better, special shout outs to the SDPA team for enabling tracing and compile on fast kernels, the compile team that has been closely guiding us on how to work around as well as fix issues (including identifying and raising NVIDIA driver bugs in CUDA graphs).

        + +

        Inference latency has been one of the roadblocks for LLM adoption in critical enterprise workflows, but another major one is the need for safety, trustworthiness and governance. IBM’s guide for AI safety and LLM risk can be found here and Meta’s responsible user guide for LLaMa can be found here.

        + +

        References

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-conference-2023/index.html b/blog/pytorch-conference-2023/index.html new file mode 100644 index 000000000000..78a1658b0766 --- /dev/null +++ b/blog/pytorch-conference-2023/index.html @@ -0,0 +1,688 @@ + + + + + + + + + + + + + PyTorch Conference 2023: Join us in San Francisco October 16-17 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        PyTorch Conference 2023

        + +

        We’re thrilled to announce the upcoming PyTorch Conference 2023! On October 16-17, the conference will showcase PyTorch 2.1, the next-generation release of the popular machine learning framework. As part of the Linux Foundation, the PyTorch Foundation Conference continues the tradition of bringing together leading researchers, developers, and academic communities to advance the education and development of end-to-end machine learning.

        + +

        The conference agenda features an engaging lineup of events, including an opening reception, engaging community and partner discussions, informative panels, poster sessions, enlightening use cases and community stories, as well as discussions on the latest trends in machine learning and deep learning development and deployment.

        + +

        Call for Proposals

        + +

        We are now accepting speaker proposals for the conference until July 21. The program committee will carefully review all submissions, and selected speakers will be notified by August 8. We strongly encourage both experienced and first-time speakers to submit their proposals. This conference provides an excellent opportunity to connect with the PyTorch community, share your ideas, and showcase your work.

        + +

        When preparing your proposal, please consider the following guidelines:

        + +
          +
        • What are you hoping to get from your presentation?
        • +
        • What do you expect the audience to gain from your presentation?
        • +
        • How will your presentation help better the open source ecosystem?
        • +
        + +

        To help you shape your proposal, here are some suggested topics for the conference:

        + +
          +
        • Deployments on AWS, Azure
        • +
        • Use cases and real-world applications
        • +
        • Foundational models
        • +
        • AI practices
        • +
        • Production considerations
        • +
        • PyTorch 2.X features and updates
        • +
        • Training techniques and best practices
        • +
        • Inference methodologies
        • +
        • Hardware advancements and optimizations
        • +
        • Edge computing applications
        • +
        • Scalability solutions
        • +
        • Latest research breakthroughs
        • +
        • Optimization strategies
        • +
        • Extending PyTorch through customizations and plugins
        • +
        + +

        We kindly request that you refrain from submitting sales or marketing pitches and avoid discussing unlicensed or closed-source technologies. Such talks tend to detract from the integrity of our events and are not well-received by conference attendees.

        + +

        Register Today

        + +

        Registration is now open! Get your ticket today and secure your spot: https://events.linuxfoundation.org/pytorch-conference/register/

        + +

        Thank you for your interest, and we look forward to a successful PyTorch Conference 2023!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-conference-2024-recap/index.html b/blog/pytorch-conference-2024-recap/index.html new file mode 100644 index 000000000000..c3f3f86f4e4f --- /dev/null +++ b/blog/pytorch-conference-2024-recap/index.html @@ -0,0 +1,1174 @@ + + + + + + + + + + + + + PyTorch Conference 2024 Recap: On Fire 🔥 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 02, 2024

        +

        + PyTorch Conference 2024 Recap: On Fire 🔥 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        women dancing with fire

        + +

        The 2024 PyTorch Conference in San Francisco gathered nearly 1,500 AI researchers, developers, and enthusiasts. Over two days, the event featured engaging discussions, insightful keynotes, and hands-on sessions focused on artificial intelligence (AI) and advancements in PyTorch, the leading open-source machine learning framework. Attendees delved into the future of generative AI, Large Language Models (LLMs), and the crucial role open-source technology plays in driving AI innovation. Here’s a recap of the key themes, highlights, and major takeaways from this year’s conference.

        + +

        Key Themes of the PyTorch Conference 2024

        + +

        Three core themes emerged throughout the conference:

        + +
          +
        1. Generative AI and LLMs: Many sessions focused on how PyTorch continues to evolve as a primary framework for Large Language Models and Generative AI applications. From scaling these models to optimizing their performance on various hardware platforms, the conference showcased the ongoing advancements and challenges in LLM architecture.
        2. +
        3. Democratizing AI Through Open Source: One of the recurring themes was the importance of open source tools and communities in shaping the future of AI. PyTorch is committed to inclusivity, ease of use, and accessibility to developers of all levels, with a focus on bringing AI to an even larger global audience.
        4. +
        5. Distributed and Edge Computing: Distributed computing and edge deployment appeared in many discussions, highlighting how PyTorch is being used to drive AI to the edge. The focus on edge accelerators, scalable training, and inference showcased how PyTorch enables the deployment of powerful models across diverse environments, from the cloud to on-device applications.
        6. +
        + +

        panel of people on a conference stage

        + +

        Watch the Sessions from PyTorch Conference

        + +

        The PyTorch Conference featured keynote sessions from top AI leaders and interesting lightning talks. You can view all of the conference sessions on our YouTube channel.

        + +
        + +
        + + + +

        PyTorch Conference Startup Showcase

        + +

        man speaking at a conference

        + +

        New this year, the Startup Showcase was an exciting addition to the PyTorch Conference. Featuring early-stage founders pitching their AI startups to a panel of top venture capitalists, this event showcased the next generation of AI-driven innovation. The finalists for the inaugural PyTorch Conference Startup Showcase included Remix Inc., Cartesia, OpenBabylon, Remyx AI, A2 Labs, Inc., QuicSnap, Iso AI, CTGT, and Creao.ai, representing some of the most innovative AI/ML startups in the industry. Attendees got a front-row seat to see cutting-edge AI startups in action, while top VCs from the AI industry evaluated the pitches.

        + +

        Congratulations to the PyTorch Conference Startup Showcase winner, CTGT! Deep learning can be opaque and biased, which limits its potential in crucial areas like healthcare and finance. CTGT is changing the game by enhancing data lineage in LLMs and cutting hallucinations. They’re empowering companies to create customized models using 500x less compute.

        + +

        View the Startup Showcase

        + +

        Mini-Summits

        + +

        The DL Compiler Mini-Summit offered attendees a deep dive into the advances in deep learning (DL) compilers that are transforming AI workloads.

        + +

        View the DL Compiler Mini-Summit

        + +

        People watching an event

        + +

        The Fine-Tuning Mini-Summit brought together a thriving community of researchers, developers, practitioners and hobbyists which focuses on topics ranging from memory efficiency, parameter-efficient fine-tuning and quantization to performance at scale and reproducible evaluations.

        + +

        View the Fine-Tuning Mini-Summit

        + +

        Major Takeaways from the PyTorch Conference 2024

        + +

        Matt giving his keynote

        + +
          +
        1. LLMs are Here to Stay: were a focal point of the event, reaffirming their pivotal role in the future of AI. As these models continue to scale, PyTorch remains the preferred framework for developing, training, and deploying them across various platforms and industries.
        2. +
        3. Open Source Drives Innovation: A key takeaway from the conference was that open-source tools like PyTorch are vital for democratizing AI. This community-driven approach accelerates innovation, enabling researchers and developers globally to collaborate and contribute to faster advancements and more accessible AI technologies.
        4. +
        5. Ethics and Sustainability Matter: The focus on ethical AI development was a significant takeaway. Talks on the inclusivity of computer vision models, the environmental impacts of AI infrastructure, and the need for transparent, unbiased AI models highlighted the growing importance of ethical considerations in the future of AI.
        6. +
        7. PyTorch Expands Beyond the Cloud: With several sessions dedicated to edge AI and distributed computing, the conference showcased how PyTorch is expanding beyond cloud-based applications into edge devices and diverse computing environments. This shift is crucial as AI advances into areas like autonomous vehicles, mobile applications, and IoT devices.
        8. +
        + +

        Thank You to Our Sponsors

        + +

        A crowd of people at a conference

        + +

        Sponsor logos

        + +

        We would like to thank each of the sponsors that made the PyTorch Conference 2024 possible. These include:

        + +

        Diamond Sponsors:

        + +
          +
        • AMD
        • +
        • Cloud Native Computing Foundation
        • +
        • IBM
        • +
        • Intel – PyTorch
        • +
        • Lightning.ai
        • +
        • Meta – PyTorch
        • +
        + +

        Platinum Sponsors:

        + +
          +
        • Arm
        • +
        • Google
        • +
        • Lambda Labs
        • +
        • Nvidia
        • +
        + +

        Silver Sponsors:

        + +
          +
        • Anyscale – PyTorch
        • +
        • Baseten
        • +
        • Chainguard
        • +
        • Databricks
        • +
        • Fal
        • +
        • FuriosaAi
        • +
        • HPE
        • +
        • Jane Street
        • +
        • Microsoft – PyTorch
        • +
        • MinIO
        • +
        • Outerbounds
        • +
        • Together.AI
        • +
        + +

        Bronze Sponsors:

        + +
          +
        • d-Matrix
        • +
        • MemVerge
        • +
        • Perforated AI
        • +
        • Quansight
        • +
        • Rotational Labs
        • +
        • ScaleGenAI
        • +
        + +

        Special Event Sponsors:

        + +
          +
        • PyTorch Flare Party: Hugging Face
        • +
        • Startup Showcase: Mayfield
        • +
        • Diversity Scholarship: AWS
        • +
        • Women and Non-Binary in PyTorch Lunch: Google
        • +
        • Happy Hour Reception: Lightning.AI
        • +
        + +

        Thank you for your continued support in advancing the PyTorch ecosystem and helping to shape the future of AI!

        + +

        Save the Date

        + +

        See you next year for the PyTorch Conference in San Francisco at the Palace of Fine Arts from October 22-23, 2025.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-developer-day-2020/index.html b/blog/pytorch-developer-day-2020/index.html new file mode 100644 index 000000000000..fdfee3379641 --- /dev/null +++ b/blog/pytorch-developer-day-2020/index.html @@ -0,0 +1,664 @@ + + + + + + + + + + + + + Announcing PyTorch Developer Day 2020 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        November 01, 2020

        +

        + Announcing PyTorch Developer Day 2020 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Starting this year, we plan to host two separate events for PyTorch: one for developers and users to discuss core technical development, ideas and roadmaps called “Developer Day”, and another for the PyTorch ecosystem and industry communities to showcase their work and discover opportunities to collaborate called “Ecosystem Day” (scheduled for early 2021).

        + +
        + +
        + +

        The PyTorch Developer Day (#PTD2) is kicking off on November 12, 2020, 8AM PST with a full day of technical talks on a variety of topics, including updates to the core framework, new tools and libraries to support development across a variety of domains. You’ll also see talks covering the latest research around systems and tooling in ML.

        + +

        For Developer Day, we have an online networking event limited to people composed of PyTorch maintainers and contributors, long-time stakeholders and experts in areas relevant to PyTorch’s future. Conversations from the networking event will strongly shape the future of PyTorch. Hence, invitations are required to attend the networking event.

        + +

        All talks will be livestreamed and available to the public.

        + + +

        Visit the event website to learn more. We look forward to welcoming you to PyTorch Developer Day on November 12th!

        + +

        Thank you,

        + +

        The PyTorch team

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-developer-day-2021/index.html b/blog/pytorch-developer-day-2021/index.html new file mode 100644 index 000000000000..960157ea6d74 --- /dev/null +++ b/blog/pytorch-developer-day-2021/index.html @@ -0,0 +1,666 @@ + + + + + + + + + + + + + Announcing PyTorch Developer Day 2021 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        August 23, 2021

        +

        + Announcing PyTorch Developer Day 2021 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce PyTorch Developer Day (#PTD2), taking place virtually from December 1 & 2, 2021. Developer Day is designed for developers and users to discuss core technical developments, ideas, and roadmaps.

        + +
        + +
        + +

        Event Details

        +

        Technical Talks Live Stream - December 1, 2021

        + +

        Join us for technical talks on a variety of topics, including updates to the core framework, new tools and libraries to support development across a variety of domains, responsible AI and industry use cases. All talks will take place on December 1 and will be live streamed on PyTorch channels.

        + +

        Stay up to date by following us on our social channels: Twitter, Facebook, or LinkedIn.

        + +

        Poster Exhibition & Networking - December 2, 2021

        + +

        On the second day, we’ll be hosting an online poster exhibition on Gather.Town. There will be opportunities to meet the authors and learn more about their PyTorch projects as well as network with the community. This poster and networking event is limited to people composed of PyTorch maintainers and contributors, long-time stakeholders and experts in areas relevant to PyTorch’s future. Conversations from the networking event will strongly shape the future of PyTorch. As such, invitations are required to attend the networking event.

        + +

        Call for Content Now Open

        + +

        Submit your poster abstracts today! Please send us the title and brief summary of your project, tools and libraries that could benefit PyTorch researchers in academia and industry, application developers, and ML engineers for consideration. The focus must be on academic papers, machine learning research, or open-source projects related to PyTorch development, Responsible AI or Mobile. Please no sales pitches. Deadline for submission is September 24, 2021.

        + +

        Visit the event website for more information and we look forward to having you at PyTorch Developer Day.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-docathon-h2-2023-wrap/index.html b/blog/pytorch-docathon-h2-2023-wrap/index.html new file mode 100644 index 000000000000..48969e8399e5 --- /dev/null +++ b/blog/pytorch-docathon-h2-2023-wrap/index.html @@ -0,0 +1,666 @@ + + + + + + + + + + + + + 🎉 PyTorch Docathon H2 2023 Wrap-up 🎉 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        November 16, 2023

        +

        + 🎉 PyTorch Docathon H2 2023 Wrap-up 🎉 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are thrilled to announce the successful completion of the Fall 2023 PyTorch Docathon! The event was a resounding success, and we want to extend our heartfelt gratitude to all the participants who made it possible. Dedication, expertise, and tireless efforts of our open-source contributors have once again helped us to improve PyTorch documentation.

        + +

        This Docathon ran from Nov 1 through Nov 15 with more than 170 registrants. The energy and enthusiasm were palpable, and entrants were judged on the difficulty of submissions that resulted in over TBA merged pull requests. We have fixed the PyTorch docstrings and made them compatible with the PEP 257 Python Docstring Conventions guidelines. We also have fixed multiple bugs in the pytorch/tutorials repo.

        + +

        We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide.

        + +

        Meet the top contributors:

        + + + +

        You can see the full docathon leaderboard published here.

        + +

        As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch documentation and code, and pushing the boundaries of what’s possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the PyTorch community.

        + +

        Thank you again for your participation and support. We look forward to seeing what you will achieve next!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-docathon-h2-2024-wrap-up/index.html b/blog/pytorch-docathon-h2-2024-wrap-up/index.html new file mode 100644 index 000000000000..af6ffaf327e5 --- /dev/null +++ b/blog/pytorch-docathon-h2-2024-wrap-up/index.html @@ -0,0 +1,666 @@ + + + + + + + + + + + + + 🎉 PyTorch Docathon H1 2024 Wrap-up 🎉 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are thrilled to announce the successful completion of the H1 2024 PyTorch Docathon! The event was a resounding success, and we want to extend our heartfelt gratitude to all the participants who made it possible. Dedication, expertise, and tireless efforts of our open-source contributors have once again helped us to improve PyTorch documentation.

        + +

        This Docathon ran from June 4 through June 20 with more than 176 registrants. The energy and enthusiasm were palpable, and entrants were judged on the difficulty of submissions that resulted in over 50 merged pull requests.

        + +

        We want to give a special shout-out to our top contributors, who went above and beyond during this event. Your dedication and expertise have been invaluable in enhancing the PyTorch documentation and empowering developers worldwide.

        + +

        Meet the top contributors

        + + + +

        For the full list of participants, see here.

        + +

        As we bring this Docathon to a close, we encourage each and every one of you to stay inspired and keep contributing to PyTorch documentation and code, and pushing the boundaries of what’s possible with PyTorch. Your collective efforts are shaping the landscape of deep learning and fostering innovation in the PyTorch community.

        + +

        Thank you again for your participation and support. We look forward to seeing what you will achieve next!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-documentary/index.html b/blog/pytorch-documentary/index.html new file mode 100644 index 000000000000..95dcbd61e586 --- /dev/null +++ b/blog/pytorch-documentary/index.html @@ -0,0 +1,689 @@ + + + + + + + + + + + + + Powering the AI Revolution: The PyTorch Documentary | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + The PyTorch Foundation + +

        +

        Now live: The official PyTorch Documentary! This film unveils the authentic narrative of PyTorch’s inception, attributing its existence to a dedicated group of unsung heroes driving technological innovation.

        + +

        The documentary shares the strength of the PyTorch community, resonating with our communities across the globe. We hope this story of PyTorch inspires greater contributions, attracts more contributors to the project, and fosters widespread recognition of PyTorch’s significance in the open source community.

        + + + +

        We couldn’t have produced this without the support of our PyTorch Foundation members and sponsors:

        + +

        company logos

        + +

        AMD

        + +

        “PyTorch’s growth and adoption in the AI community is a testament to open collaboration. The collective efforts of all the contributors have helped propel PyTorch as one of the most widely adopted AI frameworks in the industry. AMD is proud to be a part of this movement - making sure that the future of AI is open - and we are excited to continue contributing to this vibrant ecosystem.”

        + +

        – Niles Burbank, AMD

        + +

        AWS

        + +

        “The release of the PyTorch Documentary showcases the innovation and real-world impact of one of the most widely adopted open source machine learning frameworks. By supporting and contributing to the PyTorch community, AWS helps enable cutting-edge machine learning research that drives advancements in AI capabilities. We are excited about the documentary as it highlights the power of collaboration in propelling PyTorch to the forefront of machine learning and empowering developers and data scientists to create groundbreaking models. At AWS, we celebrate frameworks like PyTorch that foster environments where open source machine learning technologies can grow and benefit the community at-large, as well as our customers.”

        + +

        – Brian Granger, AWS

        + +

        Google Cloud

        + +

        “Google recognizes the impact of PyTorch on the AI community, providing researchers and developers with powerful, flexible tools for innovation. This documentary not only celebrates the remarkable achievements of the PyTorch community but also highlights the collaborative spirit driving advancements in AI. We look forward to continuing our support for PyTorch and fostering an open ecosystem that accelerates machine learning research and application.”

        + +

        – Dwarak Rajagopal, Google

        + +

        Meta

        + +

        “We have been so impressed with the growth and collaboration that PyTorch has created over the years. From very humble beginnings at Meta to a cornerstone in AI research and development, the documentary showcases the dedication of our contributors since the start. It’s an honor to be a part of something so impactful, and now it’s been documented for our community to take part in.”

        + +

        – Soumith Chintala, Meta

        + +

        Microsoft Azure

        + +

        “We’re truly excited about the premiere of the PyTorch Documentary. At Microsoft, PyTorch has been our default deep learning framework for building AI solutions including Microsoft Copilot. Additionally, we have made significant investments to create an optimized environment for our customers to develop, train, fine-tune and deploy their PyTorch workloads on Azure and Windows, furthering our commitment to democratize AI.”

        + +

        – Eric Boyd, Microsoft

        + +

        PyTorch Foundation

        + +

        “The release of the PyTorch documentary marks a significant milestone for our community, showcasing the incredible journey and rapid evolution of PyTorch. We are excited to share these stories and achievements with the world, and we look forward to continuing to foster innovation and growth of the PyTorch community and PyTorch’s evolving ecosystem.”

        + +

        – Matt White, PyTorch Foundation

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-ecosystem/index.html b/blog/pytorch-ecosystem/index.html new file mode 100644 index 000000000000..cd029035fe8a --- /dev/null +++ b/blog/pytorch-ecosystem/index.html @@ -0,0 +1,745 @@ + + + + + + + + + + + + + PyTorch Adds New Ecosystem Projects for Encrypted AI and Quantum Computing, Expands PyTorch Hub | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        The PyTorch ecosystem includes projects, tools, models and libraries from a broad community of researchers in academia and industry, application developers, and ML engineers. The goal of this ecosystem is to support, accelerate, and aid in your exploration with PyTorch and help you push the state of the art, no matter what field you are exploring. Similarly, we are expanding the recently launched PyTorch Hub to further help you discover and reproduce the latest research.

        + +

        In this post, we’ll highlight some of the projects that have been added to the PyTorch ecosystem this year and provide some context on the criteria we use to evaluate community projects. We’ll also provide an update on the fast-growing PyTorch Hub and share details on our upcoming PyTorch Summer Hackathon.

        + +
        + +
        + +

        Recently added ecosystem projects

        + +

        From private AI to quantum computing, we’ve seen the community continue to expand into new and interesting areas. The latest projects include:

        + +
          +
        • +

          Advertorch: A Python toolbox for adversarial robustness research. The primary functionalities are implemented in PyTorch. Specifically, AdverTorch contains modules for generating adversarial perturbations and defending against adversarial examples, as well as scripts for adversarial training.

          +
        • +
        • +

          botorch: A modular and easily extensible interface for composing Bayesian optimization primitives, including probabilistic models, acquisition functions, and optimizers.

          +
        • +
        • +

          Skorch: A high-level library for PyTorch that provides full scikit-learn compatibility.

          +
        • +
        • +

          PyTorch Geometric: A library for deep learning on irregular input data such as graphs, point clouds, and manifolds.

          +
        • +
        • +

          PySyft: A Python library for encrypted, privacy preserving deep learning.

          +
        • +
        • +

          PennyLane: A library for quantum ML, automatic differentiation, and optimization of hybrid quantum-classical computations.

          +
        • +
        • +

          Flair: A very simple framework for state-of-the-art natural language processing (NLP).

          +
        • +
        + +

        What makes a great project?

        + +

        When we review project submissions for the PyTorch ecosystem, we take into account a number of factors that we feel are important and that we would want in the projects we use ourselves. Some of these criteria include:

        + +
          +
        1. Well-tested: Users should be confident that ecosystem projects will work well with PyTorch, and include support for CI to ensure that testing is occurring on a continuous basis and the project can run on the latest version of PyTorch.
        2. +
        3. Clear utility: Users should understand where each project fits within the PyTorch ecosystem and the value it brings.
        4. +
        5. Permissive licensing: Users must be able to utilize ecosystem projects without licensing concerns. e.g. BSD-3, Apache-2 and MIT licenses
        6. +
        7. Easy onboarding: Projects need to have support for binary installation options (pip/Conda), clear documentation and a rich set of tutorials (ideally built into Jupyter notebooks).
        8. +
        9. Ongoing maintenance: Project authors need to be committed to supporting and maintaining their projects.
        10. +
        11. Community: Projects should have (or be on track to building) an active, broad-based community.
        12. +
        + +

        If you would like to have your project included in the PyTorch ecosystem and featured on pytorch.org/ecosystem, please complete the form here. If you’ve previously submitted a project for consideration and haven’t heard back, we promise to get back to you as soon as we can - we’ve received a lot of submissions!

        + +

        PyTorch Hub for reproducible research | New models

        + +

        Since launching the PyTorch Hub in beta, we’ve received a lot of interest from the community including the contribution of many new models. Some of the latest include U-Net for Brain MRI contributed by researchers at Duke University, Single Shot Detection from NVIDIA and Transformer-XL from HuggingFace.

        + +

        We’ve seen organic integration of the PyTorch Hub by folks like paperswithcode, making it even easier for you to try out the state of the art in AI research. In addition, companies like Seldon provide production-level support for PyTorch Hub models on top of Kubernetes.

        + +

        What are the benefits of contributing a model in the PyTorch Hub?

        + +
          +
        • +

          Compatibility: PyTorch Hub models are prioritized first for testing by the TorchScript and Cloud TPU teams, and used as baselines for researchers across a number of fields.

          +
        • +
        • +

          Visibility: Models in the Hub will be promoted on pytorch.org as well as on paperswithcode.

          +
        • +
        • +

          Ease of testing and reproducibility: Each model comes with code, clear preprocessing requirements, and methods/dependencies to run. There is also tight integration with Google Colab, making it a true single click to get started.

          +
        • +
        + +

        PyTorch Hub contributions welcome!

        + +

        We are actively looking to grow the PyTorch Hub and welcome contributions. You don’t need to be an original paper author to contribute, and we’d love to see the number of domains and fields broaden. So what types of contributions are we looking for?

        + +
          +
        • +

          Artifacts of a published or an arXiv paper (or something of a similar nature that serves a different audience — such as ULMFit) that a large audience would need.

          + +

          AND

          +
        • +
        • +

          Reproduces the published results (or better)

          +
        • +
        + +

        Overall these models are aimed at researchers either trying to reproduce a baseline, or trying to build downstream research on top of the model (such as feature-extraction or fine-tuning) as well as researchers looking for a demo of the paper for subjective evaluation. Please keep this audience in mind when contributing.

        + +

        If you are short on inspiration or would just like to find out what the SOTA is an any given field or domain, checkout the Paperswithcode state-of-the-art gallery.

        + +

        PyTorch Summer Hackathon

        + +

        We’ll be hosting the first PyTorch Summer Hackathon next month. We invite you to apply to participate in the in-person hackathon on August 8th to 9th at Facebook’s Menlo Park campus. We’ll be bringing the community together to work on innovative ML projects that can solve a broad range of complex challenges.

        + +

        Applications will be reviewed and accepted on a rolling basis until spaces are filled. For those who cannot join this Hackathon in person, we’ll be following up soon with other ways to participate.

        + +

        Please visit this link to apply.

        + +

        Thank you for being part of the PyTorch community!

        + +

        -Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-edge/index.html b/blog/pytorch-edge/index.html new file mode 100644 index 000000000000..3fb263519075 --- /dev/null +++ b/blog/pytorch-edge/index.html @@ -0,0 +1,675 @@ + + + + + + + + + + + + + PyTorch Edge: Enabling On-Device Inference Across Mobile and Edge Devices with ExecuTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + the PyTorch Edge Team + +

        +

        We are excited to announce ExecuTorch, our all-new solution for enabling on-device inference capabilities across mobile and edge devices with the backing of industry leaders like Arm, Apple, and Qualcomm Innovation Center.

        + +

        As part of PyTorch Edge’s vision for the future of the on-device AI stack and ecosystem, ExecuTorch addresses the fragmentation in the on-device AI ecosystem. It offers a design that provides extension points for seamless third-party integration to accelerate ML models on specialized hardware. Our partners have contributed custom delegate implementations to optimize model inference execution on their respective hardware platforms.

        + +

        We have created extensive documentation that provides more details about ExecuTorch’s architecture, its high-level components, example ML models running on ExecuTorch, and end-to-end tutorials for exporting and running a model on various hardware devices. We are excited to see all of the innovative use cases of ExecuTorch built by the community.

        + +

        Key Components of ExecuTorch

        + +

        ExecuTorch offers a compact runtime with a lightweight operator registry to cover the PyTorch ecosystem of models, and a streamlined path to execute PyTorch programs on edge devices. These devices range from mobile phones to embedded hardware powered by specific delegates built by our partners. In addition, ExecuTorch ships with a Software Developer Kit (SDK) and toolchain that provide an ergonomic UX for ML Developers to go from model authoring to training and device delegation in a single PyTorch workflow. This suite of tools enables ML developers to perform on-device model profiling and better ways of debugging the original PyTorch model.

        + +

        ExecuTorch is architected from the ground up in a composable manner to allow ML developers to make decisions on what components to leverage as well as entry points to extend them if needed. This design provides the following benefits to the ML community:

        + +
          +
        • Portability: Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers.
        • +
        • Productivity: Enabling developers to use the same toolchains and SDK from PyTorch model authoring and conversion, to debugging and deployment to a wide variety of platforms, resulting in productivity gains.
        • +
        • Performance: Providing end users with a seamless and high-performance experience due to a lightweight runtime as well as its ability to utilize full hardware capabilities, including general purpose CPUs and specialized purpose microprocessors such as NPUs and DSPs.
        • +
        + +

        PyTorch Edge: from PyTorch Mobile to ExecuTorch

        + +

        Bringing research and production environments closer together is a fundamental goal of PyTorch. ML engineers increasingly use PyTorch to author and deploy machine learning models in highly dynamic and ever-evolving environments, from servers to edge devices such as mobile phones and embedded hardware.

        + +

        With the increasing adoption of AI in Augmented Reality (AR), Virtual Reality (VR), Mixed Reality (MR), Mobile, IoT and other domains, there is a growing need for an end-to-end on-device solution that is extensible, modular, and aligned with the PyTorch stack.

        + +

        PyTorch Edge builds on the same fundamental principle of improving research to production by enabling the deployment of various ML models (spanning vision, speech, NLP, translation, ranking, integrity and content creation tasks) to edge devices via a low-friction development and deployment process. It provides a framework stack that spans the universe of on-device use-cases that the PyTorch community cares about.

        + +

        PyTorch Edge provides portability of core components that is required to reach a wide spectrum of devices which are characterized by differing hardware configurations, performance and efficiency. Such portability is achieved by allowing optimization that are custom developed for the target use-cases, and developer productivity via well defined entry-points, representations, and tools to tie all this together into a thriving ecosystem.

        + +

        PyTorch Edge is the future of the on-device AI stack and ecosystem for PyTorch. We are excited to see what the community builds with ExecuTorch’s on-device inference capabilities across mobile and edge devices backed by our industry partner delegates.

        + +

        Learn more about PyTorch Edge and ExecuTorch.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-enterprise-support-update/index.html b/blog/pytorch-enterprise-support-update/index.html new file mode 100644 index 000000000000..fcfc324e7cb0 --- /dev/null +++ b/blog/pytorch-enterprise-support-update/index.html @@ -0,0 +1,655 @@ + + + + + + + + + + + + + PyTorch Enterprise Support Program Update | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        November 10, 2022

        +

        + PyTorch Enterprise Support Program Update +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        On May 25, 2021, we announced the PyTorch Enterprise Support Program (ESP) that enabled providers to develop and offer tailored enterprise-grade support to their customers.

        + +

        The program enabled Program certified service providers to develop and offer tailored enterprise-grade support to their customers through contribution of hotfixes and other improvements requested by PyTorch enterprise users who were developing models in production at scale for mission-critical applications. However, as we evaluate community feedback, we found ongoing ESP support was not necessary at this time and will immediately divert these resources to other areas to improve the user experience for the entire community.

        + +

        Today, we are removing the PyTorch long-term support (LTS 1.8.2) download link from the “Get Started” page from the “Start Locally” download option in order to simplify the user experience. One can download PyTorch v1.8.2 in previous versions. Please note that it is only supported for Python while it is being deprecated. If there are any updates to ESP/LTS, we will update future blogs.

        + +

        + +

        + +

        Please reach out to marketing@pytorch.org with any questions.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-feature-classification-changes/index.html b/blog/pytorch-feature-classification-changes/index.html new file mode 100644 index 000000000000..034e138eae36 --- /dev/null +++ b/blog/pytorch-feature-classification-changes/index.html @@ -0,0 +1,690 @@ + + + + + + + + + + + + + PyTorch feature classification changes | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        July 28, 2020

        +

        + PyTorch feature classification changes +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Traditionally features in PyTorch were classified as either stable or experimental with an implicit third option of testing bleeding edge features by building master or through installing nightly builds (available via prebuilt whls). This has, in a few cases, caused some confusion around the level of readiness, commitment to the feature and backward compatibility that can be expected from a user perspective. Moving forward, we’d like to better classify the 3 types of features as well as define explicitly here what each mean from a user perspective.

        + +

        New Feature Designations

        + +

        We will continue to have three designations for features but, as mentioned, with a few changes: Stable, Beta (previously Experimental) and Prototype (previously Nightlies). Below is a brief description of each and a comment on the backward compatibility expected:

        + +

        Stable

        +

        Nothing changes here. A stable feature means that the user value-add is or has been proven, the API isn’t expected to change, the feature is performant and all documentation exists to support end user adoption.

        + +

        Level of commitment: We expect to maintain these features long term and generally there should be no major performance limitations, gaps in documentation and we also expect to maintain backwards compatibility (although breaking changes can happen and notice will be given one release ahead of time).

        + +

        Beta

        +

        We previously called these features ‘Experimental’ and we found that this created confusion amongst some of the users. In the case of a Beta level features, the value add, similar to a Stable feature, has been proven (e.g. pruning is a commonly used technique for reducing the number of parameters in NN models, independent of the implementation details of our particular choices) and the feature generally works and is documented. This feature is tagged as Beta because the API may change based on user feedback, because the performance needs to improve or because coverage across operators is not yet complete.

        + +

        Level of commitment: We are committing to seeing the feature through to the Stable classification. We are however not committing to Backwards Compatibility. Users can depend on us providing a solution for problems in this area going forward, but the APIs and performance characteristics of this feature may change.

        + +
        + +
        + +

        Prototype

        +

        Previously these were features that were known about by developers who paid close attention to RFCs and to features that land in master. These features are part of the release and are available as part of binary distributions like PyPI or Conda. We would like to get high bandwidth partner feedback ahead of a real release in order to gauge utility and any changes we need to make to the UX. For each prototype feature, a pointer to draft docs or other instructions will be provided.

        + +

        Level of commitment: We are committing to gathering high bandwidth feedback only. Based on this feedback and potential further engagement between community members, we as a community will decide if we want to upgrade the level of commitment or to fail fast. Additionally, while some of these features might be more speculative (e.g. new Frontend APIs), others have obvious utility (e.g. model optimization) but may be in a state where gathering feedback outside of high bandwidth channels is not practical, e.g. the feature may be in an earlier state, may be moving fast (PRs are landing too quickly to catch a major release) and/or generally active development is underway.

        + +

        What changes for current features?

        + +

        First and foremost, you can find these designations on pytorch.org/docs. We will also be linking any early stage features here for clarity.

        + +

        Additionally, the following features will be reclassified under this new rubric:

        + +
          +
        1. High Level Autograd APIs: Beta (was Experimental)
        2. +
        3. Eager Mode Quantization: Beta (was Experimental)
        4. +
        5. Named Tensors: Prototype (was Experimental)
        6. +
        7. TorchScript/RPC: Prototype (was Experimental)
        8. +
        9. Channels Last Memory Layout: Beta (was Experimental)
        10. +
        11. Custom C++ Classes: Beta (was Experimental)
        12. +
        13. PyTorch Mobile: Beta (was Experimental)
        14. +
        15. Java Bindings: Beta (was Experimental)
        16. +
        17. Torch.Sparse: Beta (was Experimental)
        18. +
        + +

        Cheers,

        + +

        Joe, Greg, Woo & Jessica

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/index.html b/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/index.html new file mode 100644 index 000000000000..408b7a184097 --- /dev/null +++ b/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/index.html @@ -0,0 +1,715 @@ + + + + + + + + + + + + + PyTorch for AMD ROCm™ Platform now available as Python package | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Niles Burbank – Director PM at AMD, Mayank Daga – Director, Deep Learning Software at AMD + +

        +

        With the PyTorch 1.8 release, we are delighted to announce a new installation option for users of +PyTorch on the ROCm™ open software platform. An installable Python package is now hosted on +pytorch.org, along with instructions for local installation in the same simple, selectable format as +PyTorch packages for CPU-only configurations and other GPU platforms. PyTorch on ROCm includes full +capability for mixed-precision and large-scale training using AMD’s MIOpen & RCCL libraries. This +provides a new option for data scientists, researchers, students, and others in the community to get +started with accelerated PyTorch using AMD GPUs.

        + +
        + +
        + +

        The ROCm Ecosystem

        + +

        ROCm is AMD’s open source software platform for GPU-accelerated high performance computing and +machine learning. Since the original ROCm release in 2016, the ROCm platform has evolved to support +additional libraries and tools, a wider set of Linux® distributions, and a range of new GPUs. This includes +the AMD Instinct™ MI100, the first GPU based on AMD CDNA™ architecture.

        + +

        The ROCm ecosystem has an established history of support for PyTorch, which was initially implemented +as a fork of the PyTorch project, and more recently through ROCm support in the upstream PyTorch +code. PyTorch users can install PyTorch for ROCm using AMD’s public PyTorch docker image, and can of +course build PyTorch for ROCm from source. With PyTorch 1.8, these existing installation options are +now complemented by the availability of an installable Python package.

        + +

        The primary focus of ROCm has always been high performance computing at scale. The combined +capabilities of ROCm and AMD’s Instinct family of data center GPUs are particularly suited to the +challenges of HPC at data center scale. PyTorch is a natural fit for this environment, as HPC and ML +workflows become more intertwined.

        + +

        Getting started with PyTorch for ROCm

        + +

        The scope for this build of PyTorch is AMD GPUs with ROCm support, running on Linux. The GPUs +supported by ROCm include all of AMD’s Instinct family of compute-focused data center GPUs, along +with some other select GPUs. A current list of supported GPUs can be found in the ROCm Github +repository. After confirming that the target system includes supported GPUs and the current 4.0.1 +release of ROCm, installation of PyTorch follows the same simple Pip-based installation as any other +Python package. As with PyTorch builds for other platforms, the configurator at https://pytorch.org/get-started/locally/ provides the specific command line to be run.

        + +

        PyTorch for ROCm is built from the upstream PyTorch repository, and is a full featured implementation. +Notably, it includes support for distributed training across multiple GPUs and supports accelerated +mixed precision training.

        + +

        More information

        + +

        A list of ROCm supported GPUs and operating systems can be found at +https://github.com/RadeonOpenCompute/ROCm +General documentation on the ROCm platform is available at https://rocmdocs.amd.com/en/latest/ +ROCm Learning Center at https://developer.amd.com/resources/rocm-resources/rocm-learning-center/ General information on AMD’s offerings for HPC and ML can be found at https://amd.com/hpc

        + +

        Feedback

        +

        An engaged user base is a tremendously important part of the PyTorch ecosystem. We would be deeply +appreciative of feedback on the PyTorch for ROCm experience in the PyTorch discussion forum and, where appropriate, reporting any issues via Github.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-hackathon-2021/index.html b/blog/pytorch-hackathon-2021/index.html new file mode 100644 index 000000000000..39356cb8954b --- /dev/null +++ b/blog/pytorch-hackathon-2021/index.html @@ -0,0 +1,695 @@ + + + + + + + + + + + + + Announcing PyTorch Annual Hackathon 2021 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        September 08, 2021

        +

        + Announcing PyTorch Annual Hackathon 2021 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We’re excited to announce the PyTorch Annual Hackathon 2021! This year, we’re looking to support the community in creating innovative PyTorch tools, libraries, and applications. 2021 is the third year we’re hosting this Hackathon, and we welcome you to join the PyTorch community and put your machine learning skills into action. Submissions start on September 8 and end on November 3. Good luck to everyone!

        + +
        + +
        + +

        Submission Categories

        +

        You can enter your PyTorch projects into three categories:

        + +
          +
        • +

          PyTorch Responsible AI Development Tools & Libraries - Build an AI development tool or library that helps develop AI models and applications responsibly. These tools, libraries, and apps need to support a researcher or developer to factor in fairness, security, and privacy throughout the entire machine learning development process of data gathering, model training, model validation, inferences, monitoring, and more.

          +
        • +
        • +

          Web and Mobile Applications Powered by PyTorch - Build an application with the web, mobile interface, and/or embedded device powered by PyTorch so the end users can interact with it. The submission must be built on PyTorch or use PyTorch-based libraries such as torchvision, torchtext, and fast.ai.

          +
        • +
        • +

          PyTorch Developer Tools & Libraries - Build a creative, useful, and well-implemented tool or library for improving the productivity and efficiency of PyTorch researchers and developers. The submission must be a machine learning algorithm, model, or application built using PyTorch or PyTorch-based libraries.

          +
        • +
        + +

        Prizes

        +

        Submissions will be judged on the idea’s quality, originality, implementation, and potential impact.

        + +
          +
        • +

          First-Place Winners in each category of the Hackathon will receive $5,000 in cash, along with a 30-minute call with the PyTorch development team.

          +
        • +
        • +

          Second-Place Winners will receive $3,000.

          +
        • +
        • +

          Third-Place Winners will receive $2,000.

          +
        • +
        + +

        All winners will also receive the opportunity to create blog posts that will be featured throughout PyTorch channels as well as an exclusive Github badge. Honorable Mentions will also be awarded to the following three highest-scoring entries in each category and will receive $1,000 each.

        + +

        Cloud Computing Credits

        +

        Request $100 in credits from Amazon Web Services or Google Cloud for your computing costs. Please allow 3 business days for your request to be reviewed. Credits will be provided to verified registrants until the supplies run out. For more information, see https://pytorch2021.devpost.com/details/sponsors.

        + +

        2020 Winning Projects

        + +

        DeMask won first place in the PyTorch Developer Tools category. Built using Asteroid, a PyTorch-based audio source separation toolkit, DeMask is an end-to-end model for enhancing speech while wearing face masks.

        + +

        Q&Aid won first place in the Web/Mobile Applications Powered by PyTorch category. Backed by PyTorch core algorithms and models, Q&Aid is a conceptual health care chatbot aimed at making health care diagnoses and facilitating communication between patients and doctors.

        + +

        FairTorch won first place in the PyTorch Responsible AI Development Tools category. FairTorch is a PyTorch fairness library that lets developers add constraints to their models to equalize metrics across subgroups by simply adding a few lines of code.

        + +

        How to Join

        +

        If you’re interested in joining this year’s PyTorch Hackathon, register at http://pytorch2021.devpost.com.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-landscape/index.html b/blog/pytorch-landscape/index.html new file mode 100644 index 000000000000..0c1c2a1c6d27 --- /dev/null +++ b/blog/pytorch-landscape/index.html @@ -0,0 +1,680 @@ + + + + + + + + + + + + + Introducing the New PyTorch Landscape: Your Guide to the PyTorch Ecosystem | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We’re excited to reveal our brand new PyTorch Landscape. The PyTorch Landscape helps researchers, developers, and organizations easily locate useful, curated, community-built tools that augment the PyTorch core framework.

        + +

        landscape banner

        + +

        What the Landscape Offers

        + +

        The Landscape visually organizes projects into three categories—Modeling, Training, and Optimizations—making finding relevant frameworks, libraries, and projects easy. Users can quickly locate curated, valuable tools for a variety of use cases that complement the PyTorch framework. Each tool that is part of the Landscape has been reviewed and vetted by PyTorch project experts. The projects in the Landscape are considered to be mature and healthy and provide valuable capabilities that complement the PyTorch framework in their respective domains.

        + +

        Explore the AI Landscape

        + +

        The Explore page presents platforms, tools, and libraries, each with a logo, description, and links to GitHub and further details. This categorized, visual approach simplifies discovery and provides quick access to essential technologies.

        + +

        Guide Page: A Closer Look

        + +

        For deeper insights, the Guide page expands on each project, highlighting methodologies and trends shaping AI development, from adversarial robustness to self-supervised learning. There are also project statistics provided for each project, including metrics such as number of stars, contributors, commit history, languages used, license, and other valuable metrics that provide an in-depth understanding of the project and how it may be used.

        + +

        Tracking AI’s Growth: The Stats Page

        + +

        The Stats page provides insights into AI development trends, tracking repository activity, programming languages, and industry funding data.

        + +
          +
        • Repositories: 117 repositories, 20.5k contributors, and 797.2k stars across 815MB of source code.
        • +
        • Development Trends: Weekly commit activity over the last year.
        • +
        • Licensing Breakdown: Repositories are categorized by license type.
        • +
        • Funding & Acquisitions: Insights into investment trends, including funding rounds and acquisitions.
        • +
        + +

        Why Use the PyTorch Landscape?

        + +

        Finding useful and high quality open source projects that complement the PyTorch core system can be overwhelming. The PyTorch Landscape offers a clear, accessible way to explore the ecosystem of community-built tools, whether you’re researching, building models, or making strategic decisions.

        + +

        Stay ahead with the PyTorch Landscape — your guide to the PyTorch Ecosystem.

        + +

        Want to Contribute a Project to the PyTorch Landscape?

        + +

        Have you built a useful open source tool that you would like to share with the PyTorch community? Then help us grow the Ecosystem by contributing your tool! You can find the instructions to apply here. We welcome all contributions from the community!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-library-updates-new-model-serving-library/index.html b/blog/pytorch-library-updates-new-model-serving-library/index.html new file mode 100644 index 000000000000..6619b31965d5 --- /dev/null +++ b/blog/pytorch-library-updates-new-model-serving-library/index.html @@ -0,0 +1,728 @@ + + + + + + + + + + + + + PyTorch library updates including new model serving library | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Along with the PyTorch 1.5 release, we are announcing new libraries for high-performance PyTorch model serving and tight integration with TorchElastic and Kubernetes. Additionally, we are releasing updated packages for torch_xla (Google Cloud TPUs), torchaudio, torchvision, and torchtext. All of these new libraries and enhanced capabilities are available today and accompany all of the core features released in PyTorch 1.5.

        + +

        TorchServe (Experimental)

        + +

        TorchServe is a flexible and easy to use library for serving PyTorch models in production performantly at scale. It is cloud and environment agnostic and supports features such as multi-model serving, logging, metrics, and the creation of RESTful endpoints for application integration. TorchServe was jointly developed by engineers from Facebook and AWS with feedback and engagement from the broader PyTorch community. The experimental release of TorchServe is available today. Some of the highlights include:

        + +
          +
        • Support for both Python-based and TorchScript-based models
        • +
        • Default handlers for common use cases (e.g., image segmentation, text classification) as well as the ability to write custom handlers for other use cases
        • +
        • Model versioning, the ability to run multiple versions of a model at the same time, and the ability to roll back to an earlier version
        • +
        • The ability to package a model, learning weights, and supporting files (e.g., class mappings, vocabularies) into a single, persistent artifact (a.k.a. the “model archive”)
        • +
        • Robust management capability, allowing full configuration of models, versions, and individual worker threads via command line, config file, or run-time API
        • +
        • Automatic batching of individual inferences across HTTP requests
        • +
        • Logging including common metrics, and the ability to incorporate custom metrics
        • +
        • Ready-made Dockerfile for easy deployment
        • +
        • HTTPS support for secure deployment
        • +
        + +

        To learn more about the APIs and the design of this feature, see the links below:

        +
          +
        • See for a full multi-node deployment reference architecture.
        • +
        • The full documentation can be found here.
        • +
        + +

        TorchElastic integration with Kubernetes (Experimental)

        + +

        TorchElastic is a proven library for training large scale deep neural networks at scale within companies like Facebook, where having the ability to dynamically adapt to server availability and scale as new compute resources come online is critical. Kubernetes enables customers using machine learning frameworks like PyTorch to run training jobs distributed across fleets of powerful GPU instances like the Amazon EC2 P3. Distributed training jobs, however, are not fault-tolerant, and a job cannot continue if a node failure or reclamation interrupts training. Further, jobs cannot start without acquiring all required resources, or scale up and down without being restarted. This lack of resiliency and flexibility results in increased training time and costs from idle resources. TorchElastic addresses these limitations by enabling distributed training jobs to be executed in a fault-tolerant and elastic manner. Until today, Kubernetes users needed to manage Pods and Services required for TorchElastic training jobs manually.

        + +

        Through the joint collaboration of engineers at Facebook and AWS, TorchElastic, adding elasticity and fault tolerance, is now supported using vanilla Kubernetes and through the managed EKS service from AWS.

        + +

        To learn more see the TorchElastic repo for the controller implementation and docs on how to use it.

        + +

        torch_xla 1.5 now available

        + +

        torch_xla is a Python package that uses the XLA linear algebra compiler to accelerate the PyTorch deep learning framework on Cloud TPUs and Cloud TPU Pods. torch_xla aims to give PyTorch users the ability to do everything they can do on GPUs on Cloud TPUs as well while minimizing changes to the user experience. The project began with a conversation at NeurIPS 2017 and gathered momentum in 2018 when teams from Facebook and Google came together to create a proof of concept. We announced this collaboration at PTDC 2018 and made the PyTorch/XLA integration broadly available at PTDC 2019. The project already has 28 contributors, nearly 2k commits, and a repo that has been forked more than 100 times.

        + +

        This release of torch_xla is aligned and tested with PyTorch 1.5 to reduce friction for developers and to provide a stable and mature PyTorch/XLA stack for training models using Cloud TPU hardware. You can try it for free in your browser on an 8-core Cloud TPU device with Google Colab, and you can use it at a much larger scaleon Google Cloud.

        + +

        See the full torch_xla release notes here. Full docs and tutorials can be found here and here.

        + +

        PyTorch Domain Libraries

        + +

        torchaudio, torchvision, and torchtext complement PyTorch with common datasets, models, and transforms in each domain area. We’re excited to share new releases for all three domain libraries alongside PyTorch 1.5 and the rest of the library updates. For this release, all three domain libraries are removing support for Python2 and will support Python3 only.

        + +

        torchaudio 0.5

        +

        The torchaudio 0.5 release includes new transforms, functionals, and datasets. Highlights for the release include:

        + +
          +
        • Added the Griffin-Lim functional and transform, InverseMelScale and Vol transforms, and DB_to_amplitude.
        • +
        • Added support for allpass, fade, bandpass, bandreject, band, treble, deemph, and riaa filters and transformations.
        • +
        • New datasets added including LJSpeech and SpeechCommands datasets.
        • +
        + +

        See the release full notes here and full docs can be found here.

        + +

        torchvision 0.6

        +

        The torchvision 0.6 release includes updates to datasets, models and a significant number of bug fixes. Highlights include:

        + +
          +
        • Faster R-CNN now supports negative samples which allows the feeding of images without annotations at training time.
        • +
        • Added aligned flag to RoIAlign to match Detectron2.
        • +
        • Refactored abstractions for C++ video decoder
        • +
        + +

        See the release full notes here and full docs can be found here.

        + +

        torchtext 0.6

        +

        The torchtext 0.6 release includes a number of bug fixes and improvements to documentation. Based on user’s feedback, dataset abstractions are currently being redesigned also. Highlights for the release include:

        + +
          +
        • Fixed an issue related to the SentencePiece dependency in conda package.
        • +
        • Added support for the experimental IMDB dataset to allow a custom vocab.
        • +
        • A number of documentation updates including adding a code of conduct and a deduplication of the docs on the torchtext site.
        • +
        + +

        Your feedback and discussions on the experimental datasets API are welcomed. You can send them to issue #664. We would also like to highlight the pull request here where the latest dataset abstraction is applied to the text classification datasets. The feedback can be beneficial to finalizing this abstraction.

        + +

        See the release full notes here and full docs can be found here.

        + +

        We’d like to thank the entire PyTorch team, the Amazon team and the community for all their contributions to this work.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-native-architecture-optimization/index.html b/blog/pytorch-native-architecture-optimization/index.html new file mode 100644 index 000000000000..fb16059c504c --- /dev/null +++ b/blog/pytorch-native-architecture-optimization/index.html @@ -0,0 +1,1194 @@ + + + + + + + + + + + + + PyTorch Native Architecture Optimization: torchao | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We’re happy to officially launch torchao, a PyTorch native library that makes models faster and smaller by leveraging low bit dtypes, quantization and sparsity. torchao is an accessible toolkit of techniques written (mostly) in easy to read PyTorch code spanning both inference and training. This blog will help you pick which techniques matter for your workloads.

        + +

        We benchmarked our techniques on popular GenAI models like LLama 3 and Diffusion models and saw minimal drops in accuracy. Unless otherwise noted the baselines are bf16 run on A100 80GB GPU.

        + +

        Our topline metrics for llama 3 are

        +
          +
        • 97% speedup for Llama 3 8B inference using autoquant with int4 weight only quantization and hqq
        • +
        • 73% peak VRAM reduction for Llama 3.1 8B inference at 128K context length with a quantized KV cache
        • +
        • 50% speedup for Llama 3 70B pretraining using float8 training on H100
        • +
        • 30% peak VRAM reduction for Llama 3 8B using 4 bit quantized optimizers.
        • +
        + +

        Our topline metrics for diffusion model inference

        +
          +
        • 53% speedup using float8 dynamic quantization inference with float8 row-wise scaling on flux1.dev onH100
        • +
        • 50% reduction in model VRAM for CogVideoX using int8 dynamic quantization
        • +
        + +

        Below we’ll walk through some of the techniques available in torchao you can apply to your models for inference and training.

        + +

        Inference

        + +

        Our inference quantization algorithms work over arbitrary PyTorch models that contain nn.Linear layers. Weight only and dynamic activation quantization for various dtypes and sparse layouts can be chosen using our top level quantize_ api

        + +
        from torchao.quantization import (  
        +    quantize_,  
        +    int4_weight_only,  
        +)  
        +quantize_(model, int4_weight_only())
        +
        + +

        Sometimes quantizing a layer can make it slower because of overhead so if you’d rather we just pick how to quantize each layer in a model for you then you can instead run

        + +
        model = torchao.autoquant(torch.compile(model, mode='max-autotune'))
        +
        + +

        quantize_ API has a few different options depending on whether your model is compute bound or memory bound.

        + +
        from torchao.quantization import (  
        +    # Memory bound models  
        +    int4_weight_only,  
        +    int8_weight_only,
        +
        +    # Compute bound models  
        +    int8_dynamic_activation_int8_semi_sparse_weight,  
        +    int8_dynamic_activation_int8_weight,  
        +      
        +    # Device capability 8.9+  
        +    float8_weight_only,  
        +    float8_dynamic_activation_float8_weight,  
        +)
        +
        + +

        We also have extensive benchmarks on diffusion models in collaboration with the HuggingFace diffusers team in diffusers-torchao where we demonstrated 53.88% speedup on Flux.1-Dev and 27.33% speedup on CogVideoX-5b

        + +

        + +

        Our APIs are composable so we’ve for example composed sparsity and quantization to bring 5% speedup for ViT-H inference

        + +

        But also can do things like quantize weights to int4 and the kv cache to int8 to support Llama 3.1 8B at the full 128K context length running in under 18.9GB of VRAM.
        +

        + +

        QAT

        + +

        Post training quantization, especially at less than 4 bit can suffer from serious accuracy degradations. Using Quantization Aware Training (QAT) we’ve managed to recover up to 96% of the accuracy degradation on hellaswag. We’ve integrated this as an end to end recipe in torchtune with a minimal tutorial

        + +

        + +

        Training

        + +

        Low precision compute and communications

        + +

        torchao provides easy to use e2e workflows for reducing the precision of training compute and distributed communications, starting with float8 for `torch.nn.Linear` layers.Here is a one-liner to convert the compute gemms of your training run to float8:

        + +
        from torchao.float8 import convert_to_float8_training  
        +convert_to_float8_training(model)
        +
        + +

        For an e2e example of how to speed up LLaMa 3 70B pretraining by up to 1.5x with float8, see our README, and torchtitan’s blog and float8 recipe.

        + +

        Performance and accuracy of float8 pretraining of LLaMa 3 70B, vs bfloat16

        + +

        +(source: https://dev-discuss.pytorch.org/t/enabling-float8-all-gather-in-fsdp2/2359)

        + +

        We are expanding our training workflows to more dtypes and layouts

        + +
          +
        1. NF4 QLoRA in torchtune
        2. +
        3. Prototype int8 training support
        4. +
        5. Accelerated sparse 2:4 training
        6. +
        + +

        Low bit Optimizers

        + +

        Inspired by Bits and Bytes we’ve also added prototype support for 8 and 4 bit optimizers as a drop in replacement for AdamW.

        + +
        from torchao.prototype.low_bit_optim import AdamW8bit, AdamW4bit  
        +optim = AdamW8bit(model.parameters())
        +
        + +

        + +

        Integrations

        + +

        We’ve been actively working on making sure torchao works well in some of the most important projects in open source.

        + +
          +
        1. Huggingface transformers as an inference backend
        2. +
        3. In diffusers-torchao as a reference implementation for accelerating diffusion models
        4. +
        5. In HQQ for fast 4 bit inference
        6. +
        7. In torchtune for PyTorch native QLoRA and QAT recipes
        8. +
        9. In torchchat for post training quantization
        10. +
        11. In SGLang for for int4 and int8 post training quantization
        12. +
        + +

        Conclusion

        + +

        If you’re interested in making your models faster and smaller for training or inference, we hope you’ll find torchao useful and easy to integrate.

        + +

        pip install torchao

        + +

        There are a lot of things we’re excited about next ranging from going lower than 4 bit, performant kernels for high-throughput inference, expanding to more layers, scaling types or granularities, MX hardware support and supporting more hardware backends. If any of the above sounds exciting you can follow our progress at: https://github.com/pytorch/ao

        + +

        If you’re interested in working on torchao, we’ve created a contributors guide, and if you have any questions we hang out on the #torchao channel on discord.gg/gpumode

        + +

        Acknowledgements

        + +

        We are fortunate to stand on the shoulders of giants and collaborate with some of the best people in open source. Thank you!

        + +
          +
        1. Bits and Bytes for pioneering work in low bit optimizers and QLoRA
        2. +
        3. Answer.ai for their engineering work to get FSDP and QLoRA composing
        4. +
        5. Mobius Labs for the lovely back and forths on quantization algorithms and low bit kernels
        6. +
        7. HuggingFace transformers for their help in battle testing and integrating our work
        8. +
        9. HuggingFace diffusers for our collaboration on extensive benchmarks and best practices
        10. +
        11. torch.compile so we could write our algorithms in pure PyTorch
        12. +
        13. GPU MODE for most of our early contributors
        14. +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-profiler-1.9-released/index.html b/blog/pytorch-profiler-1.9-released/index.html new file mode 100644 index 000000000000..f16695f2d6e0 --- /dev/null +++ b/blog/pytorch-profiler-1.9-released/index.html @@ -0,0 +1,854 @@ + + + + + + + + + + + + + What’s New in PyTorch Profiler 1.9? | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        August 03, 2021

        +

        + What’s New in PyTorch Profiler 1.9? +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Sabrina Smai, Program Manager on the AI Framework team at Microsoft + +

        +

        PyTorch Profiler v1.9 has been released! The goal of this new release (previous PyTorch Profiler release) is to provide you with new state-of-the-art tools to help diagnose and fix machine learning performance issues regardless of whether you are working on one or numerous machines. The objective is to target the execution steps that are the most costly in time and/or memory, and visualize the work load distribution between GPUs and CPUs.

        + +

        Here is a summary of the five major features being released:

        + +
          +
        1. Distributed Training View: This helps you understand how much time and memory is consumed in your distributed training job. Many issues occur when you take a training model and split the load into worker nodes to be run in parallel as it can be a black box. The overall model goal is to speed up model training. This distributed training view will help you diagnose and debug issues within individual nodes.
        2. +
        3. Memory View: This view allows you to understand your memory usage better. This tool will help you avoid the famously pesky Out of Memory error by showing active memory allocations at various points of your program run.
        4. +
        5. GPU Utilization Visualization: This tool helps you make sure that your GPU is being fully utilized.
        6. +
        7. Cloud Storage Support: Tensorboard plugin can now read profiling data from Azure Blob Storage, Amazon S3, and Google Cloud Platform.
        8. +
        9. Jump to Source Code: This feature allows you to visualize stack tracing information and jump directly into the source code. This helps you quickly optimize and iterate on your code based on your profiling results.
        10. +
        + +

        Getting Started with PyTorch Profiling Tool

        +

        PyTorch includes a profiling functionality called « PyTorch Profiler ». The PyTorch Profiler tutorial can be found here.

        + +

        To instrument your PyTorch code for profiling, you must:

        + +

        $ pip install torch-tb-profiler

        + +
        import torch.profiler as profiler
        +With profiler.profile(XXXX)
        +
        + +

        Comments:

        + +

        • For CUDA and CPU profiling, see below:

        +
        with torch.profiler.profile( 
        +activities=[ 
        +torch.profiler.ProfilerActivity.CPU, 
        +torch.profiler.ProfilerActivity.CUDA], 
        +
        + +

        • With profiler.record_function(“$NAME”): allows putting a decorator (a tag associated to a name) for a block of function

        + +

        • Profile_memory=True parameter under profiler.profile allows you to profile CPU and GPU memory footprint

        + +

        Visualizing PyTorch Model Performance using PyTorch Profiler

        + +

        Distributed Training

        + +

        Recent advances in deep learning argue for the value of large datasets and large models, which requires you to scale out model training to more computational resources. Distributed Data Parallel (DDP) and NVIDIA Collective Communications Library (NCCL) are the widely adopted paradigms in PyTorch for accelerating your deep learning training.

        + +

        In this release of PyTorch Profiler, DDP with NCCL backend is now supported.

        + +
        + +
        + +

        Computation/Communication Overview

        + +

        In the Computation/Communication overview under the Distributed training view, you can observe the computation-to-communication ratio of each worker and [load balancer](https://en.wikipedia.org/wiki/Load_balancing_(computing) nodes between worker as measured by granularity.

        + +

        Scenario 1:

        + +

        If the computation and overlapping time of one worker is much larger than the others, this may suggest an issue in the workload balance or worker being a straggler. Computation is the sum of kernel time on GPU minus the overlapping time. The overlapping time is the time saved by interleaving communications during computation. The more overlapping time represents better parallelism between computation and communication. Ideally the computation and communication completely overlap with each other. Communication is the total communication time minus the overlapping time. The example image below displays how this scenario appears on Tensorboard.

        + +
        + +

        Figure: A straggler example

        +
        + +

        Scenario 2:

        + +

        If there is a small batch size (i.e. less computation on each worker) or the data to be transferred is large, the computation-to-communication may also be small and be seen in the profiler with low GPU utilization and long waiting times. This computation/communication view will allow you to diagnose your code to reduce communication by adopting gradient accumulation, or to decrease the communication proportion by increasing batch size. DDP communication time depends on model size. Batch size has no relationship with model size. So increasing batch size could make computation time longer and make computation-to-communication ratio bigger.

        + +

        Synchronizing/Communication Overview

        + +

        In the Synchronizing/Communication view, you can observe the efficiency of communication. This is done by taking the step time minus computation and communication time. Synchronizing time is part of the total communication time for waiting and synchronizing with other workers. The Synchronizing/Communication view includes initialization, data loader, CPU computation, and so on Insights like what is the ratio of total communication is really used for exchanging data and what is the idle time of waiting for data from other workers can be drawn from this view.

        + +
        + +
        + +

        For example, if there is an inefficient workload balance or straggler issue, you’ll be able to identify it in this Synchronizing/Communication view. This view will show several workers’ waiting time being longer than others.

        + +
        + +
        + +

        This table view above allows you to see the detailed statistics of all communication ops in each node. This allows you to see what operation types are being called, how many times each op is called, what is the size of the data being transferred by each op, etc.

        + +

        Memory View:

        + +

        This memory view tool helps you understand the hardware resource consumption of the operators in your model. Understanding the time and memory consumption on the operator-level allows you to resolve performance bottlenecks and in turn, allow your model to execute faster. Given limited GPU memory size, optimizing the memory usage can:

        + +
          +
        1. Allow bigger model which can potentially generalize better on end level tasks.
        2. +
        3. Allow bigger batch size. Bigger batch sizes increase the training speed.
        4. +
        + +

        The profiler records all the memory allocation during the profiler interval. Selecting the “Device” will allow you to see each operator’s memory usage on the GPU side or host side. You must enable profile_memory=True to generate the below memory data as shown here.

        + +
        With torch.profiler.profile(
        +Profiler_memory=True # this will take 1 – 2 minutes to complete. 
        +)
        +
        + +

        Important Definitions:

        + +

        • “Size Increase” displays the sum of all allocation bytes and minus all the memory release bytes.

        + +

        • “Allocation Size” shows the sum of all allocation bytes without considering the memory release.

        + +

        • “Self” means the allocated memory is not from any child operators, instead by the operator itself.

        + +
        + +
        + +

        GPU Metric on Timeline:

        + +

        This feature will help you debug performance issues when one or more GPU are underutilized. Ideally, your program should have high GPU utilization (aiming for 100% GPU utilization), minimal CPU to GPU communication, and no overhead.

        + +

        Overview: +The overview page highlights the results of three important GPU usage metrics at different levels (i.e. GPU Utilization, Est. SM Efficiency, and Est. Achieved Occupancy). Essentially, each GPU has a bunch of SM each with a bunch of warps that can execute a bunch of threads concurrently. Warps execute a bunch because the amount depends on the GPU. But at a high level, this GPU Metric on Timeline tool allows you can see the whole stack, which is useful.

        + +

        If the GPU utilization result is low, this suggests a potential bottleneck is present in your model. Common reasons:

        + +

        •Insufficient parallelism in kernels (i.e., low batch size)

        + +

        •Small kernels called in a loop. This is to say the launch overheads are not amortized

        + +

        •CPU or I/O bottlenecks lead to the GPU not receiving enough work to keep busy

        + +

        Looking of the overview page where the performance recommendation section is where you’ll find potential suggestions on how to increase that GPU utilization. In this example, GPU utilization is low so the performance recommendation was to increase batch size. Increasing batch size 4 to 32, as per the performance recommendation, increased the GPU Utilization by 60.68%.

        + +

        GPU Utilization: the step interval time in the profiler when a GPU engine was executing a workload. The high the utilization %, the better. The drawback of using GPU utilization solely to diagnose performance bottlenecks is it is too high-level and coarse. It won’t be able to tell you how many Streaming Multiprocessors are in use. Note that while this metric is useful for detecting periods of idleness, a high value does not indicate efficient use of the GPU, only that it is doing anything at all. For instance, a kernel with a single thread running continuously will get a GPU Utilization of 100%

        + +

        Estimated Stream Multiprocessor Efficiency (Est. SM Efficiency) is a finer grained metric, it indicates what percentage of SMs are in use at any point in the trace This metric reports the percentage of time where there is at least one active warp on a SM and those that are stalled (NVIDIA doc). Est. SM Efficiency also has it’s limitation. For instance, a kernel with only one thread per block can’t fully use each SM. SM Efficiency does not tell us how busy each SM is, only that they are doing anything at all, which can include stalling while waiting on the result of a memory load. To keep an SM busy, it is necessary to have a sufficient number of ready warps that can be run whenever a stall occurs

        + +

        Estimated Achieved Occupancy (Est. Achieved Occupancy) is a layer deeper than Est. SM Efficiency and GPU Utilization for diagnosing performance issues. Estimated Achieved Occupancy indicates how many warps can be active at once per SMs. Having a sufficient number of active warps is usually key to achieving good throughput. Unlike GPU Utilization and SM Efficiency, it is not a goal to make this value as high as possible. As a rule of thumb, good throughput gains can be had by improving this metric to 15% and above. But at some point you will hit diminishing returns. If the value is already at 30% for example, further gains will be uncertain. This metric reports the average values of all warp schedulers for the kernel execution period (NVIDIA doc). The larger the Est. Achieve Occupancy value is the better.

        + +
        + +

        Overview details: Resnet50_batchsize4

        +
        + +
        + +

        Overview details: Resnet50_batchsize32

        +
        + +

        Kernel View +The kernel has “Blocks per SM” and “Est. Achieved Occupancy” which is a great tool to compare model runs.

        + +
        + +
        + +

        Mean Blocks per SM:
        +Blocks per SM = Blocks of this kernel / SM number of this GPU. If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized. “Mean Blocks per SM” is weighted average of all runs of this kernel name, using each run’s duration as weight.

        + +

        Mean Est. Achieved Occupancy:
        +Est. Achieved Occupancy is defined as above in overview. “Mean Est. Achieved Occupancy” is weighted average of all runs of this kernel name, using each run’s duration as weight.

        + +

        Trace View +This trace view displays a timeline that shows the duration of operators in your model and which system executed the operation. This view can help you identify whether the high consumption and long execution is because of input or model training. Currently, this trace view shows GPU Utilization and Est. SM Efficiency on a timeline.

        + +
        + +
        + +

        GPU utilization is calculated independently and divided into multiple 10 millisecond buckets. The buckets’ GPU utilization values are drawn alongside the timeline between 0 – 100%. In the above example, the “ProfilerStep5” GPU utilization during thread 28022’s busy time is higher than the following the one during “Optimizer.step”. This is where you can zoom-in to investigate why that is.

        + +
        + +
        + +

        From above, we can see the former’s kernels are longer than the later’s kernels. The later’s kernels are too short in execution, which results in lower GPU utilization.

        + +

        Est. SM Efficiency: Each kernel has a calculated est. SM efficiency between 0 – 100%. For example, the below kernel has only 64 blocks, while the SMs in this GPU is 80. Then its “Est. SM Efficiency” is 64/80, which is 0.8.

        + +
        + +
        + +

        Cloud Storage Support

        + +

        After running pip install tensorboard, to have data be read through these cloud providers, you can now run:

        + +
        torch-tb-profiler[blob] 
        +torch-tb-profiler[gs] 
        +torch-tb-profiler[s3] 
        +
        +

        pip install torch-tb-profiler[blob], pip install torch-tb-profiler[gs], or pip install torch-tb-profiler[S3] to have data be read through these cloud providers. For more information, please refer to this README.

        + +

        Jump to Source Code:

        + +

        One of the great benefits of having both TensorBoard and the PyTorch Profiler being integrated directly in Visual Studio Code (VS Code) is the ability to directly jump to the source code (file and line) from the profiler stack traces. VS Code Python Extension now supports TensorBoard Integration.

        + +

        Jump to source is ONLY available when Tensorboard is launched within VS Code. Stack tracing will appear on the plugin UI if the profiling with_stack=True. When you click on a stack trace from the PyTorch Profiler, VS Code will automatically open the corresponding file side by side and jump directly to the line of code of interest for you to debug. This allows you to quickly make actionable optimizations and changes to your code based on the profiling results and suggestions.

        + +
        + +

        Gify: Jump to Source using Visual Studio Code Plug In UI

        +
        + +

        For how to optimize batch size performance, check out the step-by-step tutorial here. PyTorch Profiler is also integrated with PyTorch Lightning and you can simply launch your lightning training jobs with –trainer.profiler=pytorch flag to generate the traces.

        + +

        What’s Next for the PyTorch Profiler?

        +

        You just saw how PyTorch Profiler can help optimize a model. You can now try the Profiler by pip install torch-tb-profiler to optimize your PyTorch model.

        + +

        Look out for an advanced version of this tutorial in the future. We are also thrilled to continue to bring state-of-the-art tool to PyTorch users to improve ML performance. We’d love to hear from you. Feel free to open an issue here.

        + +

        For new and exciting features coming up with PyTorch Profiler, follow @PyTorch on Twitter and check us out on pytorch.org.

        + +

        Acknowledgements

        + +

        The author would like to thank the contributions of the following individuals to this piece. From the Facebook side: Geeta Chauhan, Gisle Dankel, Woo Kim, Sam Farahzad, and Mark Saroufim. On the Microsoft side: AI Framework engineers (Teng Gao, Mike Guo, and Yang Gu), Guoliang Hua, and Thuy Nguyen.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-shanghai-notes/index.html b/blog/pytorch-shanghai-notes/index.html new file mode 100644 index 000000000000..090d9ed1b5ec --- /dev/null +++ b/blog/pytorch-shanghai-notes/index.html @@ -0,0 +1,683 @@ + + + + + + + + + + + + + PyTorch Shanghai Meetup Notes | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        September 08, 2024

        +

        + PyTorch Shanghai Meetup Notes +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Summary

        + +

        group photo

        + +

        We are honored to successfully host the PyTorch Shanghai Meetup on August 15, 2024. This Meetup has received great attention from the industry. We invited senior PyTorch developers from Intel and Huawei as guest speakers, who shared their valuable experience and the latest technical trends. In addition, this event also attracted PyTorch enthusiasts from many technology companies and well-known universities. A total of more than 40 participants gathered together to discuss and exchange the latest applications and technological advances of PyTorch.

        + +

        This Meetup not only strengthened the connection between PyTorch community members, but also provided a platform for local AI technology enthusiasts to learn, communicate and grow. We look forward to the next gathering to continue to promote the development of PyTorch technology in the local area.

        + +

        1. PyTorch Foundation Updates

        + +

        man instructing students

        + +

        PyTorch Board member Fred Li shared the latest updates in the PyTorch community, He reviewed the development history of the PyTorch community, explained in detail the growth path of community developers, encouraged everyone to delve deeper into technology, and introduced the upcoming PyTorch Conference 2024 related matters.

        + +

        2. Intel’s Journey with PyTorch Democratizing AI with ubiquitous hardware and open software

        + +

        PyTorch CPU module maintainer Jiong Gong shared 6-year technical contributions from Intel to PyTorch and its ecosystem, explored the remarkable advancements that Intel has made in both software and hardware democratizing AI, ensuring accessibility, and optimizing performance across a diverse range of Intel hardware platforms.

        + +

        man instructing students

        + +

        3. Exploring Multi-Backend Support in PyTorch Ecosystem: A Case Study of Ascend

        + +

        man instructing students

        + +

        Fengchun Hua, a PyTorch contributor from Huawei, took Huawei Ascend NPU as an example to demonstrate the latest achievements in multi-backend support for PyTorch applications. He introduced the hardware features of Huawei Ascend NPU and the infrastructure of CANN (Compute Architecture for Neural Networks), and explained the key achievements and innovations in native support work. He also shared the current challenges and the next work plan.

        + +

        Yuanhao Ji, another PyTorch contributor from Huawei, then introduced the Autoload Device Extension proposal, explained its implementation details and value in improving the scalability of PyTorch, and introduced the latest work progress of the PyTorch Chinese community.

        + +

        4. Intel XPU Backend for Inductor

        + +

        man instructing students

        + +

        Eikan is a PyTorch contributor from Intel. He focuses on torch.compile stack for both Intel CPU and GPU. In this session, Eikan presented Intel’s efforts on torch.compile for Intel GPUs. He provided updates on the current status of Intel GPUs within PyTorch, covering both functionality and performance aspects. Additionally, Eikan used Intel GPU as a case study to demonstrate how to integrate a new backend into the Inductor using Triton.

        + +

        5. PyTorch PrivateUse1 Evolution Approaches and Insights

        + +

        man instructing students

        + +

        Jiawei Li, a PyTorch collaborator from Huawei, introduced PyTorch’s Dispatch mechanism and emphasized the limitations of DIspatchKey. He took Huawei Ascend NPU as an example to share the best practices of the PyTorch PrivateUse1 mechanism. He mentioned that while using the PrivateUse1 mechanism, Huawei also submitted many improvements and bug fixes for the mechanism to the PyTorch community. He also mentioned that due to the lack of upstream CI support for out-of-tree devices, changes in upstream code may affect their stability and quality, and this insight was recognized by everyone.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch-xla-spmd/index.html b/blog/pytorch-xla-spmd/index.html new file mode 100644 index 000000000000..11a2b2c1af3e --- /dev/null +++ b/blog/pytorch-xla-spmd/index.html @@ -0,0 +1,814 @@ + + + + + + + + + + + + + PyTorch/XLA SPMD: Scale Up Model Training and Serving with Automatic Parallelization | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Yeounoh Chung, Jon Bolin, Milad Mohammadi, Jiewen Tan, Jack Cao, Joe Spisak, Alex Spiridonov, Shauheen Zahirazami, Steven Krawczyk, Wonjoo Lee Mohit Khatwani, Wanchao Liang, Vaibhav Singh + +

        +

        Today, we are delighted to announce PyTorch/XLA SPMD: the integration of GSPMD into PyTorch with an easy to use API. PyTorch developers seeking superior performance and scale can train and serve the largest neural networks while maximizing utilization of AI accelerators, such as Google Cloud TPUs.

        + +

        Introduction

        + +

        GSPMD is an automatic parallelization system for ML workloads. The XLA compiler transforms the single device program into a partitioned one with proper collectives, based on the user provided sharding hints. This allows developers to write PyTorch programs as if they are on a single large device without any custom sharded computation and/or collective communication ops to scale models.

        + +

        PyTorch/XLA SPMD allows PyTorch users to parallelize their ML workloads with GSPMD with less effort and with better performance. Some of the key highlights are:

        + +
          +
        • Better developer experience. Everything happens with a few sharding annotations from the user, and PyTorch/XLA SPMD achieves comparable performance to the most efficient PyTorch sharding implementation (see the Examples and Results section below). PyTorch/XLA SPMD separates the task of programming an ML model from the challenge of parallelization. Its automated approach to model sharding frees up the user from implementing the sharded version of ops with proper collectives in place.
        • +
        • A single API that enables a large variety of parallelism algorithms (including data parallelism, fully sharded data parallelism, spatial partitioning tensor and pipeline parallelism, as well as combinations of these algorithms) for different ML workloads and model architectures.
        • +
        • Industry-leading performance in large model training. PyTorch/XLA SPMD brings the powerful XLA GSPMD to PyTorch, enabling users to harness the full power of Google Cloud TPUs.
        • +
        • Enabling PyTorch and JAX developers take advantage of the same underlying XLA API to scale models.
        • +
        + +

        Key Concepts

        + +

        The key concepts behind the sharding annotation API are: 1) Mesh, 2) Partition Spec, and 3) mark_sharding API to express sharding intent using Mesh and Partition Spec. A more detailed design overview is available as a user guide here.

        + +

        Mesh

        + +

        For a given cluster of devices, a physical mesh is a representation of the interconnect topology.

        + +

        We derive a logical mesh based on this topology to create sub-groups of devices which can be used for partitioning different axes of tensors in a model. We apply sharding annotations to map the program across the logical mesh; this automatically inserts communication collectives in the program graph to support functional correctness (see the figure below).

        + +

        SPMD on PyTorch/XLA

        + +

        We abstract logical mesh with Mesh API. The axes of the logical Mesh can be named. Here is an example:

        + +
        import numpy as np
        +import torch_xla.runtime as xr
        +import torch_xla.experimental.xla_sharding as xs
        +from torch_xla.experimental.xla_sharding import Mesh
        +
        +# Enable XLA SPMD execution mode.
        +xr.use_spmd()
        +
        +# Assuming you are running on a TPU host that has 8 devices attached
        +num_devices = xr.global_runtime_device_count()
        +# mesh shape will be (4,2) in this example
        +mesh_shape = (num_devices // 2, 2)
        +device_ids = np.array(range(num_devices))
        +# axis_names 'x' nad 'y' are optional
        +mesh = Mesh(device_ids, mesh_shape, ('x', 'y'))
        +
        +mesh.get_logical_mesh()
        +>> array([[0, 1],
        +          [2, 3],
        +          [4, 5],
        +          [6, 7]])
        +mesh.shape()
        +>> OrderedDict([('x', 4), ('y', 2)])
        +
        + +

        Partition Spec

        + +

        partition_spec has the same rank as the input tensor. Each dimension describes how the corresponding input tensor dimension is sharded across the device mesh (logically defined by mesh_shape). partition_spec is a tuple of device_mesh dimension index, None, or a tuple of mesh dimension indices. The index can be an int or str if the corresponding mesh dimension is named. This specifies how each input rank is sharded (index to mesh_shape) or replicated (None).

        + +
        # Provide optional mesh axis names and use them in the partition spec
        +mesh = Mesh(device_ids, (4, 2), ('data', 'model'))
        +partition_spec = ('model', 'data')
        +xs.mark_sharding(input_tensor, mesh, partition_spec)
        +
        + +

        We support all three types of sharding described in the original GSPMD paper. For instance, one can specify partial replication like this:

        + +
        # Provide optional mesh axis names and use them in the partition spec
        +mesh = Mesh(device_ids, (2, 2, 2), ('x', 'y', 'z'))
        +
        +# evenly shard across x and z and replicate among y
        +partition_spec = ('x', 'z')  # equivalent to ('x', None, 'z')
        +xs.mark_sharding(input_tensor, mesh, partition_spec)
        +
        + +

        Simple Example With Sharding Annotation

        + +

        Users can annotate native PyTorch tensors using the mark_sharding API (src). This takes torch.Tensor as input and returns a XLAShardedTensor as output.

        + +
        def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh, partition_spec: Tuple[Union[int, None]]) -> XLAShardedTensor
        +
        + +

        Invoking mark_sharding API takes a user defined logical mesh and partition_spec and generates a sharding annotation for the XLA compiler. The sharding specification is attached to the XLATensor, as well as the original input tensor. Here is a simple usage example from the [RFC], to illustrate how the sharding annotation API works:

        + +
        import numpy as np
        +import torch
        +import torch_xla.core.xla_model as xm
        +import torch_xla.runtime as xr
        +import torch_xla.experimental.xla_sharding as xs
        +from torch_xla.experimental.xla_sharded_tensor import XLAShardedTensor
        +from torch_xla.experimental.xla_sharding import Mesh
        +
        +# Enable XLA SPMD execution mode.
        +xr.use_spmd()
        +
        +# Device mesh, this and partition spec as well as the input tensor shape define the individual shard shape.
        +num_devices = xr.global_runtime_device_count()
        +mesh_shape = (2, num_devicese // 2)  # 2x4 on v3-8, 2x2 on v4-8  
        +device_ids = np.array(range(num_devices))
        +mesh = Mesh(device_ids, mesh_shape, ('x', 'y'))
        +
        +t = torch.randn(8, 4).to(xm.xla_device())
        +
        +# Mesh partitioning, each device holds 1/8-th of the input
        +partition_spec = (0, 1)
        +m1_sharded = xs.mark_sharding(t, mesh, partition_spec)
        +assert isinstance(m1_sharded, XLAShardedTensor) == True
        +# Note that the sharding annotation is also in-placed updated to t
        +
        + +

        We can annotate different tensors in the PyTorch program to enable different parallelism techniques, as described in the comment below:

        + +
        # Sharding annotate the linear layer weights. SimpleLinear() is a nn.Module.
        +model = SimpleLinear().to(xm.xla_device())
        +xs.mark_sharding(model.fc1.weight, mesh, partition_spec)
        +
        +# Training loop
        +model.train()
        +for step, (data, target) in enumerate(loader):
        +  # Assumes `loader` returns data, target on XLA device
        +  optimizer.zero_grad()
        +  # Sharding annotate input data, we can shard any input
        +  # dimensions. Sharding the batch dimension enables 
        +  # data parallelism, sharding the feature dimension enables
        +  # spatial partitioning.
        +  xs.mark_sharding(data, mesh, partition_spec)
        +  ouput = model(data)
        +  loss = loss_fn(output, target)
        +  optimizer.step()
        +  xm.mark_step()
        +
        + +

        More complete unit test cases and integration test examples are available in the PyTorch/XLA repo.

        + +

        Results

        + +

        Performance

        + +

        We measured the performance of PyTorch/XLA SPMD using a GPT-2 model (src) and compared it with user-mode FSDP.

        + +

        Here, SPMD applies the same sharding scheme as the FSDP plot (i.e. 1D sharding). Users are expected to achieve better MFU results by exploring more advanced SPMD sharding schemes.

        + +

        SPMD vs. FSDP

        + +

        We use Model FLOPS Utilization (MFU) as a metric for comparison. MFU is “the ratio of the observed throughput relative to the theoretical maximum throughput of a system operating at peak FLOPs” (PaLM paper).

        + +
        flops_per_step = 6 * global_batch_size * seq_len * num_params
        +model_flops_utilization = flops_per_step / step_time(s) / chip_count / flops_per_chip
        +
        + +

        This estimation assumes that the input dimensionality is much larger than the input sequence length (d_model » seq_len). If this assumption is violated the self-attention FLOPs start to be significant enough and this expression will underestimate the true MFU.

        + +

        Scalability

        + +

        One of the core benefits of SPMD is the flexible partitioning which can be used to save accelerator memory (HBM) usage and improve scalability. For scalability analysis, we present two studies: 1) we examine the peak HBM across 4 model sizes using Hugging Face transformers (GPT-2) as the base implementation; 2) we examine the peak HBM usage with spatial partitioning.

        + +

        Peak HBM Utilization

        + +

        The above figure illustrates the unsharded 2B parameters model peak memory footprint stands at 26GB (red dashed line). harding model weights (model parallelism) reduces the peak memory footprint, and thus, enables larger model training with a given TPU pod slice. In these experiments, we achieved up to 39.75% MFU on a 4B parameters model on Google Cloud TPU v4-16.

        + +

        We also ran an input batch scalability test using spatial partitioning and a simple ResNet50 example (src) on Cloud TPU v4-8. Input batch is commonly sharded across the batch dimension for data parallelism (DDP, FSDP), but PyTorch/XLA SPMD enables input sharding across input feature dimensions for spatial sharding. As shown in the below figure, one can push the per-device batch size to 512 with spatial partitioning which is not possible with other data parallelism techniques.

        + +

        Batch size scaling with spatial partitioning

        + +

        The Road Forward for PyTorch/XLA SPMD

        + +

        We are ecstatic about what’s ahead for PyTorch/XLA and invite the community to join us. SPMD is still experimental, and we continuously add new features to it. In future releases, we plan to address async dataloading, partially replicated sharding, and other improvements. We’d love to hear from you, answer your questions about PyTorch/XLA SPMD, and learn how you use SPMD.

        + +

        Cheers!

        + +

        The PyTorch/XLA Team at Google

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-2-lib-updates/index.html b/blog/pytorch2-2-lib-updates/index.html new file mode 100644 index 000000000000..a1c1efb7c405 --- /dev/null +++ b/blog/pytorch2-2-lib-updates/index.html @@ -0,0 +1,754 @@ + + + + + + + + + + + + + New Library Updates in PyTorch 2.2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        January 30, 2024

        +

        + New Library Updates in PyTorch 2.2 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Summary

        + +

        We are bringing a number of improvements to the current PyTorch libraries, alongside the PyTorch 2.2 release. These updates demonstrate our focus on developing common and extensible APIs across all domains to make it easier for our community to build ecosystem projects on PyTorch.

        + + + + + + + + + + + + + + + + + + + + +
        Latest Stable Library Versions (Full List)* +
        TorchArrow 0.1.0 + TorchRec 0.6.0 + TorchVision 0.17 +
        TorchAudio 2.2.0 + TorchServe 0.9.0 + TorchX 0.7.0 +
        TorchData 0.7.1 + TorchText 0.17.0 + PyTorch on XLA Devices 2.1 +
        + +

        *To see prior versions or (unstable) nightlies, click on versions in the top left menu above ‘Search Docs’.

        + +

        TorchRL

        + +

        Feature: TorchRL’s Offline RL Data Hub

        + +

        TorchRL now provides one of the largest dataset hubs for offline RL and imitation learning, and it all comes under a single data format (TED, for TorchRL Episode Data format). This makes it possible to easily swap from different sources in a single training loop. It is also now possible to easily combine datasets of different sources through the ReplayBufferEnsemble class. The data processing is fully customizable. Sources include simulated tasks (Minari, D4RL, VD4RL), robotic datasets (Roboset, OpenX Embodied dataset) and gaming (GenDGRL/ProcGen, Atari/DQN). Check these out in the documentation.

        + +

        Aside from these changes, our replay buffers can now be dumped on disk using the .dumps() method which will serialize the buffers on disk using the TensorDict API which is faster, safer and more efficient than using torch.save.

        + +

        Finally, replay buffers can now be read and written from separate processes on the same machine without any extra code needed from the user!

        + +

        TorchRL2Gym environment API

        + +

        To facilitate TorchRL’s integration in existing code-bases and enjoy all the features of TorchRL’s environment API (execution on device, batched operations, transforms…) we provide a TorchRL-to-gym API that allows users to register any environment they want in gym or gymnasium. This can be used in turn to make TorchRL a universal lib-to-gym converter that works across stateless (eg, dm_control) and stateless (Brax, Jumanji) environments. The feature is thoroughly detailed in the doc. The info_dict reading API has also been improved.

        + +

        Environment speedups

        + +

        We added the option of executing environments on a different environment than the one used to deliver data in ParallelEnv. We also speeded up the GymLikeEnv class to a level that now makes it competitive with gym itself.

        + +

        Scaling objectives

        + +

        The most popular objectives for RLHF and training at scale (PPO and A2C) are now compatible with FSDP and DDP models!

        + +

        TensorDict

        + +

        Feature: MemoryMappedTensor to replace MemmapTensor

        + +

        We provide a much more efficient mmap backend for TensorDict; MemoryMappedTensor, which directly subclasses torch.Tensor. It comes with a bunch of utils to be constructed, such as from_tensor, empty and many more. MemoryMappedTensor is now much safer and faster than its counterpart. The library remains fully compatible with the previous class to facilitate transition.

        + +

        We also introduce a new set of multithreaded serialization methods that make tensordict serialization highly competitive with torch.save, with serialization and deserialization speeds for LLMs more than 3x faster than with torch.save.

        + +

        Feature: Non-tensor data within TensorDict

        + +

        It is not possible to carry non-tensor data through the NonTensorData tensorclass. This makes it possible to build tensordicts with metadata. The memmap-API is fully compatible with these values, allowing users to seamlessly serialize and deserialize such objects. To store non-tensor data in a tensordict, simply assign it using the __setitem__ method.

        + +

        Efficiency improvements

        + +

        Several methods runtime have been improved, such as unbind, split, map or even TensorDict instantiation. Check our benchmarks!

        + +

        TorchRec/fbgemm_gpu

        + +

        VBE

        + +

        TorchRec now natively supports VBE (variable batched embeddings) within the EmbeddingBagCollection module. This allows variable batch size per feature, unlocking sparse input data deduplication, which can greatly speed up embedding lookup and all-to-all time. To enable, simply initialize KeyedJaggedTensor with stride_per_key_per_rank and inverse_indices fields, which specify batch size per feature and inverse indices to reindex the embedding output respectively.

        + +

        In addition to the TorchRec library changes, fbgemm_gpu has added the support for variable batch size per feature in TBE. VBE is enabled on split TBE training for both weighted and unweighted cases. To use VBE, please make sure to use the latest fbgemm_gpu version.

        + +

        Embedding offloading

        + +

        This technique refers to using CUDA UVM to cache ‘hot’ embeddings (i.e. store embedding tables on host memory with cache on HBM memory), and prefetching the cache. Embedding offloading allows running a larger model with fewer GPUs, while maintaining competitive performance. Use the prefetching pipeline (PrefetchTrainPipelineSparseDist) and pass in per-table cache load factor and the prefetch_pipeline flag through constraints in the planner to use this feature.

        + +

        Fbgemm_gpu has introduced UVM cache pipeline prefetching in v0.5.0 for TBE performance speedup. This allows cache-insert to be executed in parallel with TBE forward/backward. To enable this feature, please be sure to use the latest fbgemm_gpu version.

        + +

        Trec.shard/shard_modules

        + +

        These APIs replace embedding submodules with its sharded variant. The shard API applies to an individual embedding module while the shard_modules API replaces all embedding modules and won’t touch other non-embedding submodules.

        + +

        Embedding sharding follows similar behavior to the prior TorchRec DistributedModuleParallel behavior, except the ShardedModules have been made composable, meaning the modules are backed by TableBatchedEmbeddingSlices which are views into the underlying TBE (including .grad). This means that fused parameters are now returned with named_parameters(), including in DistributedModuleParallel.

        + +

        TorchVision

        + +

        The V2 transforms are now stable!

        + +

        The torchvision.transforms.v2 namespace was still in BETA stage until now. It is now stable! Whether you’re new to Torchvision transforms, or you’re already experienced with them, we encourage you to start with Getting started with transforms v2 in order to learn more about what can be done with the new v2 transforms.

        + +

        Browse our main docs for general information and performance tips. The available transforms and functionals are listed in the API reference. Additional information and tutorials can also be found in our example gallery, e.g. Transforms v2: End-to-end object detection/segmentation example or How to write your own v2 transforms.

        + +

        Towards torch.compile() support

        + +

        We are progressively adding support for torch.compile() to torchvision interfaces, reducing graph breaks and allowing dynamic shape.

        + +

        The torchvision ops (nms, [ps_]roi_align, [ps_]roi_pool and deform_conv_2d) are now compatible with torch.compile and dynamic shapes.

        + +

        On the transforms side, the majority of low-level kernels (like resize_image() or crop_image()) should compile properly without graph breaks and with dynamic shapes. We are still addressing the remaining edge-cases, moving up towards full functional support and classes, and you should expect more progress on that front with the next release.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-2/index.html b/blog/pytorch2-2/index.html new file mode 100644 index 000000000000..3b2f83a86622 --- /dev/null +++ b/blog/pytorch2-2/index.html @@ -0,0 +1,768 @@ + + + + + + + + + + + + + PyTorch 2.2: FlashAttention-v2 integration, AOTInductor | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 2.2 (release note)! PyTorch 2.2 offers ~2x performance improvements to scaled_dot_product_attention via FlashAttention-v2 integration, as well as AOTInductor, a new ahead-of-time compilation and deployment tool built for non-python server-side deployments.

        + +

        This release also includes improved torch.compile support for Optimizers, a number of new inductor optimizations, and a new logging mechanism called TORCH_LOGS.

        + +

        Please note that we are deprecating macOS x86 support, and PyTorch 2.2.x will be the last version that supports macOS x64.

        + +

        Along with 2.2, we are also releasing a series of updates to the PyTorch domain libraries. More details can be found in the library updates blog.

        + +

        This release is composed of 3,628 commits and 521 contributors since PyTorch 2.1. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.2. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

        + +

        Summary:

        + +
          +
        • scaled_dot_product_attention (SDPA) now supports FlashAttention-2, yielding around 2x speedups compared to previous versions.
        • +
        • PyTorch 2.2 introduces a new ahead-of-time extension of TorchInductor called AOTInductor, designed to compile and deploy PyTorch programs for non-python server-side.
        • +
        • torch.distributed supports a new abstraction for initializing and representing ProcessGroups called device_mesh.
        • +
        • PyTorch 2.2 ships a standardized, configurable logging mechanism called TORCH_LOGS.
        • +
        • A number of torch.compile improvements are included in PyTorch 2.2, including improved support for compiling Optimizers and improved TorchInductor fusion and layout optimizations.
        • +
        • Please note that we are deprecating macOS x86 support, and PyTorch 2.2.x will be the last version that supports macOS x64.
        • +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +Stable + Beta + Performance Improvements +
        + FlashAttention-2 Integration + Inductor optimizations +
        + AOTInductor + aarch64 optimizations +
        + TORCH_LOGS + +
        + device_mesh + +
        + Optimizer compilation + +
        + +

        *To see a full list of public feature submissions click here.

        + +

        Beta Features

        + +

        [Beta] FlashAttention-2 support in torch.nn.functional.scaled_dot_product_attention

        + +

        torch.nn.functional.scaled_dot_product_attention (SDPA) now supports FlashAttention-2, yielding around 2x speedups (compared to the previous version) and reaching ~50-73% of theoretical maximum FLOPs/s on A100 GPUs.

        + +

        More information is available on FlashAttention-2 in this paper.

        + +

        For a tutorial on how to use SDPA please see this tutorial.

        + +

        [Beta] AOTInductor: ahead-of-time compilation and deployment for torch.export-ed programs

        + +

        AOTInductor is an extension of TorchInductor, designed to process exported PyTorch models, optimize them, and produce shared libraries as well as other relevant artifacts. These compiled artifacts can be deployed in non-Python environments, which are frequently employed for inference on the server-side. Note that AOTInductor supports the same backends as Inductor, including CUDA, ROCm, and CPU.

        + +

        For more information please see the AOTInductor tutorial.

        + +

        [Beta] Fine-grained configurable logging via TORCH_LOGS

        + +

        PyTorch now ships a standardized, configurable logging mechanism that can be used to analyze the status of various subsystems such as compilation and distributed operations.

        + +

        Logs can be enabled via the TORCH_LOGS environment variable. For example, to set the log level of TorchDynamo to logging.ERROR and the log level of TorchInductor to logging.DEBUG pass TORCH_LOGS=”-dynamo,+inductor” to PyTorch.

        + +

        For more information, please see the logging documentation and tutorial.

        + +

        [Beta] torch.distributed.device_mesh

        + +

        PyTorch 2.2 introduces a new abstraction for representing the ProcessGroups involved in distributed parallelisms called torch.distributed.device_mesh. This abstraction allows users to represent inter-node and intra-node process groups via an N-dimensional array where, for example, one dimension can data parallelism in FSDP while another could represent tensor parallelism within FSDP.

        + +

        For more information, see the device_mesh tutorial.

        + +

        [Beta] Improvements to torch.compile-ing Optimizers

        + +

        A number of improvements have been made to torch.compile-ing Optimizers including less overhead and support for cuda graphs.

        + +

        More technical details of the improvements are available on dev-discuss, and a recipe for torch.compile-ing optimizers is available here.

        + +

        Performance Improvements

        + +

        Inductor Performance Optimizations

        + +

        A number of performance optimizations have been added to TorchInductor including horizontal fusion support for torch.concat, improved convolution layout optimizations, and improved scaled_dot_product_attention pattern matching.

        + +

        For a complete list of inductor optimizations, please see the Release Notes.

        + +

        aarch64 Performance Optimizations

        + +

        PyTorch 2.2 includes a number of performance enhancements for aarch64 including support for mkldnn weight pre-packing, improved ideep primitive caching, and improved inference speed via fixed format kernel improvements to OneDNN.

        + +

        For a complete list of aarch64 optimizations, please see the Release Notes.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-3/index.html b/blog/pytorch2-3/index.html new file mode 100644 index 000000000000..755ff68e46ed --- /dev/null +++ b/blog/pytorch2-3/index.html @@ -0,0 +1,734 @@ + + + + + + + + + + + + + PyTorch 2.3 Release Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        April 24, 2024

        +

        + PyTorch 2.3 Release Blog +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 2.3 (release note)! PyTorch 2.3 offers support for user-defined Triton kernels in torch.compile, allowing for users to migrate their own Triton kernels from eager without experiencing performance regressions or graph breaks. Tensor Parallelism improves the experience for training Large Language Models using native PyTorch functions, which has been validated on training runs for 100B parameter models. As well, semi-structured sparsity implements semi-structured sparsity as a Tensor subclass, with observed speedups of up to 1.6 over dense matrix multiplication.

        + +

        This release is composed of 3393 commits and 426 contributors since PyTorch 2.2. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.3. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

        + + + + + + + + + + + + + + + + + + + + + + +
        Beta + Prototype + Performance Improvements +
        User-defined Triton kernels in torch.compile + torch.export adds new API to specify dynamic_shapes + Weight-Only-Quantization introduced into Inductor CPU backend +
        Tensor parallelism within PyTorch Distributed + Asynchronous checkpoint generation + +
        Support for semi-structured sparsity + + +
        + +

        *To see a full list of public feature submissions click here.

        + +

        Beta Features

        + +

        [Beta] Support for User-defined Triton kernels in torch.compile

        + +

        Allows for PyTorch code that contains triton kernels to be executed natively using torch.compile. This enables users to migrate code containing triton kernels from eager PyTorch to torch.compile without running into performance regressions or graph breaks. Native support also creates an opportunity for Torch Inductor to precompile the user-defined Triton kernel as well as better organize code around the Triton kernel allowing for further optimizations.

        + +

        You can find more information about how to utilize user defined Triton kernels in torch.compile within this tutorial.

        + +

        [Beta] Tensor Parallelism introduces more efficient ways to train LLMs

        + +

        The Tensor Parallel API facilitates various tensor manipulations across GPUs/hosts and integrates with FSDP for 2D Parallelism (Tensor parallelism across devices + Data Parallelism across hosts). It also offers a low-level API for constructing higher-level Tensor parallel APIs. This API has been validated to support the training of transformer models with over 100 billion parameters.

        + +

        You can find more information on how to utilize this within your workflows within this tutorial.

        + +

        [Beta] Semi-structured sparsity provides users with a way to take advantage of accelerated sparse inference and memory savings

        + +

        torch.sparse.SparseSemiStructuredTensor implements semi-structured sparsity as a Tensor subclass, which have observed speedups of up to 1.6 over dense matrix multiplication.

        + +

        In particular it adds:

        + +
          +
        • Additional support for quantization composability (mixed dtype, dequant fusion)
        • +
        • Updated cuSPARSELt and CUTLASS kernels
        • +
        • torch.compile support
        • +
        + +

        You can find more information on how to take advantage of semi-structured sparsity here.

        + +

        Prototype Features

        + +

        [PROTOTYPE] torch.export adds new API to specify dynamic_shapes

        + +

        You can now use torch.export.Dim to better represent dynamic shapes by enabling developers to specify ranges (min and max values) that can be reused across different input dimensions that are constrained to be equal.

        + +

        To learn more about torch.export.Dim as well as how it can be used to express more interesting relationships (such as linear arithmetic expressions) check out the tutorial here.

        + +

        [PROTOTYPE] Asynchronous checkpoint generation

        + +

        Asynchronous checkpoint generation allows users to continue their training loops while checkpoints are being generated, essentially offloading much of the checkpointing cost.

        + +

        You can find out how to utilize this within your own workflows with this example.

        + +

        Performance Improvements

        + +

        [PROTOTYPE] Weight-Only-Quantization introduced into Inductor CPU backend

        + +

        PyTorch 2.3 enhances LLM inference performance on torch inductor CPU backend. The project gpt-fast offers a simple and efficient PyTorch native acceleration for transformer text generation with torch.compile. Prior to 2.3 only CUDA devices were supported and this feature enables the CPU counterpart by providing highly optimized kernels for the int4 and int8 weight only quantization Linear.

        + +

        For more information / how to utilize this feature please refer to the gpt-fast README.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-4/index.html b/blog/pytorch2-4/index.html new file mode 100644 index 000000000000..85e9cee1df67 --- /dev/null +++ b/blog/pytorch2-4/index.html @@ -0,0 +1,791 @@ + + + + + + + + + + + + + PyTorch 2.4 Release Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        July 24, 2024

        +

        + PyTorch 2.4 Release Blog +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 2.4 (release note)! PyTorch 2.4 adds support for the latest version of Python (3.12) for torch.compile. AOTInductor freezing gives developers running AOTInductor more performance-based optimizations by allowing the serialization of MKLDNN weights. As well, a new default TCPStore server backend utilizing libuv has been introduced which should significantly reduce initialization times for users running large-scale jobs. Finally, a new Python Custom Operator API makes it easier than before to integrate custom kernels into PyTorch, especially for torch.compile.

        + +

        This release is composed of 3661 commits and 475 contributors since PyTorch 2.3. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.4. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Beta + Prototype + Performance Improvements +
        Python 3.12 support for torch.compile + FSDP2: DTensor-based per-parameter-sharding FSDP + torch.compile optimizations for AWS Graviton (aarch64-linux) processors +
        AOTInductor Freezing for CPU + torch.distributed.pipelining, simplified pipeline parallelism + BF16 symbolic shape optimization in TorchInductor +
        New Higher-level Python Custom Operator API + Intel GPU is available through source build + Performance optimizations for GenAI projects utilizing CPU devices +
        Switching TCPStore’s default server backend to libuv + + +
        + +

        *To see a full list of public feature submissions click here.

        + +

        Beta Features

        + +

        [Beta] Python 3.12 support for torch.compile

        + +

        torch.compile() previously only supported Python 3.8-3.11. Users can now optimize models with torch.compile() with Python 3.12.

        + +

        [Beta] AOTInductor Freezing for CPU

        + +

        This feature enables users to turn on the freezing flag when using AOTInductor on CPU. With this feature, AOTInductor can cover the same set of op scenarios and reach on-par performance as Inductor CPP backend. Before this support, when models contain MKLDNN operators (when computation-intensive operators are involved, such as Convolution, Linear, ConvTranspose, and so on) and freezing is on, those models will fail to run since AOTInductor didn’t support serializing the MKLDNN weights which have an opaque format.

        + +

        The workflow is as explained in the AOTInductor tutorial, in addition to that users could now add the freezing flag to get better performance:

        +
        export TORCHINDUCTOR_FREEZING=1
        +
        + +

        [Beta] New Higher-level Python Custom Operator API

        + +

        We’ve added a new higher-level Python Custom Operator API that makes it easier than before to extend PyTorch with custom operators that behave like PyTorch’s built-in operators. Operators registered using the new high-level torch.library APIs are guaranteed to be compatible with torch.compile and other PyTorch subsystems; authoring a custom operator in Python using the previous low-level torch.library APIs required deep understanding of PyTorch internals and has many footguns.

        + +

        Please see the tutorial for more information.

        + +

        [Beta] Switching TCPStore’s default server backend to libuv

        + +

        Introduced a new default server backend for TCPStore built with libuv which should introduce significantly lower initialization times and better scalability. This should ideally benefit users with a much shorter startup time when accounting for large-scale jobs.

        + +

        For more information on the motivation + fallback instructions please refer to this tutorial.

        + +

        Prototype Features

        + +

        [PROTOTYPE] FSDP2: DTensor-based per-parameter-sharding FSDP

        + +

        FSDP2 is a new fully sharded data parallelism implementation that uses dim-0 per-parameter sharding to resolve fundamental composability challenges with FSDP1’s flat-parameter sharding.

        + +

        For more information regarding the motivation / design for FSDP2 please refer to the RFC on Github.

        + +

        [PROTOTYPE] torch.distributed.pipelining, simplified pipeline parallelism

        + +

        Pipeline Parallelism is one of the primitive parallelism techniques for deep learning. It allows the execution of a model to be partitioned such that multiple micro-batches can execute different parts of the model code concurrently.

        + +

        torch.distributed.pipelining provides a toolkit that allows for easy implementation of pipeline parallelism on general models while also offering composability with other common PyTorch distributed features like DDP, FSDP, or tensor parallel.

        + +

        For more information on this please refer to our documentation and tutorial.

        + +

        [PROTOTYPE] Intel GPU is available through source build

        + +

        Intel GPU in PyTorch on Linux systems offers fundamental functionalities on Intel® Data Center GPU Max Series: eager mode and torch.compile.

        + +

        For eager mode, the commonly used Aten operators are implemented by using SYCL programming language. The most performance-critical graphs and operators are highly optimized by using oneAPI Deep Neural Network (oneDNN). For torch.compile mode, Intel GPU backend is integrated to Inductor on top of Triton.

        + +

        For more information for Intel GPU source build please refer to our blog post and documentation.

        + +

        Performance Improvements

        + +

        torch.compile optimizations for AWS Graviton (aarch64-linux) processors

        + +

        AWS optimized the PyTorch torch.compile feature for AWS Graviton3 processors. This optimization results in up to 2x better performance for Hugging Face model inference (based on geomean of performance improvement for 33 models) and up to 1.35x better performance for TorchBench model inference (geomean of performance improvement for 45 models) compared to the default eager mode inference across several natural language processing (NLP), computer vision (CV), and recommendation models on AWS Graviton3-based Amazon EC2 instances.

        + +

        For more information regarding specific technical details please refer to the blog post.

        + +

        BF16 symbolic shape optimization in TorchInductor

        + +

        Pytorch users can now experience improved quality and performance gains with the beta BF16 symbolic shape support. While static shape may afford additional optimization opportunities compared to symbolic shape, it is insufficient for scenarios such as inference services with varying batch size and sequence length, or detection models with data-dependent output shape.

        + +

        Verification using TorchBench, Huggingface, and timms_model shows a similar pass rate and comparable speedup with the BF16 static shape scenario. Combining the benefits of symbolic shape with BF16 AMX instructions hardware acceleration provided by Intel CPUs and general Inductor CPU backend optimizations applicable to both static and symbolic shape in PyTorch 2.4, the performance for BF16 symbolic shape has significantly improved compared to PyTorch 2.3.

        + +

        The API to use this feature:

        + +
        model = .
        +model.eval()
        +with torch.autocast(device_type=cpu, dtype=torch.bfloat16), torch.no_grad():
        +   compiled_model = torch.compile(model, dynamic=True)
        +
        + +

        Performance optimizations for GenAI projects utilizing CPU devices

        + +

        Highlighting the enhanced performance of PyTorch on CPU, as demonstrated through the optimizations made for the “Segment Anything Fast” and “Diffusion Fast” project. However, only CUDA devices are supported in the model. We have incorporated CPU support into the projects, enabling users to leverage the increased power of CPU for running the project’s experiments. Meanwhile, we have employed a block-wise attention mask for SDPA as well, which can significantly reduce peak memory usage and improve performance. We have also optimized a series of layout propagation rules in Inductor CPU to improve performance.

        + +

        To facilitate this, we have updated the README file. The API to use this feature is given below, simply providing --device cpu in the command lines:

        + +
          +
        • +

          For Segment Anything Fast:

          + +
          export SEGMENT_ANYTHING_FAST_USE_FLASH_4=0
          +python run_experiments.py 16 vit_b <pytorch_github> <segment-anything_github>
          +<path_to_experiments_data> --run-experiments --num-workers 32 --device cpu
          +
          +
        • +
        • +

          For Diffusion Fast:

          + +
          python run_benchmark.py --compile_unet --compile_vae --enable_fused_projections --device=cpu
          +
          +
        • +
        + +

        Users can follow the guidelines to run the experiments and observe the performance improvements firsthand, as well as explore the performance improvement trends across FP32 and BF16 data types.

        + +

        Additionally, users can achieve good performance using torch.compile and SDPA. By observing the performance trends across these different factors, users can gain a deeper understanding of how various optimizations enhance PyTorch’s performance on CPU.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-5/index.html b/blog/pytorch2-5/index.html new file mode 100644 index 000000000000..e384dbba86eb --- /dev/null +++ b/blog/pytorch2-5/index.html @@ -0,0 +1,780 @@ + + + + + + + + + + + + + PyTorch 2.5 Release Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 17, 2024

        +

        + PyTorch 2.5 Release Blog +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 2.5 (release note)! This release features a new cuDNN backend for SDPA, enabling speedups by default for users of SDPA on H100s or newer GPUs. As well, regional compilation of torch.compile offers a way to reduce the cold start up time for torch.compile by allowing users to compile a repeated nn.Module (e.g. a transformer layer in LLM) without recompilations. Finally, TorchInductor CPP backend offers solid performance speedup with numerous enhancements like FP16 support, CPP wrapper, AOT-Inductor mode, and max-autotune mode.

        + +

        This release is composed of 4095 commits from 504 contributors since PyTorch 2.4. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.5. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

        + +

        As well, please check out our new ecosystem projects releases with TorchRec and TorchFix.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Beta + Prototype +
        cuDNN backend for SDPA + FlexAttention +
        torch.compile regional compilation without recompilations + Compiled Autograd +
        TorchDynamo added support for exception handling & MutableMapping types + Flight Recorder +
        TorchInductor CPU backend optimization + Max-autotune Support on CPU with GEMM Template +
        + TorchInductor on Windows +
        + FP16 support on CPU path for both eager mode and TorchInductor CPP backend +
        + Autoload Device Extension +
        + Enhanced Intel GPU support +
        + +

        *To see a full list of public feature submissions click here.

        + +

        BETA FEATURES

        + +

        [Beta] cuDNN backend for SDPA

        + +

        The cuDNN “Fused Flash Attention” backend was landed for torch.nn.functional.scaled_dot_product_attention. On NVIDIA H100 GPUs this can provide up to 75% speed-up over FlashAttentionV2. This speedup is enabled by default for all users of SDPA on H100 or newer GPUs.

        + +

        [Beta] torch.compile regional compilation without recompilations

        + +

        Regional compilation without recompilations, via torch._dynamo.config.inline_inbuilt_nn_modules which default to True in 2.5+. This option allows users to compile a repeated nn.Module (e.g. a transformer layer in LLM) without recompilations. Compared to compiling the full model, this option can result in smaller compilation latencies with 1%-5% performance degradation compared to full model compilation.

        + +

        See the tutorial for more information.

        + +

        [Beta] TorchInductor CPU backend optimization

        + +

        This feature advances Inductor’s CPU backend optimization, including CPP backend code generation and FX fusions with customized CPU kernels. The Inductor CPU backend supports vectorization of common data types and all Inductor IR operations, along with the static and symbolic shapes. It is compatible with both Linux and Windows OS and supports the default Python wrapper, the CPP wrapper, and AOT-Inductor mode.

        + +

        Additionally, it extends the max-autotune mode of the GEMM template (prototyped in 2.5), offering further performance gains. The backend supports various FX fusions, lowering to customized kernels such as oneDNN for Linear/Conv operations and SDPA. The Inductor CPU backend consistently achieves performance speedups across three benchmark suites—TorchBench, Hugging Face, and timms—outperforming eager mode in 97.5% of the 193 models tested.

        + +

        PROTOTYPE FEATURES

        + +

        [Prototype] FlexAttention

        + +

        We’ve introduced a flexible API that enables implementing various attention mechanisms such as Sliding Window, Causal Mask, and PrefixLM with just a few lines of idiomatic PyTorch code. This API leverages torch.compile to generate a fused FlashAttention kernel, which eliminates extra memory allocation and achieves performance comparable to handwritten implementations. Additionally, we automatically generate the backwards pass using PyTorch’s autograd machinery. Furthermore, our API can take advantage of sparsity in the attention mask, resulting in significant improvements over standard attention implementations.

        + +

        For more information and examples, please refer to the official blog post and Attention Gym.

        + +

        [Prototype] Compiled Autograd

        + +

        Compiled Autograd is an extension to the PT2 stack allowing the capture of the entire backward pass. Unlike the backward graph traced by AOT dispatcher, Compiled Autograd tracing is deferred until backward execution time, which makes it impervious to forward pass graph breaks, and allows it to record backward hooks into the graph.

        + +

        Please refer to the tutorial for more information.

        + +

        [Prototype] Flight Recorder

        + +

        Flight recorder is a new debugging tool that helps debug stuck jobs. The tool works by continuously capturing information about collectives as they run. Upon detecting a stuck job, the information can be used to quickly identify misbehaving ranks/machines along with code stack traces.

        + +

        For more information please refer to the following tutorial.

        + +

        [Prototype] Max-autotune Support on CPU with GEMM Template

        + +

        Max-autotune mode for the Inductor CPU backend in torch.compile profiles multiple implementations of operations at compile time and selects the best-performing one. This is particularly beneficial for GEMM-related operations, using a C++ template-based GEMM implementation as an alternative to the ATen-based approach with oneDNN and MKL libraries. We support FP32, BF16, FP16, and INT8 with epilogue fusions for x86 CPUs. We’ve seen up to 7% geomean speedup on the dynamo benchmark suites and up to 20% boost in next-token latency for LLM inference.

        + +

        For more information please refer to the tutorial.

        + +

        [Prototype] TorchInductor CPU on Windows

        + +

        Inductor CPU backend in torch.compile now works on Windows. We support MSVC (cl), clang (clang-cl) and Intel compiler (icx-cl) for Windows inductor currently.

        + +

        See the tutorial for more details.

        + +

        [Prototype] FP16 support on CPU path for both eager mode and TorchInductor CPP backend

        + +

        Float16 is a commonly used reduced floating point type for performance improvement in neural network inference/training. Since this release, float16 for both eager and TorchInductor is supported on the CPU path.

        + +

        [Prototype] Autoload Device Extension

        + +

        PyTorch now supports autoloading for out-of-tree device extensions, streamlining integration by eliminating the need for manual imports. This feature, enabled through the torch.backends entrypoint, simplifies usage by ensuring seamless extension loading, while allowing users to disable it via an environment variable if needed.

        + +

        See the tutorial for more information.

        + +

        [Prototype] Enhanced Intel GPU support

        + +

        Intel GPUs support enhancement is now available for both Intel® Data Center GPU Max Series and Intel® Client GPUs (Intel® Core™ Ultra processors with built-in Intel® Arc™ graphics and Intel® Arc™ Graphics for dGPU parts), which is to make it easier to accelerate your Machine Learning workflows on Intel GPUs in PyTorch 2.5 release. We also enabled the initial support of PyTorch on Windows for Intel® Client GPUs in this release.

        + +
          +
        • Expanded PyTorch hardware backend support matrix to include both Intel Data Center and Client GPUs.  
        • +
        • The implementation of SYCL* kernels to enhance coverage and execution of Aten operators on Intel GPUs to boost performance in PyTorch eager mode.
        • +
        • Enhanced Intel GPU backend of torch.compile to improve inference and training performance for a wide range of deep learning workloads.
        • +
        + +

        These features are available through PyTorch preview and nightly binary PIP wheels. For more information regarding Intel GPU support, please refer to documentation.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorch2-6/index.html b/blog/pytorch2-6/index.html new file mode 100644 index 000000000000..0b5898e63773 --- /dev/null +++ b/blog/pytorch2-6/index.html @@ -0,0 +1,780 @@ + + + + + + + + + + + + + PyTorch 2.6 Release Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        January 29, 2025

        +

        + PyTorch 2.6 Release Blog +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are excited to announce the release of PyTorch® 2.6 (release notes)! This release features multiple improvements for PT2: torch.compile can now be used with Python 3.13; new performance-related knob torch.compiler.set_stance; several AOTInductor enhancements. Besides the PT2 improvements, another highlight is FP16 support on X86 CPUs.

        + +

        NOTE: Starting with this release we are not going to publish on Conda, please see [Announcement] Deprecating PyTorch’s official Anaconda channel for the details.

        + +

        For this release the experimental Linux binaries shipped with CUDA 12.6.3 (as well as Linux Aarch64, Linux ROCm 6.2.4, and Linux XPU binaries) are built with CXX11_ABI=1 and are using the Manylinux 2.28 build platform. If you build PyTorch extensions with custom C++ or CUDA extensions, please update these builds to use CXX_ABI=1 as well and report any issues you are seeing. For the next PyTorch 2.7 release we plan to switch all Linux builds to Manylinux 2.28 and CXX11_ABI=1, please see [RFC] PyTorch next wheel build platform: manylinux-2.28 for the details and discussion.

        + +

        Also in this release as an important security improvement measure we have changed the default value for weights_only parameter of torch.load. This is a backward compatibility-breaking change, please see this forum post for more details.

        + +

        This release is composed of 3892 commits from 520 contributors since PyTorch 2.5. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve PyTorch. More information about how to get started with the PyTorch 2-series can be found at our Getting Started page.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Beta + Prototype +
        torch.compiler.set_stance + Improved PyTorch user experience on Intel GPUs +
        torch.library.triton_op + FlexAttention support on X86 CPU for LLMs +
        torch.compile support for Python 3.13 + Dim.AUTO +
        New packaging APIs for AOTInductor + CUTLASS and CK GEMM/CONV Backends for AOTInductor +
        AOTInductor: minifier + +
        AOTInductor: ABI-compatible mode code generation + +
        FP16 support for X86 CPUs + +
        + +

        *To see a full list of public feature submissions click here.

        + +

        BETA FEATURES

        + +

        [Beta] torch.compiler.set_stance

        + +

        This feature enables the user to specify different behaviors (“stances”) that torch.compile can take between different invocations of compiled functions. One of the stances, for example, is

        + +

        “eager_on_recompile”, that instructs PyTorch to code eagerly when a recompile is necessary, reusing cached compiled code when possible.

        + +

        For more information please refer to the set_stance documentation and the Dynamic Compilation Control with torch.compiler.set_stance tutorial.

        + +

        [Beta] torch.library.triton_op

        + +

        torch.library.triton_op offers a standard way of creating custom operators that are backed by user-defined triton kernels.

        + +

        When users turn user-defined triton kernels into custom operators, torch.library.triton_op allows torch.compile to peek into the implementation, enabling torch.compile to optimize the triton kernel inside it.

        + +

        For more information please refer to the triton_op documentation and the Using User-Defined Triton Kernels with torch.compile tutorial.

        + +

        [Beta] torch.compile support for Python 3.13

        + +

        torch.compile previously only supported Python up to version 3.12. Users can now optimize models with torch.compile in Python 3.13.

        + +

        [Beta] New packaging APIs for AOTInductor

        + +

        A new package format, “PT2 archive”, has been introduced. This essentially contains a zipfile of all the files that need to be used by AOTInductor, and allows users to send everything needed to other environments. There is also functionality to package multiple models into one artifact, and to store additional metadata inside of the package.

        + +

        For more details please see the updated torch.export AOTInductor Tutorial for Python runtime.

        + +

        [Beta] AOTInductor: minifier

        + +

        If a user encounters an error while using AOTInductor APIs, AOTInductor Minifier allows creation of a minimal nn.Module that reproduces the error.

        + +

        For more information please see the AOTInductor Minifier documentation.

        + +

        [Beta] AOTInductor: ABI-compatible mode code generation

        + +

        AOTInductor-generated model code has dependency on Pytorch cpp libraries. As Pytorch evolves quickly, it’s important to make sure previously AOTInductor compiled models can continue to run on newer Pytorch versions, i.e. AOTInductor is backward compatible.

        + +

        In order to guarantee application binary interface (ABI) backward compatibility, we have carefully defined a set of stable C interfaces in libtorch and make sure AOTInductor generates code that only refers to the specific set of APIs and nothing else in libtorch. We will keep the set of C APIs stable across Pytorch versions and thus provide backward compatibility guarantees for AOTInductor-compiled models.

        + +

        [Beta] FP16 support for X86 CPUs (both eager and Inductor modes)

        + +

        Float16 datatype is commonly used for reduced memory usage and faster computation in AI inference and training. CPUs like the recently launched Intel® Xeon® 6 with P-Cores support Float16 datatype with native accelerator AMX. Float16 support on X86 CPUs was introduced in PyTorch 2.5 as a prototype feature, and now it has been further improved for both eager mode and Torch.compile + Inductor mode, making it Beta level feature with both functionality and performance verified with a broad scope of workloads.

        + +

        PROTOTYPE FEATURES

        + +

        [Prototype] Improved PyTorch user experience on Intel GPUs

        + +

        PyTorch user experience on Intel GPUs is further improved with simplified installation steps, Windows release binary distribution and expanded coverage of supported GPU models including the latest Intel® Arc™ B-Series discrete graphics. Application developers and researchers seeking to fine-tune, inference and develop with PyTorch models on Intel® Core™ Ultra AI PCs and Intel® Arc™ discrete graphics will now be able to directly install PyTorch with binary releases for Windows, Linux and Windows Subsystem for Linux 2.

        + +
          +
        • Simplified Intel GPU software stack setup to enable one-click installation of the torch-xpu PIP wheels to run deep learning workloads in an out of the box fashion, eliminating the complexity of installing and activating Intel GPU development software bundles.
        • +
        • Windows binary releases for torch core, torchvision and torchaudio have been made available for Intel GPUs, and the supported GPU models have been expanded from Intel® Core™ Ultra Processors with Intel® Arc™ Graphics, Intel® Core™ Ultra Series 2 with Intel® Arc™ Graphics and Intel® Arc™ A-Series Graphics to the latest GPU hardware Intel® Arc™ B-Series graphics.
        • +
        • Further enhanced coverage of Aten operators on Intel GPUs with SYCL* kernels for smooth eager mode execution, as well as bug fixes and performance optimizations for torch.compile on Intel GPUs.
        • +
        + +

        For more information regarding Intel GPU support, please refer to Getting Started Guide.

        + +

        [Prototype] FlexAttention support on X86 CPU for LLMs

        + +

        FlexAttention was initially introduced in PyTorch 2.5 to provide optimized implementations for Attention variants with a flexible API. In PyTorch 2.6, X86 CPU support for FlexAttention was added through TorchInductor CPP backend. This new feature leverages and extends current CPP template abilities to support broad attention variants (e.x.: PageAttention, which is critical for LLMs inference) based on the existing FlexAttention API, and brings optimized performance on x86 CPUs. With this feature, it’s easy to use FlexAttention API to compose Attention solutions on CPU platforms and achieve good performance.

        + +

        [Prototype] Dim.AUTO

        + +

        Dim.AUTO allows usage of automatic dynamic shapes with torch.export. Users can export with Dim.AUTO and “discover” the dynamic behavior of their models, with min/max ranges, relations between dimensions, and static/dynamic behavior being automatically inferred.

        + +

        This is a more user-friendly experience compared to the existing named-Dims approach for specifying dynamic shapes, which requires the user to fully understand the dynamic behavior of their models at export time. Dim.AUTO allows users to write generic code that isn’t model-dependent, increasing ease-of-use for exporting with dynamic shapes.

        + +

        Please see torch.export tutorial for more information.

        + +

        [Prototype] CUTLASS and CK GEMM/CONV Backends for AOTInductor

        + +

        The CUTLASS and CK backend adds kernel choices for GEMM autotuning in Inductor. This is now also available in AOTInductor which can run in C++ runtime environments. A major improvement to the two backends is improved compile-time speed by eliminating redundant kernel binary compilations and dynamic shapes support.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/pytorchs-tracing-based-selective-build/index.html b/blog/pytorchs-tracing-based-selective-build/index.html new file mode 100644 index 000000000000..5c2425fd7d33 --- /dev/null +++ b/blog/pytorchs-tracing-based-selective-build/index.html @@ -0,0 +1,890 @@ + + + + + + + + + + + + + PyTorch’s Tracing Based Selective Build | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 17, 2022

        +

        + PyTorch’s Tracing Based Selective Build +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Dhruv Matani, Suraj Subramanian + +

        +

        Introduction

        + +

        TL;DR: It can be challenging to run PyTorch on mobile devices, SBCs (Single Board Computers), and IOT devices. When compiled, the PyTorch library is huge and includes dependencies that might not be needed for the on-device use case.

        + +

        To run a specific set of models on-device, we actually require only a small subset of the features in the PyTorch library. We found that using a PyTorch runtime generated using selective build can achieve up to 90% reduction in binary size (for the CPU and QuantizedCPU backends on an x86-64 build on Linux). In this blog, we share our experience of generating model-specific minimal runtimes using Selective Build and show you how to do the same.

        + +

        Why is this important for app developers?

        + +

        Using a PyTorch runtime generated by selective build can reduce the size of AI-powered apps by 30+ MB - a significant reduction for a typical mobile app! Making mobile applications more lightweight has many benefits - they are runnable on a wider variety of devices, consume less cellular data, and can be downloaded and updated faster on user’s devices.

        + +

        What does the Developer Experience look like?

        + +

        This method can work seamlessly with any existing PyTorch Mobile deployment workflows. All you need to do is replace the general PyTorch runtime library with a runtime customized for the specific models you wish to use in your application. The general steps in this process are:

        + +
          +
        1. Build the PyTorch Runtime in instrumentation mode (this is called an instrumentation build of PyTorch). This will record the used operators, kernels and features.
        2. +
        3. Run your models through this instrumentation build by using the provided model_tracer binary. This will generate a single YAML file that stores all the features used by your model. These features will be preserved in the minimal runtime.
        4. +
        5. Build PyTorch using this YAML file as input. This is the selective build technique, and it greatly reduces the size of the final PyTorch binary.
        6. +
        7. Use this selectively-built PyTorch library to reduce the size of your mobile application!
        8. +
        + +

        Building the PyTorch Runtime in a special “instrumentation” mode ( by passing the TRACING_BASED=1 build option) generates an instrumentation build runtime of PyTorch, along with a model_tracer binary. Running a model with this build allows us to trace the parts of PyTorch used by the model.

        + +

        + +

        + +

        + Figure 1: Instrumentation build of PyTorch +

        + +
        # Clone the PyTorch repo
        +git clone https://github.com/pytorch/pytorch.git
        +cd pytorch
        +
        +# Build the model_tracer
        +USE_NUMPY=0 USE_DISTRIBUTED=0 USE_CUDA=0 TRACING_BASED=1 \
        +  python setup.py develop
        +
        + +

        Now this instrumentation build is used to run a model inference with representative inputs. The model_tracer binary observes parts of the instrumentation build that were activated during the inference run, and dumps it to a YAML file.

        + +

        + +

        + +

        + Figure 2: YAML file generated by running model(s) on an instrumentation build +

        + +
        # Generate YAML file
        +./build/bin/model_tracer \
        +  --model_input_path /tmp/path_to_model.ptl \
        +  --build_yaml_path /tmp/selected_ops.yaml
        +
        + +

        Now we build the PyTorch Runtime again, but this time using the YAML file generated by the tracer. The runtime now only includes those parts that are needed for this model. This is called “Selectively built PyTorch runtime” in the diagram below.

        + +
        # Clean out cached configuration
        +make clean
        +
        +# Build PyTorch using Selected Operators (from the YAML file)
        +# using the host toolchain, and use this generated library
        +BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN=1 \
        +USE_LIGHTWEIGHT_DISPATCH=0 \
        +BUILD_LITE_INTERPRETER=1 \
        +SELECTED_OP_LIST=/tmp/selected_ops.yaml \
        +TRACING_BASED=1 \
        +  ./scripts/build_mobile.sh
        +
        + +

        + +

        + +

        + Figure 3: Selective Build of PyTorch and model execution on a selectively built PyTorch runtime +

        + +

        Show me the code!

        + +

        We’ve put together a notebook to illustrate what the process above looks like in code using a simple PyTorch model.

        + +

        For a more hands-on tutorial to deploy this on Android/iOS this tutorial should be helpful.

        + +

        Technical FAQs

        + +

        Why is Tracing needed for a Selective Build of PyTorch?

        + +

        In PyTorch, CPU kernels can call other operators via the PyTorch Dispatcher. Simply including the set of root operators called directly by the model is not sufficient as there might be many more being called under-the-hood transitively. Running the model on representative inputs and observing the actual list of operators called (aka “tracing”) is the most accurate way of determining what parts of PyTorch are used.

        + +

        Additionally, factors such as which dtypes a kernel should handle are also runtime features that depend on actual input provided to the model. Hence, the tracing mechanism is extremely suitable for this purpose.

        + +

        Which features can be selected (in or out) by using Tracing Based Selective Build?

        + +

        The following features can be selected for the PyTorch runtime during the tracing based selective build process:

        + +
          +
        1. CPU/QuantizedCPU kernels for PyTorch’s ATen Operators: If a PyTorch Operator is not needed by a model targeted at a selectively built runtime, then the registration of that CPU kernel is omitted in the runtime. This is controlled via Torchgen code-gen.
        2. +
        3. Primary Operators: This is controlled by a macro named TORCH_SELECTIVE_SCHEMA (via templated selective build) that either selects a primary operator or de-selects it based on information in a generated header file.
        4. +
        5. Code that handles specific dtypes in CPU kernels: This is performed by generating exception throws in specific case statements in the switch case generated by the macro AT_PRIVATE_CHECK_SELECTIVE_BUILD.
        6. +
        7. Registration of Custom C++ Classes that extend PyTorch: This is controlled by the macro TORCH_SELECTIVE_CLASS, which can be used when registering Custom C++ Classes. The torch::selective_class_<> helper is to be used in conjunction with the macro TORCH_SELECTIVE_CLASS.
        8. +
        + +

        What is the structure of the YAML file used during the build?

        + +

        The YAML file generated after tracing looks like the example below. It encodes all the elements of the “selectable” build feature as specified above.

        + +
        include_all_non_op_selectives: false
        +build_features: []
        +operators:
        +    aten::add.Tensor:
        +        is_used_for_training: false
        +        is_root_operator: true
        +        include_all_overloads: false
        +    aten::len.t:
        +        is_used_for_training: false
        +        is_root_operator: true
        +        include_all_overloads: false
        +kernel_metadata:
        +    _local_scalar_dense_cpu:
        +    - Float
        +    add_stub:
        +    - Float
        +    copy_:
        +    - Bool
        +    - Byte
        +    mul_cpu:
        +    - Float
        +custom_classes: []
        +
        + +

        How exactly is code eliminated from the generated binary?

        + +

        Depending on the specific scenario, there are 2 main techniques that are used to hint the compiler and linker about unused and unreachable code. This code is then cleaned up by the compiler or linker as unreachable code.

        + +

        [1] Unreferenced functions removed by the Linker

        + +

        When a function that isn’t transitively referenced from any visible function is present in the compiled object files that are being linked together, the linker will remove it (if the right build flags are provided). This is leveraged in 2 scenarios by the selective build system.

        + +
        Kernel Registration in the Dispatcher
        + +

        If an operator’s kernel isn’t needed, then it isn’t registered with the dispatcher. An unregistered kernel means that the function is unreachable, and it will be removed by the linker.

        + +
        Templated Selective Build
        + +

        The general idea here is that a class template specialization is used to select a class that either captures a reference to a function or not (depending on whether it’s used) and the linker can come along and clean out the unreferenced function.

        + +

        For example, in the code below, there’s no reference to the function “fn2”, so it will be cleaned up by the linker since it’s not referenced anywhere.

        + +
        #include <vector>
        +#include <cstdio>
        +
        +template <typename T, bool>
        +struct FunctionSelector {
        +    T fn_;
        +    FunctionSelector(T fn): fn_(fn) {}
        +    T get() { return this->fn_; }
        +};
        +
        +// The "false" specialization of this class does NOT retain the argument passed
        +// to the class constructor, which means that the function pointer passed in
        +// is considered to be unreferenced in the program (unless it is referenced
        +// elsewhere).
        +template <typename T>
        +struct FunctionSelector<T, false> {
        +    FunctionSelector(T) {}
        +};
        +
        +template <typename T>
        +FunctionSelector<T, true> make_function_selector_true(T fn) {
        +    return FunctionSelector<T, true>(fn);
        +}
        +
        +template <typename T>
        +FunctionSelector<T, false> make_function_selector_false(T fn) {
        +    return FunctionSelector<T, false>(fn);
        +}
        +
        +typedef void(*fn_ptr_type)();
        +
        +std::vector<fn_ptr_type> fns;
        +
        +template <typename T>
        +void add_fn(FunctionSelector<T, true> fs) {
        +    fns.push_back(fs.get());
        +}
        +
        +template <typename T>
        +void add_fn(FunctionSelector<T, false>) {
        +    // Do nothing.
        +}
        +
        +// fn1 will be kept by the linker since it is added to the vector "fns" at
        +// runtime.
        +void fn1() {
        +    printf("fn1\n");
        +}
        +
        +// fn2 will be removed by the linker since it isn't referenced at all.
        +void fn2() {
        +    printf("fn2\n");
        +}
        +
        +int main() {
        +    add_fn(make_function_selector_true(fn1));
        +    add_fn(make_function_selector_false(fn2));
        +}
        +
        + +

        [2] Dead Code Eliminated by the Compiler

        + +

        C++ Compilers can detect dead (unreachable) code by analyzing the code’s control flow statically. For example, if there’s a code-path that comes after an unconditional exception throw, then all the code after it will be marked as dead code and not converted to object code by the compiler. Typically, compilers require the use of the -fdce flag to eliminate dead code.

        + +

        In the example below, you can see that the C++ code on the left (in the red boxes) doesn’t have any corresponding generated object code on the right.

        + +

        + +

        + +

        + Figure 4: Dead Code Elimination by C++ Compilers +

        + +

        This property is leveraged in the bodies of PyTorch kernel implementations that have a lot of repeated code to handle multiple dtypes of a Tensor. A dtype is the underlying data-type that the Tensor stores elements of. This can be one of float, double, int64, bool, int8, etc…

        + +

        Almost every PyTorch CPU kernel uses a macro of the form AT_DISPATCH_ALL_TYPES* that is used to substitute some code specialized for every dtype that the kernel needs to handle. For example:

        + +
        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
        +    kBool, kHalf, kBFloat16, dtype, "copy_kernel", [&] {
        +  cpu_kernel_vec(
        +      iter,
        +      [=](scalar_t a) -> scalar_t { return a; },
        +      [=](Vectorized<scalar_t> a) -> Vectorized<scalar_t> { return a; });
        +});
        +
        + +

        The macro AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3 internally has a switch-case statement that looks like the code in Figure-4 above. The tracing process records the dtypes triggered for the kernel tag “copy_kernel” and the build process processes these tags and inserts throw statements in every case statement that is handling the dtype that isn’t required for this kernel tag.

        + +

        This is how dtype selectivity is implemented in PyTorch’s Tracing Based Selective Build.

        + +

        Conclusion

        + +

        Tracing Based Selective Build is a practical and scalable approach to selecting only the used parts of an application to retain code that static analysis can not detect. This code is usually extremely data/input dependent in nature.

        + +

        This article provides detailed insights into how Tracing Based Selective Build works under the hood, and the technical details related to its implementation. These techniques can also be applied to other applications and situations that can benefit from reduced binary size.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/quantization-aware-training/index.html b/blog/quantization-aware-training/index.html new file mode 100644 index 000000000000..e7b79da37ea2 --- /dev/null +++ b/blog/quantization-aware-training/index.html @@ -0,0 +1,901 @@ + + + + + + + + + + + + + Quantization-Aware Training for Large Language Models with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Andrew Or, Jerry Zhang, Evan Smothers, Kartikay Khandelwal, Supriya Rao + +

        +

        In this blog, we present an end-to-end Quantization-Aware Training (QAT) flow for large language models in PyTorch. We demonstrate how QAT in PyTorch can recover up to 96% of the accuracy degradation on hellaswag and 68% of the perplexity degradation on wikitext for Llama3 compared to post-training quantization (PTQ). We present the QAT APIs in torchao and showcase how users can leverage them for fine-tuning in torchtune.

        + +

        Llama3-8B fine-tuned on the C4 dataset (en subset) with and without QAT using int8 per token dynamic activations + int4 grouped per channel weights, evaluated on hellaswag and wikitext on a A100 GPU. Note the log scale for wikitext (lower is better).

        + +

        Figure 1: Llama3-8B fine-tuned on the C4 dataset (en subset) with and without QAT using int8 per token dynamic activations + int4 grouped per channel weights, evaluated on hellaswag and wikitext on a A100 GPU. Note the log scale for wikitext (lower is better).

        + +

        To demonstrate the effectiveness of QAT in an end-to-end flow, we further lowered the quantized model to XNNPACK, a highly optimized neural network library for backends including iOS and Android, through executorch. After lowering to XNNPACK, the QAT model saw 16.8% lower perplexity than the PTQ model, while maintaining the same model size and on-device inference and generation speeds.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Lowered model metric + PTQ + QAT +
        Wikitext word perplexity (↓) + 23.316 + 19.403 +
        Wikitext byte perplexity (↓) + 1.850 + 1.785 +
        Wikitext bits per byte (↓) + 0.887 + 0.836 +
        Model size + 3.881 GB + 3.881 GB +
        On-device inference speed + 5.065 tok/s + 5.265 tok/s +
        On-device generation speed + 8.369 tok/s + 8.701 tok/s +
        + +

        Table 1: QAT achieved 16.8% lower perplexity and unchanged model sizes and on-device inference and generation speeds on the Llama3-8B model lowered to XNNPACK. Linear layers are quantized using int8 per token dynamic activations + int4 grouped per channel weights, and embeddings are additionally quantized to int4 using a group size of 32 (QAT is only applied to linear layers). Wikitext evaluation is performed using 5 samples and a max sequence length of 127 on server CPU, since evaluation is not available on device (lower is better for all wikitext results). On-device inference and generation is benchmarked on the Samsung Galaxy S22 smartphone.

        + +

        QAT APIs

        + +

        We are excited for users to try our QAT API in torchao, which can be leveraged for both training and fine-tuning. This API involves two steps, prepare and convert: prepare applies a transformation on the linear layers in the model to simulate the numerics of quantization during training, and convert actually quantizes these layers into lower bit-widths after training. The converted model can then be used in the exact same way as the PTQ model:

        + +
        import torch
        +from torchtune.models.llama3 import llama3
        +from torchao.quantization.prototype.qat import Int8DynActInt4WeightQATQuantizer
        +
        +# Smaller version of llama3 to fit in a single GPU
        +model = llama3(
        +    vocab_size=4096,
        +    num_layers=16,
        +    num_heads=16,
        +    num_kv_heads=4,
        +    embed_dim=2048,
        +    max_seq_len=2048,
        +).cuda()
        +
        +# Quantizer for int8 dynamic per token activations +
        +# int4 grouped per channel weights, only for linear layers
        +qat_quantizer = Int8DynActInt4WeightQATQuantizer()
        +
        +# Insert "fake quantize" operations into linear layers.
        +# These operations simulate quantization numerics during
        +# training without performing any dtype casting
        +model = qat_quantizer.prepare(model)
        +
        +# Standard training loop
        +optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5)
        +loss_fn = torch.nn.CrossEntropyLoss()
        +for i in range(10):
        +    example = torch.randint(0, 4096, (2, 16)).cuda()
        +    target = torch.randn((2, 16, 4096)).cuda()
        +    output = model(example)
        +    loss = loss_fn(output, target)
        +    loss.backward()
        +    optimizer.step()
        +    optimizer.zero_grad()
        +
        +# Convert fake quantize to actual quantize operations
        +# The quantized model has the exact same structure as the
        +# quantized model produced in the corresponding PTQ flow
        +# through `Int8DynActInt4WeightQuantizer`
        +model = qat_quantizer.convert(model)
        +
        +# inference or generate
        +
        + +

        Fine-tuning with torchtune

        + +

        We also integrated this QAT flow into torchtune and provided recipes to run this in a distributed setting, similar to the existing full fine-tune distributed recipe. Users can additionally apply QAT during LLM fine-tuning by running the following command. See this README for more details.

        + +
        tune run --nproc_per_node 8 qat_distributed --config llama3/8B_qat_full
        +
        + +

        What is Quantization-Aware Training?

        + +

        Quantization-Aware Training (QAT) is a common quantization technique for mitigating model accuracy/perplexity degradation that arises from quantization. This is achieved by simulating quantization numerics during training while keeping the weights and/or activations in the original data type, typically float, effectively “fake quantizing” the values instead of actually casting them to lower bit-widths:

        + +
        # PTQ: x_q is quantized and cast to int8
        +# scale and zero point (zp) refer to parameters used to quantize x_float
        +# qmin and qmax refer to the range of quantized values
        +x_q = (x_float / scale + zp).round().clamp(qmin, qmax).cast(int8)
        +
        +# QAT: x_fq is still in float
        +# Fake quantize simulates the numerics of quantize + dequantize
        +x_fq = (x_float / scale + zp).round().clamp(qmin, qmax)
        +x_fq = (x_fq - zp) * scale
        +
        + +

        Since quantization involves non-differentiable operations like rounding, the QAT backward pass typically uses straight-through estimators (STE), a mechanism to estimate the gradients flowing through non-smooth functions, to ensure the gradients passed to the original weights are still meaningful. In this manner, the gradients are computed with the knowledge that the weights will ultimately be quantized after training, effectively allowing the model to adjust for quantization noise during the training process. Note that an alternative to QAT is quantized training, which actually casts the values to lower bit dtypes during training, but prior efforts have only seen success up to 8-bits, whereas QAT is effective even at lower bit-widths.

        + +

        QAT in PyTorch

        + +

        We added an initial QAT flow in torchao under prototype here. Currently we support int8 dynamic per-token activations + int4 grouped per-channel weights (abbreviated 8da4w) for linear layers. These settings are motivated by a combination of kernel availability on edge backends and prior research on LLM quantization, which found that per-token activation and per-group weight quantization achieves the best model quality for LLMs compared to other quantization schemes.

        + +

        torchao QAT flow. This flow involves two steps: (1) prepare, which inserts the fake quantization ops into the model’s linear layers, and (2) convert, which converts these fake quantization ops with actual quantize and dequantize ops after training.

        + +

        Figure 2: torchao QAT flow. This flow involves two steps: (1) prepare, which inserts the fake quantization ops into the model’s linear layers, and (2) convert, which converts these fake quantization ops with actual quantize and dequantize ops after training.

        + +

        This flow produces the exact same quantized model as the PTQ flow using the same quantization settings (through Int8DynActInt4WeightQuantizer), but with quantized weights that achieve superior accuracies and perplexities. Thus, we can use the model converted from the QAT flow as a drop-in replacement for the PTQ model and reuse all the backend delegation logic and underlying kernels.

        + +

        Experimental Results

        + +

        All experiments in this blog post are performed using the torchtune QAT integration described above. We use 6-8 A100 GPUs with 80 GBs each to fine-tune Llama2-7B and Llama3-8B on the C4 dataset (en subset) for 5000 steps. For all experiments, we use batch size = 2, learning rate = 2e-5, max sequence length = 4096 for Llama2 and 8192 for Llama3, Fully Sharded Data Parallel (FSDP) as our distribution strategy, and activation checkpointing to reduce memory footprint. For 8da4w experiments, we use a group size of 256 for weights.

        + +

        Since the pre-training dataset is not easily accessible, we perform QAT during the fine-tuning process. Empirically, we found that disabling fake quantization for the first N steps led to better results, presumably because doing so allows the weights to stabilize before we start introducing quantization noise to the fine-tuning process. We disable fake quantization for the first 1000 steps for all our experiments.

        + +

        We evaluate our quantized models using the lm-evaluation-harness integration in torchtune. We report evaluation results from a variety of tasks commonly used to evaluate LLMs, including hellaswag, a commonsense sentence completion task, wikitext, a next token/byte prediction task, and a few question-answering tasks such as arc, openbookqa, and piqa. For wikitext, perplexity refers to the inverse of how well the model can predict the next word or byte (lower is better), and bits_per_byte refers to how many bits are needed to predict the next byte (lower is also better here). For all other tasks, acc_norm refers to the accuracy normalized by the byte-length of the target string.

        + +

        Int8 Dynamic Activations + Int4 Weight Quantization (8da4w)

        + +

        Starting with Llama2 8da4w quantization, we saw that QAT was able to recover 62% of the normalized accuracy degradation on hellaswag compared to PTQ, and 58% and 57% of the word and byte perplexity degradation (respectively) on wikitext. We see similar improvements for most of the other tasks.

        + +

        Llama2-7B 8da4w quantization with and without QAT

        + +

        Figure 3a: Llama2-7B 8da4w quantization with and without QAT

        + +

        Llama2-7B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)

        + +

        Figure 3b: Llama2-7B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)

        + +

        Llama3 8da4w quantization saw even more pronounced improvements with QAT. On the hellaswag evaluation task, we were able to recover 96% of the normalized accuracy degradation on hellaswag compared to PTQ, with minimal overall degradation (<1%) compared to the non-quantized accuracy. On the wikitext evaluation task, QAT recovered 68% and 65% of the word and byte perplexity degradation (respectively). Even on arc_challenge, which was difficult for Llama2 QAT, we were able to recover 51% of the normalized accuracy degradation.

        + +

        Llama3-8B 8da4w quantization with and without QAT

        + +

        Figure 4a: Llama3-8B 8da4w quantization with and without QAT

        + +

        Llama3-8B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)

        + +

        Figure 4b: Llama3-8B 8da4w quantization with and without QAT, evaluated on wikitext (lower is better)

        + +

        Lower Bit Weight Only Quantization

        + +

        We further extended the torchao QAT flow to 2-bit and 3-bit weight only quantization and repeated the same experiments for Llama3-8B. Quantization degradation is more severe at lower bit-widths, so we use a group size of 32 for all experiments for finer-grained quantization.

        + +

        However, this is still not enough for 2-bits PTQ, which saw wikitext perplexity explode. To mitigate this problem, we leverage knowledge from prior sensitivity analysis that the first 3 and last 2 layers of the Llama3 model are the most sensitive, and skip quantizing these layers in exchange for a moderate increase in quantized model size (1.78 GB for 2-bits and 1.65 GB for 3-bits). This brought the wikitext word perplexity down from 603336 to 6766, which is significant but still far from acceptable. To further improve the quantized model, we turn to QAT.

        + +

        Llama3-8B 2-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.

        + +

        Figure 5a: Llama3-8B 2-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.

        + +

        We observe that applying QAT while skipping quantization for the first 3 and last 2 layers further brought the word perplexity down to a much more reasonable value of 30 (from 6766). More generally, QAT was able to recover 53% of the normalized accuracy degradation on hellaswag compared to PTQ, and 99% and 89% of the word and byte perplexity degradation (respectively) on wikitext. Without skipping the sensitive layers, however, QAT was far less effective at mitigating degradation in quantized model quality.

        + +

        Llama3-8B 2-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.

        + +

        Figure 5b: Llama3-8B 2-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.

        + +

        For 3-bit weight only quantization, QAT was effective even without skipping the first 3 and last 2 layers, though skipping these layers still led to better results for both PTQ and QAT. In the skip case, QAT was able to recover 63% of the normalized accuracy degradation on hellaswag compared to PTQ, and 72% and 65% of the word and byte perplexity degradation (respectively) on wikitext.

        + +

        Llama3-8B 3-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.

        + +

        Figure 6a: Llama3-8B 3-bit weight only quantization with and without QAT. Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization.

        + +

        Llama3-8B 3-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.

        + +

        Figure 6b: Llama3-8B 3-bit weight only quantization with and without QAT, evaluated on wikitext (lower is better). Bars with “skip” refer to skipping quantization for the first 3 and last 2 layers of the model, which are more sensitive to quantization. Note the log scale.

        + +

        QAT Overhead

        + +

        QAT inserts many fake quantize operations throughout the model, adding considerable overhead to both the fine-tuning speed and the memory usage. For a model like Llama3-8B for example, we have (32 * 7) + 1 = 225 linear layers, each of which has at least 1 fake quantize for the weights and potentially 1 fake quantize for the input activations. Memory footprint increase is also significant, since we cannot mutate the weights in-place and so we need to clone them before applying fake quantization, though this overhead can be mostly mitigated by enabling activation checkpointing.

        + +

        In our microbenchmarks, we found that 8da4w QAT fine-tuning is ~34% slower than regular full fine-tuning. With activation checkpointing, the memory increase per GPU is around 2.35 GB. Most of these overheads are fundamental to how QAT works, though we may be able to speed up computation with torch.compile in the future.

        + + + + + + + + + + + + + + + + + +
        Per GPU statistics + Full fine-tuning + QAT fine-tuning +
        Median tokens per second + 546.314 tok/s + 359.637 tok/s +
        Median peak memory + 67.501 GB + 69.850 GB +
        + +

        Table 2: Llama3 QAT fine-tuning overhead for int8 per token dynamic activations + int4 grouped per channel weights on 6 A100 GPUs (each with 80GB memory).

        + +

        Looking Ahead

        + +

        In this blog, we presented a QAT flow for LLMs through torchao, integrated this flow with the fine-tuning APIs in torchtune, and demonstrated its potential to recover most of the quantization degradation compared to PTQ and match non-quantized performance on certain tasks. There are many directions for future explorations:

        + +
          +
        • Hyperparameter tuning. It is likely that extensive hyperparameter tuning can further improve the results of finetuning and QAT. In addition to the general hyperparameters like the learning rate, batch size, dataset size, and number of fine-tuning steps, we should also tune QAT-specific ones, such as when to start/stop fake quantization, how many steps to fake quantize, and regularization parameters for fake quantized values.
        • +
        • Outlier reduction techniques. In our experiments, we found that both PTQ and QAT were susceptible to outliers. In addition to simple clamping and regularization during fine-tuning, we can explore techniques that allow the network to learn how to control these outliers (e.g. learned quantization ranges, clipped softmax, and gated attention), or possibly even borrow outlier suppression techniques from post-training settings (e.g. SpinQuant, SmoothQuant) and apply them sparingly throughout the fine-tuning process.
        • +
        • Mixed-precision and more complex dtypes. Especially in the lower bit regime, we saw that skipping quantization for certain sensitive layers was effective for both PTQ and QAT. Did we need to skip quantizing these layers altogether, or can we still quantize them, just to lower bit-widths? It will be interesting to explore mixed-precision quantization in the context of QAT. Training with newer dtypes such as MX4 is another promising direction, especially given that the upcoming Blackwell GPUs will no longer support int4 tensor cores.
        • +
        • Composability with LoRA and QLoRA. Our QAT integration in torchtune currently only supports the full fine-tuning workflow. However, many users wish to fine-tune their models using low-ranked adaptors to substantially reduce their memory footprint. Composing QAT with techniques like LoRA / QLoRA will enable users to reap the memory and performance benefits of these approaches while producing a model that will ultimately be quantized with minimal model quality degradation.
        • +
        • Composability with torch.compile. This is another potential way to significantly speed up fake quantization computations in QAT while reducing memory footprint. torch.compile is currently not compatible with the distribution strategy used in full distributed fine-tuning recipes in torchtune (with or without QAT), but support will be added in the near future.
        • +
        • Quantizing other layers. In this work, we only explored quantizing the linear layers. However, in the context of long sequence lengths, the KV cache often becomes the throughput bottleneck and can reach tens of GBs, hence LLM-QAT explored quantizing the KV cache alongside activations and weights. Prior work has also had success with quantizing the embedding layer down to 2-bits in other transformer-based models.
        • +
        • End-to-end evaluation on performant cuda kernels. A natural extension of this work is to provide an end-to-end QAT flow evaluated on performant cuda kernels, similar to the existing 8da4w QAT flow lowered to XNNPACK kernels through executorch. For int4 weight only quantization, we can leverage the efficient int4 weight mm kernel with bitpacking for quantization, and there is ongoing work to add QAT support for this kernel: https://github.com/pytorch/ao/pull/383. For 8da4w quantization, mixed 4-bit/8-bit GEMM is also being added in cutlass. This will be needed to build an efficient 8da4w cuda kernel.
        • +
        + +

        The QAT code can be found here. Please refer to this torchtune tutorial to get started. If you have any further questions, please feel free to open an issue on the torchao github or reach out to andrewor@meta.com. We welcome your feedback and contributions!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/quantization-in-practice/index.html b/blog/quantization-in-practice/index.html new file mode 100644 index 000000000000..0ef96c37f1d4 --- /dev/null +++ b/blog/quantization-in-practice/index.html @@ -0,0 +1,1119 @@ + + + + + + + + + + + + + Practical Quantization in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        February 08, 2022

        +

        + Practical Quantization in PyTorch +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Suraj Subramanian, Mark Saroufim, Jerry Zhang + +

        +

        Quantization is a cheap and easy way to make your DNN run faster and with lower memory requirements. PyTorch offers a few different approaches to quantize your model. In this blog post, we’ll lay a (quick) foundation of quantization in deep learning, and then take a look at how each technique looks like in practice. Finally we’ll end with recommendations from the literature for using quantization in your workflows.

        + +

        + +
        + Fig 1. PyTorch <3 Quantization +

        + +

        Contents

        + +

        Fundamentals of Quantization

        + +
        +

        If someone asks you what time it is, you don’t respond “10:14:34:430705”, but you might say “a quarter past 10”.

        +
        + +

        Quantization has roots in information compression; in deep networks it refers to reducing the numerical precision of its weights and/or activations.

        + +

        Overparameterized DNNs have more degrees of freedom and this makes them good candidates for information compression [1]. When you quantize a model, two things generally happen - the model gets smaller and runs with better efficiency. Hardware vendors explicitly allow for faster processing of 8-bit data (than 32-bit data) resulting in higher throughput. A smaller model has lower memory footprint and power consumption [2], crucial for deployment at the edge.

        + +

        Mapping function

        +

        The mapping function is what you might guess - a function that maps values from floating-point to integer space. A commonly used mapping function is a linear transformation given by , where is the input and are quantization parameters.

        + +

        To reconvert to floating point space, the inverse function is given by .

        + +

        , and their difference constitutes the quantization error.

        + +

        Quantization Parameters

        +

        The mapping function is parameterized by the scaling factor and zero-point .

        + +

        is simply the ratio of the input range to the output range +

        + +

        where [] is the clipping range of the input, i.e. the boundaries of permissible inputs. [] is the range in quantized output space that it is mapped to. For 8-bit quantization, the output range .

        + +

        acts as a bias to ensure that a 0 in the input space maps perfectly to a 0 in the quantized space.

        + +

        Calibration

        +

        The process of choosing the input clipping range is known as calibration. The simplest technique (also the default in PyTorch) is to record the running mininmum and maximum values and assign them to and . TensorRT also uses entropy minimization (KL divergence), mean-square-error minimization, or percentiles of the input range.

        + +

        In PyTorch, Observer modules (code) collect statistics on the input values and calculate the qparams . Different calibration schemes result in different quantized outputs, and it’s best to empirically verify which scheme works best for your application and architecture (more on that later).

        + +
        from torch.quantization.observer import MinMaxObserver, MovingAverageMinMaxObserver, HistogramObserver
        +C, L = 3, 4
        +normal = torch.distributions.normal.Normal(0,1)
        +inputs = [normal.sample((C, L)), normal.sample((C, L))]
        +print(inputs)
        +
        +# >>>>>
        +# [tensor([[-0.0590,  1.1674,  0.7119, -1.1270],
        +#          [-1.3974,  0.5077, -0.5601,  0.0683],
        +#          [-0.0929,  0.9473,  0.7159, -0.4574]]]),
        +
        +# tensor([[-0.0236, -0.7599,  1.0290,  0.8914],
        +#          [-1.1727, -1.2556, -0.2271,  0.9568],
        +#          [-0.2500,  1.4579,  1.4707,  0.4043]])]
        +
        +observers = [MinMaxObserver(), MovingAverageMinMaxObserver(), HistogramObserver()]
        +for obs in observers:
        +  for x in inputs: obs(x) 
        +  print(obs.__class__.__name__, obs.calculate_qparams())
        +
        +# >>>>>
        +# MinMaxObserver (tensor([0.0112]), tensor([124], dtype=torch.int32))
        +# MovingAverageMinMaxObserver (tensor([0.0101]), tensor([139], dtype=torch.int32))
        +# HistogramObserver (tensor([0.0100]), tensor([106], dtype=torch.int32))
        +
        + +

        Affine and Symmetric Quantization Schemes

        +

        Affine or asymmetric quantization schemes assign the input range to the min and max observed values. Affine schemes generally offer tighter clipping ranges and are useful for quantizing non-negative activations (you don’t need the input range to contain negative values if your input tensors are never negative). The range is calculated as +. Affine quantization leads to more computationally expensive inference when used for weight tensors [3].

        + +

        Symmetric quantization schemes center the input range around 0, eliminating the need to calculate a zero-point offset. The range is calculated as +. For skewed signals (like non-negative activations) this can result in bad quantization resolution because the clipping range includes values that never show up in the input (see the pyplot below).

        + +
        act =  torch.distributions.pareto.Pareto(1, 10).sample((1,1024))
        +weights = torch.distributions.normal.Normal(0, 0.12).sample((3, 64, 7, 7)).flatten()
        +
        +def get_symmetric_range(x):
        +  beta = torch.max(x.max(), x.min().abs())
        +  return -beta.item(), beta.item()
        +
        +def get_affine_range(x):
        +  return x.min().item(), x.max().item()
        +
        +def plot(plt, data, scheme):
        +  boundaries = get_affine_range(data) if scheme == 'affine' else get_symmetric_range(data)
        +  a, _, _ = plt.hist(data, density=True, bins=100)
        +  ymin, ymax = np.quantile(a[a>0], [0.25, 0.95])
        +  plt.vlines(x=boundaries, ls='--', colors='purple', ymin=ymin, ymax=ymax)
        +
        +fig, axs = plt.subplots(2,2)
        +plot(axs[0, 0], act, 'affine')
        +axs[0, 0].set_title("Activation, Affine-Quantized")
        +
        +plot(axs[0, 1], act, 'symmetric')
        +axs[0, 1].set_title("Activation, Symmetric-Quantized")
        +
        +plot(axs[1, 0], weights, 'affine')
        +axs[1, 0].set_title("Weights, Affine-Quantized")
        +
        +plot(axs[1, 1], weights, 'symmetric')
        +axs[1, 1].set_title("Weights, Symmetric-Quantized")
        +plt.show()
        +
        + +

        + +
        Fig 2. Clipping ranges (in purple) for affine and symmetric schemes +

        + +

        In PyTorch, you can specify affine or symmetric schemes while initializing the Observer. Note that not all observers support both schemes.

        + +
        for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
        +  obs = MovingAverageMinMaxObserver(qscheme=qscheme)
        +  for x in inputs: obs(x)
        +  print(f"Qscheme: {qscheme} | {obs.calculate_qparams()}")
        +
        +# >>>>>
        +# Qscheme: torch.per_tensor_affine | (tensor([0.0101]), tensor([139], dtype=torch.int32))
        +# Qscheme: torch.per_tensor_symmetric | (tensor([0.0109]), tensor([128]))
        +
        + +

        Per-Tensor and Per-Channel Quantization Schemes

        +

        Quantization parameters can be calculated for the layer’s entire weight tensor as a whole, or separately for each channel. In per-tensor, the same clipping range is applied to all the channels in a layer

        + +

        + +
        Fig 3. Per-Channel uses one set of qparams for each channel. Per-tensor uses the same qparams for the entire tensor. +

        + +

        For weights quantization, symmetric-per-channel quantization provides better accuracies; per-tensor quantization performs poorly, possibly due to high variance in conv weights across channels from batchnorm folding [3].

        + +
        from torch.quantization.observer import MovingAveragePerChannelMinMaxObserver
        +obs = MovingAveragePerChannelMinMaxObserver(ch_axis=0)  # calculate qparams for all `C` channels separately
        +for x in inputs: obs(x)
        +print(obs.calculate_qparams())
        +
        +# >>>>>
        +# (tensor([0.0090, 0.0075, 0.0055]), tensor([125, 187,  82], dtype=torch.int32))
        +
        + +

        Backend Engine

        +

        Currently, quantized operators run on x86 machines via the FBGEMM backend, or use QNNPACK primitives on ARM machines. Backend support for server GPUs (via TensorRT and cuDNN) is coming soon. Learn more about extending quantization to custom backends: RFC-0019.

        + +
        backend = 'fbgemm' if x86 else 'qnnpack'
        +qconfig = torch.quantization.get_default_qconfig(backend)  
        +torch.backends.quantized.engine = backend
        +
        + +

        QConfig

        + +

        The QConfig (code) NamedTuple stores the Observers and the quantization schemes used to quantize activations and weights.

        + +

        Be sure to pass the Observer class (not the instance), or a callable that can return Observer instances. Use with_args() to override the default arguments.

        + +
        my_qconfig = torch.quantization.QConfig(
        +  activation=MovingAverageMinMaxObserver.with_args(qscheme=torch.per_tensor_affine),
        +  weight=MovingAveragePerChannelMinMaxObserver.with_args(qscheme=torch.qint8)
        +)
        +# >>>>>
        +# QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MovingAverageMinMaxObserver'>, qscheme=torch.per_tensor_affine){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MovingAveragePerChannelMinMaxObserver'>, qscheme=torch.qint8){})
        +
        + +

        In PyTorch

        + +

        PyTorch allows you a few different ways to quantize your model depending on

        +
          +
        • if you prefer a flexible but manual, or a restricted automagic process (Eager Mode v/s FX Graph Mode)
        • +
        • if qparams for quantizing activations (layer outputs) are precomputed for all inputs, or calculated afresh with each input (static v/s dynamic),
        • +
        • if qparams are computed with or without retraining (quantization-aware training v/s post-training quantization)
        • +
        + +

        FX Graph Mode automatically fuses eligible modules, inserts Quant/DeQuant stubs, calibrates the model and returns a quantized module - all in two method calls - but only for networks that are symbolic traceable. The examples below contain the calls using Eager Mode and FX Graph Mode for comparison.

        + +

        In DNNs, eligible candidates for quantization are the FP32 weights (layer parameters) and activations (layer outputs). Quantizing weights reduces the model size. Quantized activations typically result in faster inference.

        + +

        As an example, the 50-layer ResNet network has ~26 million weight parameters and computes ~16 million activations in the forward pass.

        + +

        Post-Training Dynamic/Weight-only Quantization

        +

        Here the model’s weights are pre-quantized; the activations are quantized on-the-fly (“dynamic”) during inference. The simplest of all approaches, it has a one line API call in torch.quantization.quantize_dynamic. Currently only Linear and Recurrent (LSTM, GRU, RNN) layers are supported for dynamic quantization.

        + +

        (+) Can result in higher accuracies since the clipping range is exactly calibrated for each input [1].

        + +

        (+) Dynamic quantization is preferred for models like LSTMs and Transformers where writing/retrieving the model’s weights from memory dominate bandwidths [4].

        + +

        (-) Calibrating and quantizing the activations at each layer during runtime can add to the compute overhead.

        + +
        import torch
        +from torch import nn
        +
        +# toy model
        +m = nn.Sequential(
        +  nn.Conv2d(2, 64, (8,)),
        +  nn.ReLU(),
        +  nn.Linear(16,10),
        +  nn.LSTM(10, 10))
        +
        +m.eval()
        +
        +## EAGER MODE
        +from torch.quantization import quantize_dynamic
        +model_quantized = quantize_dynamic(
        +    model=m, qconfig_spec={nn.LSTM, nn.Linear}, dtype=torch.qint8, inplace=False
        +)
        +
        +## FX MODE
        +from torch.quantization import quantize_fx
        +qconfig_dict = {"": torch.quantization.default_dynamic_qconfig}  # An empty key denotes the default applied to all modules
        +model_prepared = quantize_fx.prepare_fx(m, qconfig_dict)
        +model_quantized = quantize_fx.convert_fx(model_prepared)
        +
        + +

        Post-Training Static Quantization (PTQ)

        +

        PTQ also pre-quantizes model weights but instead of calibrating activations on-the-fly, the clipping range is pre-calibrated and fixed (“static”) using validation data. Activations stay in quantized precision between operations during inference. About 100 mini-batches of representative data are sufficient to calibrate the observers [2]. The examples below use random data in calibration for convenience - using that in your application will result in bad qparams.

        + +

        + PTQ flowchart +
        + Fig 4. Steps in Post-Training Static Quantization +

        + +

        Module fusion combines multiple sequential modules (eg: [Conv2d, BatchNorm, ReLU]) into one. Fusing modules means the compiler needs to only run one kernel instead of many; this speeds things up and improves accuracy by reducing quantization error.

        + +

        (+) Static quantization has faster inference than dynamic quantization because it eliminates the float<->int conversion costs between layers.

        + +

        (-) Static quantized models may need regular re-calibration to stay robust against distribution-drift.

        + +
        # Static quantization of a model consists of the following steps:
        +
        +#     Fuse modules
        +#     Insert Quant/DeQuant Stubs
        +#     Prepare the fused module (insert observers before and after layers)
        +#     Calibrate the prepared module (pass it representative data)
        +#     Convert the calibrated module (replace with quantized version)
        +
        +import torch
        +from torch import nn
        +import copy
        +
        +backend = "fbgemm"  # running on a x86 CPU. Use "qnnpack" if running on ARM.
        +
        +model = nn.Sequential(
        +     nn.Conv2d(2,64,3),
        +     nn.ReLU(),
        +     nn.Conv2d(64, 128, 3),
        +     nn.ReLU()
        +)
        +
        +## EAGER MODE
        +m = copy.deepcopy(model)
        +m.eval()
        +"""Fuse
        +- Inplace fusion replaces the first module in the sequence with the fused module, and the rest with identity modules
        +"""
        +torch.quantization.fuse_modules(m, ['0','1'], inplace=True) # fuse first Conv-ReLU pair
        +torch.quantization.fuse_modules(m, ['2','3'], inplace=True) # fuse second Conv-ReLU pair
        +
        +"""Insert stubs"""
        +m = nn.Sequential(torch.quantization.QuantStub(), 
        +                  *m, 
        +                  torch.quantization.DeQuantStub())
        +
        +"""Prepare"""
        +m.qconfig = torch.quantization.get_default_qconfig(backend)
        +torch.quantization.prepare(m, inplace=True)
        +
        +"""Calibrate
        +- This example uses random data for convenience. Use representative (validation) data instead.
        +"""
        +with torch.inference_mode():
        +  for _ in range(10):
        +    x = torch.rand(1,2, 28, 28)
        +    m(x)
        +    
        +"""Convert"""
        +torch.quantization.convert(m, inplace=True)
        +
        +"""Check"""
        +print(m[[1]].weight().element_size()) # 1 byte instead of 4 bytes for FP32
        +
        +
        +## FX GRAPH
        +from torch.quantization import quantize_fx
        +m = copy.deepcopy(model)
        +m.eval()
        +qconfig_dict = {"": torch.quantization.get_default_qconfig(backend)}
        +# Prepare
        +model_prepared = quantize_fx.prepare_fx(m, qconfig_dict)
        +# Calibrate - Use representative (validation) data.
        +with torch.inference_mode():
        +  for _ in range(10):
        +    x = torch.rand(1,2,28, 28)
        +    model_prepared(x)
        +# quantize
        +model_quantized = quantize_fx.convert_fx(model_prepared)
        +
        + +

        Quantization-aware Training (QAT)

        +

        + QAT flowchart +
        + Fig 5. Steps in Quantization-Aware Training +

        + +

        The PTQ approach is great for large models, but accuracy suffers in smaller models [[6]]. This is of course due to the loss in numerical precision when adapting a model from FP32 to the INT8 realm (Figure 6(a)). QAT tackles this by including this quantization error in the training loss, thereby training an INT8-first model.

        + +

        + Fig. 6: Comparison of PTQ and QAT +
        + Fig 6. Comparison of PTQ and QAT convergence [3] +

        + +

        All weights and biases are stored in FP32, and backpropagation happens as usual. However in the forward pass, quantization is internally simulated via FakeQuantize modules. They are called fake because they quantize and immediately dequantize the data, adding quantization noise similar to what might be encountered during quantized inference. The final loss thus accounts for any expected quantization errors. Optimizing on this allows the model to identify a wider region in the loss function (Figure 6(b)), and identify FP32 parameters such that quantizing them to INT8 does not significantly affect accuracy.

        + +

        + Fake Quantization in the forward and backward pass +
        Fig 7. Fake Quantization in the forward and backward pass +
        Image source: https://developer.nvidia.com/blog/achieving-fp32-accuracy-for-int8-inference-using-quantization-aware-training-with-tensorrt +

        + +

        (+) QAT yields higher accuracies than PTQ.

        + +

        (+) Qparams can be learned during model training for more fine-grained accuracy (see LearnableFakeQuantize)

        + +

        (-) Computational cost of retraining a model in QAT can be several hundred epochs [1]

        + +
        # QAT follows the same steps as PTQ, with the exception of the training loop before you actually convert the model to its quantized version
        +
        +import torch
        +from torch import nn
        +
        +backend = "fbgemm"  # running on a x86 CPU. Use "qnnpack" if running on ARM.
        +
        +m = nn.Sequential(
        +     nn.Conv2d(2,64,8),
        +     nn.ReLU(),
        +     nn.Conv2d(64, 128, 8),
        +     nn.ReLU()
        +)
        +
        +"""Fuse"""
        +torch.quantization.fuse_modules(m, ['0','1'], inplace=True) # fuse first Conv-ReLU pair
        +torch.quantization.fuse_modules(m, ['2','3'], inplace=True) # fuse second Conv-ReLU pair
        +
        +"""Insert stubs"""
        +m = nn.Sequential(torch.quantization.QuantStub(), 
        +                  *m, 
        +                  torch.quantization.DeQuantStub())
        +
        +"""Prepare"""
        +m.train()
        +m.qconfig = torch.quantization.get_default_qconfig(backend)
        +torch.quantization.prepare_qat(m, inplace=True)
        +
        +"""Training Loop"""
        +n_epochs = 10
        +opt = torch.optim.SGD(m.parameters(), lr=0.1)
        +loss_fn = lambda out, tgt: torch.pow(tgt-out, 2).mean()
        +for epoch in range(n_epochs):
        +  x = torch.rand(10,2,24,24)
        +  out = m(x)
        +  loss = loss_fn(out, torch.rand_like(out))
        +  opt.zero_grad()
        +  loss.backward()
        +  opt.step()
        +
        +"""Convert"""
        +m.eval()
        +torch.quantization.convert(m, inplace=True)
        +
        + +

        Sensitivity Analysis

        +

        Not all layers respond to quantization equally, some are more sensitive to precision drops than others. Identifying the optimal combination of layers that minimizes accuracy drop is time-consuming, so [3] suggest a one-at-a-time sensitivity analysis to identify which layers are most sensitive, and retaining FP32 precision on those. In their experiments, skipping just 2 conv layers (out of a total 28 in MobileNet v1) give them near-FP32 accuracy. Using FX Graph Mode, we can create custom qconfigs to do this easily:

        + +
        # ONE-AT-A-TIME SENSITIVITY ANALYSIS 
        +
        +for quantized_layer, _ in model.named_modules():
        +  print("Only quantizing layer: ", quantized_layer)
        +
        +  # The module_name key allows module-specific qconfigs. 
        +  qconfig_dict = {"": None, 
        +  "module_name":[(quantized_layer, torch.quantization.get_default_qconfig(backend))]}
        +
        +  model_prepared = quantize_fx.prepare_fx(model, qconfig_dict)
        +  # calibrate
        +  model_quantized = quantize_fx.convert_fx(model_prepared)
        +  # evaluate(model)
        +
        + +

        Another approach is to compare statistics of the FP32 and INT8 layers; commonly used metrics for these are SQNR (Signal to Quantized Noise Ratio) and Mean-Squre-Error. Such a comparative analysis may also help in guiding further optimizations.

        + +

        + Fig 8. Comparing model weights and activations +
        + Fig 8. Comparing model weights and activations +

        + +

        PyTorch provides tools to help with this analysis under the Numeric Suite. Learn more about using Numeric Suite from the full tutorial.

        + +
        # extract from https://pytorch.org/tutorials/prototype/numeric_suite_tutorial.html
        +import torch.quantization._numeric_suite as ns
        +
        +def SQNR(x, y):
        +    # Higher is better
        +    Ps = torch.norm(x)
        +    Pn = torch.norm(x-y)
        +    return 20*torch.log10(Ps/Pn)
        +
        +wt_compare_dict = ns.compare_weights(fp32_model.state_dict(), int8_model.state_dict())
        +for key in wt_compare_dict:
        +    print(key, compute_error(wt_compare_dict[key]['float'], wt_compare_dict[key]['quantized'].dequantize()))
        +
        +act_compare_dict = ns.compare_model_outputs(fp32_model, int8_model, input_data)
        +for key in act_compare_dict:
        +    print(key, compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize()))
        +
        +
        + +

        Recommendations for your workflow

        +

        + Suggested quantization workflow +
        + Fig 9. Suggested quantization workflow +

        +

        Click for larger image

        + +

        Points to note

        +
          +
        • Large (10M+ parameters) models are more robust to quantization error. [2]
        • +
        • Quantizing a model from a FP32 checkpoint provides better accuracy than training an INT8 model from scratch.[2]
        • +
        • Profiling the model runtime is optional but it can help identify layers that bottleneck inference.
        • +
        • Dynamic Quantization is an easy first step, especially if your model has many Linear or Recurrent layers.
        • +
        • Use symmetric-per-channel quantization with MinMax observers for quantizing weights. Use affine-per-tensor quantization with MovingAverageMinMax observers for quantizing activations[2, 3]
        • +
        • Use metrics like SQNR to identify which layers are most suscpetible to quantization error. Turn off quantization on these layers.
        • +
        • Use QAT to fine-tune for around 10% of the original training schedule with an annealing learning rate schedule starting at 1% of the initial training learning rate. [3]
        • +
        • If the above workflow didn’t work for you, we want to know more. Post a thread with details of your code (model architecture, accuracy metric, techniques tried). Feel free to cc me @suraj.pt.
        • +
        + +

        That was a lot to digest, congratulations for sticking with it! Next, we’ll take a look at quantizing a “real-world” model that uses dynamic control structures (if-else, loops). These elements disallow symbolic tracing a model, which makes it a bit tricky to directly quantize the model out of the box. In the next post of this series, we’ll get our hands dirty on a model that is chock full of loops and if-else blocks, and even uses third-party libraries in the forward call.

        + +

        We’ll also cover a cool new feature in PyTorch Quantization called Define-by-Run, that tries to ease this constraint by needing only subsets of the model’s computational graph to be free of dynamic flow. Check out the Define-by-Run poster at PTDD’21 for a preview.

        + +

        References

        +

        [1] Gholami, A., Kim, S., Dong, Z., Yao, Z., Mahoney, M. W., & Keutzer, K. (2021). A survey of quantization methods for efficient neural network inference. arXiv preprint arXiv:2103.13630.

        + +

        [2] Krishnamoorthi, R. (2018). Quantizing deep convolutional networks for efficient inference: A whitepaper. arXiv preprint arXiv:1806.08342.

        + +

        [3] Wu, H., Judd, P., Zhang, X., Isaev, M., & Micikevicius, P. (2020). Integer quantization for deep learning inference: Principles and empirical evaluation. arXiv preprint arXiv:2004.09602.

        + +

        [4] PyTorch Quantization Docs

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/real-time-speech-rec/index.html b/blog/real-time-speech-rec/index.html new file mode 100644 index 000000000000..e0733c598a51 --- /dev/null +++ b/blog/real-time-speech-rec/index.html @@ -0,0 +1,877 @@ + + + + + + + + + + + + + Real-time Audio-visual Speech Recognition | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 10, 2023

        +

        + Real-time Audio-visual Speech Recognition +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Audio-Visual Speech Recognition (AV-ASR, or AVSR) is the task of transcribing text from audio and visual streams, which has recently attracted a lot of research attention due to its robustness to noise. The vast majority of work to date has focused on developing AV-ASR models for non-streaming recognition; studies on streaming AV-ASR are very limited.

        + +

        We have developed a compact real-time speech recognition system based on TorchAudio, a library for audio and signal processing with PyTorch. It can run locally on a laptop with high accuracy without accessing the cloud. Today, we are releasing the real-time AV-ASR recipe under a permissive open license (BSD-2-Clause license), enabling a broad set of applications and fostering further research on audio-visual models for speech recognition.

        + +

        This work is part of our approach to AV-ASR research. A promising aspect of this approach is its ability to automatically annotate large-scale audio-visual datasets, which enables the training of more accurate and robust speech recognition systems. Furthermore, this technology has the potential to run on smart devices since it achieves the latency and memory efficiency that such devices require for inference.

        + +

        In the future, speech recognition systems are expected to power applications in numerous domains. One of the primary applications of AV-ASR is to enhance the performance of ASR in noisy environments. Since visual streams are not affected by acoustic noise, integrating them into an audio-visual speech recognition model can compensate for the performance drop of ASR models. Our AV-ASR system has the potential to serve multiple purposes beyond speech recognition, such as text summarization, translation and even text-to-speech conversion. Moreover, the exclusive use of VSR can be useful in certain scenarios, e.g. where speaking is not allowed, in meetings, and where privacy in public conversations is desired.

        + +

        AV-ASR

        + +

        Fig. 1 The pipeline for audio-visual speech recognition system

        + +

        Fig. 1: The pipeline for audio-visual speech recognition system

        + +

        Our real-time AV-ASR system is presented in Fig. 1. It consists of three components, a data collection module, a pre-processing module and an end-to-end model. The data collection module comprises hardware devices, such as a microphone and camera. Its role is to collect information from the real world. Once the information is collected, the pre-processing module location and crop out face. Next, we feed the raw audio stream and the pre-processed video stream into our end-to-end model for inference.

        + +

        Data collection

        + +

        We use torchaudio.io.StreamReader to capture audio/video from streaming device input, e.g. microphone and camera on laptop. Once the raw video and audio streams are collected, the pre-processing module locates and crops faces. It should be noted that data is immediately deleted during the streaming process.

        + +

        Pre-processing

        + +

        Before feeding the raw stream into our model, each video sequence has to undergo a specific pre-processing procedure. This involves three critical steps. The first step is to perform face detection. Following that, each individual frame is aligned to a referenced frame, commonly known as the mean face, in order to normalize rotation and size differences across frames. The final step in the pre-processing module is to crop the face region from the aligned face image. We would like to clearly note that our model is fed with raw audio waveforms and pixels of the face, without any further preprocessing like face parsing or landmark detection. An example of the pre-processing procedure is illustrated in Table 1.

        + + + + + + + + + + + + + + +
        +Original image + + + +Detected image + + +Transformed image + + +Cropped image + +
        + 0. Original + +1. Detection + +2. Alignment + +3. Crop +
        + +

        Table 1: Preprocessing pipeline.

        + +

        Model

        + +

        Fig. 2 The architecture for the audio-visual speech recognition system.

        + +

        Fig. 2: The architecture for the audio-visual speech recognition system

        + +

        We consider two configurations: Small with 12 Emformer blocks and Large with 28, with 34.9M and 383.3M parameters, respectively. Each AV-ASR model composes front-end encoders, a fusion module, an Emformer encoder, and a transducer model. To be specific, we use convolutional frontends to extract features from raw audio waveforms and facial images. The features are concatenated to form 1024-d features, which are then passed through a two-layer multi-layer perceptron and an Emformer transducer model. The entire network is trained using RNN-T loss. The architecture of the proposed AV-ASR model is illustrated in Fig. 2.

        + +

        Analysis

        + +

        Datasets. We follow Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels to use publicly available audio-visual datasets including LRS3, VoxCeleb2 and AVSpeech for training. We do not use mouth ROIs or facial landmarks or attributes during both training and testing stages.

        + +

        Comparisons with the state-of-the-art. Non-streaming evaluation results on LRS3 are presented in Table 2. Our audio-visual model with an algorithmic latency of 800 ms (160ms+1280msx0.5) yields a WER of 1.3%, which is on par with those achieved by state-of-the-art offline models such as AV-HuBERT, RAVEn, and Auto-AVSR.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Method + Total Hours + WER (%) +
        ViT3D-CM + 90, 000 + 1.6 +
        AV-HuBERT + 1, 759 + 1.4 +
        RAVEn + 1, 759 + 1.4 +
        AutoAVSR + 3, 448 + 0.9 +
        Ours + 3, 068 + 1.3 +
        + +

        Table 2: Non-streaming evaluation results for audio-visual models on the LRS3 dataset.

        + +

        Noisy experiments. During training, 16 different noise types are randomly injected to audio waveforms, including 13 types from Demand database, ‘DLIVING’,’DKITCHEN’, ‘OMEETING’, ‘OOFFICE’, ‘PCAFETER’, ‘PRESTO’, ‘PSTATION’, ‘STRAFFIC’, ‘SPSQUARE’, ‘SCAFE’, ‘TMETRO’, ‘TBUS’ and ‘TCAR’, two more types of noise from speech commands database, white and pink and one more type of noise from NOISEX-92 database, babble noise. SNR levels in the range of [clean, 7.5dB, 2.5dB, -2.5dB, -7.5dB] are selected from with a uniform distribution. Results of ASR and AV-ASR models, when tested with babble noise, are shown in Table 3. With increasing noise level, the performance advantage of our audio-visual model over our audio-only model grows, indicating that incorporating visual data improves noise robustness.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Type + + 10dB + 5dB + 0dB + -5dB + -10dB +
        A + 1.6 + 1.8 + 3.2 + 10.9 + 27.9 + 55.5 +
        A+V + 1.6 + 1.7 + 2.1 + 6.2 + 11.7 + 27.6 +
        + +

        Table 3: Streaming evaluation WER (%) results at various signal-to-noise ratios for our audio-only (A) and audio-visual (A+V) models on the LRS3 dataset under 0.80-second latency constraints.

        + +

        Real-time factor. The real-time factor (RTF) is an important measure of a system’s ability to process real-time tasks efficiently. An RTF value of less than 1 indicates that the system meets real-time requirements. We measure RTF using a laptop with an Intel® Core™ i7-12700 CPU running at 2.70 GHz and an NVIDIA 3070 GeForce RTX 3070 Ti GPU. To the best of our knowledge, this is the first AV-ASR model that reports RTFs on the LRS3 benchmark. The Small model achieves a WER of 2.6% and an RTF of 0.87 on CPU (Table 4), demonstrating its potential for real-time on-device inference applications.

        + + + + + + + + + + + + + + + + + + + + + + + + +
        Model + Device + Streaming WER [%] + RTF +
        Large + GPU + 1.6 + 0.35 +
        Small + GPU + 2.6 + 0.33 +
        CPU + 0.87 +
        + +

        Table 4: Impact of AV-ASR model size and device on WER and RTF. Note that the RTF calculation includes the pre-processing step wherein the Ultra-Lightweight Face Detection Slim 320 model is used to generate face bounding boxes.

        + +

        Learn more about the system from the published works below:

        + +
          +
        • Shi, Yangyang, Yongqiang Wang, Chunyang Wu, Ching-Feng Yeh, Julian Chan, Frank Zhang, Duc Le, and Mike Seltzer. “Emformer: Efficient memory transformer based acoustic model for low latency streaming speech recognition.” In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6783-6787. IEEE, 2021.
        • +
        • Ma, Pingchuan, Alexandros Haliassos, Adriana Fernandez-Lopez, Honglie Chen, Stavros Petridis, and Maja Pantic. “Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels.” In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1-5. IEEE, 2023.
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/rebellions/index.html b/blog/rebellions/index.html new file mode 100644 index 000000000000..c543c69f03b2 --- /dev/null +++ b/blog/rebellions/index.html @@ -0,0 +1,669 @@ + + + + + + + + + + + + + Rebellions Joins the PyTorch Foundation as a General Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Rebellions logo

        + +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Rebellions has joined as a general member.

        + +

        Rebellions is a South Korea-based semiconductor company specializing in the design and development of AI chips for data centers and edge devices. Their innovative hardware and software solutions aim to accelerate generative AI and machine learning workloads, focusing on high energy efficiency and performance. The company successfully launched and deployed its AI chip ‘ATOM’ targeting data centers in 2023 and is developing its next-generation AI accelerator ‘REBEL’.

        + +

        “We’re thrilled to welcome Rebellions as a new general member of the PyTorch Foundation,” said Matt White, Executive Director of the PyTorch Foundation. “Rebellions brings a unique perspective to the PyTorch ecosystem with their focus on advancing the integration of NPU architectures for AI acceleration with PyTorch. Their expertise will play a vital role in ensuring PyTorch continues to evolve as a versatile framework, accommodating the diverse needs of modern AI workloads. We look forward to collaborating with Rebellions to drive innovation and strengthen the PyTorch ecosystem for developers worldwide.”

        + +

        Rebellions has introduced native support for PyTorch 2.0 in their RBLN SDK. This integration includes compatibility with torch.compile, a pivotal feature of PyTorch 2.0 that enhances model performance. Through this development, Rebellions has empowered developers to seamlessly harness the full potential of their AI accelerator lineup within the environment.

        + +

        Rebellions is also deeply committed to advancing the PyTorch ecosystem through collaborative innovation starting in Korea. The company has established a Special Interest Group (SIG) focusing on Pytorch Core within the PyTorch Korea community and is actively working with volunteers recruited through MODULABS, an open research institute, to integrate native support for the deep learning framework into their Neural Processing Unit (NPU).

        + +

        In addition, Rebellions is collaborating with academic institutions, such as Yonsei University, Hanyang University, University of Science & Technology (UST) and national agencies, such as the Electronics and Telecommunications Research Institute (ETRI), to offer undergraduate and graduate courses on PyTorch and enable them to leverage Pytorch as their research platform.

        + +

        These initiatives highlight Rebellions’ dedication to optimizing the PyTorch experience for developers and researchers alike, while also fostering education and innovation in the field.

        + +

        “By integrating our hardware innovations with PyTorch, we’re building Native NPU support to accelerate diverse AI workloads.” said Hong-seok Kim, the Chief Software Architect at Rebellions. “We’re excited to contribute to the PyTorch community by community-driven initiatives and partnerships, advancing NPU architecture support for next-generation AI solutions. Together with the PyTorch community, we aim to pioneer new possibilities in AI acceleration and empower developers worldwide with efficient computing solutions.”

        + +

        To learn more about how your organization can be a part of the PyTorch Foundation, visit our website.

        + +

        About Rebellions

        + +

        Rebellions is a South Korea-based semiconductor company specializing in the design and development of AI chips for data centers and edge devices. Their innovative hardware and software solutions aim to accelerate generative AI and machine learning workloads, focusing on high energy efficiency and performance. The company successfully launched and deployed its AI chip ‘ATOM’ targeting data centers in 2023 and is developing its next-generation AI accelerator ‘REBEL’ incorporating a scalable chiplet architecture and high-bandwidth memory.

        + +

        About PyTorch Foundation

        + +

        The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

        + +

        About The Linux Foundation

        + +

        The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/reducing-checkpointing-times/index.html b/blog/reducing-checkpointing-times/index.html new file mode 100644 index 000000000000..df48557e36c1 --- /dev/null +++ b/blog/reducing-checkpointing-times/index.html @@ -0,0 +1,708 @@ + + + + + + + + + + + + + Reducing Model Checkpointing Times by Over 10x with PyTorch Distributed Asynchronous Checkpointing | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Meta: Lucas Pasqualin, Less Wright, Iris Zhang (PyTorch), Chien-Chin Huang; IBM Research: Swaminathan Sundararaman, Saransh Gupta, Raghu Ganti + +

        +

        Summary: With PyTorch distributed’s new asynchronous checkpointing feature, developed with feedback from IBM, we show how IBM Research Team is able to implement and reduce effective checkpointing time by a factor of 10-20x. Example: 7B model ‘down time’ for a checkpoint goes from an average of 148.8 seconds to 6.3 seconds, or 23.62x faster.

        + +

        This directly translates into either more net training progress for every given 24 hour period while continuing to robustly checkpoint or more frequent checkpoints to shorten recovery window/time.

        + +

        In this note, we showcase the usage code and architecture that makes asynchronous checkpointing possible, along with timing results verified by IBM’s Research team.

        + +

        Async Checkpointing vs Standard Checkpointing

        + +

        Model checkpointing is a vital part of large model training, but checkpointing is an expensive process as each checkpoint process involves blocking training progress in order to save out the latest model weights. However, not checkpointing or reducing checkpointing frequency can result in a significant loss in training progress. For example, failures such as a deadlock, straggler, and gpu errors require the training process to be restarted. In order to restart from a failure, all (training) workers must stop their training process and be restarted from the last saved checkpoint.

        + +

        Thus, the inherent tension between robustness to failures vs training progress plays out as a tradeoff, but now with asynchronous checkpointing, PyTorch Distributed is able to significantly reduce this tension and enable frequent checkpoint with minimal impact to the overall training time.

        + +

        For background, it was almost exactly a year ago that we showcased how distributed checkpointing had massively sped up checkpointing times from the original torch.save() functionality. As IBM Research had noted, torch.save could take up to 30 minutes to checkpoint a single 11B model (PyTorch 1.13).

        + +

        With advancements in distributed checkpointing, checkpoints could be done in under 4 minutes for up to 30B model sizes.

        + +

        With asynchronous checkpointing, the training time lost due to checkpointing now moves to under 30 seconds, and often as short as 6 seconds.

        + +

        To be clear, asynchronous checkpointing does not compress the actual serialization checkpointing time as the previous update showcased. Rather it moves the final checkpointing process off the critical path (to cpu threads) to allow GPU training to continue while finalizing the checkpoint under separate threads.

        + +

        However, to the user, the effect is nearly the same in that down time for training due to checkpointing is substantially reduced, in many cases by 10x or even 20x.

        + +

        Async Dist Checkpointing

        + +

        As the above speedup chart shows, asynchronous checkpointing produces a 10x to 23x further improvement over the previous large improvements from a year ago.

        + +

        How does Asynchronous Checkpointing work?

        + +

        Asynchronous checkpointing modularizes the checkpointing process into two parts rather than one monolithic process. The first phase copies the data from each gpu/rank from GPU to CPU. This is the visible downtime to the user and can take from 6 - 14 seconds for 7B-13B model sizes. The second phase asynchronously copies the data from CPU memory to disk to persist the checkpoint.

        + +

        Once data is copied to CPU in the first phase, the GPU is free to immediately resume training. Hence with asynchronous checkpointing the downtime for checkpointing is simply the time needed to copy over the latest model states to CPU.

        + +

        At the same time that training resumes, non-blocking CPU threads work with the freshly arrived data in memory to complete the full checkpointing/serialization process to disk (i.e. persistent save).

        + +

        flow diagram

        + +

        Note that PyTorch’s Distributed Checkpointer relies on collective communication calls for per-rank metadata necessary to optimize saves, as well as a final synchronization which marks checkpointing as complete and makes the action atomic. This can interfere with distributed training (as distributed training also relies upon similar calls to synchronize training across multiple GPUs) if the Checkpointing thread utilizes the same process group used for training.

        + +

        Specifically, a race condition between the calls could potentially cause training and asynch checkpointing save threads to wait on collective calls at the same time, resulting in a true collective hang.

        + +

        We avoided this scenario by initializing a separate process group for async checkpointing. This separates the checkpointing collectives into their own logical process group, which thus ensures it will not interfere with collective calls in the main training threads.

        + +

        How do I use Asynchronous Checkpointing in my training?

        + +

        Usage of Asynchronous checkpointing is relatively straightforward. Using the latest nightly version of PyTorch, you will want to initialize your process group with both nccl and gloo. Gloo is required for the cpu threads portion.

        + +

        From there, create a duplicate process group which the asynchronous checkpointing will utilize. +Then train as usual but at the point when you want to checkpoint, use the asynchronous save api, passing in the states to save, the checkpoint id and the checkpoint process group.

        + +

        Code snippet

        + +

        Asynchronous checkpointing is also fully implemented in torchtitan. Here, it is implemented for use with pre-training your own Llama2 or Lllama3 model. Using it is as simple as updating the toml config file:

        + +

        Code snippet

        + +

        Future work

        + +

        Checkpointing has made huge strides over the past year. Moving from almost half an hour checkpoints to under 5 minutes with distributed checkpointing and now to under 30 seconds with asynchronous checkpointing.

        + +

        The last frontier - zero overhead checkpointing where even the < 30 seconds is eliminated by streaming the updated weights during the backward pass such that checkpoint data is already on cpu at the point asynchronous checkpointing would kick in.

        + +

        This would effectively move large model training to where checkpointing has no disruption or downtime enabling both more robustness (as checkpoints could be taken more frequently) and faster training progress due to no downtime for checkpointing.

        + +

        Source code link: https://github.com/pytorch/pytorch/blob/main/torch/distributed/checkpoint/state_dict_saver.py

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/running-pytorch-models-on-jetson-nano/index.html b/blog/running-pytorch-models-on-jetson-nano/index.html new file mode 100644 index 000000000000..603c1d740706 --- /dev/null +++ b/blog/running-pytorch-models-on-jetson-nano/index.html @@ -0,0 +1,910 @@ + + + + + + + + + + + + + Running PyTorch Models on Jetson Nano | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        March 16, 2022

        +

        + Running PyTorch Models on Jetson Nano +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Jeff Tang, Hamid Shojanazeri, Geeta Chauhan + +

        +

        Overview

        +

        NVIDIA Jetson Nano, part of the Jetson family of products or Jetson modules, is a small yet powerful Linux (Ubuntu) based embedded computer with 2/4GB GPU. With it, you can run many PyTorch models efficiently. This document summarizes our experience of running different deep learning models using 3 different mechanisms on Jetson Nano:

        + +
          +
        1. +

          Jetson Inference the higher-level NVIDIA API that has built-in support for running most common computer vision models which can be transfer-learned with PyTorch on the Jetson platform.

          +
        2. +
        3. +

          TensorRT, an SDK for high-performance inference from NVIDIA that requires the conversion of a PyTorch model to ONNX, and then to the TensorRT engine file that the TensorRT runtime can run.

          +
        4. +
        5. +

          PyTorch with the direct PyTorch API torch.nn for inference.

          +
        6. +
        + +

        Setting up Jetson Nano

        +

        After purchasing a Jetson Nano here, simply follow the clear step-by-step instructions to download and write the Jetson Nano Developer Kit SD Card Image to a microSD card, and complete the setup. After the setup is done and the Nano is booted, you’ll see the standard Linux prompt along with the username and the Nano name used in the setup.

        + +

        To check the GPU status on Nano, run the following commands:

        + +
        sudo pip3 install jetson-stats
        +sudo jtop
        +
        + +

        You’ll see information, including:

        + +
        + +
        + +

        You can also see the installed CUDA version:

        + +
        $ ls -lt /usr/local
        +lrwxrwxrwx  1 root root   22 Aug  2 01:47 cuda -> /etc/alternatives/cuda
        +lrwxrwxrwx  1 root root   25 Aug  2 01:47 cuda-10 -> /etc/alternatives/cuda-10
        +drwxr-xr-x 12 root root 4096 Aug  2 01:47 cuda-10.2
        +
        + +

        To use a camera on Jetson Nano, for example, Arducam 8MP IMX219, follow the instructions here or run the commands below after installing a camera module:

        + +
        cd ~
        +wget https://github.com/ArduCAM/MIPI_Camera/releases/download/v0.0.3/install_full.sh
        +chmod +x install_full.sh
        +./install_full.sh -m arducam
        +
        + +

        Another way to do this is to use the original Jetson Nano camera driver:

        + +
        sudo dpkg -r arducam-nvidia-l4t-kernel
        +sudo shutdown -r now
        +
        + +

        Then, use ls /dev/video0 to confirm the camera is found:

        + +
        $ ls /dev/video0
        +/dev/video0
        +
        + +

        And finally, the following command to see the camera in action:

        + +
        nvgstcapture-1.0 --orientation=2
        +
        + +

        Using Jetson Inference

        +

        NVIDIA Jetson Inference API offers the easiest way to run image recognition, object detection, semantic segmentation, and pose estimation models on Jetson Nano. Jetson Inference has TensorRT built-in, so it’s very fast.

        + +

        To test run Jetson Inference, first clone the repo and download the models:

        + +
        git clone --recursive https://github.com/dusty-nv/jetson-inference
        +cd jetson-inference
        +
        + +

        Then use the pre-built Docker Container that already has PyTorch installed to test run the models:

        + +
        docker/run.sh --volume ~/jetson_inference:/jetson_inference
        +
        + +

        To run image recognition, object detection, semantic segmentation, and pose estimation models on test images, use the following:

        + +
        cd build/aarch64/bin
        +./imagenet.py images/jellyfish.jpg /jetson_inference/jellyfish.jpg
        +./segnet.py images/dog.jpg /jetson_inference/dog.jpeg
        +./detectnet.py images/peds_0.jpg /jetson_inference/peds_0.jpg
        +./posenet.py images/humans_0.jpg /jetson_inference/pose_humans_0.jpg
        +
        + +

        Four result images from running the four different models will be generated. Exit the docker image to see them:

        + +
        $ ls -lt ~/jetson_inference/
        +-rw-r--r-- 1 root root  68834 Oct 15 21:30 pose_humans_0.jpg
        +-rw-r--r-- 1 root root 914058 Oct 15 21:30 peds_0.jpg
        +-rw-r--r-- 1 root root 666239 Oct 15 21:30 dog.jpeg
        +-rw-r--r-- 1 root root 179760 Oct 15 21:29 jellyfish.jpg
        +
        + +
        + Using jest interface example 1 + Using jest interface example 2 +
        + +
        + Using jest interface example 3 + Using jest interface example 4 +
        + +

        You can also use the docker image to run PyTorch models because the image has PyTorch, torchvision and torchaudio installed:

        + +
        # pip list|grep torch
        +torch (1.9.0)
        +torchaudio (0.9.0a0+33b2469)
        +torchvision (0.10.0a0+300a8a4)
        +
        + +

        Although Jetson Inference includes models already converted to the TensorRT engine file format, you can fine-tune the models by following the steps in Transfer Learning with PyTorch (for Jetson Inference) here.

        + +

        Using TensorRT

        +

        TensorRT is an SDK for high-performance inference from NVIDIA. Jetson Nano supports TensorRT via the Jetpack SDK, included in the SD Card image used to set up Jetson Nano. To confirm that TensorRT is already installed in Nano, run dpkg -l|grep -i tensorrt:

        + +
        + +
        + +

        Theoretically, TensorRT can be used to “take a trained PyTorch model and optimize it to run more efficiently during inference on an NVIDIA GPU.” Follow the instructions and code in the notebook to see how to use PyTorch with TensorRT through ONNX on a torchvision Resnet50 model:

        + +
          +
        1. +

          How to convert the model from PyTorch to ONNX;

          +
        2. +
        3. +

          How to convert the ONNX model to a TensorRT engine file;

          +
        4. +
        5. +

          How to run the engine file with the TensorRT runtime for performance improvement: inference time improved from the original 31.5ms/19.4ms (FP32/FP16 precision) to 6.28ms (TensorRT).

          +
        6. +
        + +

        You can replace the Resnet50 model in the notebook code with another PyTorch model, go through the conversion process above, and run the finally converted model TensorRT engine file with the TensorRT runtime to see the optimized performance. But be aware that due to the Nano GPU memory size, models larger than 100MB are likely to fail to run, with the following error information:

        + +

        Error Code 1: Cuda Runtime (all CUDA-capable devices are busy or unavailable)

        + +

        You may also see an error when converting a PyTorch model to ONNX model, which may be fixed by replacing:

        + +

        torch.onnx.export(resnet50, dummy_input, "resnet50_pytorch.onnx", verbose=False)

        + +

        with:

        + +

        torch.onnx.export(model, dummy_input, "deeplabv3_pytorch.onnx", opset_version=11, verbose=False)

        + +

        Using PyTorch

        +

        First, to download and install PyTorch 1.9 on Nano, run the following commands (see here for more information):

        + +
        wget https://nvidia.box.com/shared/static/p57jwntv436lfrd78inwl7iml6p13fzh.whl -O torch-1.8.0-cp36-cp36m-linux_aarch64.whl -O torch-1.9.0-cp36-cp36m-linux_aarch64.whl
        +sudo apt-get install python3-pip libopenblas-base libopenmpi-dev 
        +pip3 install Cython
        +pip3 install numpy torch-1.9.0-cp36-cp36m-linux_aarch64.whl
        +
        + +

        To download and install torchvision 0.10 on Nano, run the commands below:

        + +
        https://drive.google.com/uc?id=1tU6YlPjrP605j4z8PMnqwCSoP6sSC91Z
        +pip3 install torchvision-0.10.0a0+300a8a4-cp36-cp36m-linux_aarch64.whl
        +
        + +

        After the steps above, run this to confirm:

        +
        $ pip3 list|grep torch
        +torch (1.9.0)
        +torchvision (0.10.0)
        +
        + +

        You can also use the docker image described in the section Using Jetson Inference (which also has PyTorch and torchvision installed), to skip the manual steps above.

        + +

        The official YOLOv5 repo is used to run the PyTorch YOLOv5 model on Jetson Nano. After logging in to Jetson Nano, follow the steps below:

        + +
          +
        • Get the repo and install what’s required:
        • +
        + +
        git clone https://github.com/ultralytics/yolov5
        +cd yolov5
        +pip install -r requirements.txt
        +
        + +
          +
        • Run python3 detect.py, which by default uses the PyTorch yolov5s.pt model. You should see something like:
        • +
        + +
        detect: weights=yolov5s.pt, source=data/images, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False
        +YOLOv5 🚀 v5.0-499-g48b00db torch 1.9.0 CUDA:0 (NVIDIA Tegra X1, 3956.1015625MB)
        +
        +Fusing layers... 
        +Model Summary: 224 layers, 7266973 parameters, 0 gradients
        +image 1/5 /home/jeff/repos/yolov5-new/yolov5/data/images/bus.jpg: 640x480 4 persons, 1 bus, 1 fire hydrant, Done. (0.142s)
        +...
        +
        + +

        The inference time on Jetson Nano GPU is about 140ms, more than twice as fast as the inference time on iOS or Android (about 330ms).

        + +

        If you get an error “ImportError: The _imagingft C module is not installed.” then you need to reinstall pillow:

        +
        sudo apt-get install libpng-dev
        +sudo apt-get install libfreetype6-dev
        +pip3 uninstall pillow
        +pip3 install --no-cache-dir pillow
        +
        + +

        After successfully completing the python3 detect.py run, the object detection results of the test images located in data/images will be in the runs/detect/exp directory. To test the detection with a live webcam instead of local images, use the --source 0 parameter when running python3 detect.py):

        + +
        ~/repos/yolov5$ ls -lt runs/detect/exp10
        +total 1456
        +-rw-rw-r-- 1 jeff jeff 254895 Oct 15 16:12 zidane.jpg
        +-rw-rw-r-- 1 jeff jeff 202674 Oct 15 16:12 test3.png
        +-rw-rw-r-- 1 jeff jeff 217117 Oct 15 16:12 test2.jpg
        +-rw-rw-r-- 1 jeff jeff 305826 Oct 15 16:12 test1.png
        +-rw-rw-r-- 1 jeff jeff 495760 Oct 15 16:12 bus.jpg
        +
        + +

        Using the same test files used in the PyTorch iOS YOLOv5 demo app or Android YOLOv5 demo app, you can compare the results generated with running the YOLOv5 PyTorch model on mobile devices and Jetson Nano:

        + +
        + PyTorch YOLOv5 on Jetson Nano, example with a dog + PyTorch YOLOv5 on Jetson Nano, example with a horse and a rider +
        +

        Figure 1. PyTorch YOLOv5 on Jetson Nano.

        + +
        + PyTorch YOLOv5 on iOS, example with a dog + PyTorch YOLOv5 on iOS, example with a horse and a rider +
        +

        Figure 2. PyTorch YOLOv5 on iOS.

        + +
        + PyTorch YOLOv5 on Android, example with a dog + PyTorch YOLOv5 on Android, example with a horse and a rider +
        +

        Figure 3. PyTorch YOLOv5 on Android.

        + +

        Summary

        +

        Based on our experience of running different PyTorch models for potential demo apps on Jetson Nano, we see that even Jetson Nano, a lower-end of the Jetson family of products, provides a powerful GPU and embedded system that can directly run some of the latest PyTorch models, pre-trained or transfer learned, efficiently.

        + +

        Building PyTorch demo apps on Jetson Nano can be similar to building PyTorch apps on Linux, but you can also choose to use TensorRT after converting the PyTorch models to the TensorRT engine file format.

        + +

        But if you just need to run some common computer vision models on Jetson Nano using NVIDIA’s Jetson Inference which supports image recognition, object detection, semantic segmentation, and pose estimation models, then this is the easiest way.

        + +

        References

        +

        Torch-TensorRT, a compiler for PyTorch via TensorRT: +https://github.com/NVIDIA/Torch-TensorRT/

        + +

        Jetson Inference docker image details: +https://github.com/dusty-nv/jetson-inference/blob/master/docs/aux-docker.md

        + +

        A guide to using TensorRT on the NVIDIA Jetson Nano: +https://docs.donkeycar.com/guide/robot_sbc/tensorrt_jetson_nano/ +including:

        + +
          +
        1. +

          Use Jetson as a portable GPU device to run an NN chess engine model: +https://medium.com/@ezchess/jetson-lc0-running-leela-chess-zero-on-nvidia-jetson-a-portable-gpu-device-a213afc9c018

          +
        2. +
        3. +

          A MaskEraser app using PyTorch and torchvision, installed directly with pip: +https://github.com/INTEC-ATI/MaskEraser#install-pytorch

          +
        4. +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed/index.html b/blog/scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed/index.html new file mode 100644 index 000000000000..d349f2b42cb2 --- /dev/null +++ b/blog/scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed/index.html @@ -0,0 +1,885 @@ + + + + + + + + + + + + + Scaling Multimodal Foundation Models in TorchMultimodal with Pytorch Distributed | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Ankita De, Edward Wang (EcoF), Rohan Varma, Anjali Sridhar, Kartikay Khandelwal + +

        +

        Introduction

        + +

        In recent years, scaling model sizes has become a promising area of research. In the field of NLP, language models have gone from hundreds of millions of parameters (BERT) to hundreds of billions of parameters (GPT-3) demonstrating significant improvements on downstream tasks. The scaling laws for large scale language models have also been studied extensively in the industry. A similar trend can be observed in the vision field, with the community moving to transformer based models (like Vision Transformer, Masked Auto Encoders) as well. It is clear that individual modalities - text, image, video - have benefited massively from recent advancements in scale, and frameworks have quickly adapted to accommodate larger models.

        + +

        At the same time, multimodality is becoming increasingly important in research with tasks like image-text retrieval, visual question-answering, visual dialog and text to image generation gaining traction in real world applications. Training large scale multimodal models is the natural next step and we already see several efforts in this area like CLIP from OpenAI, Parti from Google and CM3 from Meta.

        + +

        In this blog, we present a case study demonstrating the scaling of FLAVA to 10B params using techniques from PyTorch Distributed. FLAVA is a vision and language foundation model, available in TorchMultimodal, which has shown competitive performance on both unimodal and multimodal benchmarks. We also give the relevant code pointers in this blog. The instructions for running an example script to scale FLAVA can be found here.

        + +

        Scaling FLAVA Overview

        + +

        FLAVA is a foundation multimodal model which consists of transformer based image and text encoders followed by a transformer-based multimodal fusion module. It is pretrained on both unimodal and multimodal data with a diverse set of losses. This includes masked language, image and multimodal modeling losses that require the model to reconstruct the original input from its context (self-supervised learning). It also uses image text matching loss over positive and negative examples of aligned image-text pairs as well as CLIP style contrastive loss. In addition to multimodal tasks (like image-text retrieval), FLAVA demonstrated competitive performance on unimodal benchmarks as well (GLUE tasks for NLP and image classification for vision).

        + +

        + +

        + +

        The original FLAVA model has ~350M parameters and uses ViT-B16 configurations (from the Vision Transformer paper) for image and text encoders. The multimodal fusion transformer follows the unimodal encoders but with half the number of layers. We explore increasing the size of each encoder to larger ViT variants.

        + +

        Another aspect of scaling is adding the ability to increase the batch size. FLAVA makes use of contrastive loss over in-batch negatives, which typically benefits from large batch size (as studied here). The largest training efficiency or throughput is also generally achieved when operating near maximum possible batch sizes as determined by the amount of GPU memory available (also see the experiments section).

        + +

        The following table displays the different model configurations we experimented with. We also determine the maximum batch size that was able to fit in memory for each configuration in the experiments section.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Approx Model paramsHidden sizeMLP sizeHeadsUnimodal layersMultimodal layersModel size (fp32)
        350M (original)7683072121261.33GB
        900M102440961624123.48GB
        1.8B128051201632166.66GB
        2.7B1408614416402010.3GB
        4.8B1664819216482418.1GB
        10B20481024016644038GB
        + +

        Optimization overview

        + +

        PyTorch offers several native techniques to efficiently scale models. In the following sections, we go over some of these techniques and show how they can be applied to scale up a FLAVA model to 10 billion parameters.

        + +

        Distributed Data Parallel

        + +

        A common starting point for distributed training is data parallelism. Data parallelism replicates the model across each worker (GPU), and partitions the dataset across the workers. Different workers process different data partitions in parallel and synchronize their gradients (via all reduce) before model weights are updated. The figure below showcases the flow (forward, backward, and weight update steps) for processing a single example for data parallelism:

        + +

        + +

        + +

        + Source: https://engineering.fb.com/2021/07/15/open-source/fsdp/ +

        + +

        PyTorch provides a native API, DistributedDataParallel (DDP) to enable data parallelism which can be used as a module wrapper as showcased below. Please see PyTorch Distributed documentation for more details.

        + +
        from torchmultimodal.models.flava.model import flava_model_for_pretraining
        +import torch
        +import torch.distributed as dist
        +
        +model = flava_model_for_pretraining().cuda()
        +# Initialize PyTorch Distributed process groups
        +# Please see https://pytorch.org/tutorials/intermediate/dist_tuto.html for details
        +dist.init_process_group(backend=”nccl”)
        +# Wrap model in DDP
        +model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[torch.cuda.current_device()])
        +
        + +

        Fully Sharded Data Parallel

        + +

        GPU memory usage of a training application can roughly be broken down into model inputs, intermediate activations (needed for gradient computation), model parameters, gradients, and optimizer states. Scaling a model will typically increase each of these elements. Scaling a model with DDP can eventually result in out-of-memory issues when a single GPU’s memory becomes insufficient since it replicates the parameters, gradients, and optimizer states on all workers.

        + +

        To reduce this replication and save GPU memory, we can shard the model parameters, gradients, and optimizer states across all workers with each worker only managing a single shard. This technique was popularized by the ZeRO-3 approach developed by Microsoft. A PyTorch-native implementation of this approach is available as FullyShardedDataParallel (FSDP) API, released as a beta feature in PyTorch 1.12. During a module’s forward and backward passes, FSDP unshards the model parameters as needed for computation (using all-gather) and reshards them after computation. It synchronizes gradients using the reduce-scatter collective to ensure sharded gradients are globally averaged. The forward and backward pass flow of a model wrapped in FSDP are detailed below:

        + +

        + +

        + +

        + Source: https://engineering.fb.com/2021/07/15/open-source/fsdp/ +

        + +

        To use FSDP, the submodules of a model need to be wrapped with the API to control when specific submodules are sharded or unsharded. FSDP provides an auto-wrapping API (see the auto_wrap_policy argument) that can be used out of the box as well as several wrapping policies and the ability to write your own policy.

        + +

        The following example demonstrates wrapping the FLAVA model with FSDP. We specify the auto-wrapping policy as transformer_auto_wrap_policy. This will wrap individual transformer layers (TransformerEncoderLayer), the image transformer (ImageTransformer), text encoder (BERTTextEncoder) and multimodal encoder (FLAVATransformerWithoutEmbeddings) as individual FSDP units. This uses a recursive wrapping approach for efficient memory management. For example, after an individual transformer layer’s forward or backward pass is finished, its parameters are discarded, freeing up memory thereby reducing peak memory usage.

        + +

        FSDP also provides a number of configurable options to tune the performance of applications. For example, in our use case, we illustrate the use of the new limit_all_gathers flag, which prevents all-gathering model parameters too early thereby alleviating memory pressure on the application. We encourage users to experiment with this flag which can potentially improve the performance of applications with high active memory usage.

        + +
        import torch
        +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        +from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
        +from torchmultimodal.models.flava.model import flava_model_for_pretraining
        +from torchmultimodal.models.flava.text_encoder import BertTextEncoder
        +from torchmultimodal.models.flava.image_encoder import ImageTransformer
        +from torchmultimodal.models.flava.transformer import FLAVATransformerWithoutEmbeddings
        +from torchmultimodal.modules.layers.transformer import TransformerEncoderLayer
        +
        +model = flava_model_for_pretraining().cuda()
        +dist.init_process_group(backend=”nccl”)
        +
        +model = FSDP(
        +               model,
        +               device_id=torch.cuda.current_device(),
        +               auto_wrap_policy=partial(
        +                   transformer_auto_wrap_policy,
        +                   transformer_layer_cls={
        +                       TransformerEncoderLayer,
        +                       ImageTransformer,
        +                       BERTTextEncoder,
        +                       FLAVATransformerWithoutEmbeddings
        +                   },
        +               ),
        +               limit_all_gathers=True,
        +           )
        +
        + +

        Activation Checkpointing

        + +

        As discussed above, intermediate activations, model parameters, gradients, and optimizer states contribute to the overall GPU memory usage. FSDP can reduce memory consumption due to the latter three but does not reduce memory consumed by activations. Memory used by activations increases with increase in batch size or number of hidden layers. Activation checkpointing is a technique to decrease this memory usage by recomputing the activations during the backward pass instead of holding them in memory for a specific checkpointed module. For example, we observed ~4x reduction in the peak active memory after forward pass by applying activation checkpointing to the 2.7B parameter model.

        + +

        PyTorch offers a wrapper based activation checkpointing API. In particular, checkpoint_wrapper allows users to wrap an individual module with checkpointing, and apply_activation_checkpointing allows users to specify a policy with which to wrap modules within an overall module with checkpointing. Both these APIs can be applied to most models as they do not require any modifications to the model definition code. However, if more granular control over checkpointed segments, such as checkpointing specific functions within a module, is required, the functional torch.utils.checkpoint API can be leveraged, although this requires modification to the model code. The application of the activation checkpointing wrapper to individual FLAVA transformer layers (denoted by TransformerEncoderLayer) is shown below. For a thorough description of activation checkpointing, please see the description in the PyTorch documentation.

        + +
        from torchmultimodal.models.flava.model import flava_model_for_pretraining
        +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import apply_activation_checkpointing, checkpoint_wrapper, CheckpointImpl
        +from torchmultimodal.modules.layers.transformer import TransformerEncoderLayer
        +
        +model = flava_model_for_pretraining()
        +checkpoint_tformer_layers_policy = lambda submodule: isinstance(submodule, TransformerEncoderLayer)
        +
        +apply_activation_checkpointing(
        +               model,
        +               checkpoint_wrapper_fn=checkpoint_wrapper,
        +               check_fn=checkpoint_tformer_layers_policy,
        +           )
        +
        +

        Used together, wrapping FLAVA transformer layers with activation checkpointing and wrapping the overall model with FSDP as demonstrated above, we are able to scale FLAVA to 10B parameters.

        + +

        Experiments

        + +

        We conduct an empirical study about the impact of the different optimizations from the previous section on system performance. For all our experiments, we use a single node with 8 A100 40GB GPUs and run the pretraining for 1000 iterations. All runs also used PyTorch’s automatic mixed precision with the bfloat16 data type. TensorFloat32 format is also enabled to improve matmul performance on the A100. We define throughput as the average number of items (text or image) processed per second (we ignore the first 100 iterations while measuring throughput to account for warmup). We leave training to convergence and its impact on downstream task metrics as an area for future study.

        + +

        Figure 1 plots the throughput for each model configuration and optimization, both with a local batch size of 8 and then with the maximum batch size possible on 1 node. Absence of a data point for a model variant for an optimization indicates that the model could not be trained on a single node.

        + +

        Figure 2 plots the maximum possible batch size per worker for each optimization. We observe a few things:

        + +
          +
        1. Scaling model size: DDP is only able to fit the 350M and 900M model on a node. With FSDP, due to memory savings, we are able to train ~3x bigger models compared to DDP (i.e. the 1.8B and 2.7B variants). Combining activation checkpointing (AC) with FSDP enables training even bigger models, on the order of ~10x compared to DDP (i.e. 4.8B and 10B variants)
        2. +
        3. Throughput: +
            +
          • For smaller model sizes, at a constant batch size of 8, the throughput for DDP is slightly higher than or equal to FSDP, explainable by the additional communication required by FSDP. It is lowest for FSDP and AC combined together. This is because AC re-runs checkpointed forward passes during the backwards pass, trading off additional computation for memory savings. However, in the case of the 2.7B model, FSDP + AC actually has higher throughput compared to FSDP alone. This is because the 2.7B model with FSDP is operating close to the memory limit even at batch size 8 triggering CUDA malloc retries which tend to slow down training. AC helps with reducing the memory pressure and leads to no retries.
          • +
          • For DDP and FSDP + AC, the throughput increases with an increase in batch size for each model. For FSDP alone, this is true for smaller variants. However, with the 1.8B and 2.7B parameter models, we observe throughput degradation when increasing batch size. A potential reason for this, as noted above also, is that at the memory limit, PyTorch’s CUDA memory management may have to retry cudaMalloc calls and/or run expensive defragmentation steps to find free memory blocks to handle the workload’s memory requirements which can result in training slowdown.
          • +
          • For larger models that can only be trained with FSDP (1.8B, 2.7B, 4.8B) the setting with highest throughput achieved is with FSDP + AC scaling to the maximum batch size. For 10B, we observe nearly equal throughput for smaller and maximum batch size. This might be counterintuitive as AC results in increased computation and maxing out batch size potentially leads to expensive defragmentation operations due to operating at CUDA memory limit. However, for these large models, the increase in batch size is large enough to mask this overhead.
          • +
          +
        4. +
        + +

        + +

        + +

        + Figure 1: Training throughput for different configurations +

        + +
          +
        1. Batch size: FSDP alone enables slightly higher batch sizes compared to DDP. Using FSDP + AC enables ~3x batch size compared to DDP for the 350M param model and ~5.5x for 900M param model. Even for 10B, a max batch size of ~20 which is fairly decent. This essentially enables larger global batch size using fewer GPUs which is especially useful for contrastive learning tasks.
        2. +
        + +

        + +

        + +

        + Figure 2: Max local batchsize possible for different configurations +

        + +

        Conclusion

        + +

        As the world moves towards multimodal foundation models, scaling model parameters and efficient training is becoming an area of focus. The PyTorch ecosystem aims to accelerate innovation in this field by providing different tools to the research community, both for training and scaling multimodal models. With FLAVA, we laid out an example of scaling a model for multimodal understanding. In the future, we plan to add support for other kinds of models like the ones for multimodal generation and demonstrate their scaling factors. We also hope to automate many of these scaling and memory saving techniques (such as sharding and activation checkpointing) to reduce the amount of user experimentation needed to achieve the desired scale and maximum training throughput.

        + +

        References

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud/index.html b/blog/scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud/index.html new file mode 100644 index 000000000000..9f0de4be9be8 --- /dev/null +++ b/blog/scaling-pytorch-fsdp-for-training-foundation-models-on-ibm-cloud/index.html @@ -0,0 +1,726 @@ + + + + + + + + + + + + + Scaling PyTorch FSDP for Training Foundation Models on IBM Cloud | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Linsong Chu, Less Wright, Hamid Shojanazeri, Sophia Wen, Raghu Ganti, Geeta Chauhan + +

        +

        Large model training using a cloud native approach is of growing interest for many enterprises given the emergence and success of foundation models. Some AI practitioners may assume that the only way they can achieve high GPU utilization for distributed training jobs is to run them on HPC systems, such as those inter-connected with Infiniband and may not consider Ethernet connected systems. We demonstrate how the latest distributed training technique, Fully Sharded Data Parallel (FSDP) from PyTorch, successfully scales to models of size 10B+ parameters using commodity Ethernet networking in IBM Cloud.

        + +

        PyTorch FSDP Scaling

        + +

        As models get larger, the standard techniques for data parallel training work only if the GPU can hold a full replica of the model, along with its training state (optimizer, activations, etc.). However, GPU memory increases have not kept up with the model size increases and new techniques for training such models have emerged (e.g., Fully Sharded Data Parallel, DeepSpeed), which allow us to efficiently distribute the model and data over multiple GPUs during training. In this blog post, we demonstrate a path to achieve remarkable scaling of model training to 64 nodes (512 GPUs) using PyTorch native FSDP APIs as we increase model sizes to 11B.

        + +

        What is Fully Sharded Data Parallel?

        + +

        FSDP extends the distributed data parallel training (DDP) approach by sharding model parameters, gradient and optimizer states into K FSDP units, determined by using a wrapping policy. FSDP achieves large model training efficiency in terms of resources and performance by significantly reducing the memory footprint on each GPU and overlapping computation and communication.

        + +

        Resource efficiency is achieved with memory footprint reduction by having all GPUs own a portion of each FSDP unit. To process a given FSDP unit, all GPUs share their locally owned portion via all_gather communication calls.

        + +

        Performance efficiency is accomplished by overlapping all_gather communication calls for upcoming FSDP units with computation of the current FSDP unit. Once the current FSDP unit has been processed, the non-locally owned parameters are dropped, freeing memory for the upcoming FSDP units. This process achieves training efficiency by the overlap of computation and communication, while also reducing the peak memory needed by each GPU.

        + +

        In what follows, we demonstrate how FSDP allows us to keep hundreds of GPUs highly utilized throughout a distributed training job, while running over standard Ethernet networking (system description towards the end of the blog). We chose the T5 architecture for our experiments and leveraged the code from the FSDP workshop. In each of our experiments, we start with a single node experiment to create a baseline and report the metric seconds/iteration normalized by the batch size as well as compute the teraflops based on the Megatron-LM paper (see Appendix for details of teraflop computation for T5). Our experiments aim to maximize the batch size (while avoiding cudaMalloc retries) to take full advantage of overlap in computation and communications, as discussed below. Scaling is defined as the ratio of the seconds/iteration normalized by batch size for N nodes versus a single node, representing how well we can utilize the additional GPUs as more nodes are added.

        + +

        Experimental Results

        + +

        Our first set of experiments using the T5-3B configuration (mixed precision with BF16, activation checkpointing, and transformer wrapping policy) demonstrated scaling efficiency of 95% as we increased the number of GPUs from 8 to 512 (1 to 64 nodes, respectively). We achieved these results without any modifications to the existing FSDP APIs. We observed that, for this scale, over Ethernet based network, there is sufficient bandwidth to enable continuous overlap of communication and computation.

        + +

        However, when we increased the T5 model size to 11B, the scaling efficiency declined substantially to 20%. The PyTorch profiler shows that overlap of communication and computation was very limited. Further investigation into the network bandwidth usage revealed that the poor overlap is being caused by latency in the communication of individual packets and not the bandwidth required (in fact, our peak bandwidth utilization is 1/4th of that available). This led us to hypothesize that if we can increase the compute time by increasing the batch size, we can better overlap communication and computation. However, given we are already at maximum GPU memory allocation, we must identify opportunities to rebalance the memory allocation to allow for increase in batch size. We identified that the model state was being allocated a lot more memory than was needed. The primary function of these reservations is to have pre-reserved memory ready to aggressively send/receive tensors during the communication periods and too few buffers can result in increased wait times, whereas too many buffers result in smaller batch sizes.

        + +

        To achieve better efficiency, the PyTorch distributed team introduced a new control knob, the rate_limiter which controls how much memory is allocated for send/receive of tensors, alleviating the memory pressure and providing room for higher batch sizes. In our case, the rate_limiter could increase the batch size from 20 to 50, thus increasing compute time by 2.5x and allowing for much greater overlap of communication and computation. With this fix, we increased the scaling efficiency to >75% (at 32 nodes)!

        + +

        Continued investigation into the factors limiting scaling efficiency uncovered that the rate limiter was creating a recurring pipeline bubble of GPU idle time. This was due to the rate limiter using a block and flush approach for the allocation and release of each set of memory buffers. By waiting for the entire block to complete before initiating a new all_gather, the GPU was idling at the start of each block, while waiting for the new set of all_gather parameters to arrive. This bubble was alleviated by moving to a sliding window approach. Upon the completion of a single all_gather step and its computation (rather than a block of them), the memory is freed and the next all_gather is immediately issued in a much more uniform manner. This improvement eliminated the pipeline bubble and boosted the scaling efficiencies to >90% (at 32 nodes).

        + +

        + +

        + +

        +Figure 1: Scaling of T5-XL (3B) and T5-XXL (11B) from 1 node to 64 nodes +

        + +

        + +

        + +

        +Figure 2: TFLOPs/sec usage for T5-XL(3B) and T5-XXL (11B) as we increase number of nodes +

        + +

        IBM Cloud AI System and Middleware

        + +

        The AI infrastructure used for this work is a large-scale AI system on IBM Cloud consisting of nearly 200 nodes, each node with 8 NVIDIA A100 80GB cards, 96 vCPUs, and 1.2TB CPU RAM. The GPU cards within a node are connected via NVLink with a card-to-card bandwidth of 600GBps. Nodes are connected by 2 x 100Gbps Ethernet links with SRIOV based TCP/IP stack, providing a usable bandwidth of 120Gbps.

        + +

        The IBM Cloud AI System has been production-ready since May of 2022 and is configured with the OpenShift container platform to run AI workloads. We also built a software stack for production AI workloads that provide end-to-end tools for training workloads. The middleware leverages Ray for pre and post processing workloads and PyTorch for training of models. We also integrate a Kubernetes native scheduler, MCAD, that manages multiple jobs with job queuing, gang scheduling, prioritization, and quota management. A multi-NIC CNI discovers all available network interfaces and handles them as a single NIC pool enabling optimized use of the network interfaces in Kubernetes. Finally, CodeFlare CLI supports a single pane for observability of the full stack using a desktop CLI (e.g., GPU utilization, application metrics like loss, gradient norm).

        + +

        + +

        + +

        +Figure 3: Foundation Model Middleware Stack +

        + +

        Conclusion and Future Work

        + +

        In conclusion, we demonstrated how we can achieve remarkable scaling of FSDP APIs over non-InfiniBand networks. We identified the bottleneck that had limited scaling to less than 20% efficiency for 11B parameter model training. After identifying the issue, we were able to correct this with a new rate limiter control to ensure a more optimal balance of reserved memory and communication overlap relative to compute time. With this improvement, we were able to achieve 90% scaling efficiency (a 4.5x improvement), at 256 GPUs and 80% at 512 GPUs for training of the 11B parameter model. In addition, the 3B parameter model scales extremely well with 95% efficiency even as we increase the number of GPUs to 512.

        + +

        This is a first in the industry to achieve such scaling efficiencies for up to 11B parameter models using Kubernetes with vanilla Ethernet and PyTorch native FSDP API’s. This improvement enables users to train huge models on a Hybrid Cloud platform in a cost efficient and sustainable manner.

        + +

        We plan on continuing to investigate scaling with decoder only models and increasing the size of these models to 100B+ parameters. From a system design perspective, we are exploring capabilities such as RoCE and GDR that can improve latencies of communications over Ethernet networks.

        + +

        Acknowledgements

        + +

        This blog was possible because of contributions from both PyTorch Distributed and IBM Research teams.

        + +

        From the PyTorch Distributed team, we would like to thank Less Wright, Hamid Shojanazeri, Geeta Chauhan, Shen Li, Rohan Varma, Yanli Zhao, Andrew Gu, Anjali Sridhar, Chien-Chin Huang, and Bernard Nguyen.

        + +

        From the IBM Research team, we would like to thank Linsong Chu, Sophia Wen, Lixiang (Eric) Luo, Marquita Ellis, Davis Wertheimer, Supriyo Chakraborty, Raghu Ganti, Mudhakar Srivatsa, Seetharami Seelam, Carlos Costa, Abhishek Malvankar, Diana Arroyo, Alaa Youssef, Nick Mitchell.

        + +

        Appendix

        + +

        Teraflop computation

        + +

        The T5-XXL (11B) architecture has two types of T5 blocks, one is an encoder and the second is a decoder. Following the approach of Megatron-LM, where each matrix multiplication requires 2m×k×n FLOPs, where the first matrix is of size m×k and the second is k×n. The encoder block consists of self-attention and feed forward layers, whereas the decoder block consists of self-attention, cross-attention, and feed forward layers.

        + +

        The attention (both self and cross) block consists of a QKV projection, which requires 6Bsh2 operations, an attention matrix computation requiring 2Bs2h operations, an attention over values which needs 2Bs2h computations, and the post-attention linear projection requires 2Bsh2 operations. Finally, the feed forward layer requires 15Bsh2 operations.

        + +

        The total for an encoder block is 23Bsh2+4Bs2h, whereas for a decoder block, it comes to 31Bsh2+8Bs2h. With a total of 24 encoder and 24 decoder blocks and 2 forward passes (as we discard the activations) and one backward pass (equivalent to two forward passes), the final FLOPs computation comes to be 96×(54Bsh2+ 12Bs2h) + 6BshV. Here, B is the batch size per GPU, s is sequence length, h is hidden state size, and V is vocabulary size. +We repeat a similar computation for T5-XL (3B) architecture, which is slightly different.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/index.html b/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/index.html new file mode 100644 index 000000000000..2909cf5065f5 --- /dev/null +++ b/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/index.html @@ -0,0 +1,761 @@ + + + + + + + + + + + + + Scaling PyTorch models on Cloud TPUs with FSDP | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Ronghang Hu, Vaibhav Singh, Jack Cao, Milad Mohammadi, Yeounoh Chung, Shauheen Zahirazami, Ross Girshick + +

        +

        Introduction

        + +

        The research community has witnessed a lot of successes with large models across NLP, computer vision, and other domains in recent years. Many of these successes were enabled by Cloud TPUs – which are powerful hardware for distributed training. To support TPUs in PyTorch, the PyTorch/XLA library provides a backend for XLA devices (most notably TPUs) and lays the groundwork for scaling large PyTorch models on TPUs.

        + +

        However, most existing modeling scaling tools in the PyTorch ecosystem assume GPU (or CPU) devices, often depend on specific features in CUDA, and do not work directly on TPUs. The lack of scaling tools makes it challenging to build large models that cannot fit into the memory of a single TPU chip.

        + +

        To support model scaling on TPUs, we implemented the widely-adopted Fully Sharded Data Parallel (FSDP) algorithm for XLA devices as part of the PyTorch/XLA 1.12 release. We provide an FSDP interface with a similar high-level design to the CUDA-based PyTorch FSDP class while also handling several restrictions in XLA (see Design Notes below for more details). This FSDP interface allowed us to easily build models with e.g. 10B+ parameters on TPUs and has enabled many research explorations.

        + +

        Using Fully Sharded Data Parallel (FSDP) in PyTorch/XLA

        + +

        We provide a wrapper class XlaFullyShardedDataParallel over a given PyTorch model to shard its parameters across data-parallel workers. An example usage is as follows:

        + +
        import torch
        +import torch_xla.core.xla_model as xm
        +from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
        +
        +model = FSDP(my_module)
        +optim = torch.optim.Adam(model.parameters(), lr=0.0001)
        +output = model(x, y)
        +loss = output.sum()
        +loss.backward()
        +optim.step()
        +
        + +

        Wrapping an nn.Module instance with XlaFullyShardedDataParallel enables the ZeRO-2 algorithm on it, where its gradients and the optimizer states are sharded for the entire training process. During its forward and backward passes, the full parameters of the wrapped module are first reconstructed from their corresponding shards for computation.

        + +

        Nested FSDP wrapping can be used to further save memory. This allows the model to store only the full parameters of one individual layer at any given time. For nested FSDP, one should first wrap its individual submodules with an inner FSDP before wrapping the base model with an outer FSDP. This allows the model to store only the full parameters of one individual layer at any given time. And having an outer wrapper ensures to handle any leftover parameters, corresponding to the ZeRO-3 algorithm. Nested FSDP wrapping can be applied at any depth of submodules and there can be more than 2 layers of nesting.

        + +

        Model checkpoint saving and loading for models and optimizers can be done like before by saving and loading their .state_dict(). Meanwhile, each training process should save its own checkpoint file of the sharded model parameters and optimizer states, and load the checkpoint file for the corresponding rank when resuming (regardless of ZeRO-2 or ZeRO-3, i.e. nested wrapping or not). A command line tool and a Python interface are provided to consolidate the sharded model checkpoint files together into a full/unshareded model checkpoint file.

        + +

        Gradient checkpointing (also referred to as “activation checkpointing” or “rematerialization”) is another common technique for model scaling and can be used in conjunction with FSDP. We provide checkpoint_module, a wrapper function over a given nn.Module instance for gradient checkpointing (based on torch_xla.utils.checkpoint.checkpoint).

        + +

        The MNIST and ImageNet examples below provide illustrative usages of (plain or nested) FSDP, saving and consolidation of model checkpoints, as well as gradient checkpointing.

        + +

        Starting examples of FSDP in PyTorch/XLA

        + +

        Training MNIST and ImageNet with FSDP

        + +

        MNIST and ImageNet classification can often be used as starting points to build more complicated deep learning models. We provide the following FSDP examples on these two datasets:

        + + + +

        A comparison of them with the vanilla data-parallel examples of MNIST and ImageNet illustrates how to adapt a training script to use FSDP. A major distinction to keep in mind is that when stepping the optimizer on an FSDP-wrapped model, one should directly call optimizer.step() instead of xm.optimizer_step(optimizer). The latter reduces the gradients across ranks, which is not what we need in FSDP, where the gradients are already reduced and sharded (from a reduce-scatter op in its backward pass).

        + +

        Installation

        + +

        FSDP is available from the PyTorch/XLA 1.12 and newer nightly releases. Please refer to https://github.com/pytorch/xla#-available-images-and-wheels for a guide on installation as well as Cloud TPU allocation. Then clone PyTorch/XLA repo on a TPU VM as follows

        + +
        mkdir -p ~/pytorch && cd ~/pytorch
        +git clone --recursive https://github.com/pytorch/xla.git
        +cd ~/
        +
        + +

        Train MNIST on v3-8 TPU

        + +

        It gets around 98.9 accuracy for 2 epochs:

        + +
        python3 ~/pytorch/xla/test/test_train_mp_mnist_fsdp_with_ckpt.py \
        +  --batch_size 16 --drop_last --num_epochs 2 \
        +  --use_nested_fsdp
        +
        + +

        The script above automatically tests consolidation of the sharded model checkpoints at the end. You can also manually consolidate the sharded checkpoint files via

        + +
        python3 -m torch_xla.distributed.fsdp.consolidate_sharded_ckpts \
        +  --ckpt_prefix /tmp/mnist-fsdp/final_ckpt \
        +  --ckpt_suffix "_rank-*-of-*.pth"
        +
        + +

        Train ImageNet with ResNet-50 on v3-8 TPU

        + +

        It gets around 75.9 accuracy for 100 epochs, same as what one would get without using FSDP; download and preprocess the ImageNet-1k dataset to /datasets/imagenet-1k:

        + +
        python3 ~/pytorch/xla/test/test_train_mp_imagenet_fsdp.py \
        +  --datadir /datasets/imagenet-1k --drop_last \
        +  --model resnet50 --test_set_batch_size 64 --eval_interval 10 \
        +  --lr 0.4 --batch_size 128 --num_warmup_epochs 5 \
        +  --lr_scheduler_divide_every_n_epochs 30 --lr_scheduler_divisor 10 \
        +  --num_epochs 100 \
        +  --use_nested_fsdp
        +
        + +

        You can also explore other options in these two examples, such as --use_gradient_checkpointing to apply gradient checkpointing (i.e. activation checkpointing) on the ResNet blocks, or --compute_dtype bfloat16 to perform forward and backward passes in bfloat16 precision.

        + +

        Examples on large-scale models

        + +

        When building large models on TPUs, we often need to be aware of the memory constraints (e.g. 16 GB per core in TPU v3 and 32 GB per chip in TPU v4). For large models that cannot fit into a single TPU memory or the host CPU memory, one should use nested FSDP to implement the ZeRO-3 algorithm interleave submodule construction with inner FSDP wrapping, so that the full model never needs to be stored in memory during construction.

        + +

        We illustrate these cases in https://github.com/ronghanghu/ptxla_scaling_examples, which provides examples of training a Vision Transformer (ViT) model with 10B+ parameters on a TPU v3 pod (with 128 cores) as well as other cases.

        + +

        Design Notes

        + +

        One might wonder why we need to develop a separate FSDP class in PyTorch/XLA instead of directly reusing PyTorch’s FSDP class or extending it to the XLA backend. The main motivation behind a separate FSDP class in PyTorch/XLA is that the native PyTorch’s FSDP class heavily relies on CUDA features that are not supported by XLA devices, while XLA also has several unique characteristics that need special handling. These distinctions require a different implementation of FSDP that would be much easier to build in a separate class.

        + +

        Changes in API calls

        +

        One prominent distinction is that the native PyTorch FSDP is built upon separate CUDA streams for asynchronous execution in eager mode, while PyTorch/XLA runs in lazy mode and also does not support streams. In addition, TPU requires that all devices homogeneously run the same program. As a result, in the PyTorch/XLA FSDP implementation, CUDA calls and per-process heterogeneity need to be replaced by XLA APIs and alternative homogeneous implementations.

        + +

        Tensor Storage Handling

        + +

        Another prominent distinction is how to free a tensor’s storage, which is much harder in XLA than in CUDA. To implement ZeRO-3, one needs to free the storage of full parameters after a module’s forward pass, so that the next module can reuse this memory buffer for subsequent computation. PyTorch’s FSPD accomplishes this on CUDA by freeing the actual storage of a parameter p via p.data.storage().resize_(0). However, XLA tensors do not have this .storage() handle given that the XLA HLO IRs are completely functional and do not provide any ops to deallocate a tensor or resize its storage. Below the PyTorch interface, only the XLA compiler can decide when to free a TPU device memory corresponding to an XLA tensor, and a prerequisite is that the memory can only be released when the tensor object gets deallocated in Python – which cannot happen in FSDP because these parameter tensors are referenced as module attributes and also saved by PyTorch autograd for the backward pass.

        + +

        Our solution to this issue is to split a tensor’s value properties from its autograd Variable properties, and to free a nn.Parameter tensor by setting its .data attribute to a dummy scalar of size 1. This way the actual data tensor for the full parameter gets dereferenced in Python so that XLA can recycle its memory for other computation, while autograd can still trace the base nn.Parameter as a weak reference to the parameter data. To get this to work, one also needs to handle views over the parameters as views in PyTorch also hold references to its actual data (this required fixing a shape-related issue with views in PyTorch/XLA).

        + +

        Working with XLA compiler

        + +

        The solution above should be enough to free full parameters if the XLA compiler faithfully preserves the operations and their execution order in our PyTorch program. But there is another problem – XLA attempts to optimize the program to speed up its execution by applying common subexpression elimination (CSE) to the HLO IRs. In a naive implementation of FSDP, the XLA compiler typically eliminates the 2nd all-gather in the backward pass to reconstruct the full parameters when it sees that it is a repeated computation from the forward pass, and directly holds and reuses the full parameters we want to free up after the forward pass. To guard against this undesired compiler behavior, we introduced the optimization barrier op into PyTorch/XLA and used it to stop eliminating the 2nd all-gather. This optimization barrier is also applied to a similar case of gradient checkpointing to prevent CSE between forward and backward passes that could eliminate the rematerialization.

        + +

        In the future, if the distinctions between CUDA and XLA become not as prominent as mentioned above, it could be worth considering a merge of the PyTorch/XLA FSDP with the native PyTorch FSDP to have a unified interface.

        + +

        Acknowledgments

        + +

        Thanks to Junmin Hao from AWS for reviewing the PyTorch/XLA FSDP pull request. Thanks to Brian Hirsh from the Meta PyTorch team for support on the PyTorch core issues. Thanks to Isaack Karanja, Will Cromar, and Blake Hechtman from Google for support on GCP, XLA, and TPU issues.

        + +

        Thanks to Piotr Dollar, Wan-Yen Lo, Alex Berg, Ryan Mark, Kaiming He, Xinlei Chen, Saining Xie, Shoubhik Debnath, Min Xu, and Vaibhav Aggarwal from Meta FAIR for various TPU-related discussions.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-recommendation-2d-sparse-parallelism/index.html b/blog/scaling-recommendation-2d-sparse-parallelism/index.html new file mode 100644 index 000000000000..a9ca6beedd24 --- /dev/null +++ b/blog/scaling-recommendation-2d-sparse-parallelism/index.html @@ -0,0 +1,877 @@ + + + + + + + + + + + + + Scaling Recommendation Systems Training to Thousands of GPUs with 2D Sparse Parallelism | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + PyTorch Team at Meta: Chunzhi Yang, Rich Zhu, Zain Huda, Liangbei Xu, Xin Zhang, Jiyan Yang, Dennis van der Staay, Wang Zhou, Jin Fang, Jade Nie, Yuxi Hu + +

        +

        At Meta, recommendation systems are the cornerstone of delivering relevant and personalized ads to billions of users globally. Through technologies like PyTorch’s TorchRec, we’ve successfully developed solutions that enable model training across hundreds of GPUs. While these systems have served us well, recent research on scaling laws has revealed a compelling opportunity: we can achieve significantly better model performance by training dramatically larger neural networks.

        + +

        However, this insight presents us with a new challenge. Our current training infrastructure, though highly optimized for hundreds of GPUs, cannot efficiently scale to the thousands of GPUs needed to train these larger models. The leap from hundreds to thousands of GPUs introduces complex technical challenges, particularly around handling sparse operations in recommendation models. These challenges require fundamentally new approaches to distributed training, which we address with a novel parallelization strategy.

        + +

        To address these issues, we introduced 2D embedding parallel, a novel parallelism strategy that overcomes the sparse scaling challenges inherent in training large recommendation models across thousands of GPUs. This is available today in TorchRec through the DMPCollection API. This approach combines two complementary parallelization techniques: data parallelism for the sparse components of the model, and model parallelism for the embedding tables, leveraging TorchRec’s robust sharding capabilities. By strategically integrating these techniques, we’ve created a solution that scales to thousands of GPUs and now powers Meta’s largest recommendation model training runs.

        + +

        What are the sparse scaling challenges?

        + +

        We identified three key challenges that prevented us from naively scaling our model to thousands of GPUs:

        + +
          +
        • Imbalancing and straggler issue: with more GPUs it’s harder to achieve balanced sharding, some ranks can have much heavier workload for embedding computations, which can slow down the entire training.
        • +
        • Communication across nodes: As training jobs utilize an increased number of GPUs, the all-to-all communication bandwidth can drop under certain network topologies which can increase communication latency significantly.
        • +
        • Memory overhead: The memory used by input features is often negligible, however, as we use thousands of GPUs, we can introduce larger input features and the memory requirements can become significant.
        • +
        + +

        With 2D embedding parallel, we can describe our new parallelism scheme like this, in this example we have 2 model replicas (Replica 1: GPU1/GPU3, Replica 2: GPU2/GPU4)

        + +

        Flow diagram

        + +

        Figure 1: Layout illustration of 2D Sparse Parallelism

        + +

        With 2D sparse parallelism we address these challenges, instead of sharding tables across all ranks, we first evenly divide all ranks into several parallel groups:

        + +
          +
        1. Within each group, we use model parallel for the embedding tables, such as column-wise/row-wise sharding. At scale, for our largest tables, we have also developed a grid sharding, which shards embedding tables on the row and column dimension.
        2. +
        3. Across groups, we do data parallel, such that each rank in a group has its corresponding replica rank in the other groups (replica rank means storing the same embedding table shards). +
            +
          1. After each group has completed its own backward pass, we all reduce the embedding table weights across the replicas to keep them synchronized.
          2. +
          +
        4. +
        + +

        Our production solution

        + +

        TorchRec is our library to build the sparse part of the recommendation models in native PyTorch. With the traditional API being DistributedModelParallel which applies model parallel to the embedding tables. We introduce a new API alongside it, known as DMPCollection, which serves as the main entry point for enabling 2D parallel on TorchRec models. We designed it to be as easy of a change as applying FSDP/DDP is.

        + +

        To understand what DMPCollection does, we have to understand what DistributedModelParallel (DMP) does first:

        + +
          +
        1. Create embedding tables, known as EmbeddingBagCollection and EmbeddingCollections.
        2. +
        3. Generate a sharding plan with respect to GPU topology, embedding tables, memory available, input data, and more.
        4. +
        5. Wrap model with DMP and the associated sharding plan passed in.
        6. +
        7. DMP initializes and shards the embedding tables in accordance with the sharding plan.
        8. +
        9. On a train step, DMP takes an input batch, communicates it to the appropriate GPUs containing the embedding table shard of interest, looks up the value, and returns it back to the GPU that requested it. This is all done on the global process group, with some exceptions for special sharding (such as table row wise sharding)
        10. +
        + +

        DistributedModelParallel was built for model parallel with many parts working under the assumption of sharding and working around the global world size. We need to change these parts in a way where we can introduce additional dimensions of parallelism without losing the optimizations and feature set of TorchRec.

        + +

        DMPCollection changes a few key parts to enable 2D parallel in an extensible way,

        + +
          +
        • Generate sharding plans for the smaller sharding group once, once passed in we communicate to the appropriate ranks across the global group and remap the ranks to fit the new sharding group ranks.
        • +
        • Create two new NCCL process groups, known as sharding and replica process groups. The sharding process group is passed into sharding and train step components of TorchRec. The replica process group is used for the weight and optimizer state synchronization, the all reduce call happens over this process group. +
            +
          • The sub NCCL process groups allow us to efficiently communicate only between the ranks that are relevant for a particular comm. Each rank will have two associated process groups.
          • +
          +
        • +
        + +

        To the user, the change is very simple, while taking away all the complexity around applying the parallelism strategies to the model.

        + +

        How do we create these sharding and replication groups?

        + +

        These process groups are one of the keys to DMPCollection’s performant implementation. From our earlier diagram, we showed a simple 2x2 GPU setup, however, at scale, how do we assign which ranks are part of a given sharding group and what are their replica ranks across the sharding groups?

        + +

        Consider the following setup with 2 nodes, each with 4 GPUs. The sharding and replication groups under 2D parallel will be,

        + + + + + + +
        + + + + + + + + + + + + + + +
        Sharding Group + Sharding Ranks +
        0 + 0, 2, 4, 6 +
        1 + 1, 3, 5, 7 +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + +
        Replication Group + Replication Ranks +
        0 + 0, 1 +
        1 + 2, 3 +
        2 + 4, 5 +
        3 + 6, 7 +
        + + +
        + +

        We use the following formulation,

        + +
          +
        1. Divide all trainers into G sharding groups, each with L trainers +
            +
          1. Groups, G, is determined by G = T / L, where T is total number of trainers
          2. +
          +
        2. +
        3. For each group, G, we assigned non-contiguous trainer ranks based on the group it’s in, following, +
            +
          1. [i, G+i, 2G+i, …, (L - 1) G+i], where* i = 0 to G-1*
          2. +
          +
        4. +
        5. From the groups, G, we can create the replication group, which is every G continuous ranks +
            +
          1. (0 to G-1, G to 2* G - 1) each continuous set stores the duplicate embedding table shards.
          2. +
          +
        6. +
        + +

        This means our sharding groups, G, are of size L, which can be known as the number of ranks to apply model parallel across. This, in turn, gives us replica groups, each of size G, which are the ranks we data parallel across.

        + +

        In DMPCollection, we’re able to create these process groups efficiently with the use of DeviceMesh, we create the entire GPU topology in a 2x2 matrix, with each row representing the group of sharding ranks and each column representing the corresponding replica ranks,

        + +
        create peer matrix
        +num_groups = global_world_size // sharding_group_size
        +for each group_rank in num_groups:
        +	peers = [num_groups * rank + group_rank for rank in range(sharding_group_size)]
        +	add peer to peer matrix
        +
        +initalize DeviceMesh with two dimensions (shard, replicate)
        +slice DeviceMesh on shard for sharding process group
        +slide DeviceMesh on replicate for replica process group
        +
        + +

        With our DeviceMesh approach, should we want to change the topology or provide further flexibility in the future, we can easily extend our creation logic to any form of topologies and even extend for further dimensions of parallelism if needed.

        + +

        Performance of 2D parallel

        + +

        Our rank partitioning strategy optimizes communication patterns by strategically placing model replica ranks for each shard within the same compute node. This architecture provides significant performance benefits for the weight synchronization operation. After the backward pass, we perform all-reduce operations to synchronize model weights—which is an expensive process given the large parameter counts we have to communicate and sync—with our setup of placing replicas on the same node we leverage intra node’s high-bandwidth over-relying on slower inter-node bandwidth.

        + +

        The effect of this design choice on the other communication collectives generally improves the latencies. The improvement stems from two factors.

        + +
          +
        1. By sharding the embedding tables over a reduced number of ranks and conducting communications for the model within the smaller group, we achieve a lower all-to-all latency.
        2. +
        3. With the replication in 2D parallel, our embedding lookup latency on a rank reduces, we can reduce the local batch size to 1/Nth of the equivalent global batch size, where N is the number of model replicas.
        4. +
        + +

        A production model trace exemplifies these two factors, here we run the 2D parallel job on 1024 GPUs, with a sharding group size of 256 GPUs.

        + +

        State diagram

        + +

        Figure 2: Comparing latencies between non 2D parallel and 2D parallel workloads

        + +

        There are two key levers users have to tune to maximize performance for their workloads:

        + +
          +
        1. The size of the model sharding group relative to the global world size. The global world size divided by the sharding group size represents the number of model replicas we will have. +
            +
          1. To maximize performance, users can look to scale up their model up to 8x, this scaling factor maintains the intra-host all reduce. +
              +
            1. For further scaling, the all reduce would have to happen over inter host. From our experiments, we did not see an obvious performance regression and in fact note advantages of an inter host all reduce. We can change our sharding and replica topology to inter host all reduce, which can help us introduce fault tolerance strategies should a particular host go down.
            2. +
            +
          2. +
          +
        2. +
        3. Frequency of all reduce synchronization, DMPCollection comes with a sync() call, which can be tuned to be called every N training steps, performing a sort of local SGD training. With scale, reducing the frequency of synchronization can bring significant gains to performance.
        4. +
        + +

        Future Work

        + +

        Readers should note that 2D sparse parallel training differs from non-parallelized training because we synchronize the embedding table weights rather than the gradients. This approach is made possible by TorchRec’s use of FBGEMM, which provides optimized kernels under the hood. One of FBGEMM’s key optimizations is the fusion of the optimizer in the backward pass. Instead of fully materializing the embedding table gradients—which would consume significant memory—they are passed directly to the optimizer update. Attempting to materialize and synchronize these gradients would create substantial overhead, making that approach impractical.

        + +

        Our exploration revealed that to achieve training results comparable to the baseline, we synchronize optimizer states on a delayed schedule, with the timing dependent on the number of sharding/replica groups (ie: for Adagrad we update the momentum behind by one sync step). This approach also enables users to implement local SGD or semi-synchronized training strategies, which can achieve convergence and potentially produce better loss curves than the baseline.

        + +

        We thank you for reading our post! This is an exciting direction we have come across that we hope to develop further to maximize performance of recommendation systems and push the state of the art.

        + + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/scaling-vision-model-training-platforms-with-pytorch/index.html b/blog/scaling-vision-model-training-platforms-with-pytorch/index.html new file mode 100644 index 000000000000..679778b38ed4 --- /dev/null +++ b/blog/scaling-vision-model-training-platforms-with-pytorch/index.html @@ -0,0 +1,814 @@ + + + + + + + + + + + + + Scaling Vision Model Training Platforms with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Vaibhav Aggarwal, Mannat Singh, Anjali Sridhar, Yanghao Li, Shoubhik Debnath, Ronghang Hu, Will Feng, Xinlei Chen, Tingting Markstrum, Diana Liskovich, Anupam Bhatnagar, Chay Ryali, Haoqi Fan, Tete Xiao, Min Xu, Rahul Iyer, Christoph Feichtenhofer, Ross Girshick, Piotr Dollar, Aaron Adcock, Wan-Yen Lo, CK Luk + +

        +

        TL;DR: We demonstrate the use of PyTorch with FairScale’s FullyShardedDataParallel (FSDP) API in writing large vision transformer models. We discuss our techniques for scaling and optimizing these models on a GPU cluster. The goal of this platform scaling effort is to enable research at scale. This blog does not discuss model accuracy, new model architectures, or new training recipes.

        + +

        1. Introduction

        + +

        Latest vision research [1, 2] demonstrates model scaling as a promising research direction. In this project, we aim to enable our platforms to train massive vision transformer (ViT) [3] models. We present our work on scaling the largest trainable ViT from 1B to 120B parameters in FAIR vision platforms. We wrote ViT in PyTorch and leveraged its support for large-scale, distributed training on a GPU cluster.

        + +

        In the rest of this blog, we will first discuss the main challenges, namely scalability, optimization, and numerical stability. Then we will discuss how we tackle them with techniques including data and model parallelism, automatic mixed precision, kernel fusion, and bfloat16. Finally, we present our results and conclude.

        + +

        2. Main Challenges

        + +

        2.1 Scalability

        + +

        The key scalability challenge is to efficiently shard a model’s operations and state across multiple GPUs. A 100B parameter model requires ~200GB of RAM just for parameters, assuming fp16 representation. So, it is impossible to fit the model on a single GPU (A100 has at most 80GB RAM). Therefore, we need some way to efficiently shard a model’s data (input, parameters, activations, and optimizer state) across multiple GPUs.

        + +

        Another aspect of this problem is to scale without significantly changing the training recipe. E.g. Certain representation learning recipes use a global batch size of up to 4096 beyond which we start to see accuracy degradation. We cannot scale to more than 4096 GPUs without using some form of tensor or pipeline parallelism.

        + +

        2.2 Optimization

        + +

        The key optimization challenge is to maintain high GPU utilization even as we scale the number of model parameters and flops. When we scale models to teraflops and beyond, we start to hit major bottlenecks in our software stack that super-linearly increase training time and reduce accelerator utilization. We require hundreds or thousands of GPUs to run just a single experiment. Improvements in accelerator utilization can lead to significant reductions in cost and improve fleet utilization. It enables us to fund more projects and run more experiments in parallel.

        + +

        2.3 Numerical Stability

        + +

        The key stability challenge is to avoid numerical instability and divergence at large scale. We empirically observed in our experiments that the training instability gets severe and hard to deal with when we scale up model sizes, data, batch sizes, learning rate, etc. Vision Transformers particularly face training instability even at a lower parameter threshold. E.g., we find it challenging to train even ViT-H (with just 630M parameters) in mixed-precision mode without using strong data augmentation. We need to study the model properties and training recipes to make sure that the models train stably and converge.

        + +

        3. Our Solutions

        + +

        Figure 1 depicts our solutions to each of the challenges.

        + +

        + +

        + +

        3.1 Addressing scaling challenges with data parallelism and model parallelism

        + +

        We apply various forms of data and model parallelism to enable fitting very large models in GPU memory.

        + +

        We use FairScale’s FullyShardedDataParallel (FSDP) API [4], based on PyTorch, to shard parameters, gradients, and optimizer state across multiple GPUs, thereby reducing the memory footprint per GPU. This process consists of the following three steps:

        + +
          +
        • +

          Step 1: We wrapped the entire model in a single FSDP instance. This shards the model parameters at the end of a forward pass and gathers parameters at the beginning of a forward pass. This enabled us to scale ~3x from 1.5B to 4.5B parameters.

          +
        • +
        • +

          Step 2: We experimented with wrapping individual model layers in separate FSDP instances. This nested wrapping further reduced the memory footprint by sharding and gathering parameters of individual model layers instead of an entire model. The peak memory is then determined by an individually wrapped transformer block in GPU memory in this mode instead of the entire model.

          +
        • +
        • +

          Step 3: We used activation-checkpoint to reduce the memory consumption by activations. It saves the input tensors and discards the intermediate activation tensors during the forward pass. These are recomputed during the backward pass.

          +
        • +
        + +

        In addition, we experimented with model-parallelism techniques such as pipeline parallelism [5], which allow us to scale to more GPUs without increasing the batch size.

        + +

        3.2 Addressing optimization challenges with advanced AMP and kernel fusion

        + +

        Advanced AMP

        + +

        Automatic Mixed Precision (AMP) [6] training refers to training models using a lower precision of bits than FP32 or the default but still maintaining accuracy. We experimented with three levels of AMP as described below:

        + +
          +
        • +

          AMP O1: This refers to training in mixed precision where weights are in FP32 and some operations are in FP16. With AMP O1, the ops that might impact accuracy remain in FP32 and are not autocasted to FP16.

          +
        • +
        • +

          AMP O2: This refers to training in mixed precision but with more weights and ops in FP16 than in O1. Weights do not implicitly remain in FP32 and are cast to FP16. A copy of the master weights is maintained in the FP32 precision that is used by the optimizer. If we want the normalization layer weights in FP32 then we need to explicitly use layer wrapping to ensure that.

          +
        • +
        • +

          Full FP16: This refers to training in full FP16 where weights and operations are in FP16. FP16 is challenging to enable for training due to convergence issues.

          +
        • +
        + +

        We found that AMP O2 with LayerNorm wrapping in FP32 leads to the best performance without sacrificing accuracy.

        + +

        Kernel Fusion

        + +
          +
        • To reduce GPU kernel launch overhead and increase GPU work granularity, we experimented with kernel fusions, including fused dropout and fused layer-norm, using the xformers library [7].
        • +
        + +

        3.3 Addressing stability challenges by studying ops numerical stability and training recipes

        + +

        BFloat16 in general but with LayerNorm in FP32

        + +

        The bfloat16 (BF16) [8] floating-point format provides the same dynamic range as FP32 with a memory footprint identical to FP16. We found that we could train models in the BF16 format using the same set of hyperparameters as in FP32, without special parameter tuning. Nevertheless, we found that we need to keep LayerNorm in FP32 mode in order for the training to converge.

        + +

        3.4 Final training recipe

        + +

        A summary of the final training recipe.

        + +
          +
        1. Wrap the outer model in an FSDP instance. Enable parameter sharding after the forward pass.
        2. +
        3. Wrap individual ViT blocks with activation checkpointing, nested FSDP wrapping, and parameter flattening.
        4. +
        5. Enable mixed precision mode (AMP O2) with bfloat16 representation. Maintain the optimizer state in FP32 precision to enhance numerical stability.
        6. +
        7. Wrap normalization layers like LayerNorm in FP32 for better numerical stability.
        8. +
        9. Maximize the Nvidia TensorCore utilization by keeping matrix dimensions to be multiple of 8. For More details check Nvidia Tensor Core Performance Guide.
        10. +
        + +

        4. Results

        + +

        In this section, we show the scaling results of ViT on three types of tasks: (1) image classification, (2) object detection (3) video understanding. Our key result is that we are able to train massive ViT backbones across these vision tasks after applying the discussed scaling and optimization techniques. This enables vision research at a much larger scale. We trained the models to convergence to verify that we maintain the current baselines even with all the optimizations. A common trend in Figures 2, 3, 4 is that we are able to train up to 25B-param models with an epoch time of less than 4 hours on 128 A100 GPUs. The 60B and 120B models are relatively slower to train.

        + +

        Figure 2 shows the image-classification scaling result. It plots the epoch time for training ViTs on ImageNet using 128 A100-80GB GPUs with different model sizes.

        + +

        + +

        + +

        +Figure 2: Image-classification scaling result. +

        + +

        Figure 3 shows the object-detection scaling result. It plots the epoch time for training ViTDet [9] with different ViT backbones on COCO using 128 A100-80GB GPUs.

        + +

        + +

        + +

        +Figure 3: Object-detection scaling result. +

        + +

        Figure 4 shows the video-understanding scaling result. It plots the epoch time for training MViTv2 [10] models on Kinetics 400 [11] using 128 V100 (32 GB) GPUs in FP32.

        + +

        + +

        + +

        +Figure 4: Video-understanding scaling result. +

        + +

        Figure 5 shows the optimization result with the ViT-H model in Figure 2 on 8 A100-40GB GPUs. +Three versions are used: (1) the baseline uses PyTorch’s DDP [12] with AMP O1, (2) FSDP + AMP-O2 + other optimizations, and (3) FSDP + FP16 + other optimizations. These optimizations altogether speed up the training by up to 2.2x.

        + +

        + +

        + +

        +Figure 5: Training speedups from various optimizations. +

        + +

        5. Concluding Remarks

        + +

        We have demonstrated the use of PyTorch with FairScale’s FullyShardedDataParallel (FSDP) API in writing large vision transformer models. We discuss our techniques for scaling and optimizing these models on a GPU cluster. We hope that this article can motivate others to develop large-scale ML models with PyTorch and its ecosystem.

        + +

        References

        + +

        [1] Masked Autoencoders Are Scalable Vision Learners

        + +

        [2] Revisiting Weakly Supervised Pre-Training of Visual Perception Models

        + +

        [3] An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale

        + +

        [4] fairscale.nn.FullyShardedDataParallel

        + +

        [5] Pipeline parallelism in PyTorch

        + +

        [6] Automatic Mixed Precision (AMP) in PyTorch

        + +

        [7] xformers

        + +

        [8] The bfloat16 numerical format

        + +

        [9] Exploring Plain Vision Transformer Backbones for Object Detection

        + +

        [10] MViTv2: Improved Multiscale Vision Transformers for Classification and Detection

        + +

        [11] https://www.deepmind.com/open-source/kinetics

        + +

        [12] Getting Started with Distributed Data Parallel (DDP)

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/sglang-joins-pytorch/index.html b/blog/sglang-joins-pytorch/index.html new file mode 100644 index 000000000000..5ef1322bcb42 --- /dev/null +++ b/blog/sglang-joins-pytorch/index.html @@ -0,0 +1,733 @@ + + + + + + + + + + + + + SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + SGLang Team + +

        +

        sglang logo

        + +

        We’re thrilled to announce that the SGLang project has been integrated into the PyTorch ecosystem! This integration ensures that SGLang aligns with PyTorch’s standards and practices, providing developers with a reliable and community-supported framework for fast and flexible serving of LLMs.

        + +

        To view the PyTorch Ecosystem, see the PyTorch Landscape and learn more about how projects can join the PyTorch Ecosystem.

        + +

        About SGLang

        + +

        SGLang is a fast-serving engine for large language models and vision language models. It makes the interaction with models faster and more controllable by co-designing the backend runtime and frontend language.

        + +

        The core features include:

        + +
          +
        • Fast Backend Runtime: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
        • +
        • Flexible Frontend Language: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
        • +
        • Extensive Model Support: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
        • +
        • Active Community: SGLang is open source and backed by an active community with industry adoption.
        • +
        + +

        SGLang is famous for its fast speed. It can often significantly outperform other state-of-the-art frameworks in terms of serving throughput and latency. You can learn more about the underlying techniques from the past release blog posts: v0.2 blog, v0.3 blog, v0.4 blog.

        + +

        SGLang has been widely adopted by leading industry companies and frontier research labs. For example, xAI uses SGLang to serve its flagship model, Grok 3, which is currently the best model according to the Chatbot Arena leaderboard. Microsoft Azure uses SGLang to serve DeepSeek R1 on AMD GPUs, which is currently the best open source model.

        + +

        Serving DeepSeek Models

        + +

        You can easily launch a Docker container to serve a DeepSeek model with the following command:

        + +
        # Pull the latest image
        +docker pull lmsysorg/sglang:latest
        +
        +# Launch a server
        +docker run --gpus all --shm-size 32g -p 30000:30000 -v ~/.cache/huggingface:/root/.cache/huggingface --ipc=host --network=host --privileged lmsysorg/sglang:latest \
        +    python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code --port 30000
        +
        + +

        Then you can query the server with the OpenAI-compatible API

        + +
        import openai
        +client = openai.Client(base_url=f"http://127.0.0.1:30000/v1", api_key="None")
        +
        +response = client.chat.completions.create(
        +    model="deepseek-ai/DeepSeek-V3",
        +    messages=[
        +        {"role": "user", "content": "List 3 countries and their capitals."},
        +    ],
        +    temperature=0,
        +    max_tokens=64,
        +)
        +
        + +

        The server launch command above works for 8xH200. You can find detailed instructions for other hardware (MI300X, H100, A100, H20, L40S) at https://docs.sglang.ai/references/deepseek.html.

        + +

        SGLang integrates DeepSeek-specific optimizations, such as MLA throughput optimizations, MLA-optimized kernels, data-parallel attention, multi-token prediction, and DeepGemm, making it the top choice for serving DeepSeek models by dozens of companies, including AMD, NVIDIA, and many cloud providers. The team is actively working on integrating more optimizations following the 2025 H1 roadmap below.

        + +

        Serving Llama Models

        + +

        Similarly, you can launch the server for a Llama 3.1 text model with:

        + +
        python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
        +
        + +

        Or a Llama 3.2 multimodal model with:

        + +
        python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct  --chat-template=llama_3_vision
        +
        + +

        Roadmap

        + +

        This year, the SGLang team will continue to push the boundaries of system efficiency. You can find the roadmap of 2025H1 here. The focus is

        + +
          +
        • Throughput-oriented large-scale deployment similar to the DeepSeek inference system
        • +
        • Long context optimizations
        • +
        • Low latency speculative decoding
        • +
        • Reinforcement learning training framework integration
        • +
        • Kernel optimizations
        • +
        + +

        Community

        + +

        SGLang has been deployed to large-scale production, generating trillions of tokens every day. It has an active community with over three hundred contributors on GitHub. It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, iFlytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.

        + +

        logos

        + +

        Conclusion

        + +

        We’re excited to welcome SGLang to the PyTorch ecosystem. SGLang accelerates the serving of large language and vision language models. It’s widely adopted by industry, powering the large-scale online serving of frontier models like Grok and DeepSeek.

        + +

        We invite you to explore the SGLang GitHub repo, join the community on Slack, and reach out to contact@sglang.ai for inquiries or collaboration opportunities. Together, we can make powerful AI models accessible to everyone.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/snowflake-joins-pytorch/index.html b/blog/snowflake-joins-pytorch/index.html new file mode 100644 index 000000000000..b9711df75269 --- /dev/null +++ b/blog/snowflake-joins-pytorch/index.html @@ -0,0 +1,673 @@ + + + + + + + + + + + + + Snowflake Joins the PyTorch Foundation as a General Member | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Snowflake logo

        + +

        The PyTorch Foundation, a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem, is announcing today that Snowflake has joined as a general member.

        + +

        Snowflake enables thousands of organizations to unite siloed data, discover and securely share data, power data applications, and execute diverse AI/ML and analytic workloads across multiple clouds and geographies.

        + +

        “By joining the PyTorch community, we know that Snowflake will help accelerate data warehousing solutions and cutting-edge AI frameworks. This showcases the commitment to advancing innovation for data and artificial intelligence,” said Ibrahim Haddad, Executive Director, PyTorch Foundation. “We are thrilled to have Snowflake join the PyTorch Foundation, marking a significant stride in the convergence of data management and deep learning technologies.”

        + +

        Snowflake enables collaboration with AI technologies to handle the storage and analysis of large datasets generated by machine learning and AI applications through scalability and SQL support.

        + +

        With the integrated repository of Python libraries from Anaconda in Snowpark, Snowflake users have always had a streamlined experience to deploy pre-trained PyTorch models in Snowflake to easily and securely make them a part of applications. Now with the addition of GPU instances in Snowpark Container Services (in private preview), training and other computationally intensive processing using PyTorch will also be streamlined, providing teams with an end-to-end solution for AI development and deployment.

        + +

        “Most if not all of our customers incorporate open source software as part of their data stacks, so it is critical for us to work with open source ecosystems like the PyTorch Foundation, alongside incorporating open source to meet the needs of our customers,” said Adrien Treuille, Co-Founder of Streamlit, Director of Product Management at Snowflake. “As AI developers continue to integrate their models as part of applications, the power of Snowflake and PyTorch — coupled with Streamlit as the powerful front-end — creates near-limitless innovation for developers looking to build next-generation apps and unlock even more use cases.”

        + +

        To learn more about the power of Snowflake and PyTorch, tune into Snowflake’s developer conference for AI and apps, BUILD.

        + +

        To learn more about how you can be a part of the PyTorch Foundation, visit our website.

        + +

        About Snowflake

        + +

        Snowflake enables every organization to mobilize their data with Snowflake’s Data Cloud. Customers use the Data Cloud to unite siloed data, discover and securely share data, power data applications, and execute diverse AI/ML and analytic workloads. Wherever data or users live, Snowflake delivers a single data experience that spans multiple clouds and geographies. Thousands of customers across many industries, including 639 of the 2023 Forbes Global 2000 (G2K) as of July 31, 2023, use Snowflake Data Cloud to power their businesses. Learn more at snowflake.com.

        + +

        About PyTorch Foundation

        + +

        The PyTorch Foundation is a neutral home for the deep learning community to collaborate on the open source PyTorch framework and ecosystem. The PyTorch Foundation is supported by its members and leading contributors to the PyTorch open source project. The Foundation leverages resources provided by members and contributors to enable community discussions and collaboration.

        + +

        About The Linux Foundation

        + +

        The Linux Foundation is the world’s leading home for collaboration on open source software, hardware, standards, and data. Linux Foundation projects are critical to the world’s infrastructure including Linux, Kubernetes, Node.js, ONAP, PyTorch, RISC-V, SPDX, OpenChain, and more. The Linux Foundation focuses on leveraging best practices and addressing the needs of contributors, users, and solution providers to create sustainable models for open collaboration. For more information, please visit us at linuxfoundation.org. The Linux Foundation has registered trademarks and uses trademarks. For a list of trademarks of The Linux Foundation, please see its trademark usage page. Linux is a registered trademark of Linus Torvalds.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/speeding-up-vits/index.html b/blog/speeding-up-vits/index.html new file mode 100644 index 000000000000..6f6c1d4b21f9 --- /dev/null +++ b/blog/speeding-up-vits/index.html @@ -0,0 +1,752 @@ + + + + + + + + + + + + + Speeding up ViTs using Block Sparsity | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        May 14, 2024

        +

        + Speeding up ViTs using Block Sparsity +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + FAIR at Meta: Mostafa Elhoushi, Sensors and Systems at Meta Reality Labs Research: Syed Shakib Sarwar, Aaryan Kothapalli, Mia Kasperek, Barbara De Salvo, PyTorch at Meta: Christian Puhrsch, Jesse Cai, Joe Isaacson, Quantsight: Andrew James, Pearu Peterson, Nikita Vedeneev + +

        +

        TLDR: We show promising results of up to a 1.46x speedup with <2% drop in accuracy on float32 Vision Transformers on A100 GPUs by applying block sparsity on MLP module’s weights. This approach can potentially be applied to other types of transformers including large language models. Our implementation and benchmarks to reproduce our results are available at https://github.com/pytorch-labs/superblock.

        + +

        Introduction

        + +

        PyTorch has landed a lot of improvements to CUDA kernels that implement block sparse matrix multiplications. Recent updates to Pytorch can lead up to 4.8x speedup on large matrix multiplication shapes with high sparsity levels over dense baselines.

        + +

        In this blog, we show the promising results of applying block sparsity on weights of linear layers of MLP (multi-layer perceptron) layers in vision transformers (ViTs) and show end-to-end model speedups on A100 Nvidia GPUs.

        + +

        As a recap, block sparsity sparsifies weights in tiles of blocks of predetermined size, rather than sparsifying individual elements. This particular sparsity pattern is interesting because it is amenable to GPU acceleration via fast sparse kernels. For more information about the differences between different sparsity patterns, or about sparsity as a whole, please check out torchao.

        + +

        Illustrations of different types of sparsity.

        + +

        Illustrations of different types of sparsity.

        + +

        Approach

        + +

        Our approach can be broken down into two distinct steps:

        + +
          +
        1. Training the model from scratch using block sparse masks subnets.
        2. +
        3. Folding these masks into our weights to accelerate them for inference.
        4. +
        + +

        We explain our training and inference steps below

        + +

        Training

        + +

        Starting with an uninitialized Vision Transformer, we apply random trainable masks with a specified block size and sparsity level on the weights of output projection linear layer of attention blocks, the weights of the two linear layers inside the MLP, a.k.a., FFN (feed forward networks), as well as the final linear classification layer. The forward pass during training follows the supermask approach, as each mask is converted to binary map using a tuned threshold based on sparsity requirements, e.g., if we want 80% sparsity, we will have the threshold automatically tuned to keep top 20% weights. The masks are of a square <block size>x<block size> elements, where <block size> is a hyperparameter. The priority of the weights is dependent on the mask value or score which is trained. We multiply the binary masks of each layer with the weights to sparsify the model.

        + +

        Illustration of the Supermask sparsification approach

        + +

        Illustration of the Supermask sparsification approach.

        + +

        Inference

        + +

        After training, the dense weights can be turned to sparse weights by multiplying with the mask and stored for inference. At this stage, although the weights have a high percentage of zero values, they are still stored in dense format. We use PyTorch’s to_sparse_bsr() API to to convert the weights to Block Sparse Representation (BSR) format that stores only the non-zero values and the indices of their blocks. This step only needs to be done once and the results can be cached for runtime.

        + +

        During runtime, no changes in code are required. We just pass any input tensor to the model, and when the forward() function of the sparsified linear layers are invoked, PyTorch takes care of invoking the optimized matrix multiplication for block sparse weights. This should work for A100 as well as H100 NVIDIA GPUs.

        + +

        Results: Microbenchmarks

        + +

        To validate the viability of block sparsity from a performance standpoint, we first ran a series of microbenchmarks using this simple script. Using the linear shapes from ViT-b, we compared the speedup of our block sparse kernels across a single linear layer as we varied the sparsity level and block size of the weight matrix.

        + +

        We run using PyTorch 2.3.0.dev20240305+cu121 nightly on NVIDIA A100s and report the speedup of each sparsity configuration compared to dense baseline. We observed positive speedups when block size >=32 or sparsity level >= 0.8 for float32, while for bfloat16 we observe smaller speedups and usually for block size 64 and higher sparsities. Hence, for end-to-end speedups on the model, we will focus in this blog on float32 and leave bfloat16 for future work.

        + +

        Micro benchmarking results on linear layers of ViT-b-16.

        + +

        Micro benchmarking results on linear layers of ViT-b-16.

        + +

        Micro benchmarking results on linear layers of ViT-b-16.

        + +

        Results: Vision Transformers

        + +

        Once we confirmed that we were able to show speedups over the linear layers, we focused on showing end-to-end speedups on ViT_B_16.

        + +

        We trained this model from scratch on ImageNet dataset using the standard ViT_B_16 recipe. We show speedups for sparsifying MLP modules and leave sparsifying weights of input and output projections of attention for future work.

        + +

        We looked at wall-clock inference speedup, focusing on batch size 256. We found that:

        + +
          +
        • For 90% sparsity we can get 1.24x, 1.37x, 1.65x speedups for block sizes 16, 32, and 64 respectively.
        • +
        • To obtain speedup, the minimum sparsity for block sizes 16, 32, and 64 are 0.86, 0.82, and 0.7 respectively. Hence, as expected, the larger the block size, the smaller sparsity we need to obtain speedup.
        • +
        + +

        We note a limitation of the sparse_bsr() API: that layers need to be multiples of the block size. Since the dimensions of the last FC classification layer in ViT was not a multiple of the block size, they were not converted to BSR representation in our experiments.

        + +

        Speedup on ViT-b-16 with batch size 256 on MLP modules across different batch sparsities and block sizes.

        + +

        Speedup on ViT-b-16 with batch size 256 on MLP modules across different batch sparsities and block sizes.

        + +

        We also explored the speedup for different batch sizes for 90% sparsity. We observed a speedup over the baseline for batch sizes starting from 16 and upwards. While bigger block sizes have bigger speedups at the largest batch sizes, the smallest possible batch size to obtain >1 speedup is smaller for smaller block sizes.

        + +

        We believe on-device hardware can obtain speedups for batch size 1 as they - unlike server GPUs - can be fully utilized at such small batch sizes.

        + +

        Speedup on ViT-b-16 with 90% sparsity on MLP modules across different batch sizes and block sizes.

        + +

        Speedup on ViT-b-16 with 90% sparsity on MLP modules across different batch sizes and block sizes.

        + +

        Looking at the Top-1 accuracy on ImageNet=blurred test set of the sparsified models for different block sizes and sparsities, we see a few expected results:

        + +
          +
        • low levels of sparsity (<=70%) have no meaningful regression in accuracy
        • +
        • mid levels of sparsity (>=80% to <90%) have limited regression in accuracy
        • +
        • high levels of sparsity (>=90%) removes so many weights that accuracy is significantly impacted
        • +
        + +

        More research could be done to improve accuracies of higher sparsities and larger block sizes. We hope that the block sparsity support in PyTorch and the illustrated speedups in this blog will encourage researchers to explore more accurate sparsification approaches.

        + +

        Accuracies on training ViT-b-16 on ImageNet-blurred using the SuperMask approach.

        + +

        Accuracies on training ViT-b-16 on ImageNet-blurred using the SuperMask approach.

        + +

        Next Steps

        + +

        We have shown promising speedups for block sparsifying MLP modules ViT in float32 precision. There is still more work to be done in order to observe speedups on bfloat16 and we hope to obtain progress on that soon. Possible next steps to further optimize block sparsity on vision transformers and transformers in general:

        + +
          +
        • Perform block sparsity on attention input and output projections.
        • +
        • Perform block sparsity during finetuning rather than training from scratch.
        • +
        • Perform further optimizations on the matmul kernels for ViT’s linear operator specific shapes (especially for 80% and lower sparsity).
        • +
        • Combine with other optimizations such as int8 and torch.compile()
        • +
        • Explore other weight sparsification algorithms, e.g., Spartan, to improve accuracy
        • +
        • Explore selecting weights to sparsify (e.g., specific transformer layers)
        • +
        + +

        Please reach out to melhoushi@meta.com if you have questions or are interested in contributing to block sparsification!

        + +

        Additionally if you’re broadly interested in sparsity please feel free to reach out to @jcaip / jessecai@meta.com and please come check out torchao, a community we’re building for architecture optimization techniques like quantization and sparsity.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/stochastic-weight-averaging-in-pytorch/index.html b/blog/stochastic-weight-averaging-in-pytorch/index.html new file mode 100644 index 000000000000..dbb239364df6 --- /dev/null +++ b/blog/stochastic-weight-averaging-in-pytorch/index.html @@ -0,0 +1,878 @@ + + + + + + + + + + + + + Stochastic Weight Averaging in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        April 29, 2019

        +

        + Stochastic Weight Averaging in PyTorch +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Pavel Izmailov and Andrew Gordon Wilson + +

        +

        In this blogpost we describe the recently proposed Stochastic Weight Averaging (SWA) technique [1, 2], and its new implementation in torchcontrib. SWA is a simple procedure that improves generalization in deep learning over Stochastic Gradient Descent (SGD) at no additional cost, and can be used as a drop-in replacement for any other optimizer in PyTorch. SWA has a wide range of applications and features:

        + +
          +
        1. SWA has been shown to significantly improve generalization in computer vision tasks, including VGG, ResNets, Wide ResNets and DenseNets on ImageNet and CIFAR benchmarks [1, 2].
        2. +
        3. SWA provides state-of-the-art performance on key benchmarks in semi-supervised learning and domain adaptation [2].
        4. +
        5. SWA is shown to improve the stability of training as well as the final average rewards of policy-gradient methods in deep reinforcement learning [3].
        6. +
        7. An extension of SWA can obtain efficient Bayesian model averaging, as well as high quality uncertainty estimates and calibration in deep learning [4].
        8. +
        9. SWA for low precision training, SWALP, can match the performance of full-precision SGD even with all numbers quantized down to 8 bits, including gradient accumulators [5].
        10. +
        + +

        In short, SWA performs an equal average of the weights traversed by SGD with a modified learning rate schedule (see the left panel of Figure 1.). SWA solutions end up in the center of a wide flat region of loss, while SGD tends to converge to the boundary of the low-loss region, making it susceptible to the shift between train and test error surfaces (see the middle and right panels of Figure 1).

        + +
        + +
        + +

        Figure 1. Illustrations of SWA and SGD with a Preactivation ResNet-164 on CIFAR-100 [1]. Left: test error surface for three FGE samples and the corresponding SWA solution (averaging in weight space). Middle and Right: test error and train loss surfaces showing the weights proposed by SGD (at convergence) and SWA, starting from the same initialization of SGD after 125 training epochs. Please see [1] for details on how these figures were constructed.

        + +

        With our new implementation in torchcontrib using SWA is as easy as using any other optimizer in PyTorch:

        + +
        from torchcontrib.optim import SWA
        +
        +...
        +...
        +
        +# training loop
        +base_opt = torch.optim.SGD(model.parameters(), lr=0.1)
        +opt = torchcontrib.optim.SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
        +for _ in range(100):
        +     opt.zero_grad()
        +     loss_fn(model(input), target).backward()
        +     opt.step()
        +opt.swap_swa_sgd()
        +
        + +

        You can wrap any optimizer from torch.optim using the SWA class, and then train your model as usual. When training is complete you simply call swap_swa_sgd() to set the weights of your model to their SWA averages. Below we explain the SWA procedure and the parameters of the SWA class in detail. We emphasize that SWA can be combined with any optimization procedure, such as Adam, in the same way that it can be combined with SGD.

        + +

        Is this just Averaged SGD?

        + +

        At a high level, averaging SGD iterates dates back several decades in convex optimization [6, 7], where it is sometimes referred to as Polyak-Ruppert averaging, or averaged SGD. But the details matter. Averaged SGD is often employed in conjunction with a decaying learning rate, and an exponentially moving average, typically for convex optimization. In convex optimization, the focus has been on improved rates of convergence. In deep learning, this form of averaged SGD smooths the trajectory of SGD iterates, but does not perform very differently.

        + +

        By contrast, SWA is focused on an equal average of SGD iterates with a modified cyclical or high constant learning rate, and exploits the flatness of training objectives [8] specific to deep learning for improved generalization.

        + +

        Stochastic Weight Averaging

        + +

        There are two important ingredients that make SWA work. First, SWA uses a modified learning rate schedule so that SGD continues to explore the set of high-performing networks instead of simply converging to a single solution. For example, we can use the standard decaying learning rate strategy for the first 75% of training time, and then set the learning rate to a reasonably high constant value for the remaining 25% of the time (see the Figure 2 below). The second ingredient is to average the weights of the networks traversed by SGD. For example, we can maintain a running average of the weights obtained in the end of every epoch within the last 25% of training time (see Figure 2).

        +
        + +
        + +

        Figure 2. Illustration of the learning rate schedule adopted by SWA. Standard decaying schedule is used for the first 75% of the training and then a high constant value is used for the remaining 25%. The SWA averages are formed during the last 25% of training.

        + +

        In our implementation the auto mode of the SWA optimizer allows us to run the procedure described above. To run SWA in auto mode you just need to wrap your optimizer base_opt of choice (can be SGD, Adam, or any other torch.optim.Optimizer) with SWA(base_opt, swa_start, swa_freq, swa_lr). After swa_start optimization steps the learning rate will be switched to a constant value swa_lr, and in the end of every swa_freq optimization steps a snapshot of the weights will be added to the SWA running average. Once you run opt.swap_swa_sgd(), the weights of your model are replaced with their SWA running averages.

        + +

        Batch Normalization

        + +

        One important detail to keep in mind is batch normalization. Batch normalization layers compute running statistics of activations during training. Note that the SWA averages of the weights are never used to make predictions during training, and so the batch normalization layers do not have the activation statistics computed after you reset the weights of your model with opt.swap_swa_sgd(). To compute the activation statistics you can just make a forward pass on your training data using the SWA model once the training is finished. In the SWA class we provide a helper function opt.bn_update(train_loader, model). It updates the activation statistics for every batch normalization layer in the model by making a forward pass on the train_loader data loader. You only need to call this function once in the end of training.

        + +

        Advanced Learning-Rate Schedules

        + +

        SWA can be used with any learning rate schedule that encourages exploration of the flat region of solutions. For example, you can use cyclical learning rates in the last 25% of the training time instead of a constant value, and average the weights of the networks corresponding to the lowest values of the learning rate within each cycle (see Figure 3).

        + +
        + +
        + +

        Figure 3. Illustration of SWA with an alternative learning rate schedule. Cyclical learning rates are adopted in the last 25% of training, and models for averaging are collected in the end of each cycle.

        + +

        In our implementation you can implement custom learning rate and weight averaging strategies by using SWA in the manual mode. The following code is equivalent to the auto mode code presented in the beginning of this blogpost.

        + +
        opt = torchcontrib.optim.SWA(base_opt)
        +for i in range(100):
        +    opt.zero_grad()
        +    loss_fn(model(input), target).backward()
        +    opt.step()
        +    if i > 10 and i % 5 == 0:
        +        opt.update_swa()
        +opt.swap_swa_sgd()
        +
        + +

        In manual mode you don’t specify swa_start, swa_lr and swa_freq, and just call opt.update_swa() whenever you want to update the SWA running averages (for example in the end of each learning rate cycle). In manual mode SWA doesn’t change the learning rate, so you can use any schedule you want as you would normally do with any other torch.optim.Optimizer.

        + +

        Why does it work?

        + +

        SGD converges to a solution within a wide flat region of loss. The weight space is extremely high-dimensional, and most of the volume of the flat region is concentrated near the boundary, so SGD solutions will always be found near the boundary of the flat region of the loss. SWA on the other hand averages multiple SGD solutions, which allows it to move towards the center of the flat region.

        + +

        We expect solutions that are centered in the flat region of the loss to generalize better than those near the boundary. Indeed, train and test error surfaces are not perfectly aligned in the weight space. Solutions that are centered in the flat region are not as susceptible to the shifts between train and test error surfaces as those near the boundary. In Figure 4 below we show the train loss and test error surfaces along the direction connecting the SWA and SGD solutions. As you can see, while SWA solution has a higher train loss compared to the SGD solution, it is centered in the region of low loss, and has a substantially better test error.

        + +
        + +
        + +

        Figure 4. Train loss and test error along the line connecting the SWA solution (circle) and SGD solution (square). SWA solution is centered in a wide region of low train loss while the SGD solution lies near the boundary. Because of the shift between train loss and test error surfaces, SWA solution leads to much better generalization.

        + +

        Examples and Results

        + +

        We released a GitHub repo here with examples of using the torchcontrib implementation of SWA for training DNNs. For example, these examples can be used to achieve the following results on CIFAR-100:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        DNN (Budget)SGDSWA 1 BudgetSWA 1.25 BudgetsSWA 1.5 Budgets
        VGG16 (200)72.55 ± 0.1073.91 ± 0.1274.17 ± 0.1574.27 ± 0.25
        PreResNet110 (150)76.77 ± 0.3878.75 ± 0.1678.91 ± 0.2979.10 ± 0.21
        PreResNet164 (150)78.49 ± 0.3679.77 ± 0.1780.18 ± 0.2380.35 ± 0.16
        WideResNet28x10 (200)80.82 ± 0.2381.46 ± 0.2381.91 ± 0.2782.15 ± 0.27
        + +

        Semi-Supervised Learning

        + +

        In a follow-up paper SWA was applied to semi-supervised learning, where it illustrated improvements beyond the best reported results in multiple settings. For example, with SWA you can get 95% accuracy on CIFAR-10 if you only have the training labels for 4k training data points (the previous best reported result on this problem was 93.7%). This paper also explores averaging multiple times within epochs, which can accelerate convergence and find still flatter solutions in a given time.

        +
        + +
        + +

        Figure 5. Performance of fast-SWA on semi-supervised learning with CIFAR-10. fast-SWA achieves record results in every setting considered.

        + +

        Calibration and Uncertainty Estimates

        +

        SWA-Gaussian (SWAG) is a simple, scalable and convenient approach to uncertainty estimation and calibration in Bayesian deep learning. Similarly to SWA, which maintains a running average of SGD iterates, SWAG estimates the first and second moments of the iterates to construct a Gaussian distribution over weights. SWAG distribution approximates the shape of the true posterior: Figure 6 below shows the SWAG distribution on top of the posterior log-density for PreResNet-164 on CIFAR-100.

        +
        + +
        +

        Figure 6. SWAG distribution on top of posterior log-density for PreResNet-164 on CIFAR-100. The shape of SWAG distribution is aligned with the posterior.

        + +

        Empirically, SWAG performs on par or better than popular alternatives including MC dropout, KFAC Laplace, and temperature scaling on uncertainty quantification, out-of-distribution detection, calibration and transfer learning in computer vision tasks. Code for SWAG is available here.

        + +

        Reinforcement Learning

        + +

        In another follow-up paper SWA was shown to improve the performance of policy gradient methods A2C and DDPG on several Atari games and MuJoCo environments.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        EnvironmentA2CA2C + SWA
        Breakout522 ± 34703 ± 60
        Qbert18777 ± 77821272 ± 655
        SpaceInvaders7727 ± 112121676 ± 8897
        Seaquest1779 ± 41795 ± 4
        CrazyClimber147030 ± 10239139752 ± 11618
        BeamRider9999 ± 40211321 ± 1065
        + +

        Low Precision Training

        +

        We can filter through quantization noise by combining weights that have been rounded down with weights that have been rounded up. Moreover, by averaging weights to find a flat region of the loss surface, large perturbations of the weights will not affect the quality of the solution (Figures 7 and 8). Recent work shows that by adapting SWA to the low precision setting, in a method called SWALP, one can match the performance of full-precision SGD even with all training in 8 bits [5]. This is quite a practically important result, given that (1) SGD training in 8 bits performs notably worse than full precision SGD, and (2) low precision training is significantly harder than predictions in low precision after training (the usual setting). For example, a ResNet-164 trained on CIFAR-100 with float (16-bit) SGD achieves 22.2% error, while 8-bit SGD achieves 24.0% error. By contrast, SWALP with 8 bit training achieves 21.8% error.

        +
        + +
        + +

        Figure 7. Quantizing in a flat region can still provide solutions with low loss.

        + +
        + +
        + +

        Figure 8. Low precision SGD training (with a modified learning rate schedule) and SWALP.

        + +

        Conclusion

        + +

        One of the greatest open questions in deep learning is why SGD manages to find good solutions, given that the training objectives are highly multimodal, and there are in principle many settings of parameters that achieve no training loss but poor generalization. By understanding geometric features such as flatness, which relate to generalization, we can begin to resolve these questions and build optimizers that provide even better generalization, and many other useful features, such as uncertainty representation. We have presented SWA, a simple drop-in replacement for standard SGD, which can in principle benefit anyone training a deep neural network. SWA has been demonstrated to have strong performance in a number of areas, including computer vision, semi-supervised learning, reinforcement learning, uncertainty representation, calibration, Bayesian model averaging, and low precision training.

        + +

        We encourage you try out SWA! Using SWA is now as easy as using any other optimizer in PyTorch. And even if you have already trained your model with SGD (or any other optimizer), it’s very easy to realize the benefits of SWA by running SWA for a small number of epochs starting with a pre-trained model.

        + +
          +
        • [1] Averaging Weights Leads to Wider Optima and Better Generalization; Pavel Izmailov, Dmitry Podoprikhin, Timur Garipov, Dmitry Vetrov, Andrew Gordon Wilson; Uncertainty in Artificial Intelligence (UAI), 2018
        • +
        • [2] There Are Many Consistent Explanations of Unlabeled Data: Why You Should Average; Ben Athiwaratkun, Marc Finzi, Pavel Izmailov, Andrew Gordon Wilson; International Conference on Learning Representations (ICLR), 2019
        • +
        • [3] Improving Stability in Deep Reinforcement Learning with Weight Averaging; Evgenii Nikishin, Pavel Izmailov, Ben Athiwaratkun, Dmitrii Podoprikhin, Timur Garipov, Pavel Shvechikov, Dmitry Vetrov, Andrew Gordon Wilson, UAI 2018 Workshop: Uncertainty in Deep Learning, 2018
        • +
        • [4] A Simple Baseline for Bayesian Uncertainty in Deep Learning, Wesley Maddox, Timur Garipov, Pavel Izmailov, Andrew Gordon Wilson, arXiv pre-print, 2019: https://arxiv.org/abs/1902.02476
        • +
        • [5] SWALP : Stochastic Weight Averaging in Low Precision Training, Guandao Yang, Tianyi Zhang, Polina Kirichenko, Junwen Bai, Andrew Gordon Wilson, Christopher De Sa, To appear at the International Conference on Machine Learning (ICML), 2019.
        • +
        • [6] David Ruppert. Efficient estimations from a slowly convergent Robbins-Monro process. Technical report, Cornell University Operations Research and Industrial Engineering, 1988.
        • +
        • [7] Acceleration of stochastic approximation by averaging. Boris T Polyak and Anatoli B Juditsky. SIAM Journal on Control and Optimization, 30(4):838–855, 1992.
        • +
        • [8] Loss Surfaces, Mode Connectivity, and Fast Ensembling of DNNs, Timur Garipov, Pavel Izmailov, Dmitrii Podoprikhin, Dmitry Vetrov, Andrew Gordon Wilson. Neural Information Processing Systems (NeurIPS), 2018
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/straggler-mitigation/index.html b/blog/straggler-mitigation/index.html new file mode 100644 index 000000000000..c83a299b44c5 --- /dev/null +++ b/blog/straggler-mitigation/index.html @@ -0,0 +1,908 @@ + + + + + + + + + + + + + Straggler Mitigation On PyTorch DDP By Hierarchical SGD | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Yi Wang (Cruise AI), Rohan Varma (Meta AI) + +

        +

        PyTorch DDP has been widely adopted across the industry for distributed training, which by default runs synchronous SGD to synchronize gradients across model replicas at every step. The performance of this technique is critical for fast iteration during model exploration as well as resource and cost saving. The performance is critical for fast iteration and cost saving of model development and exploration. To resolve a ubiquitous performance bottleneck introduced by slow nodes in large-scale training, Cruise and Meta co-developed a solution based on the Hierarchical SGD algorithm to significantly accelerate training in the presence of these stragglers.

        + +

        The Need For Straggler Mitigation

        + +

        In DDP setup, a straggler problem can occur when one or more processes run much slower (“stragglers”) than other processes. When this happens, all the processes have to wait for the stragglers before synchronizing gradients and completing the communication, which essentially bottlenecks distributed performance to the slowest worker.As a result, even for the cases of training relatively small models, the communication cost can still be a major performance bottleneck.

        + +

        Potential Causes of Stragglers

        + +

        Severe straggler issues are usually caused by workload imbalance before synchronization, and many factors can contribute to this imbalance. For instance, some data loader workers in the distributed environment can become stragglers, because some input examples can be outliers in terms of the data size, or the data transfer of some examples can be drastically slowed down due to unstable network I/O, or the on-the-fly data transformation costs can have a high variance.

        + +

        Besides data loading, other phases before gradient synchronization can also cause stragglers, such as unbalanced workloads of embedding table lookup during the forward pass in recommendation systems.

        + +

        The Appearance of Stragglers

        + +

        If we profile DDP training jobs that have stragglers, we can find that some processes may have much higher gradient synchronization costs (a.k.a., allreducing gradients) than other processes at a certain step. As a result, the distributed performance can be dominated by the communication cost even if the model size is very small. In this case, some processes run faster than the straggler(s) at a step, and hence they have to wait for the stragglers and spend a much longer time on allreduce.

        + +

        The below shows screenshots of two trace files output by PyTorch profiler in a use case. Each screenshot profiles 3 steps.

        +
          +
        • The first screenshot shows that a process has a very high allreduce cost in both the first and the third steps, because this process reaches the synchronization phase earlier than the straggler(s), and it spends more time on waiting. On the other hand, the allreduce cost is relatively small in the second step, this suggests that 1) there is no straggler at this step; or 2) this process is the straggler among all the processes, so it does not need to wait for any other process.
        • +
        + +

        chart showing allreduce cost

        + +

        Both the 1st and the 3rd Steps Are Slowed Down by Stragglers

        + +
          +
        • The second screenshot shows a normal case without stragglers. In this case, all the gradient synchronizations are relatively short.
        • +
        + +

        chart showing normal case without stragglers

        + +

        Normal Case Without Stragglers

        + +

        Hierarchical SGD in PyTorch

        + +

        Recently hierarchical SGD has been proposed to optimize the communication costs by mainly reducing the total amount of data transfer in large-scale distributed training, and multiple convergence analyses have been provided (example). As a main novelty of this post, at Cruise we could leverage hierarchical SGD to mitigate stragglers, which may also occur on training relatively small models. Our implementation has been upstreamed by Cruise to PyTorch in early 2022.

        + +

        How Does Hierarchical SGD Work?

        + +

        As the name implies, hierarchical SGD organizes all the processes into groups at different levels as a hierarchy, and runs synchronization by following the rules below:

        + +
          +
        • All the groups at the same level have the same number of processes, and the processes in these groups synchronize at the same frequency concurrently, where the synchronization period is pre-defined by the user.
        • +
        • The higher level a group is, the larger synchronization period is used, as the synchronization becomes more expensive.
        • +
        • When multiple overlapping groups are supposed to synchronize according to their periods, to reduce redundant synchronization and avoid data race across groups, only the highest-level group runs synchronization.
        • +
        + +

        The following figure illustrates an example of 4-level hierarchy SGD among 16 processes on 8 machines, each of which has 2 GPUs:

        + +
          +
        1. Level 1: Each process runs mini-batch SGD locally;
        2. +
        3. Level 2: Each 4-process group across 2 machines runs synchronization every 2 steps;
        4. +
        5. Level 3: Each 8-process group across 4 machines runs synchronization every 4 steps;
        6. +
        7. Level 4: The global process group of all 16 processes over 8 machines runs synchronization every 8 steps.
        8. +
        + +

        Particularly, when the step number can be divided by 8, only the synchronization at 3) is executed, and when the step number can be divided by 4 but not 8, only the synchronization at 2) is executed.

        + +

        An example of 4-level hierarchy SGD among 16 processes on 8 machines, each of which has 2 GPUs

        + +

        Intuitively, hierarchical SGD can be viewed as an extension of local SGD, which only has a two-level hierarchy – every process runs mini-batch SGD locally and then synchronizes globally at a certain frequency. This can also help explain that, just like local SGD, hierarchical SGD synchronizes model parameters instead of gradients. Otherwise the gradient descent will be mathematically incorrect when the frequency is greater than 1.

        + +

        Why Can Hierarchical SGD Mitigate Stragglers?

        + +

        The key insight here is that, when there is a random straggler, it only directly slows down a relatively small group of processes instead of all the processes. Next time another random straggler is very likely to slow down a different small group, and hence a hierarchy can help smooth out the straggler effect.

        + +

        The example below assumes that there is a random straggler among totally 8 processes at every step. After 4 steps, vanilla DDP that runs synchronous SGD will be slowed down by straggler 4 times, because it runs global synchronization at every step. In contrast, hierarchical SGD runs synchronization with the groups of 4 processes after the first two steps, and then a global synchronization after another two steps. We can see that both the first two and the last two stragglers have a large overlap, and hence the performance loss can be mitigated.

        + +

        flow diagram

        + +

        Essentially, the mitigation effect of this hierarchical SGD example actually is between local SGD at a frequency of every 2 steps and every 4 steps. The main advantage of hierarchical SGD over local SGD is a better convergence efficiency of the same global synchronization frequency, because hierarchical SGD allows more low-level synchronization. Moreover, it is possible for hierarchical SGD to provide a global synchronization frequency lower than local SGD with model parity, leading to a higher training performance, especially in a large-scale distributed training.

        + +

        Ease of Use

        + +

        Straggler mitigation is not a novel study in distributed training. Multiple approaches have been proposed, such as gossip SGD, data encoding, gradient coding, as well as some particularly designed for parameter-server architecture, including backup workers and stale synchronous parallel. However, to the best of our knowledge, before this effort we have not found a good open-source PyTorch implementation of straggler mitigation that can work like a plugin to our training system at Cruise. In contrast, our implementation only requires the minimal changes – no need to modify the existing code or tune any existing hyperparameters. This is a very appealing advantage for industry users.

        + +

        As the code example below shows, only a few lines need to be added to the setup of DDP model, and the training loop code can keep untouched. As explained previously, hierarchical SGD is an extended form of local SGD, so the enablement can be quite similar to local SGD (see PyTorch docs of PostLocalSGDOptimizer):

        + +
          +
        1. Register a post-local SGD communication hook to run a warmup stage of fully synchronous SGD and defer hierarchical SGD.
        2. +
        3. Create a post-local SGD optimizer that wraps an existing local optimizer and a hierarchical SGD configuration.
        4. +
        + +
        import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
        +from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
        +    PostLocalSGDState,
        +    post_localSGD_hook,
        +)
        +from torch.distributed.optim import PostLocalSGDOptimizer
        +
        +ddp_model = nn.parallel.DistributedDataParallel(
        +    module=model,
        +    device_ids=[rank],
        +)
        +
        +# Register a post-local SGD communication hook for the warmup.
        +subgroup, _ = torch.distributed.new_subgroups()
        +state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=1_000)
        +ddp_model.register_comm_hook(state, post_localSGD_hook)
        +
        +# Wraps the existing (local) optimizer to run hierarchical model averaging.
        +optim = PostLocalSGDOptimizer(
        +  optim=optim,
        +  averager=hierarchicalSGD.HierarchicalModelAverager(
        +    # The config runs a 4-level hierarchy SGD among 128 processes:
        +    # 1) Each process runs mini-batch SGD locally;
        +    # 2) Each 8-process group synchronize every 2 steps;
        +    # 3) Each 32-process group synchronize every 4 steps;
        +    # 4) All 128 processes synchronize every 8 steps.
        +    period_group_size_dict=OrderedDict([(2, 8), (4, 32), (8, 128)]),
        +    # Do not run hierarchical SGD until 1K steps for model parity.
        +    warmup_steps=1_000)
        +)
        +
        + +

        Algorithm Hyperparameters

        + +

        Hierarchical SGD has two major hyperparameters: period_group_size_dict and warmup_steps.

        + +
          +
        • period_group_size_dict is an ordered dictionary mapping from synchronization period to process group size, used for initializing process groups of different sizes in a hierarchy to synchronize parameters concurrently. A larger group is expected to use a larger synchronization period.
        • +
        • warmup_steps specifies a number of steps as the warmup stage to run synchronous SGD before hierarchical SGD. Similar to post-local SGD algorithm, a warmup stage is usually recommended to achieve a higher accuracy. The value should be the same as start_localSGD_iter arg used in PostLocalSGDState when post_localSGD_hook is registered. Typically the warmup stage should at least cover the beginning of training when the loss is decreased drastically.
        • +
        + +

        A subtle difference between the PyTorch implementation and the initial design proposed by relevant papers is that, after the warmup stage, by default the processes within each host still run intra-host gradient synchronization at every step. This is because that:

        + +
          +
        1. The intra-host communication is relatively cheap, and it can usually significantly accelerate the convergence;
        2. +
        3. The intra-host group (of size 4 or 8 for most industry users) can usually be a good choice of the smallest group of processes that synchronize most frequently in hierarchical SGD. If the synchronization period is 1, then gradient synchronization is faster than model parameter synchronization (a.k.a., model averaging), because DDP automatically overlaps gradient synchronization and the backward pass.
        4. +
        + +

        Such intra-host gradient synchronization can be disabled by unsetting post_local_gradient_allreduce arg in PostLocalSGDState.

        + +

        Demonstration

        + +

        Now we demonstrate that hierarchical SGD can accelerate distributed training by mitigating stragglers.

        + +

        Experimental Setup

        + +

        We compared the performance of hierarchical SGD against local SGD and synchronous SGD on ResNet18 (model size: 45MB). Since the model is so small, the training is not bottlenecked by data transfer cost during synchronization. To avoid the noises incurred by data loading from remote storage, the input data was randomly simulated from memory. We varied the number of GPUs used by training from 64 to 256. The batch size per worker is 32, and the number of iterations of training is 1,000. Since we don’t evaluate convergence efficiency in this set of experiments, warmup is not enabled.

        + +

        We also emulated stragglers at a rate of 1% on 128 and 256 GPUs, and 2% on 64 GPUs, to make sure at least one stragglers at every step on average. These stragglers randomly appear on different CUDA devices. Each straggler stalls for 1 second besides the normal per-step training time (~55ms in our setup). This can be perceived as a practical scenario where 1% or 2% of input data are outliers in terms of the data pre-processing cost (I/O and/or data transformation on the fly) during training, and such cost is 20X+ larger than the average.

        + +

        The code snippet below shows how a straggler can be emulated in the training loop. We applied it to a ResNet model, and it can be easily applied to the other models as well.

        + +
             loss = loss_fn(y_pred, y)
        +     # Emulate a straggler that lags for 1 second at a rate of 1%.
        +     if random.randint(1, 100) == 1:
        +         time.sleep(1)
        +     loss.backward()
        +     optimizer.step()
        +
        + +

        The experiments are conducted on us-central1 GCP cluster. Each machine has 4 NVIDIA Tesla T4 GPUs with 16 GB memory per GPU, connected through a 32 Gbit/s ethernet network. Each instance also features 96 vCPUs, 360 GB RAM.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Architecture + ResNet18 (45MB) +
        Workers + 64, 128, 256 +
        Backend + NCCL +
        GPU + Tesla T4, 16 GB memory +
        Batch size + 32 x ## of workers +
        Straggler Duration + 1 sec +
        Straggler Rate + 1% on 128 and 256 GPUs, 2% on 64 GPUs +
        + +

        We used multiple configurations for both local SGD and hierarchical SGD. Local SGD runs global synchronization every 2, 4, and 8 steps, respectively.

        + +

        We ran hierarchical SGD with the following configurations:

        + +
          +
        1. On 64 GPUs: +
            +
          1. Each 8-process group, 32-process, and the global 64-process group synchronizes every 2, 4, and 8 steps, respectively. Denoted as “HSGD 2-8,4-32,8-64”.
          2. +
          3. Each 32-process group and the global 64-process group synchronizes every 4 and 8 steps, respectively. Denoted as “HSGD 4-32,8-64”.
          4. +
          +
        2. +
        3. On 128 GPUs: +
            +
          1. Each 8-process group, 32-process group, and the global 128-process group synchronizes every 2, 4, and 8 steps, respectively. Denoted as “HSGD 2-8,4-32,8-128”.
          2. +
          3. Each 32-process group and the global 128-process group synchronizes every 4 and 8 steps, respectively. Denoted as “HSGD 4-32,8-128”.
          4. +
          +
        4. +
        5. On 256 GPUs: +
            +
          1. Each 4-process group, 16-process group, 64-process group, and the global 256-process group synchronizes every 1, 2, 4, and 8 steps, respectively. Denoted as “HSGD 1-4,2-16,4-64,8-256”.
          2. +
          3. Each 8-process group, 64-process group, and the global 256-process group synchronizes every 2, 4, and 8 steps. Denoted as “HSGD 2-8,4-64,8-256”.
          4. +
          5. Each 16-process group and the global 256-process group synchronizes every 4 and 8 steps, respectively. Denoted as “HSGD 4-16,8-256”.
          6. +
          +
        6. +
        + +

        Experimental Results

        + +

        The figures below show the speedups of different communication schemes against the baseline of synchronous SGD, with the emulated stragglers. We can make the following observations:

        + +
          +
        1. As expected, we can see that both hierarchical SGD and local SGD can achieve a higher speedup with a lower synchronization frequency.
        2. +
        3. The speedups of the hierarchical SGD schemes are 2.08X-2.45X on 64 GPUs, 2.57X-2.68X on 128 GPUs, and 2.63X-3.25X on 256 GPUs, respectively. This shows that hierarchical SGD can significantly mitigate stragglers, and such mitigation can be more effective at a larger scale.
        4. +
        5. The performance of local SGD with the synchronization period of 2 steps and 8 steps can be perceived as the lower bound and upper bound of the experimented hierarchical SGD schemes, respectively. This is because the hierarchical SGD schemes synchronize less frequently than every 2 steps globally, but their low-level synchronization at small groups are the extra overheads in comparison with the global synchronization every 8 steps.
        6. +
        + +

        Overall, hierarchical SGD can provide a finer-grained trade-off between communication cost and model quality than local SGD. Therefore, when local SGD at a relatively large synchronization period like 8 or 4 cannot give a satisfactory convergence efficiency, hierarchical SGD can have a much better chance to achieve both a good speedup and a model parity.

        + +

        Since only simulated data is used in the experiments, we did not demonstrate the model parity here, which in practice can be achieved in two ways:

        +
          +
        1. Tuning the hyperparameters including both hierarchy and warmup steps;
        2. +
        3. For some cases, hierarchical SGD could lead to a slightly lower quality than the original model for the same number of training steps (i.e., lower convergence rate), but with a speedup like 2X+ per training step, it is still possible to achieve model parity with more steps but still less total training time.
        4. +
        + +

        Speedups on 64 GPUs

        + +

        Speedups on 128 GPUs

        + +

        Speedups on 256 GPUs

        + +

        Limitations

        + +

        Before applying hierarchical SGD to straggler mitigation, the user should be aware of a few limitations of this approach:

        + +
          +
        1. This approach can only mitigate non-persistent stragglers, which occur to different workers at different times. However, for the case of persistent stragglers, which can be caused by hardware degradation or a network issue on a specific host, these stragglers will slow down the same low-level subgroup at every time, leading to nearly no straggler mitigation.
        2. +
        3. This approach can only mitigate low-frequency stragglers. E.g., if 30% workers can randomly become stragglers at every step, then most low-level synchronizations will still be slowed down by stragglers. As a result, hierarchical SGD may not show an obvious performance advantage over synchronous SGD.
        4. +
        5. Since hierarchical SGD applies model averaging that does not overlap with backward like gradient averaging used by vanilla DDP, its performance gain of straggler mitigation must outweigh the performance loss of no overlap between communication and backward pass. Therefore, if stragglers only slow down training by less than 10%, hierarchical SGD may not be able to bring much speedup. This limitation can be addressed by overlapping optimizer step and backward pass in the future.
        6. +
        7. Since hierarchical SGD is less well-studied than local SGD, there is no guarantee that hierarchical SGD with a finer-grained synchronization granularity can converge faster than certain advanced forms of local SGD, such as SlowMo, which can improve convergence efficiency with slow momentum. However, to the best of our knowledge, these advanced algorithms cannot be natively supported as a PyTorch DDP plugin like hierarchical SGD yet.
        8. +
        + +

        Acknowledgements

        + +

        We would like to thank Cruise teammates Bo Tian, Sergei Vorobev, Eugene Selivonchyk, Tsugn-Hsien Lee, Dan Ring, Ian Ackerman, Lei Chen, Maegan Chew, Viet Anh To, Xiaohui Long, Zeyu Chen, Alexander Sidorov, Igor Tsvetkov, Xin Hu, Manav Kataria, Marina Rubtsova, and Mohamed Fawzy, as well as Meta teammates Shen Li, Yanli Zhao, Suraj Subramanian, Hamid Shojanzeri, Anjali Sridhar and Bernard Nguyen for the support.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/submit-to-speak/index.html b/blog/submit-to-speak/index.html new file mode 100644 index 000000000000..9cf088b5e57f --- /dev/null +++ b/blog/submit-to-speak/index.html @@ -0,0 +1,716 @@ + + + + + + + + + + + + + 📣 Submit to Speak at PyTorch Conference + Save on Registration | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Step into the Future of AI at PyTorch Conference 2025.

        + +

        banner ad for conference

        + +

        The Call for Proposals for PyTorch Conference 2025 is officially open!

        + +

        Join us in San Francisco from October 22–23, 2025, to showcase your expertise and innovations with PyTorch—the industry-leading, open-source machine learning framework powering innovations from bare-metal infrastructure to sophisticated application and agent layers. This is your opportunity to share insights, breakthroughs, and case studies with a global audience of AI and Generative AI practitioners, researchers, and developers.

        + +

        people watching presentation at conference

        + +

        Submit your proposals and prepare to engage, learn, and network alongside some of the brightest minds in the AI/ML community. We’re seeking sessions, Birds of a Feather discussions, lightning talks, and poster sessions on the following topics:

        + +
          +
        • Core PyTorch Framework
        • +
        • PyTorch on Accelerator Hardware
        • +
        • PyTorch Ecosystem and Tools
        • +
        • AI Applications and Use Cases
        • +
        • AI in Research and Academia
        • +
        • AI in Industry and Enterprise Applications
        • +
        • AI Infrastructure and Scalability
        • +
        • Ethical AI, Governance, and Regulation
        • +
        • Training, Fine-Tuning, and Alignment
        • +
        • Inference, Deployment, and Serving
        • +
        • Performance Measurement and Benchmarking
        • +
        • Data Engineering and Management for AI
        • +
        • Generative AI and Large Language Models (LLMs)
        • +
        • Model Optimization and Efficiency
        • +
        • Open Source Collaboration, Education and Community Building
        • +
        • Edge AI and On-Device
        • +
        • DL Compilers and Kernel Authoring
        • +
        + +
        +

        Learn more and submit your talk by Sunday, June 1, at 11:59 PDT!

        + + SUBMIT TO SPEAK + +
        + +
        + +

        people arriving at conference

        + +

        Save up to USD$500 with Super Early Bird Pricing!

        + +
          +
        • Reserve your pass by 11:59 PM PDT on March 21 and score Super Early Bird pricing for just USD$499. That’s a savings of up to USD$500!
        • +
        • Student or faculty? Learn more about our discounted academic rate.
        • +
        • Need help covering travel costs? We offer discretionary travel funding for those community members who would otherwise not be able to attend. Learn more.
        • +
        + + + +
        + +

        Become a Sponsor at PyTorch Conference 2025!

        + +

        Seize your opportunity to influence the future of Generative AI and Machine Learning by sponsoring PyTorch Conference 2025. PyTorch is at the forefront of innovation—empowering rapid experimentation, flexible model development, and efficient deployment into production environments with its powerful, versatile ecosystem of tools and thriving community of dedicated users.

        + +

        As a sponsor, you’ll gain more than visibility; you’ll strategically position your organization at the heart of a vibrant, global AI/ML ecosystem. Connect directly with 3,000+ expert attendees, researchers, engineers, and decision-makers, and actively shape the conversations driving the next generation of AI advancements.

        + + + +

        For more details on CFP submissions, registration, and sponsorship, visit the PyTorch Conference Website.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/tac-elects-new-leadership/index.html b/blog/tac-elects-new-leadership/index.html new file mode 100644 index 000000000000..a414ee5ca8e5 --- /dev/null +++ b/blog/tac-elects-new-leadership/index.html @@ -0,0 +1,742 @@ + + + + + + + + + + + + + PyTorch Foundation Technical Advisory Council Elects New Leadership | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are pleased to announce the first-ever Chair and Vice Chair of the PyTorch Foundation’s Technical Advisory Council (TAC): Luca Antiga as the Chair and Jiong Gong as Vice Chair. Both leaders bring extensive experience and deep commitment to the PyTorch community, and they are set to guide the TAC in its mission to foster an open, diverse, and innovative PyTorch technical community.

        + +

        Meet the New Leadership

        + +

        Luca Antiga

        + +

        Luca Antiga is the CTO at Lightning AI since 2022. He is an early contributor to PyTorch core and co-authored “Deep Learning with PyTorch” (published by Manning). He started his journey as a researcher in Bioengineering, and later co-founded Orobix, a company focused on building and deploying AI in production settings.

        + +

        “I am looking forward to taking on the role of the chair of the PyTorch TAC,” says Luca. “As the TAC chair, I will ensure effective, timely topic selection and enhance visibility of technical needs from the board members and from the ecosystem at large. I will strive for directional, cohesive messaging throughout the transition of PyTorch from Meta to the Linux Foundation.”

        + +

        Jiong Gong

        + +

        Jiong Gong is a Principal Engineer and SW Architect for PyTorch Optimization from Intel. He serves as one of the PyTorch CPU module maintainers and is an active contributor to the TorchInductor CPU backend.

        + +

        “I plan to further strengthen the collaboration between PyTorch developers and hardware vendors, promoting innovation and performance optimization across various hardware platforms, enhancing PyTorch ecosystem and streamlining the decision-making process,” says Jiong. “I am honored to serve as the vice chair of the TAC.”

        + +

        What Does the TAC Do?

        + +

        The PyTorch Foundation’s TAC provides a forum for technical communication, leadership, and collaboration for the PyTorch Foundation. The committee members are members of the PyTorch Foundation. The committee holds open meetings once a month that anyone in the community can attend. The committee provides thought leadership on technical topics, knowledge sharing, and a forum to discuss issues with other technical experts in the community.

        + +

        New TAC Webpage

        + +

        Stay connected with the PyTorch Foundation’s Technical Advisory Council (TAC) by visiting our new TAC webpage. Here you can find the TAC members, where to view upcoming meeting agendas, access presentations, attend public meetings, watch meeting recordings and participate in discussions on key technical topics.

        + +

        Plus stay tuned on our blog for regular updates from the PyTorch Foundation TAC leadership.

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/tensor-comprehensions/index.html b/blog/tensor-comprehensions/index.html new file mode 100644 index 000000000000..63825ddd02eb --- /dev/null +++ b/blog/tensor-comprehensions/index.html @@ -0,0 +1,834 @@ + + + + + + + + + + + + + Tensor Comprehensions in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        March 05, 2018

        +

        + Tensor Comprehensions in PyTorch +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Priya Goyal (FAIR), Nicolas Vasilache (FAIR), Oleksandr Zinenko (Inria & DI ENS), Theodoros Theodoridis (ETH Zürich), Zachary DeVito (FAIR), William S. Moses (MIT CSAIL), Sven Verdoolaege (FAIR), Andrew Adams (FAIR), Albert Cohen (Inria & DI ENS & FAIR) + +

        +

        Tensor Comprehensions (TC) is a tool that lowers the barrier for writing high-performance code. It generates GPU code from a simple high-level language and autotunes the code for specific input sizes.

        + +

        We highly recommend reading the Tensor Comprehensions blogpost first.

        + +

        If you ran into any of the following scenarios, TC is a useful tool for you.

        + +
          +
        • +

          Your PyTorch layer is large and slow, and you contemplated writing a dedicated C++ or CUDA code for it. But you don’t know how to program in CUDA or write low-level code.

          +
        • +
        • +

          You wrote a CUDA layer, but it took a week to write, debug, optimize for speed. You wished you could do this in an hour.

          +
        • +
        • +

          You want to fuse multiple layers like Conv-ReLU-BatchNorm or Linear-ReLU-Linear-ReLU in your network for speed, but it was quite difficult to comprehend

          +
        • +
        • +

          Your research involves weird Tensor shapes that CuDNN and MKL are not optimized for. For example, you do convolutions of 13 x 24 with an input image of 143 x 55. You tried running it with CuDNN and it was slower than you wished.

          +
        • +
        • +

          Your code is slowed-down by transposing Tensors constantly to fit a particular memory layout. You wish it was easy to write custom code that operates efficiently on your input layout.

          +
        • +
        + +

        Tensor Comprehensions are seamless to use in PyTorch, interoperating with PyTorch Tensors and nn Variables.

        + +

        Let us run through using TC with PyTorch.

        + +

        1. Install the package

        + +
        conda install -c pytorch -c tensorcomp tensor_comprehensions
        +
        + +

        At this time we only provide Linux-64 binaries which have been tested on Ubuntu 16.04 and CentOS7.

        + +

        TC depends on heavyweight C++ projects such as Halide, Tapir-LLVM and ISL. Hence, we rely on Anaconda to distribute these dependencies reliably. For the same reason, TC is not available via PyPI.

        + +

        2. Import the python package

        + +
        import tensor_comprehensions as tc
        +
        + +

        3. Define the TC expression and create a python function

        + +
        lang = """
        +def fcrelu(float(B,M) I, float(N,M) W1, float(N) B1) -> (O1) {
        +    O1(b, n) +=! I(b, m) * W1(n, m)
        +    O1(b, n) = O1(b, n) + B1(n)
        +    O1(b, n) = fmax(O1(b, n), 0)
        +}
        +"""
        +fcrelu = tc.define(lang, name="fcrelu")
        +
        + +

        This fcrelu function takes PyTorch Tensors as input and returns a PyTorch Tensor. It takes input I, weight W1, bias B1 and returns output O1.

        + +

        4. Let’s create some dummy input tensors

        + +
        B, M, N = 100, 128, 100
        +I, W1, B1 = torch.randn(B, M).cuda(), torch.randn(N, M).cuda(), torch.randn(N).cuda()
        +
        + +

        5. Now autotune the function for your input sizes

        + +
        fcrelu.autotune(I, W1, B1, cache="fcrelu_100_128_100.tc")
        +
        + +

        The autotuner is your biggest friend. You generally do not want to use a tc function without autotuning it first.

        + +

        When the autotuning is running, the current best performance is displayed. If you are satisfied with the current result or you are out of time, stop the tuning procedure by pressing Ctrl+C.

        + +

        cache saves the results of the autotuned kernel search and saves it to the file fcrelu_100_128_100.tc. The next time you call the same line of code, it loads the results of the autotuning without recomputing it.

        + +

        The autotuner has a few hyperparameters (just like your ConvNet has learning rate, number of layers, etc.). We pick reasonable defaults, but you can read about using advanced options here.

        + +

        6. Call the function with the inputs, to get your result

        + +
        out = fcrelu(I, W1, B1)
        +
        + +

        Now, let’s look at how to write TC expressions.

        + +

        A quick primer on the TC language

        + +

        The TC notation focuses on the mathematical nature of the layer, leaving performance considerations to it’s backend code that uses Halide and polyhedral compilation techniques which accumulate decades of cutting edge Loop Nest Optimization (LNO) research.

        + +

        TC is close to np.einsum. We shall quickly learn TC by example

        + +
        lang = """
        +def matmul(float(M,N) A, float(N,K) B) -> (output) {
        +  output(i, j) +=! A(i, kk) * B(kk, j)
        +}
        +"""
        +
        + +

        In this example, we define a function matmul which takes two input A and B of shapes M x N and N x K and returns a single output. The shape of output is automatically inferred by the TC language (discussed below).

        + +

        Let’s look at this line:

        + +
        output(i, j) +=! A(i, kk) * B(kk, j)
        +
        + +

        It says:

        + +
          +
        • output(i, j) means output is 2D.
        • +
        • for each location output(i, j), we add (+=) A(i, kk) * B(kk, j).
        • +
        • i is well-defined as all locations in A dim=0, i.e. i in range(0, M)
        • +
        • j is well-defined as all locations in B dim=1, i.e. j in range(0, K)
        • +
        • kk is inferred as all locations from 0 to N
        • +
        + +

        The shape of output is inferred from the maximum values i and j can take, which is M and K, so output is of size M x K.

        + +

        The ! symbol initializes output with 0.0. It is equivalent to:

        + +
        output(i, j) = 0
        +output(i, j) += A(i, kk) * B(kk, j)
        +
        + +

        Scalar inputs and range constraints: implement AvgPool2d

        + +
        """
        +
        +def avgpool(float(B, C, H, W) input) -> (output) {{
        +  output(b, c, h, w) += input(b, c, h * {sH} + kh, w * {sW} + kw) where kh in 0:{kH}, kw in 0:{kW}
        +}}
        +
        +"""
        +avgpool = tc.define(LANG, name="avgpool", constants={"sH":1, "sW":1, "kH":2, "kW":2})
        +
        + +

        here the where keyword can take ranges of values to operate on. 0:{kH} is equivalent range(kH) in Python.

        + +

        Note: the syntax for passing in scalars is subject to change in the next release.

        + +

        torch.nn layers

        + +

        We added some sugar-coating around the basic PyTorch integration of TC to make it easy to integrate TC into larger torch.nn models by defining the forward and backward TC expressions and taking Variable inputs / outputs.

        + +

        Some essentials that you will miss (we’re working on them)

        + +

        Autotuning for variable-length sequences

        + +

        The TC auto-tuner requires all input sizes to be specified before-hand. For example, if you have input I1 which is an image batch, the autotuner wants to know the exact shape of I1 to generate an optimized kernel. You cannot specify: image with height between 200 and 300. This is more essential in sequence data such as NLP, where each sentence can have a different length.

        + +

        The reason why the autotuner is non-parametric is because it’s harder and harder to auto-tune parametric constraints, this is active research. Hence, for the first release, we made a conscious decision to give you the tool in a form where we know it works well.

        + +

        As a work-around, if you know that you have a few specific shapes of interest, you can run the autotuner with these multiple shapes.

        + +
        relu = tc.define(LANG, name="relu")
        +batch, channels = 16, 3
        +tc.autotune((batch, channels, 32, 32)) # image of size 32 x 32
        +tc.autotune((batch, channels, 48, 48)) # image of size 48 x 48
        +tc.autotune((batch, channels, 64, 64)) # image of size 64 x 64
        +
        + +

        Now the autotuner is tuned for these three specific image sizes 32x32, 48x48 and 64x64.

        + +

        Lack of loops

        + +

        If you want to write an RNN, it’s easy to see it as a for loop over time. However, the TC language does not have loops yet. If you really want to write RNNs, you can write unrolled loops.

        + +

        Strided-Tensors

        + +

        The TC backend does not support non-contiguous Tensors yet. If the inputs you give are not contiguous, they are made contiguous before passing to the TC backend.

        + +

        Reshaping Tensors within a TC expression

        + +

        You cannot write this operation in TC: torch.matmul(...).view(...).mean(...). Whenever there is need for a view to change the shape of an input, you have to get the output, view it at the PyTorch level.

        + +

        Getting Started

        + +
          +
        • Walk through Tutorial to quickly get started with understanding and using Tensor Comprehensions PyTorch package.
        • +
        • Over 20 examples of various ML layers with TC, including avgpool, maxpool, matmul, matmul - give output buffers and batch-matmul, convolution, strided-convolution, batchnorm, copy, cosine similarity, Linear, Linear + ReLU, group-convolutions, strided group-convolutions, indexing, Embedding (lookup table), small-mobilenet, softmax, tensordot, transpose
        • +
        • Detailed docs on Tensor Comprehensions and integration with PyTorch.
        • +
        + +

        Communication

        + +
          +
        • Slack: For discussion around framework integration, build support, collaboration, etc. join our slack channel.
        • +
        • Email: tensorcomp@fb.com
        • +
        • GitHub: bug reports, feature requests, install issues, RFCs, thoughts, etc.
        • +
        + +

        Acknowledgements

        + +

        We would like to thank Soumith Chintala, Edward Yang and Sam Gross for their immense guidance and help in making the integration API nice and smooth. We would also like to thank rest of the PyTorch team and our pre-release users for their helpful feedback that guided us in making the integration better.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/tensor-memory-format-matters/index.html b/blog/tensor-memory-format-matters/index.html new file mode 100644 index 000000000000..c8cd4b352a6a --- /dev/null +++ b/blog/tensor-memory-format-matters/index.html @@ -0,0 +1,978 @@ + + + + + + + + + + + + + Efficient PyTorch: Tensor Memory Format Matters | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Dhruv Matani, Suraj Subramanian + +

        +

        Ensuring the right memory format for your inputs can significantly impact the running time of your PyTorch vision models. When in doubt, choose a Channels Last memory format.

        + +

        When dealing with vision models in PyTorch that accept multimedia (for example image Tensorts) as input, the Tensor’s memory format can significantly impact the inference execution speed of your model on mobile platforms when using the CPU backend along with XNNPACK. This holds true for training and inference on server platforms as well, but latency is particularly critical for mobile devices and users.

        + + + +

        Outline of this article

        +
          +
        1. Deep Dive into matrix storage/memory representation in C++. Introduction to Row and Column major order.
        2. +
        3. Impact of looping over a matrix in the same or different order as the storage representation, along with an example.
        4. +
        5. Introduction to Cachegrind; a tool to inspect the cache friendliness of your code.
        6. +
        7. Memory formats supported by PyTorch Operators.
        8. +
        9. Best practices example to ensure efficient model execution with XNNPACK optimizations
        10. +
        + +

        Matrix Storage Representation in C++

        + +

        Images are fed into PyTorch ML models as multi-dimensional Tensors. These Tensors have specific memory formats. To understand this concept better, let’s take a look at how a 2-d matrix may be stored in memory.

        + +

        Broadly speaking, there are 2 main ways of efficiently storing multi-dimensional data in memory.

        +
          +
        1. Row Major Order: In this format, the matrix is stored in row order, with each row stored before the next row in memory. I.e. row N comes before row N+1.
        2. +
        3. Column Major Order: In this format, the matrix is stored in column-order, with each column stored before the next column in memory. I.e. column N comes before column N+1.
        4. +
        + +

        You can see the differences graphically below.

        + +

        +C++ stores multi-dimensional data in row-major format. +
        +C++ stores multi-dimensional data in row-major format. +

        + +

        Efficiently accessing elements of a 2d matrix

        + +

        Similar to the storage format, there are 2 ways to access data in a 2d matrix.

        + +
          +
        1. Loop Over Rows first: All elements of a row are processed before any element of the next row.
        2. +
        3. Loop Over Columns first: All elements of a column are processed before any element of the next column.
        4. +
        + +

        For maximum efficiency, one should always access data in the same format in which it is stored. I.e. if the data is stored in row-major order, then one should try to access it in that order.

        + +

        The code below (main.cpp) shows 2 ways of accessing all the elements of a 2d 4000x4000 matrix.

        + +
        #include <iostream>
        +#include <chrono>
        +
        +// loop1 accesses data in matrix 'a' in row major order,
        +// since i is the outer loop variable, and j is the
        +// inner loop variable.
        +int loop1(int a[4000][4000]) {
        + int s = 0;
        + for (int i = 0; i < 4000; ++i) {
        +   for (int j = 0; j < 4000; ++j) {
        +     s += a[i][j];
        +   }
        + }
        + return s;
        +}
        +
        +// loop2 accesses data in matrix 'a' in column major order
        +// since j is the outer loop variable, and i is the
        +// inner loop variable.
        +int loop2(int a[4000][4000]) {
        + int s = 0;
        + for (int j = 0; j < 4000; ++j) {
        +   for (int i = 0; i < 4000; ++i) {
        +     s += a[i][j];
        +   }
        + }
        + return s;
        +}
        +
        +int main() {
        + static int a[4000][4000] = {0};
        + for (int i = 0; i < 100; ++i) {
        +   int x = rand() % 4000;
        +   int y = rand() % 4000;
        +   a[x][y] = rand() % 1000;
        + }
        +
        + auto start = std::chrono::high_resolution_clock::now();
        + auto end = start;
        + int s = 0;
        +
        +#if defined RUN_LOOP1
        + start = std::chrono::high_resolution_clock::now();
        +
        + s = 0;
        + for (int i = 0; i < 10; ++i) {
        +   s += loop1(a);
        +   s = s % 100;
        + }
        + end = std::chrono::high_resolution_clock::now();
        +
        + std::cout << "s = " << s << std::endl;
        + std::cout << "Time for loop1: "
        +   << std::chrono::duration<double, std::milli>(end - start).count()
        +   << "ms" << std::endl;
        +#endif
        +
        +#if defined RUN_LOOP2
        + start = std::chrono::high_resolution_clock::now();
        + s = 0;
        + for (int i = 0; i < 10; ++i) {
        +   s += loop2(a);
        +   s = s % 100;
        + }
        + end = std::chrono::high_resolution_clock::now();
        +
        + std::cout << "s = " << s << std::endl;
        + std::cout << "Time for loop2: "
        +   << std::chrono::duration<double, std::milli>(end - start).count()
        +   << "ms" << std::endl;
        +#endif
        +}
        +
        +
        +Lets build and run this program and see what it prints.
        +
        +g++ -O2 main.cpp -DRUN_LOOP1 -DRUN_LOOP2
        +./a.out
        +
        +
        +Prints the following:
        +
        +s = 70
        +Time for loop1: 77.0687ms
        +s = 70
        +Time for loop2: 1219.49ms
        +
        + +

        loop1() is 15x faster than loop2(). Why is that? Let’s find out below!

        + +

        Measure cache misses using Cachegrind

        + +

        Cachegrind is a cache profiling tool used to see how many I1 (first level instruction), D1 (first level data), and LL (last level) cache misses your program caused.

        + +

        Let’s build our program with just loop1() and just loop2() to see how cache friendly each of these functions is.

        + +

        Build and run/profile just loop1()

        + +
        g++ -O2 main.cpp -DRUN_LOOP1
        +valgrind --tool=cachegrind ./a.out
        +
        + +

        Prints:

        + +
        ==3299700==
        +==3299700== I   refs:      643,156,721
        +==3299700== I1  misses:          2,077
        +==3299700== LLi misses:          2,021
        +==3299700== I1  miss rate:        0.00%
        +==3299700== LLi miss rate:        0.00%
        +==3299700==
        +==3299700== D   refs:      160,952,192  (160,695,444 rd   + 256,748 wr)
        +==3299700== D1  misses:     10,021,300  ( 10,018,723 rd   +   2,577 wr)
        +==3299700== LLd misses:     10,010,916  ( 10,009,147 rd   +   1,769 wr)
        +==3299700== D1  miss rate:         6.2% (        6.2%     +     1.0%  )
        +==3299700== LLd miss rate:         6.2% (        6.2%     +     0.7%  )
        +==3299700==
        +==3299700== LL refs:        10,023,377  ( 10,020,800 rd   +   2,577 wr)
        +==3299700== LL misses:      10,012,937  ( 10,011,168 rd   +   1,769 wr)
        +==3299700== LL miss rate:          1.2% (        1.2%     +     0.7%  )
        +
        + +

        Build and run/profile just loop2()

        + +
        g++ -O2 main.cpp -DRUN_LOOP2
        +valgrind --tool=cachegrind ./a.out
        +
        + +

        Prints:

        + +
        ==3300389==
        +==3300389== I   refs:      643,156,726
        +==3300389== I1  misses:          2,075
        +==3300389== LLi misses:          2,018
        +==3300389== I1  miss rate:        0.00%
        +==3300389== LLi miss rate:        0.00%
        +==3300389==
        +==3300389== D   refs:      160,952,196  (160,695,447 rd   + 256,749 wr)
        +==3300389== D1  misses:    160,021,290  (160,018,713 rd   +   2,577 wr)
        +==3300389== LLd misses:     10,014,907  ( 10,013,138 rd   +   1,769 wr)
        +==3300389== D1  miss rate:        99.4% (       99.6%     +     1.0%  )
        +==3300389== LLd miss rate:         6.2% (        6.2%     +     0.7%  )
        +==3300389==
        +==3300389== LL refs:       160,023,365  (160,020,788 rd   +   2,577 wr)
        +==3300389== LL misses:      10,016,925  ( 10,015,156 rd   +   1,769 wr)
        +==3300389== LL miss rate:          1.2% (        1.2%     +     0.7%  )
        +
        + +

        The main differences between the 2 runs are:

        +
          +
        1. D1 misses: 10M v/s 160M
        2. +
        3. D1 miss rate: 6.2% v/s 99.4%
        4. +
        + +

        As you can see, loop2() causes many many more (~16x more) L1 data cache misses than loop1(). This is why loop1() is ~15x faster than loop2().

        + +

        Memory Formats supported by PyTorch Operators

        + +

        While PyTorch operators expect all tensors to be in Channels First (NCHW) dimension format, PyTorch operators support 3 output memory formats.

        + +
          +
        1. Contiguous: Tensor memory is in the same order as the tensor’s dimensions.
        2. +
        3. ChannelsLast: Irrespective of the dimension order, the 2d (image) tensor is laid out as an HWC or NHWC (N: batch, H: height, W: width, C: channels) tensor in memory. The dimensions could be permuted in any order.
        4. +
        5. ChannelsLast3d: For 3d tensors (video tensors), the memory is laid out in THWC (Time, Height, Width, Channels) or NTHWC (N: batch, T: time, H: height, W: width, C: channels) format. The dimensions could be permuted in any order.
        6. +
        + +

        The reason that ChannelsLast is preferred for vision models is because XNNPACK (kernel acceleration library) used by PyTorch expects all inputs to be in Channels Last format, so if the input to the model isn’t channels last, then it must first be converted to channels last, which is an additional operation.

        + +

        Additionally, most PyTorch operators preserve the input tensor’s memory format, so if the input is Channels First, then the operator needs to first convert to Channels Last, then perform the operation, and then convert back to Channels First.

        + +

        When you combine it with the fact that accelerated operators work better with a channels last memory format, you’ll notice that having the operator return back a channels-last memory format is better for subsequent operator calls or you’ll end up having every operator convert to channels-last (should it be more efficient for that specific operator).

        + +

        From the XNNPACK home page:

        + +
        +

        “All operators in XNNPACK support NHWC layout, but additionally allow custom stride along the Channel dimension”.

        +
        + +

        PyTorch Best Practice

        + +

        The best way to get the most performance from your PyTorch vision models is to ensure that your input tensor is in a Channels Last memory format before it is fed into the model.

        + +

        You can get even more speedups by optimizing your model to use the XNNPACK backend (by simply calling optimize_for_mobile() on your torchscripted model). Note that XNNPACK models will run slower if the inputs are contiguous, so definitely make sure it is in Channels-Last format.

        + +

        Working example showing speedup

        + +

        Run this example on Google Colab - note that runtimes on colab CPUs might not reflect accurate performance; it is recommended to run this code on your local machine.

        + +
        import torch
        +from torch.utils.mobile_optimizer import optimize_for_mobile
        +import torch.backends.xnnpack
        +import time
        +
        +print("XNNPACK is enabled: ", torch.backends.xnnpack.enabled, "\n")
        +
        +N, C, H, W = 1, 3, 200, 200
        +x = torch.rand(N, C, H, W)
        +print("Contiguous shape: ", x.shape)
        +print("Contiguous stride: ", x.stride())
        +print()
        +
        +xcl = x.to(memory_format=torch.channels_last)
        +print("Channels-Last shape: ", xcl.shape)
        +print("Channels-Last stride: ", xcl.stride())
        +
        +## Outputs:
        + 
        +# XNNPACK is enabled:  True
        + 
        +# Contiguous shape:  torch.Size([1, 3, 200, 200])
        +# Contiguous stride:  (120000, 40000, 200, 1)
        + 
        +# Channels-Last shape:  torch.Size([1, 3, 200, 200])
        +# Channels-Last stride:  (120000, 1, 600, 3)
        +
        +
        + +

        The input shape stays the same for contiguous and channels-last formats. Internally however, the tensor’s layout has changed as you can see in the strides. Now, the number of jumps required to go across channels is only 1 (instead of 40000 in the contiguous tensor). +This better data locality means convolution layers can access all the channels for a given pixel much faster. Let’s see now how the memory format affects runtime:

        + +
        from torchvision.models import resnet34, resnet50, resnet101
        +
        +m = resnet34(pretrained=False)
        +# m = resnet50(pretrained=False)
        +# m = resnet101(pretrained=False)
        +
        +def get_optimized_model(mm):
        +  mm = mm.eval()
        +  scripted = torch.jit.script(mm)
        +  optimized = optimize_for_mobile(scripted)  # explicitly call the xnnpack rewrite 
        +  return scripted, optimized
        +
        +
        +def compare_contiguous_CL(mm):
        +  # inference on contiguous
        +  start = time.perf_counter()
        +  for i in range(20):
        +    mm(x)
        +  end = time.perf_counter()
        +  print("Contiguous: ", end-start)
        +
        +  # inference on channels-last
        +  start = time.perf_counter()
        +  for i in range(20):
        +    mm(xcl)
        +  end = time.perf_counter()
        +  print("Channels-Last: ", end-start)
        +
        +with torch.inference_mode():
        +  scripted, optimized = get_optimized_model(m)
        +
        +  print("Runtimes for torchscripted model: ")
        +  compare_contiguous_CL(scripted.eval())
        +  print()
        +  print("Runtimes for mobile-optimized model: ")
        +  compare_contiguous_CL(optimized.eval())
        +
        +   
        +## Outputs (on an Intel Core i9 CPU):
        + 
        +# Runtimes for torchscripted model:
        +# Contiguous:  1.6711160129999598
        +# Channels-Last:  1.6678222839999535
        + 
        +# Runtimes for mobile-optimized model:
        +# Contiguous:  0.5712863490000473
        +# Channels-Last:  0.46113000699995155
        +
        +
        + +

        Conclusion

        + +

        The Memory Layout of an input tensor can significantly impact a model’s running time. For Vision Models, prefer a Channels Last memory format to get the most out of your PyTorch models.

        + +

        References

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/the-road-to-1_0/index.html b/blog/the-road-to-1_0/index.html new file mode 100644 index 000000000000..6b31df131340 --- /dev/null +++ b/blog/the-road-to-1_0/index.html @@ -0,0 +1,751 @@ + + + + + + + + + + + + + The road to 1.0: production ready PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + The PyTorch Team + +

        +

        We would like to give you a preview of the roadmap for PyTorch 1.0 , the next release of PyTorch. Over the last year, we’ve had 0.2, 0.3 and 0.4 transform PyTorch from a [Torch+Chainer]-like interface into something cleaner, adding double-backwards, numpy-like functions, advanced indexing and removing Variable boilerplate. At this time, we’re confident that the API is in a reasonable and stable state to confidently release a 1.0.

        + +

        However, 1.0 isn’t just about stability of the interface.

        + +

        One of PyTorch’s biggest strengths is its first-class Python integration, imperative style, simplicity of the API and options. These are aspects that make PyTorch good for research and hackability.

        + +

        One of its biggest downsides has been production-support. What we mean by production-support is the countless things one has to do to models to run them efficiently at massive scale:

        + +
          +
        • exporting to C++-only runtimes for use in larger projects
        • +
        • optimizing mobile systems on iPhone, Android, Qualcomm and other systems
        • +
        • using more efficient data layouts and performing kernel fusion to do faster inference (saving 10% of speed or memory at scale is a big win)
        • +
        • quantized inference (such as 8-bit inference)
        • +
        + +

        Startups, large companies and anyone who wants to build a product around PyTorch have asked for production support. At Facebook (the largest stakeholder for PyTorch) we have Caffe2, which has been the production-ready platform, running in our datacenters and shipping to more than 1 billion phones spanning eight generations of iPhones and six generations of Android CPU architectures. It has server-optimized inference on Intel / ARM, TensorRT support, and all the necessary bits for production. Considering all this value locked-in to a platform that the PyTorch team works quite closely with, we decided to marry PyTorch and Caffe2 which gives the production-level readiness for PyTorch.

        + +

        Supporting production features without adding usability issues for our researchers and end-users needs creative solutions.

        + +

        Production != Pain for researchers

        + +

        Adding production capabilities involves increasing the API complexity and number of configurable options for models. One configures memory-layouts (NCHW vs NHWC vs N,C/32,H,W,32, each providing different performance characteristics), quantization (8-bit? 3-bit?), fusion of low-level kernels (you used a Conv + BatchNorm + ReLU, let’s fuse them into a single kernel), separate backend options (MKLDNN backend for a few layers and NNPACK backend for other layers), etc.

        + +

        PyTorch’s central goal is to provide a great platform for research and hackability. So, while we add all these optimizations, we’ve been working with a hard design constraint to never trade these off against usability.

        + +

        To pull this off, we are introducing torch.jit, a just-in-time (JIT) compiler that at runtime takes your PyTorch models and rewrites them to run at production-efficiency. The JIT compiler can also export your model to run in a C++-only runtime based on Caffe2 bits.

        + +
        +

        In 1.0, your code continues to work as-is, we’re not making any big changes to the existing API.

        +
        + +

        Making your model production-ready is an opt-in annotation, which uses the torch.jit compiler to export your model to a Python-less environment, and improving its performance. Let’s walk through the JIT compiler in detail.

        + +

        torch.jit: A JIT-compiler for your models

        + +

        We strongly believe that it’s hard to match the productivity you get from specifying your models directly as idiomatic Python code. This is what makes PyTorch so flexible, but it also means that PyTorch pretty much never knows the operation you’ll run next. This however is a big blocker for export/productionization and heavyweight automatic performance optimizations because they need full upfront knowledge of how the computation will look before it even gets executed.

        + +

        We provide two opt-in ways of recovering this information from your code, one based on tracing native python code and one based on compiling a subset of the python language annotated into a python-free intermediate representation. After thorough discussions we concluded that they’re both going to be useful in different contexts, and as such you will be able to mix and match them freely.

        + +

        Tracing Mode

        + +

        The PyTorch tracer, torch.jit.trace, is a function that records all the native PyTorch operations performed in a code region, along with the data dependencies between them. In fact, PyTorch has had a tracer since 0.3, which has been used for exporting models through ONNX. What changes now, is that you no longer necessarily need to take the trace and run it elsewhere - PyTorch can re-execute it for you, using a carefully designed high-performance C++ runtime. As we develop PyTorch 1.0 this runtime will integrate all the optimizations and hardware integrations that Caffe2 provides.

        + +

        The biggest benefit of this approach is that it doesn’t really care how your Python code is structured — you can trace through generators or coroutines, modules or pure functions. Since we only record native PyTorch operators, these details have no effect on the trace recorded. This behavior, however, is a double-edged sword. For example, if you have a loop in your model, it will get unrolled in the trace, inserting a copy of the loop body for as many times as the loop ran. This opens up opportunities for zero-cost abstraction (e.g. you can loop over modules, and the actual trace will be loop-overhead free!), but on the other hand this will also affect data dependent loops (think of e.g. processing sequences of varying lengths), effectively hard-coding a single length into the trace.

        + +

        For networks that do not contain loops and if statements, tracing is non-invasive and is robust enough to handle a wide variety of coding styles. This code example illustrates what tracing looks like:

        + +
        # This will run your nn.Module or regular Python function with the example
        +# input that you provided. The returned callable can be used to re-execute
        +# all operations that happened during the example run, but it will no longer
        +# use the Python interpreter.
        +from torch.jit import trace
        +traced_model = trace(model, example_input=input)
        +traced_fn = trace(fn, example_input=input)
        +
        +# The training loop doesn't change. Traced model behaves exactly like an
        +# nn.Module, except that you can't edit what it does or change its attributes.
        +# Think of it as a "frozen module".
        +for input, target in data_loader:
        +    loss = loss_fn(traced_model(input), target)
        +
        + +

        Script Mode

        + +

        Tracing mode is a great way to minimize the impact on your code, but we’re also very excited about the models that fundamentally make use of control flow such as RNNs. Our solution to this is a scripting mode.

        + +

        In this case you write out a regular Python function, except that you can no longer use certain more complicated language features. Once you isolated the desired functionality, you let us know that you’d like the function to get compiled by decorating it with an @script decorator. This annotation will transform your python function directly into our high-performance C++ runtime. This lets us recover all the PyTorch operations along with loops and conditionals. They will be embedded into our internal representation of this function, and will be accounted for every time this function is run.

        + +
        from torch.jit import script
        +
        +@script
        +def rnn_loop(x):
        +    hidden = None
        +    for x_t in x.split(1):
        +        x, hidden = model(x, hidden)
        +    return x
        +
        + +

        Optimization and Export

        + +

        Regardless of whether you use tracing or @script, the result is a python-free representation of your model, which can be used to optimize the model or to export the model from python for use in production environments.

        + +

        Extracting bigger segments of the model into an intermediate representation makes it possible to do sophisticated whole-program optimizations and to offload computation to specialized AI accelerators which operate on graphs of computation. We have already been developing the beginnings of these optimizations, including passes that fuse GPU operations together to improve the performance of smaller RNN models.

        + +

        It also allows us to use existing high-performance backends available in Caffe2 today to run the model efficiently. Additionally, @script functions (and modules!) can be fully exported to ONNX in a way that retains their dynamic nature, such that you can easily run them in a Python-free environment using the model executors from Caffe2 or by transferring the model to any other framework supporting ONNX.

        + +

        Usability

        + +

        We care deeply about maintaining our current level of usability and we know that execution of the code not directly in Python leads to harder debugging, but this is something that we think about a lot, and we’re making sure that you’re not getting locked in to a completely different programming language.

        + +

        First, we follow the principle of pay for what you use — if you don’t need to optimize or export your model, you do not have to use these new features and won’t see any downsides. Furthermore, use of traced or @script modules/functions can be done incrementally. For instance, all of these behaviors are allowed: You can trace part of your model and use the trace in a larger non-traced model. You can use tracing for 90% of your model, and use @script for the one sub-module that actually has some control flow in it. You can write a function using @script and have it call a native python function. If something appears incorrect in an @script function, you can remove the annotation and the code will execute in native python where it is easy to debug using your favorite tools and methods. Think of tracing and @script like type annotations using MyPy or TypeScript — each additional annotation can be tested incrementally, and none are required until you want to optimize or productionize.

        + +

        Most importantly, these modes will be built into the core of PyTorch so that mixing and matching them with your existing code can happen seamlessly.

        + +

        Note: The name JIT for these components is a bit of a misnomer and comes from historical reasons. The tracing/function execution in PyTorch started out as an optimizing JIT compiler that generated fused CUDA kernels but then grew to encompass optimization, @script, and export. When it is ready for release we will likely rename this functionality to the hybrid frontend, but we wanted to present it here as it is named in the code so that you can follow along as we develop it.

        + +

        Other changes and improvements

        + +

        Production support is the big feature for 1.0, but we will continue optimizing and fixing other parts of PyTorch as course of the standard release process.

        + +

        On the backend side of things, PyTorch will see some changes, which might affect user-written C and C++ extensions. We are replacing (or refactoring) the backend ATen library to incorporate features and optimizations from Caffe2.

        + +

        Last Words

        + +

        We aim to release 1.0 some time during the summer. You can follow-along our progress on the Pull Requests page.

        + +

        You can read this from the perspective of the Caffe2 project at: https://caffe2.ai/blog/2018/05/02/Caffe2_PyTorch_1_0.html

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch/index.html b/blog/the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch/index.html new file mode 100644 index 000000000000..86b4354b8a03 --- /dev/null +++ b/blog/the-torch.fft-module-accelerated-fast-fourier-transforms-with-autograd-in-pyTorch/index.html @@ -0,0 +1,711 @@ + + + + + + + + + + + + + The torch.fft module: Accelerated Fast Fourier Transforms with Autograd in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Mike Ruberry, Peter Bell, and Joe Spisak + +

        +

        The Fast Fourier Transform (FFT) calculates the Discrete Fourier Transform in O(n log n) time. It is foundational to a wide variety of numerical algorithms and signal processing techniques since it makes working in signals’ “frequency domains” as tractable as working in their spatial or temporal domains.

        + +

        As part of PyTorch’s goal to support hardware-accelerated deep learning and scientific computing, we have invested in improving our FFT support, and with PyTorch 1.8, we are releasing the torch.fft module. This module implements the same functions as NumPy’s np.fft module, but with support for accelerators, like GPUs, and autograd.

        + +

        Getting started

        + +

        Getting started with the new torch.fft module is easy whether you are familiar with NumPy’s np.fft module or not. While complete documentation for each function in the module can be found here, a breakdown of what it offers is:

        + +
          +
        • fft, which computes a complex FFT over a single dimension, and ifft, its inverse
        • +
        • the more general fftn and ifftn, which support multiple dimensions
        • +
        • The “real” FFT functions, rfft, irfft, rfftn, irfftn, designed to work with signals that are real-valued in their time domains
        • +
        • The “Hermitian” FFT functions, hfft and ihfft, designed to work with signals that are real-valued in their frequency domains
        • +
        • Helper functions, like fftfreq, rfftfreq, fftshift, ifftshift, that make it easier to manipulate signals
        • +
        + +

        We think these functions provide a straightforward interface for FFT functionality, as vetted by the NumPy community, although we are always interested in feedback and suggestions!

        + +

        To better illustrate how easy it is to move from NumPy’s np.fft module to PyTorch’s torch.fft module, let’s look at a NumPy implementation of a simple low-pass filter that removes high-frequency variance from a 2-dimensional image, a form of noise reduction or blurring:

        + +
        import numpy as np
        +import numpy.fft as fft
        +
        +def lowpass_np(input, limit):
        +    pass1 = np.abs(fft.rfftfreq(input.shape[-1])) < limit
        +    pass2 = np.abs(fft.fftfreq(input.shape[-2])) < limit
        +    kernel = np.outer(pass2, pass1)
        +    
        +    fft_input = fft.rfft2(input)
        +    return fft.irfft2(fft_input * kernel, s=input.shape[-2:])
        +
        + +

        Now let’s see the same filter implemented in PyTorch:

        + +
        import torch
        +import torch.fft as fft
        +
        +def lowpass_torch(input, limit):
        +    pass1 = torch.abs(fft.rfftfreq(input.shape[-1])) < limit
        +    pass2 = torch.abs(fft.fftfreq(input.shape[-2])) < limit
        +    kernel = torch.outer(pass2, pass1)
        +    
        +    fft_input = fft.rfft2(input)
        +    return fft.irfft2(fft_input * kernel, s=input.shape[-2:])
        +
        + +

        Not only do current uses of NumPy’s np.fft module translate directly to torch.fft, the torch.fft operations also support tensors on accelerators, like GPUs and autograd. This makes it possible to (among other things) develop new neural network modules using the FFT.

        + +

        Performance

        + +

        The torch.fft module is not only easy to use — it is also fast! PyTorch natively supports Intel’s MKL-FFT library on Intel CPUs, and NVIDIA’s cuFFT library on CUDA devices, and we have carefully optimized how we use those libraries to maximize performance. While your own results will depend on your CPU and CUDA hardware, computing Fast Fourier Transforms on CUDA devices can be many times faster than computing it on the CPU, especially for larger signals.

        + +

        In the future, we may add support for additional math libraries to support more hardware. See below for where you can request additional hardware support.

        + +

        Updating from older PyTorch versions

        + +

        Some PyTorch users might know that older versions of PyTorch also offered FFT functionality with the torch.fft() function. Unfortunately, this function had to be removed because its name conflicted with the new module’s name, and we think the new functionality is the best way to use the Fast Fourier Transform in PyTorch. In particular, torch.fft() was developed before PyTorch supported complex tensors, while the torch.fft module was designed to work with them.

        + +

        PyTorch also has a “Short Time Fourier Transform”, torch.stft, and its inverse torch.istft. These functions are being kept but updated to support complex tensors.

        + +

        Future

        + +

        As mentioned, PyTorch 1.8 offers the torch.fft module, which makes it easy to use the Fast Fourier Transform (FFT) on accelerators and with support for autograd. We encourage you to try it out!

        + +

        While this module has been modeled after NumPy’s np.fft module so far, we are not stopping there. We are eager to hear from you, our community, on what FFT-related functionality you need, and we encourage you to create posts on our forums at https://discuss.pytorch.org/, or file issues on our Github with your feedback and requests. Early adopters have already started asking about Discrete Cosine Transforms and support for more hardware platforms, for example, and we are investigating those features now.

        + +

        We look forward to hearing from you and seeing what the community does with PyTorch’s new FFT functionality!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torch-linalg-autograd/index.html b/blog/torch-linalg-autograd/index.html new file mode 100644 index 000000000000..89fbb2968324 --- /dev/null +++ b/blog/torch-linalg-autograd/index.html @@ -0,0 +1,793 @@ + + + + + + + + + + + + + The torch.linalg module: Accelerated Linear Algebra with Autograd in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Mike Ruberry, Ivan Yashchuk, Xiao Wang, Mario Lezcano and Natalia Gimelshein + +

        +

        Linear algebra is essential to deep learning and scientific computing, and it’s always been a core part of PyTorch. PyTorch 1.9 extends PyTorch’s support for linear algebra operations with the torch.linalg module. This module, documented here, has 26 operators, including faster and easier to use versions of older PyTorch operators, every function from NumPy’s linear algebra module extended with accelerator and autograd support, and a few operators that are completely new. This makes the torch.linalg immediately familiar to NumPy users and an exciting update to PyTorch’s linear algebra support.

        + +

        NumPy-like linear algebra in PyTorch

        + +

        If you’re familiar with NumPy’s linear algebra module then it’ll be easy to start using torch.linalg. In most cases it’s a drop-in replacement. Let’s looking at drawing samples from a multivariate normal distribution using the Cholesky decomposition as a motivating example to demonstrate this:

        + +
        import numpy as np
        +
        +# Creates inputs
        +np.random.seed(0)
        +mu_np = np.random.rand(4)
        +L = np.random.rand(4, 4)
        +# Covariance matrix sigma is positive-definite
        +sigma_np = L @ L.T + np.eye(4)
        +normal_noise_np = np.random.standard_normal(mu_np.size)
        +
        +def multivariate_normal_sample_np(mu, sigma, normal_noise):
        +    return mu + np.linalg.cholesky(sigma) @ normal_noise
        +
        +print("Random sample: ", 
        +      multivariate_normal_sample_np(mu_np, sigma_np, normal_noise_np))
        +: Random sample: [2.9502426 1.78518077 1.83168697 0.90798228]
        +
        + +

        Now let’s see the same sampler implemented in PyTorch:

        + +
        import torch
        +
        +def multivariate_normal_sample_torch(mu, sigma, normal_noise):
        +    return mu + torch.linalg.cholesky(sigma) @ normal_noise
        +
        + +

        The two functions are identical, and we can validate their behavior by calling the function with the same arguments wrapped as PyTorch tensors:

        + +
        # NumPy arrays are wrapped as tensors and share their memory
        +mu_torch = torch.from_numpy(mu_np)
        +sigma_torch = torch.from_numpy(sigma_np)
        +normal_noise_torch = torch.from_numpy(normal_noise_np)
        +
        +multivariate_normal_sample_torch(mu_torch, sigma_torch, normal_noise_torch)
        +: tensor([2.9502, 1.7852, 1.8317, 0.9080], dtype=torch.float64)
        +
        + +

        The only difference is in how PyTorch prints tensors by default.

        + +

        The Cholesky decomposition can also help us quickly compute the probability density function of the non-degenerate multivariate normal distribution. One of the expensive terms in that computation is the square root of the determinant of the covariance matrix. Using properties of the determinant and the Cholesky decomposition we can calculate the same result faster than the naive computation, however. Here’s the NumPy program that demonstrates this:

        + +
        sqrt_sigma_det_np = np.sqrt(np.linalg.det(sigma_np))
        +sqrt_L_det_np = np.prod(np.diag(np.linalg.cholesky(sigma_np)))
        +
        +print("|sigma|^0.5 = ", sqrt_sigma_det_np)
        +: |sigma|^0.5 = 4.237127491242027
        + 
        +print("|L| = ", sqrt_L_det_np)
        +: |L| = 4.237127491242028
        +
        + +

        And here’s the same validation in PyTorch:

        + +
        sqrt_sigma_det_torch = torch.sqrt(torch.linalg.det(sigma_torch))
        +sqrt_L_det_torch = torch.prod(torch.diag(torch.linalg.cholesky(sigma_torch)))
        +
        +print("|sigma|^0.5 = ", sqrt_sigma_det_torch)
        +: |sigma|^0.5 = tensor(4.2371, dtype=torch.float64) 
        +
        +print("|L| = ", sqrt_L_det_torch)
        +: |L| = tensor(4.2371, dtype=torch.float64)
        +
        + +

        We can measure the difference in run time using PyTorch’s built-in benchmark utility:

        + +
        import torch.utils.benchmark as benchmark
        +
        +t0 = benchmark.Timer(
        +    stmt='torch.sqrt(torch.linalg.det(sigma))',
        +    globals={'sigma': sigma_torch})
        +
        +t1 = benchmark.Timer(
        +    stmt='torch.prod(torch.diag(torch.linalg.cholesky(sigma)))',
        +    globals={'sigma': sigma_torch})
        +
        +print(t0.timeit(100))
        +: torch.sqrt(torch.linalg.det(sigma))
        +  80.80 us
        +  1 measurement, 100 runs , 1 thread
        +
        +
        +print(t1.timeit(100))
        +: torch.prod(torch.diag(torch.linalg.cholesky(sigma)))
        +  11.56 us
        +  1 measurement, 100 runs , 1 thread
        +
        + +

        Demonstrating that the approach using the Cholesky decomposition can be significantly faster. Behind the scenes, PyTorch’s linear algebra module uses OpenBLAS or MKL implementations of the LAPACK standard to maximize its CPU performance.

        + +

        Autograd Support

        + +

        PyTorch’s linear algebra module doesn’t just implement the same functions as NumPy’s linear algebra module (and a few more), it also extends them with autograd and CUDA support.

        + +

        Let’s look at a very simple program that just computes an inverse and the gradient of that operation to show how autograd works:

        + +
        t = torch.tensor(((1, 2), (3, 4)), dtype=torch.float32, requires_grad=True)
        +
        +inv = torch.linalg.inv(t)
        +inv.backward(torch.ones_like(inv))
        +
        +print(t.grad)
        +: tensor([[-0.5000, 0.5000],
        +          [ 0.5000, -0.5000]])
        +
        + +

        We can mimic the same computation in NumPy by defining the autograd formula ourselves:

        + +
        a = np.array(((1, 2), (3, 4)), dtype=np.float32)
        +
        +inv_np = np.linalg.inv(a)
        +
        +def inv_backward(result, grad):
        +    return -(result.transpose(-2, -1) @ (grad @ result.transpose(-2, -1)))
        +grad_np = inv_backward(inv_np, np.ones_like(inv_np))
        +
        +print(grad_np)
        +: [[-0.5 0.5]
        +   [ 0.5 -0.5]]
        +
        + +

        Of course, as programs become more complicated it’s convenient to have builtin autograd support, and PyTorch’s linear algebra module supports both real and complex autograd.

        + +

        CUDA Support

        + +

        Support for autograd and accelerators, like CUDA devices, is a core part of PyTorch. The torch.linalg module was developed with NVIDIA’s PyTorch and cuSOLVER teams, who helped optimize its performance on CUDA devices with the cuSOLVER, cuBLAS, and MAGMA libraries. These improvements make PyTorch’s CUDA linear algebra operations faster than ever. For example, let’s look at the performance of PyTorch 1.9’s torch.linalg.cholesky vs. PyTorch 1.8’s (now deprecated) torch.cholesky:

        + +
        + +
        + +

        (The above charts were created using an Ampere A100 GPU with CUDA 11.3, cuSOLVER 11.1.1.58, and MAGMA 2.5.2. Matrices are in double precision.)

        + +

        These charts show that performance has increased significantly on larger matrices, and that batched performance is better across the board. Other linear algebra operations, including torch.linalg.qr and torch.linalg.lstsq, have also had their CUDA performance improved.

        + +

        Beyond NumPy

        + +

        In addition to offering all the functions in NumPy’s linear algebra module with support for autograd and accelerators, torch.linalg has a few new functions of its own. NumPy’s linalg.norm does not allow users to compute vector norms over arbitrary subsets of dimensions, so to enable this functionality we added torch.linalg.vector_norm. We’ve also started modernizing other linear algebra functionality in PyTorch, so we created torch.linalg.householder_product to replace the older torch.orgqr, and we plan to continue adding more linear algebra functionality in the future, too.

        + +

        The Future of Linear Algebra in PyTorch

        + +

        The torch.linalg module is fast and familiar with great support for autograd and accelerators. It’s already being used in libraries like botorch, too. But we’re not stopping here. We plan to continue updating more of PyTorch’s existing linear algebra functionality (like torch.lobpcg) and offering more support for low rank and sparse linear algebra. We also want to hear your feedback on how we can improve, so start a conversation on the forum or file an issue on our Github and share your thoughts.

        + +

        We look forward to hearing from you and seeing what the community does with PyTorch’s new linear algebra functionality!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchchat-local-llm-inference/index.html b/blog/torchchat-local-llm-inference/index.html new file mode 100644 index 000000000000..0babc629281e --- /dev/null +++ b/blog/torchchat-local-llm-inference/index.html @@ -0,0 +1,797 @@ + + + + + + + + + + + + + Introducing torchchat: Accelerating Local LLM Inference on Laptop, Desktop and Mobile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Today, we’re releasing torchchat, a library showcasing how to seamlessly and performantly run Llama 3, 3.1, and other large language models across laptop, desktop, and mobile.

        + +

        In our previous blog posts, we showed how to use native PyTorch 2 to run LLMs with great performance using CUDA. Torchchat expands on this with more target environments, models and execution modes. Additionally it provides important functions such as export, quantization and eval in a way that’s easy to understand providing an E2E story for those who want to build a local inference solution.

        + +

        You will find the project organized into three areas:

        + +
          +
        • Python: Torchchat provides a REST API that is called via a Python CLI or can be accessed via the browser
        • +
        • C++: Torchchat produces a desktop-friendly binary using PyTorch’s AOTInductor backend
        • +
        • Mobile devices: Torchchat uses ExecuTorch to export a .pte binary file for on-device inference
        • +
        + +

        torchchat schema

        + +

        Performance

        + +

        The following table tracks the performance of torchchat for Llama 3 for a variety of configurations.
        +Numbers for Llama 3.1 are coming soon.

        + +

        Llama 3 8B Instruct on Apple MacBook Pro M1 Max 64GB Laptop

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Mode + DType + Llama 3 8B Tokens/Sec +
        Arm Compile + float16 + 5.84 +
        int8 + 1.63 +
        int4 + 3.99 +
        Arm AOTI + float16 + 4.05 +
        int8 + 1.05 +
        int4 + 3.28 +
        MPS Eager + float16 + 12.63 +
        int8 + 16.9 +
        int4 + 17.15 +
        + +

        Llama 3 8B Instruct on Linux x86 and CUDA
        +Intel(R) Xeon(R) Platinum 8339HC CPU @ 1.80GHz with 180GB Ram + A100 (80GB)

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +Mode + DType + Llama 3 8B Tokens/Sec +
        x86 Compile + bfloat16 + 2.76 +
        int8 + 3.15 +
        int4 + 5.33 +
        CUDA Compile + bfloat16 + 83.23 +
        int8 + 118.17 +
        int4 + 135.16 +
        + +

        Llama3 8B Instruct on Mobile
        +Torchchat achieves > 8T/s on the Samsung Galaxy S23 and iPhone using 4-bit GPTQ via ExecuTorch.

        + +

        Conclusion

        + +

        We encourage you to clone the torchchat repo and give it a spin, explore its capabilities, and share your feedback as we continue to empower the PyTorch community to run LLMs locally and on constrained devices. Together, let’s unlock the full potential of generative AI and LLMs on any device. Please submit issues as you see them, since we are still iterating quickly. We’re also inviting community contributions across a broad range of areas, from additional models, target hardware support, new quantization schemes, or performance improvements. Happy experimenting!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchcodec/index.html b/blog/torchcodec/index.html new file mode 100644 index 000000000000..00397f912c10 --- /dev/null +++ b/blog/torchcodec/index.html @@ -0,0 +1,749 @@ + + + + + + + + + + + + + torchcodec: Easy and Efficient Video Decoding for PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are pleased to officially announce torchcodec, a library for decoding videos into PyTorch tensors. It is fast, accurate, and easy to use. When running PyTorch models on videos, torchcodec is our recommended way to turn those videos into data your model can use.

        + +

        Highlights of torchcodec include:

        + +
          +
        • An intuitive decoding API that treats a video file as a Python sequence of frames. We support both index-based and presentation-time-based frame retrieval.
        • +
        • An emphasis on accuracy: we ensure you get the frames you requested, even if your video has variable frame rates.
        • +
        • A rich sampling API that makes it easy and efficient to retrieve batches of frames.
        • +
        • Best-in-class CPU decoding performance.
        • +
        • CUDA accelerated decoding that enables high throughput when decoding many videos at once.
        • +
        • Support for all codecs available in your installed version of FFmpeg.
        • +
        • Simple binary installs for Linux and Mac.
        • +
        + +

        Easy to Use

        + +

        A simple, intuitive API was one of our main design principles. We start with simple decoding and extracting specific frames of a video:

        + +
        from torchcodec.decoders import VideoDecoder
        +from torch import Tensor
        +
        +decoder = VideoDecoder("my_video.mp4")
        +
        +# Index based frame retrieval.
        +first_ten_frames: Tensor = decoder[10:]
        +last_ten_frames: Tensor = decoder[-10:]
        +
        +# Multi-frame retrieval, index and time based.
        +frames = decoder.get_frames_at(indices=[10, 0, 15])
        +frames = decoder.get_frames_played_at(seconds=[0.2, 3, 4.5])
        +
        + +

        All decoded frames are already PyTorch tensors, ready to be fed into models for training.

        + +

        Of course, more common in ML training pipelines is sampling multiple clips from videos. A clip is just a sequence of frames in presentation order—but the frames are often not consecutive. Our sampling API makes this easy:

        + +
        from torchcodec.samplers import clips_at_regular_timestamps
        +
        +clips = clips_at_regular_timestamps(
        +  decoder,
        +  seconds_between_clip_starts=10,
        +  num_frames_per_clip=5,
        +  seconds_between_frames=0.2,
        +)
        +
        + +

        The above call yields a batch of clips where each clip starts 10 seconds apart, each clip has 5 frames, and those frames are 0.2 seconds apart. See our tutorials on decoding and sampling for more!

        + +

        Fast Performance

        + +

        Performance was our other main design principle. Decoding videos for ML training has different performance requirements than decoding videos for playback. A typical ML video training pipeline will process many different videos (sometimes in the millions!), but only sample a small number of frames (dozens to hundreds) from each video.

        + +

        For this reason, we’ve paid particular attention to our decoder’s performance when seeking multiple times in a video, decoding a small number of frames after each seek. We present experiments with the following four scenarios:

        + +
          +
        1. +

          Decoding and transforming frames from multiple videos at once, inspired by what we have seen in data loading for large-scale training pipelines:

          + +

          a. Ten threads decode batches of 50 videos in parallel.
          +b. For each video, decode 10 frames at evenly spaced times.
          +c. For each frame, resize it to a 256x256 resolution.

          +
        2. +
        3. Decoding 10 frames at random locations in a single video.
        4. +
        5. Decoding 10 frames at evenly spaced times of a single video.
        6. +
        7. Decoding the first 100 frames of a single video.
        8. +
        + +

        We compare the following video decoders:

        + +
          +
        • Torchaudio, CPU decoding only.
        • +
        • Torchvision, using the video_reader backend which is CPU decoding only.
        • +
        • Torchcodec, GPU decoding with CUDA.
        • +
        • Torchcodec, CPU decoding only.
        • +
        + +

        Using the following three videos:

        + +
          +
        1. A synthetically generated video using FFmpeg’s mandelbrot generation pattern. The video is 10 seconds long, 60 frames per second and 1920x1080.
        2. +
        3. Same as above, except the video is 120 seconds long.
        4. +
        5. A promotional video from NASA that is 206 seconds long, 29.7 frames per second and 960x540.
        6. +
        + +

        The experimental script is in our repo. Our experiments run on a Linux system with an Intel processor that has 22 available cores and an NVIDIA GPU. For CPU decoding, all libraries were instructed to automatically determine the best number of threads to use.

        + +

        Benchmark chart

        + +

        From our experiments, we draw several conclusions:

        + +
          +
        • Torchcodec is consistently the best-performing library for the primary use case we designed it for: decoding many videos at once as a part of a training data loading pipeline. In particular, high-resolution videos see great gains with CUDA where decoding and transforms both happen on the GPU.
        • +
        • Torchcodec is competitive on the CPU with seek-heavy use cases such as random and uniform sampling. Currently, torchcodec’s performance is better with shorter videos that have a smaller file size. This performance is due to torchcodec’s emphasis on seek-accuracy, which involves an initial linear scan.
        • +
        • Torchcodec is not as competitive when there is no seeking; that is, opening a video file and decoding from the beginning. This is again due to our emphasis on seek-accuracy and the initial linear scan.
        • +
        + +

        Implementing an approximate seeking mode in torchcodec should resolve these performance gaps, and it’s our highest priority feature for video decoding.

        + +

        What’s Next?

        + +

        As the name implies, the long-term future for torchcodec is more than just video decoding. Our next big feature is audio support—both decoding audio streams from video, and from audio-only media. In the long term, we want torchcodec to be the media decoding library for PyTorch. That means as we implement functionality in torchcodec, we will deprecate and eventually remove complementary features from torchaudio and torchvision.

        + +

        We also have video decoding improvements lined up, such as the previously mentioned approximate seeking mode for those who are willing to sacrifice accuracy for performance.

        + +

        Most importantly, we’re looking for feedback from the community! We’re most interested in working on features that the community finds valuable. Come share your needs and influence our future direction!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchcsprng-release-blog/index.html b/blog/torchcsprng-release-blog/index.html new file mode 100644 index 000000000000..64c8e3e45e55 --- /dev/null +++ b/blog/torchcsprng-release-blog/index.html @@ -0,0 +1,711 @@ + + + + + + + + + + + + + PyTorch framework for cryptographically secure random number generation, torchcsprng, now available | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        One of the key components of modern cryptography is the pseudorandom number generator. Katz and Lindell stated, “The use of badly designed or inappropriate random number generators can often leave a good cryptosystem vulnerable to attack. Particular care must be taken to use a random number generator that is designed for cryptographic use, rather than a ‘general-purpose’ random number generator which may be fine for some applications but not ones that are required to be cryptographically secure.”[1] Additionally, most pseudorandom number generators scale poorly to massively parallel high-performance computation because of their sequential nature. Others don’t satisfy cryptographically secure properties.

        + +

        torchcsprng is a PyTorch C++/CUDA extension that provides cryptographically secure pseudorandom number generators for PyTorch.

        + +

        torchcsprng overview

        + +

        Historically, PyTorch had only two pseudorandom number generator implementations: Mersenne Twister for CPU and Nvidia’s cuRAND Philox for CUDA. Despite good performance properties, neither of them are suitable for cryptographic applications. Over the course of the past several months, the PyTorch team developed the torchcsprng extension API. Based on PyTorch dispatch mechanism and operator registration, it allows the users to extend c10::GeneratorImpl and implement their own custom pseudorandom number generator.

        + +

        torchcsprng generates a random 128-bit key on the CPU using one of its generators and then runs AES128 in CTR mode either on CPU or GPU using CUDA. This then generates a random 128-bit state and applies a transformation function to map it to target tensor values. This approach is based on Parallel Random Numbers: As Easy as 1, 2, 3 (John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, D. E. Shaw Research). It makes torchcsprng both crypto-secure and parallel on both CPU and CUDA.

        + +
        + +
        + +

        Since torchcsprng is a PyTorch extension, it is available on the platforms where PyTorch is available (support for Windows-CUDA will be available in the coming months).

        + +

        Using torchcsprng

        + +

        The torchcsprng API is very simple to use and is fully compatible with the PyTorch random infrastructure:

        + +

        Step 1: Install via binary distribution

        + +

        Anaconda:

        + +
        conda install torchcsprng -c pytorch
        +
        + +

        pip:

        + +
        pip install torchcsprng
        +
        + +

        Step 2: import packages as usual but add csprng

        + +
        import torch
        +import torchcsprng as csprng
        +
        + +

        Step 3: Create a cryptographically secure pseudorandom number generator from /dev/urandom:

        + +
        urandom_gen = csprng.create_random_device_generator('/dev/urandom')
        +
        + +

        and simply use it with the existing PyTorch methods:

        + +
        torch.randn(10, device='cpu', generator=urandom_gen)
        +
        + +

        Step 4: Test with Cuda

        + +

        One of the advantages of torchcsprng generators is that they can be used with both CPU and CUDA tensors:

        + +
        torch.randn(10, device='cuda', generator=urandom_gen)
        +
        + +

        Another advantage of torchcsprng generators is that they are parallel on CPU unlike the default PyTorch CPU generator.

        + +

        Getting Started

        + +

        The easiest way to get started with torchcsprng is by visiting the GitHub page where you can find installation and build instructions, and more how-to examples.

        + +

        Cheers,

        + +

        The PyTorch Team

        + +

        [1] Introduction to Modern Cryptography: Principles and Protocols (Chapman & Hall/CRC Cryptography and Network Security Series) by Jonathan Katz and Yehuda Lindell

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchrec-fbgemm-1/index.html b/blog/torchrec-fbgemm-1/index.html new file mode 100644 index 000000000000..bd6bb2e1ca20 --- /dev/null +++ b/blog/torchrec-fbgemm-1/index.html @@ -0,0 +1,759 @@ + + + + + + + + + + + + + TorchRec and FBGEMM 1.0 Stable Release | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 23, 2024

        +

        + TorchRec and FBGEMM 1.0 Stable Release +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Paul Zhang, Zain Huda, Sarunya Pumma, Shintaro Iwasaki, Supadchaya Puangpontip, Benson Ma + +

        +

        We are happy to announce the stable release, 1.0, for TorchRec and FBGEMM. TorchRec is the PyTorch native recommendation systems library, powered by FBGEMM’s (Facebook GEneral Matrix Multiplication) efficient, low-level kernels.

        + +

        TorchRec

        + +

        Initially open sourced in 2022, TorchRec provides common primitives for creating state-of-the-art personalization models:

        + +
          +
        • Simple, optimized APIs for distributed training across hundreds of GPUs
        • +
        • Advanced sharding techniques for embeddings
        • +
        • Modules common in authoring recommendation systems
        • +
        • Frictionless path to distributed inference with APIs for quantization and sharding of TorchRec models
        • +
        + +

        Since then, TorchRec has matured significantly, with wide internal adoption across many Meta production recommendation models for training and inference, alongside new features such as: variable batched embeddings, embedding offloading, zero collision hashing, etc. Furthermore, TorchRec has a presence outside of Meta, such as in recommendation models at Databricks and in the Twitter algorithm. As a result, standard TorchRec features have been marked as stable, with PyTorch style BC guarantees, and can be seen on the revamped TorchRec documentation.

        + +

        FBGEMM

        + +

        FBGEMM is a library that provides high-performance kernels for CPUs and GPUs. Since 2018, FBGEMM has supported the efficient execution of Meta-internal and external AI/ML workloads by expanding its scope from performance-critical kernels for inference on CPUs to more complex sparse operators for both training and inference – and recently for Generative AI – on CPUs and GPUs.

        + +

        FBGEMM has been empowering TorchRec through its backend high-performance kernel implementations for recommendation workloads, ranging from embedding bag kernels to jagged tensor operations. Together with TorchRec, we released FBGEMM 1.0, which guarantees the functionality and backward-compatibility of several stable APIs serving its core features with enhanced documentation.

        + +

        Performance

        + +

        DLRM (Deep Learning Recommendation Model) is the standard neural network architecture for powering recommendations at Meta, with categorical features being processed through embeddings, while continuous (dense) features are processed with a bottom multilayer perceptron. The following diagram depicts the basic architecture of DLRM, with a second order interaction layer between the dense and sparse features and a top MLP for generating the prediction.

        + +

        flow diagram

        + +

        TorchRec provides standardized modules with significant optimizations in fusing embedding lookups. EBC is a traditional PyTorch embedding module implementation, containing a collection of torch.nn.EmbeddingBags. FusedEBC, powered by FBGEMM for high performance operations on embedding tables with a fused optimizer and UVM caching/management for alleviating memory constraints, is the optimized version present in sharded TorchRec modules for distributed training and inference. The below benchmark demonstrates the vast performance improvements of FusedEBC in comparison to a traditional PyTorch embedding module implementation (EBC) and the ability for FusedEBC to handle much larger embeddings than what is available on GPU memory with UVM caching.

        + +

        performance chart

        + +

        TorchRec Data Types

        + +

        TorchRec provides standard data types and modules for easy handling of distributed embeddings. Here is a simple example setting up a collection of embedding tables through TorchRec:

        + +
        from torchrec import EmbeddingBagCollection
        +from torchrec import KeyedJaggedTensor
        +from torchrec import JaggedTensor
        +
        +ebc = torchrec.EmbeddingBagCollection(
        +    device="cpu",
        +    tables=[
        +        torchrec.EmbeddingBagConfig(
        +            name="product_table",
        +            embedding_dim=64,
        +            num_embeddings=4096,
        +            feature_names=["product"],
        +            pooling=torchrec.PoolingType.SUM,
        +        ),
        +        torchrec.EmbeddingBagConfig(
        +            name="user_table",
        +            embedding_dim=64,
        +            num_embeddings=4096,
        +            feature_names=["user"],
        +            pooling=torchrec.PoolingType.SUM,
        +        )
        +    ]
        +)
        +
        +product_jt = JaggedTensor(
        +    values=torch.tensor([1, 2, 1, 5]), lengths=torch.tensor([3, 1])
        +)
        +user_jt = JaggedTensor(values=torch.tensor([2, 3, 4, 1]), lengths=torch.tensor([2, 2]))
        +
        +kjt = KeyedJaggedTensor.from_jt_dict({"product": product_jt, "user": user_jt})
        +
        +print("Call EmbeddingBagCollection Forward: ", ebc(kjt))
        +
        + +

        Sharding

        + +

        TorchRec provides a planner class that automatically generates an optimized sharding plan across many GPUs. Here we demonstrate generating a sharding plan across two GPUs:

        + +
        from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
        +
        +planner = EmbeddingShardingPlanner(
        +    topology=Topology(
        +        world_size=2,
        +        compute_device="cuda",
        +    )
        +)
        +
        +plan = planner.collective_plan(ebc, [sharder], pg)
        +
        +print(f"Sharding Plan generated: {plan}")
        +
        + +

        Model Parallel

        + +

        TorchRec’s main distributed training API is DistributedModelParallel, which calls the planner to generate a sharding plan (demonstrated above) and shards TorchRec modules according to that plan. We demonstrate using DistributedModelParallel to our EmbeddingBagCollection for sharding embeddings distributed training:

        + +
        model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda"))
        +
        + +

        Inference

        + +

        TorchRec provides simple APIs for quantizing and sharding embeddings for a model for distributed inference. The usage is demonstrated below:

        + +
        from torchrec.inference.modules import (
        +    quantize_inference_model,
        +    shard_quant_model,
        +)
        +quant_model = quantize_inference_model(ebc)
        +sharded_model, _ = shard_quant_model(
        +    quant_model, compute_device=device, sharding_device=device
        +)
        +
        + +

        Conclusion

        + +

        TorchRec and FBGEMM are now stable, with optimized features for large scale recommendation systems.

        + +

        For setting up TorchRec and FBGEMM, check out the getting started guide.
        +
        +We also recommend the comprehensive, end-to-end tutorial for introducing the features in TorchRec and FBGEMM.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchserve-performance-tuning/index.html b/blog/torchserve-performance-tuning/index.html new file mode 100644 index 000000000000..05543156e5b0 --- /dev/null +++ b/blog/torchserve-performance-tuning/index.html @@ -0,0 +1,1084 @@ + + + + + + + + + + + + + Torchserve Performance Tuning, Animated Drawings Case-Study | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Hamid Shojanazeri, Geeta Chauhan, Mark Saroufim, Jesse Smith + +

        +

        In this post we discuss performance tuning of Torchserve for serving your models in production. One of the biggest challenges in the life cycle of a ML project is deploying models in production. This requires a reliable serving solution along with solutions that address the MLOps needs. A robust serving solution needs to provide support for multi model serving, model versioning, metric logging, monitoring and scaling to serve the peak traffic. In this post, we will have an overview of Torchserve and how to tune its performance for production use-cases. We discuss the Animated Drawings app from Meta that can turn your human figure sketches to animations and how it could serve the peak traffic with Torchserve. The Animated Drawing’s workflow is below.

        + +

        + +

        + +

        https://ai.facebook.com/blog/using-ai-to-bring-childrens-drawings-to-life/

        + +

        Many AI systems and tools are designed to handle realistic images of humans, children’s drawings add a level of complexity and unpredictability as they are often constructed in abstract, fanciful ways. These types of morphological and stylistic variations can confuse even state-of-the-art AI systems that excel at spotting objects in photorealistic images and drawings. +Meta AI researchers are working to overcome this challenge so that AI systems will be better able to recognize drawings of human figures in the wildly varied ways that children create them. This great blog post provides more details about the Animated Drawings and the approach taken.

        + +

        Torchserve

        + +

        + +

        Fig1. Overall flow of Torchserve performance tuning
        +

        + +

        Once you have trained your model, it needs to be integrated into a larger system to have a full-fledged application, we use the term “model serving” to refer to this integration. Basically model serving is making your trained model available to run inferences and subsequent use of the model.

        + +

        Torchserve is the Pytorch preferred solution for serving models in production. It is a performant and scalable tool that wraps your model in a HTTP or HTTPS API. It has a frontend implemented in Java that handles multiple tasks from assigning workers for serving models to handling the connection between client and server. Torchserve has a Python backend that is responsible for handling the inference service.

        + +

        Torchserve supports multi model serving and versioning for AB test, dynamic batching, logging and metrics. It exposes four APIs for inference, explanations, management and metrics.

        + +

        Inference API is listening on port 8080 and accessible through localhost by default, this can be configured in Torchserve configuration and enable getting predictions from the model.

        + +

        Explanation API uses Captum under the hood to provide explanations of the model that is being served and listens to the port 8080 as well.

        + +

        Management API allows to register or unregister and describe a model. It also enables users to scale up or down the number of workers that serve the model.

        + +

        Metric API by default listens to port 8082 and enables us to monitor the model that is being served.

        + +

        Torchserve let you scale your model serving and handle the peak traffic by supporting batch inference and multiple workers that serve your model. Scaling can be done through management API and settings through a configuration file. Also, metric API helps you to monitor your model serving through default and customizable metrics.

        + +

        Other advanced settings such as the length of the queue for the received requests, maximum wait time for a batch of inputs and many other properties are configurable through a config file that can be passed to Torchserve when it is started.

        + +

        Steps to serve your model with Torchserve

        + +
          +
        1. Install Torchserve, model archiver and its requirements.
        2. +
        3. Choose a default handler that fits your task (e.g image classification, etc) or author a custom handler.
        4. +
        5. Package your model artifacts (trained model checkpoint and all other necessary files for loading and running your model) and the handler into a “.mar” file using Torcharchive and place it in the model store.
        6. +
        7. Start serving your model.
        8. +
        9. Run inference. +We will discuss model handlers and metrics in more detail here.
        10. +
        + +

        Model handlers

        + +

        Torchserve uses a handler in the backend to load the models, preprocess the received data, run inference and post-process the response. Handler in torchserve is a python script that all the model initialization, preprocessing, inference and post processing logic goes into.

        + +

        Torchserve provides an out of the box handler for a number of applications like image classification, segmentation, object detection and text classification. It also supports custom handlers, in case your use case is not supported in default handlers.

        + +

        It provides a great flexibility in custom handlers, this potentially make Torchserve as multi-framework serving tool. Custom handlers let you define your custom logic to initialize a model that can be used also to load models from other frameworks such as ONNX.

        + +

        Torchserve handler is made of four main functions, initialize, preprocess, inference and postprocess that each return a list. The code snippet below shows an example of a custom handler.Custom handlers inherit from BaseHandler in Torchserve and can overwrite any of the main functions. Here is an example of the handler used for loading the Detectron2 model for figure detection, this model has been exported to Torchscript and uses model.half() to run the inference with FP16, details are explained in another section in this post.

        + +
        
        +class MyModelHandler(BaseHandler):
        +    def initialize(self, context):
        +        self.manifest = ctx.manifest
        +        properties = ctx.system_properties
        +        model_dir = properties.get("model_dir")
        +        serialized_file = self.manifest["model"]["serializedFile"]
        +        model_pt_path = os.path.join(model_dir, serialized_file)
        +
        +        self.device = torch.device(
        +        "cuda:" + str(properties.get("gpu_id"))
        +        if torch.cuda.is_available() and properties.get("gpu_id") is not None
        +        else "cpu"
        +        )
        +        self.model = torch.jit.load(model_pt_path, map_location=self.device)
        +
        +        self.model = self.model.half()
        +
        +    def preprocess(self, data):
        +
        +        inputs = []
        +        for request in batch:
        +
        +            request_body = request.get("body")
        +
        +            input_ = io.BytesIO(request_body)
        +            image = cv2.imdecode(np.fromstring(input_.read(), np.uint8), 1)
        +            input = torch.Tensor(image).permute(2, 0, 1)
        +            input = input.to(self.device)
        +            input = input.half()
        +            inputs.append({"image": input})
        +
        +        return inputs
        +
        +    def inference(self,inputs):
        +        predictions = self.model(**inputs)
        +        return predictions
        +
        +    def postprocess(self, output):
        +        responses = []
        +        for inference_output in inference_outputs:
        +            responses_json = {
        +            'classes': inference_output['pred_classes'].tolist(),
        +            'scores': inference_output['scores'].tolist(),
        +            "boxes": inference_output['pred_boxes'].tolist()
        +            }
        +            responses.append(json.dumps(responses_json))
        +
        +        return responses
        +
        + +

        Metrics

        + +

        An essential component in serving models in production is the ability to monitor them. Torchserve collects system level metrics regularly and allows adding custom metrics as well.

        + +

        System level metrics consist of CPU utilization, available and used disk space and memory on the host machine along with number of requests with different response codes (e.g 200-300, 400-500 and above 500). Custom metrics can be added to the metrics as explained here. TorchServe logs these two sets of metrics to different log files. Metrics are collected by default at:

        + +
          +
        • System metrics - log_directory/ts_metrics.log
        • +
        • Custom metrics - log directory/model_metrics.log
        • +
        + +

        As mentioned before, Torchserve also exposes metric API, that by default listens to port 8082 and enables users to query and monitor the collected metrics. The default metrics endpoint returns Prometheus formatted metrics. You can query metrics using curl requests or point a Prometheus Server to the endpoint and use Grafana for dashboards.

        + +

        While serving a model you can query metrics using curl request as follows:

        + +
        curl http://127.0.0.1:8082/metrics
        +
        + +

        In case you are looking into exporting the logged metrics, please refer to this example that uses mtail to export metrics to Prometheus. Tracking these metrics in a dashboard allows you to monitor performance regressions that may have been sporadic or hard to spot during an offline benchmark run.

        + +

        What to consider for tuning performance of a model in production

        + +

        The workflow suggested in Fig 1, is the general idea on how to approach model deployment in production with Torchserve.

        + +

        In many cases serving models in production is optimized based on throughput or latency service level agreement (SLA)s. Usually real-time applications are more concerned about latency whereas off-line applications may care more about higher throughput.

        + +

        There are a number of main factors contributing to the performance of a serving model in production. In particular, we are focusing on serving Pytorch models with Torchserve here, however most of these factors generalize to all models from other frameworks as well.

        + +
          +
        • Model optimizations: this is a pre-step for deploying models into production. This is a very broad discussion that we will get into in a series of future blogs. This includes techniques like quantization, pruning to decrease the size of the model, using Intermediate representations (IR graphs) such as Torchscript in Pytorch, fusing kernels and many others. Currently torchprep provides many of these techniques as a CLI tool.
        • +
        • Batch inference: it refers to feeding multiple inputs into a model, while it is essential during training, it can be very helpful to manage the cost at inference time as well. Hardware accelerators are optimized for parallelism and batching helps to saturate the compute capacity and often leads to higher throughput. The main difference in inference is you can’t wait too long to get a batch filled from clients, something we call dynamic batching
        • +
        • +

          Number of Workers : Torchserve uses workers to serve models. Torchserve workers are Python processes that hold a copy of the model weights for running inference. Too few workers means you’re not benefitting from enough parallelism but too many can cause worker contention and degrade end to end performance.

          +
        • +
        • Hardware : choosing the appropriate hardware based on the model, application and latency, throughput budget. This could be one of the supported hardwares in Torchserve, CPU, GPU, AWS Inferentia. Some hardware configurations are intended for best in class performance and others are better suited for cost effective inference. From our experiments we’ve found that GPUs shine best at larger batch sizes whereas the right CPUs and AWS Inferentia can be far more cost effective for lower batch sizes and low latency.
        • +
        + +

        Best Practices for Performance tuning on Torchserve

        + +

        To get the best performance out of your model while serving it with Torchserve, we are sharing some of the best practices here. Torchserve provides a benchmark suite that provides helpful insight to make informed decisions on different choices as detailed below.

        + +
          +
        • Optimize your model as the first step, Pytorch model optimization tutorials. Model optimization choices are also closely tied to the hardware of choice. We will discuss it in more detail in another blog post.
        • +
        • Deciding the hardware for model deployment can be closely related to the latency and throughput budget and cost per inference. Depending on the size of model and application it can vary, for some models like computer vision models it has been historically not affordable to run in production on CPU. However, by having optimizations such IPEX as recently added to Torchserve this has been much more affordable and cost beneficial and you can learn more in this investigative case study
        • +
        • +

          Workers in Torchserve are Python processes that provide parallelism, setting the number of workers should be done carefully. By default Torchserve launch number of workers equal to VCPUs or available GPUs on the host, this can add a considerable amount of time to the Torchserve start.

          + +

          Torchserve exposes a config property to set the number of workers. To provide an efficient parallelism through multiple workers and avoiding them to compete over resources, as a baseline we recommend following setting on CPU and GPU:

          + +

          CPU : In the handler, torch.set_num_threads(1) then set the number of workers to num physical cores / 2. But the the best threading configurations can be achieved by leveraging the Intel CPU launcher script.

          + +

          GPU: number of available GPUs can be set through number_gpus in config.properties. Torchserve uses round robin to assign workers to GPUs. We recommend setting the number of workers as follows. Number of worker = (Number of available GPUs) / (Number of Unique Models). Note that GPUs that are pre-Ampere do not provide any resource isolation with Multi Instance GPUs.

          +
        • +
        • Batch size can directly affect the latency and the throughput. To better utilize the compute resources batch size needs to be increased. However, there is a tradeoff between latency and throughput. Larger batch sizes can increase the throughput but results in a higher latency as well. Batch size can be set in Torchserve in two ways, either through model config in config.properties or while registering the model using Management API.
        • +
        + +

        In the next section, we are going to use Torchserve benchmark suite to decide the best combination of model optimization, hardware, workers, and batch size.

        + +

        Animated Drawings Performance Tuning

        + +

        To use the Torchserve benchmark suite, first we need to have an archived file, “.mar” file as discussed above, that contains the model, handler and all other artifacts to load and run inference. Animated Drawings uses Detectron2’s implementation of Mask-RCNN for an object detection model.

        + +

        How to run benchmark suite

        + +

        The Automated benchmark suite in Torchserve let you benchmark multiple models with different setting including batch size and number of worker and finally generate a report for you. To get started:

        + +
        git clone https://github.com/pytorch/serve.git
        +
        +cd serve/benchmarks
        +
        +pip install -r requirements-ab.txt
        +
        +apt-get install apache2-utils
        +
        + +

        Model level settings can be configured in a yaml file similar to

        + +
        
        +Model_name:
        +    eager_mode:
        +        benchmark_engine: "ab"
        +        url: "Path to .mar file"
        +        workers:
        +            - 1
        +            - 4
        +        batch_delay: 100
        +        batch_size:
        +            - 1
        +            - 2
        +            - 4
        +            - 8
        +        requests: 10000
        +        concurrency: 10
        +        input: "Path to model input"
        +        backend_profiling: False
        +        exec_env: "local"
        +        processors:
        +            - "cpu"
        +            - "gpus": "all"
        +
        +
        + +

        This yaml file will be referenced in the benchmark_config_template.yaml file that includes other settings for generating reports, this can optionally work with AWS cloud watch for logs as well.

        + +
        python benchmarks/auto_benchmark.py --input benchmark_config_template.yaml
        +
        + +

        Running the benchmarks, results will be written in “csv” file that can be found in “_ /tmp/benchmark/ab_report.csv_” and full report “/tmp/ts_benchmark/report.md”. It will include items such as Torchserve average latency, model P99 latency, throughput, number of concurrency, number of requests, handler time, and some other metrics. Here we focus on some of the important ones that we track to tune the performance which are, concurrency, model P99 latency, throughput. We look at these numbers specifically in combination with batch size, the used device, number of workers and if any model optimization has been done.

        + +

        The latency SLA for this model has been set to 100 ms, this is real-time application and as we discussed earlier, latency is more of a concern and throughput ideally should be as high as possible while it does not violate the latency SLA.

        + +

        Through searching the space, over different batch sizes (1-32), number of workers (1-16) and devices (CPU,GPU), we have run a set of experiments that summarized the best ones in the table below.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Device + Concurrency + # Requests + #workers + Batch size + Payload/image + Optimization + Throughput + Latency P99 +
        CPU + 10 + 1000 + 1 + 1 + small + N/A + 3.45 + 305.3 ms +
        CPU + 1 + 1000 + 1 + 1 + small + N/A + 3.45 + 291.8 ms +
        GPU + 10 + 1000 + 1 + 1 + small + N/A + 41.05 + 25.48 ms +
        GPU + 1 + 1000 + 1 + 1 + small + N/A + 42.21 + 23.6 ms +
        GPU + 10 + 1000 + 1 + 4 + small + N/A + 54.78 + 73.62 ms +
        GPU + 10 + 1000 + 1 + 4 + small + model.half() + 78.62 + 50.69 ms +
        GPU + 10 + 1000 + 1 + 8 + small + model.half() + 85.29 + 94.4 ms +
        + +

        The latency of this model on CPU with all of the tried settings in terms of batch size, concurrency and number of workers did not meet the SLA, in fact ~13x higher.

        + +

        Moving the model serving to GPU, immediately could improve the latency ~**13x **from 305 ms down to 23.6 ms.

        + +

        One of the simplest optimizations that we could do for the model was lowering its precision to fp16, it is one liner (model.half()) and could reduce the model P99 latency **by **32% and increase the throughput by almost the same amount.

        + +

        There could be other optimization done by Torchscripting the model and using optimize_for_inference or other tricks including onnx or tensorrt runtime optimizations which leverage aggressive fusions are out of the scope of this post. We will discuss model optimizations in a separate post.

        + +

        We found both on CPU and GPU , setting **number of workers=1 **worked the best in this case.

        + +
          +
        • Moving the model to GPU, using number of workers = 1, and batch size = 1 increased the Throughput ~12x compared to CPU and latency ~13x.
        • +
        • Moving the model to GPU, using model.half(), number of workers = 1, and batch size = 8 yielded best results in terms of Throughput and tolerable latency. Throughput increased ~25x compared to CPU with latency still meeting the SLA (94.4ms).
        • +
        + +

        Note: if you are running the benchmark suite, make sure you are setting a proper batch_delay and set the concurrency of the request to a number proportional to your batch size. Concurrency here means the number of concurrent requests being sent to the server.

        + +

        Conclusion

        + +

        In this post, we have discussed the considerations and knobs that Torchserve expose to tune the performance in production. We have discussed the Torchserve benchmark suite as a means to tune the performance and get insights on possible choices for model optimizations, hardware choice and cost in general. We used Animated Drawings app which uses Detectron2’s Mask-RCNN model as a case-study to showcase the performance tuning with benchmark suite.

        + +

        For more details on Performance tuning in Torchserve please refer to our documentation here. +Also feel free to open a ticket on Torchserve repo for any further questions and feedback.

        + +

        Acknowledgement

        + +

        We would like to thank Somya Jain (Meta), Christopher Gustave (Meta) for their great support and guidance throughout many steps of this blog and providing insights to Sketch Animator workflow. Also, special thanks to Li Ning from AWS for the great efforts to make performance tuning much easier on Torchserve with automated benchmark suite.

        + + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchtune-fine-tune-llms/index.html b/blog/torchtune-fine-tune-llms/index.html new file mode 100644 index 000000000000..65869f2476d1 --- /dev/null +++ b/blog/torchtune-fine-tune-llms/index.html @@ -0,0 +1,694 @@ + + + + + + + + + + + + + torchtune: Easily fine-tune LLMs using PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We’re pleased to announce the alpha release of torchtune, a PyTorch-native library for easily fine-tuning large language models.

        + +

        Staying true to PyTorch’s design principles, torchtune provides composable and modular building blocks along with easy-to-extend training recipes to fine-tune popular LLMs on a variety of consumer-grade and professional GPUs.

        + +

        torchtune supports the full fine-tuning workflow from start to finish, including

        + +
          +
        • Downloading and preparing datasets and model checkpoints.
        • +
        • Customizing the training with composable building blocks that support different model architectures, parameter-efficient fine-tuning (PEFT) techniques, and more.
        • +
        • Logging progress and metrics to gain insight into the training process.
        • +
        • Quantizing the model post-tuning.
        • +
        • Evaluating the fine-tuned model on popular benchmarks.
        • +
        • Running local inference for testing fine-tuned models.
        • +
        • Checkpoint compatibility with popular production inference systems.
        • +
        + +

        To get started, jump right into the code or walk through our many tutorials!

        + +

        Why torchtune?

        + +

        Over the past year there has been an explosion of interest in open LLMs. Fine-tuning these state of the art models has emerged as a critical technique for adapting them to specific use cases. This adaptation can require extensive customization from dataset and model selection all the way through to quantization, evaluation and inference. Moreover, the size of these models poses a significant challenge when trying to fine-tune them on consumer-level GPUs with limited memory.

        + +

        Existing solutions make it hard to add these customizations or optimizations by hiding the necessary pieces behind layers of abstractions. It’s unclear how different components interact with each other and which of these need to be updated to add new functionality. torchtune empowers developers to adapt LLMs to their specific needs and constraints with full control and visibility.

        + +

        torchtune’s Design

        + +

        torchtune was built with the following principles in mind

        + +
          +
        • Easy extensibility - New techniques emerge all the time and everyone’s fine-tuning use case is different. torchtune’s recipes are designed around easily composable components and hackable training loops, with minimal abstraction getting in the way of fine-tuning your fine-tuning. Each recipe is self-contained - no trainers or frameworks, and is designed to be easy to read - less than 600 lines of code!
        • +
        • Democratize fine-tuning - Users, regardless of their level of expertise, should be able to use torchtune. Clone and modify configs, or get your hands dirty with some code! You also don’t need beefy data center GPUs. Our memory efficient recipes have been tested on machines with a single 24GB gaming GPU.
        • +
        • Interoperability with the OSS LLM ecosystem - The open source LLM ecosystem is absolutely thriving, and torchtune takes advantage of this to provide interoperability with a wide range of offerings. This flexibility puts you firmly in control of how you train and use your fine-tuned models.
        • +
        + +

        Over the next year, open LLMs will become even more powerful, with support for more languages (multilingual), more modalities (multimodal) and more tasks. As the complexity of these models increases, we need to pay the same attention to “how” we design our libraries as we do to the features provided or performance of a training run. Flexibility will be key to ensuring the community can maintain the current pace of innovation, and many libraries/tools will need to play well with each other to power the full spectrum of use cases. torchtune is built from the ground up with this future in mind.

        + +

        In the true PyTorch spirit, torchtune makes it easy to get started by providing integrations with some of the most popular tools for working with LLMs.

        + +
          +
        • Hugging Face Hub - Hugging Face provides an expansive repository of open source models and datasets for fine-tuning. torchtune seamlessly integrates through the tune download CLI command so you can get started right away with fine-tuning your first model.
        • +
        • PyTorch FSDP - Scale your training using PyTorch FSDP. It is very common for people to invest in machines with multiple consumer level cards like the 3090/4090 by NVidia. torchtune allows you to take advantage of these setups by providing distributed recipes powered by FSDP.
        • +
        • Weights & Biases - torchtune uses the Weights & Biases AI platform to log metrics and model checkpoints during training. Track your configs, metrics and models from your fine-tuning runs all in one place!
        • +
        • EleutherAI’s LM Evaluation Harness - Evaluating fine-tuned models is critical to understanding whether fine-tuning is giving you the results you need. torchtune includes a simple evaluation recipe powered by EleutherAI’s LM Evaluation Harness to provide easy access to a comprehensive suite of standard LLM benchmarks. Given the importance of evaluation, we will be working with EleutherAI very closely in the next few months to build an even deeper and more “native” integration.
        • +
        • ExecuTorch - Models fine-tuned with torchtune can be easily exported to ExecuTorch, enabling efficient inference to be run on a wide variety of mobile and edge devices.
        • +
        • torchao - Easily and efficiently quantize your fine-tuned models into 4-bit or 8-bit using a simple post-training recipe powered by the quantization APIs from torchao.
        • +
        + +

        What’s Next?

        + +

        This is just the beginning and we’re really excited to put this alpha version in front of a vibrant and energetic community. In the coming weeks, we’ll continue to augment the library with more models, features and fine-tuning techniques. We’d love to hear any feedback, comments or feature requests in the form of GitHub issues on our repository, or on our Discord channel. As always, we’d love any contributions from this awesome community. Happy Tuning!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchvision-mobilenet-v3-implementation/index.html b/blog/torchvision-mobilenet-v3-implementation/index.html new file mode 100644 index 000000000000..6c08e8cff4cc --- /dev/null +++ b/blog/torchvision-mobilenet-v3-implementation/index.html @@ -0,0 +1,1092 @@ + + + + + + + + + + + + + Everything you need to know about TorchVision’s MobileNetV3 implementation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Vasilis Vryniotis and Francisco Massa + +

        +

        In TorchVision v0.9, we released a series of new mobile-friendly models that can be used for Classification, Object Detection and Semantic Segmentation. In this article, we will dig deep into the code of the models, share notable implementation details, explain how we configured and trained them, and highlight important tradeoffs we made during their tuning. Our goal is to disclose technical details that typically remain undocumented in the original papers and repos of the models.

        + +

        Network Architecture

        + +

        The implementation of the MobileNetV3 architecture follows closely the original paper. It is customizable and offers different configurations for building Classification, Object Detection and Semantic Segmentation backbones. It was designed to follow a similar structure to MobileNetV2 and the two share common building blocks.

        + +

        Off-the-shelf, we offer the two variants described on the paper: the Large and the Small. Both are constructed using the same code with the only difference being their configuration which describes the number of blocks, their sizes, their activation functions etc.

        + +

        Configuration parameters

        + +

        Even though one can write a custom InvertedResidual setting and pass it to the MobileNetV3 class directly, for the majority of applications we can adapt the existing configs by passing parameters to the model building methods. Some of the key configuration parameters are the following:

        + +
          +
        • +

          The width_mult parameter is a multiplier that affects the number of channels of the model. The default value is 1 and by increasing or decreasing it one can change the number of filters of all convolutions, including the ones of the first and last layers. The implementation ensures that the number of filters is always a multiple of 8. This is a hardware optimization trick which allows for faster vectorization of operations.

          +
        • +
        • +

          The reduced_tail parameter halves the number of channels on the last blocks of the network. This version is used by some Object Detection and Semantic Segmentation models. It’s a speed optimization which is described on the MobileNetV3 paper and reportedly leads to a 15% latency reduction without a significant negative effect on accuracy.

          +
        • +
        • +

          The dilated parameter affects the last 3 InvertedResidual blocks of the model and turns their normal depthwise Convolutions to Atrous Convolutions. This is used to control the output stride of these blocks and has a significant positive effect on the accuracy of Semantic Segmentation models.

          +
        • +
        + +

        Implementation details

        + +

        Below we provide additional information on some notable implementation details of the architecture. +The MobileNetV3 class is responsible for building a network out of the provided configuration. Here are some implementation details of the class:

        + +
          +
        • +

          The last convolution block expands the output of the last InvertedResidual block by a factor of 6. The implementation is aligned with the Large and Small configurations described on the paper and can adapt to different values of the multiplier parameter.

          +
        • +
        • +

          Similarly to other models such as MobileNetV2, a dropout layer is placed just before the final Linear layer of the classifier.

          +
        • +
        + +

        The InvertedResidual class is the main building block of the network. Here are some notable implementation details of the block along with its visualization which comes from Figure 4 of the paper:

        + +
          +
        • +

          There is no expansion step if the input channels and the expanded channels are the same. This happens on the first convolution block of the network.

          +
        • +
        • +

          There is always a projection step even when the expanded channels are the same as the output channels.

          +
        • +
        • +

          The activation method of the depthwise block is placed before the Squeeze-and-Excite layer as this improves marginally the accuracy.

          +
        • +
        + +
        + +
        + +

        Classification

        + +

        In this section we provide benchmarks of the pre-trained models and details on how they were configured, trained and quantized.

        + +

        Benchmarks

        + +

        Here is how to initialize the pre-trained models:

        +
        large = torchvision.models.mobilenet_v3_large(pretrained=True, width_mult=1.0,  reduced_tail=False, dilated=False)
        +small = torchvision.models.mobilenet_v3_small(pretrained=True)
        +quantized = torchvision.models.quantization.mobilenet_v3_large(pretrained=True)
        +
        + +

        Below we have the detailed benchmarks between new and selected previous models. As we can see MobileNetV3-Large is a viable replacement of ResNet50 for users who are willing to sacrifice a bit of accuracy for a roughly 6x speed-up:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelAcc@1Acc@5Inference on CPU (sec)# Params (M)
        MobileNetV3-Large74.04291.3400.04115.48
        MobileNetV3-Small67.66887.4020.01652.54
        Quantized MobileNetV3-Large73.00490.8580.01622.96
        MobileNetV271.88090.2900.06083.50
        ResNet5076.15092.8700.254525.56
        ResNet1869.76089.0800.103211.69
        + +

        Note that the inference times are measured on CPU. They are not absolute benchmarks, but they allow for relative comparisons between models.

        + +

        Training process

        + +

        All pre-trained models are configured with a width multiplier of 1, have full tails, are non-dilated, and were fitted on ImageNet. Both the Large and Small variants were trained using the same hyper-parameters and scripts which can be found in our references folder. Below we provide details on the most notable aspects of the training process.

        + +

        Achieving fast and stable training

        + +

        Configuring RMSProp correctly was crucial to achieve fast training with numerical stability. The authors of the paper used TensorFlow in their experiments and in their runs they reported using quite high rmsprop_epsilon comparing to the default. Typically this hyper-parameter takes small values as it’s used to avoid zero denominators, but in this specific model choosing the right value seems important to avoid numerical instabilities in the loss.

        + +

        Another important detail is that though PyTorch’s and TensorFlow’s RMSProp implementations typically behave similarly, there are a few differences with the most notable in our setup being how the epsilon hyperparameter is handled. More specifically, PyTorch adds the epsilon outside of the square root calculation while TensorFlow adds it inside. The result of this implementation detail is that one needs to adjust the epsilon value while porting the hyper parameter of the paper. A reasonable approximation can be taken with the formula PyTorch_eps = sqrt(TF_eps).

        + +

        Increasing our accuracy by tuning hyperparameters & improving our training recipe

        + +

        After configuring the optimizer to achieve fast and stable training, we turned into optimizing the accuracy of the model. There are a few techniques that helped us achieve this. First of all, to avoid overfitting we augmented out data using the AutoAugment algorithm, followed by RandomErasing. Additionally we tuned parameters such as the weight decay using cross validation. We also found beneficial to perform weight averaging across different epoch checkpoints after the end of the training. Finally, though not used in our published training recipe, we found that using Label Smoothing, Stochastic Depth and LR noise injection improve the overall accuracy by over 1.5 points.

        + +

        The graph and table depict a simplified summary of the most important iterations for improving the accuracy of the MobileNetV3 Large variant. Note that the actual number of iterations done while training the model was significantly larger and that the progress in accuracy was not always monotonically increasing. Also note that the Y-axis of the graph starts from 70% instead from 0% to make the difference between iterations more visible:

        + +
        + +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        IterationAcc@1Acc@5
        Baseline with “MobileNetV2-style” Hyperparams71.54290.068
        + RMSProp with default eps70.68489.38
        + RMSProp with adjusted eps & LR scheme71.76490.178
        + Data Augmentation & Tuned Hyperparams73.8691.292
        + Checkpoint Averaging74.02891.382
        + Label Smoothing & Stochastic Depth & LR noise75.53692.368
        + +

        Note that once we’ve achieved an acceptable accuracy, we verified the model performance on the hold-out test dataset which hasn’t been used before for training or hyper-parameter tuning. This process helps us detect overfitting and is always performed for all pre-trained models prior their release.

        + +

        Quantization

        + +

        We currently offer quantized weights for the QNNPACK backend of the MobileNetV3-Large variant which provides a speed-up of 2.5x. To quantize the model, Quantized Aware Training (QAT) was used. The hyper parameters and the scripts used to train the model can be found in our references folder.

        + +

        Note that QAT allows us to model the effects of quantization and adjust the weights so that we can improve the model accuracy. This translates to an accuracy increase of 1.8 points comparing to simple post-training quantization:

        + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Quantization StatusAcc@1Acc@5
        Non-quantized74.04291.340
        Quantized Aware Training73.00490.858
        Post-training Quantization71.16089.834
        + +

        Object Detection

        + +

        In this section, we will first provide benchmarks of the released models, and then discuss how the MobileNetV3-Large backbone was used in a Feature Pyramid Network along with the FasterRCNN detector to perform Object Detection. We will also explain how the network was trained and tuned alongside with any tradeoffs we had to make. We will not cover details about how it was used with SSDlite as this will be discussed on a future article.

        + +

        Benchmarks

        + +

        Here is how the models are initialized:

        +
        high_res = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True) 
        +low_res = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=True)
        +
        + +

        Below are some benchmarks between new and selected previous models. As we can see the high resolution Faster R-CNN with MobileNetV3-Large FPN backbone seems a viable replacement of the equivalent ResNet50 model for those users who are willing to sacrifice few accuracy points for a 5x speed-up:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelmAPInference on CPU (sec)# Params (M)
        Faster R-CNN MobileNetV3-Large FPN (High-Res)32.80.840919.39
        Faster R-CNN MobileNetV3-Large 320 FPN (Low-Res)22.80.167919.39
        Faster R-CNN ResNet-50 FPN37.04.151441.76
        RetinaNet ResNet-50 FPN36.44.882534.01
        + +

        Implementation details

        + +

        The Detector uses a FPN-style backbone which extracts features from different convolutions of the MobileNetV3 model. By default the pre-trained model uses the output of the 13th InvertedResidual block and the output of the Convolution prior to the pooling layer but the implementation supports using the outputs of more stages.

        + +

        All feature maps extracted from the network have their output projected down to 256 channels by the FPN block as this greatly improves the speed of the network. These feature maps provided by the FPN backbone are used by the FasterRCNN detector to provide box and class predictions at different scales.

        + +

        Training & Tuning process

        + +

        We currently offer two pre-trained models capable of doing object detection at different resolutions. Both models were trained on the COCO dataset using the same hyper-parameters and scripts which can be found in our references folder.

        + +

        The High Resolution detector was trained with images of 800-1333px, while the mobile-friendly Low Resolution detector was trained with images of 320-640px. The reason why we provide two separate sets of pre-trained weights is because training a detector directly on the smaller images leads to a 5 mAP increase in precision comparing to passing small images to the pre-trained high-res model. Both backbones were initialized with weights fitted on ImageNet and the 3 last stages of their weights where fined-tuned during the training process.

        + +

        An additional speed optimization can be applied on the mobile-friendly model by tuning the RPN NMS thresholds. By sacrificing only 0.2 mAP of precision we were able to improve the CPU speed of the model by roughly 45%. The details of the optimization can be seen below:

        + + + + + + + + + + + + + + + + + + + + + +
        Tuning StatusmAPInference on CPU (sec)
        Before23.00.2904
        After22.80.1679
        + +

        Below we provide some examples of visualizing the predictions of the Faster R-CNN MobileNetV3-Large FPN model:

        + +
        + +
        + +

        Semantic Segmentation

        + +

        In this section we will start by providing some benchmarks of the released pre-trained models. Then we will discuss how a MobileNetV3-Large backbone was combined with segmentation heads such as LR-ASPP, DeepLabV3 and the FCN to conduct Semantic Segmentation. We will also explain how the network was trained and propose a few optional optimization techniques for speed critical applications.

        + +

        Benchmarks

        + +

        This is how to initialize the pre-trained models:

        + +
        lraspp = torchvision.models.segmentation.lraspp_mobilenet_v3_large(pretrained=True) 
        +deeplabv3 = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True)
        +
        + +

        Below are the detailed benchmarks between new and selected existing models. As we can see, the DeepLabV3 with a MobileNetV3-Large backbone is a viable replacement of FCN with ResNet50 for the majority of applications as it achieves similar accuracy with a 8.5x speed-up. We also observe that the LR-ASPP network supersedes the equivalent FCN in all metrics:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelmIoUGlobal Pixel AccInference on CPU (sec)# Params (M)
        LR-ASPP MobileNetV3-Large57.991.20.32783.22
        DeepLabV3 MobileNetV3-Large60.391.20.586911.03
        FCN MobileNetV3-Large (not released)57.890.90.37025.05
        DeepLabV3 ResNet5066.492.46.353139.64
        FCN ResNet5060.591.45.014632.96
        + +

        Implementation details

        + +

        In this section we will discuss important implementation details of tested segmentation heads. Note that all models described in this section use a dilated MobileNetV3-Large backbone.

        + +

        LR-ASPP

        + +

        The LR-ASPP is the Lite variant of the Reduced Atrous Spatial Pyramid Pooling model proposed by the authors of the MobileNetV3 paper. Unlike the other segmentation models in TorchVision, it does not make use of an auxiliary loss. Instead it uses low and high-level features with output strides of 8 and 16 respectively.

        + +

        Unlike the paper where a 49x49 AveragePooling layer with variable strides is used, our implementation uses an AdaptiveAvgPool2d layer to process the global features. This is because the authors of the paper tailored the head to the Cityscapes dataset while our focus is to provide a general purpose implementation that can work on multiple datasets. Finally our implementation always has a bilinear interpolation before returning the output to ensure that the sizes of the input and output images match exactly.

        + +

        DeepLabV3 & FCN

        + +

        The combination of MobileNetV3 with DeepLabV3 and FCN follows closely the ones of other models and the stage estimation for these methods is identical to LR-ASPP. The only notable difference is that instead of using high and low level features, we attach the normal loss to the feature map with output stride 16 and an auxiliary loss on the feature map with output stride 8.

        + +

        Finally we should note that the FCN version of the model was not released because it was completely superseded by the LR-ASPP both in terms of speed and accuracy. The pre-trained weights are still available and can be used with minimal changes to the code.

        + +

        Training & Tuning process

        + +

        We currently offer two MobileNetV3 pre-trained models capable of doing semantic segmentation: the LR-ASPP and the DeepLabV3. The backbones of the models were initialized with ImageNet weights and trained end-to-end. Both architectures were trained on the COCO dataset using the same scripts with similar hyper-parameters. Their details can be found in our references folder.

        + +

        Normally, during inference the images are resized to 520 pixels. An optional speed optimization is to construct a Low Res configuration of the model by using the High-Res pre-trained weights and reducing the inference resizing to 320 pixels. This will improve the CPU execution times by roughly 60% while sacrificing a couple of mIoU points. The detailed numbers of this optimization can be found on the table below:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Low-Res ConfigurationmIoU DifferenceSpeed ImprovementmIoUGlobal Pixel AccInference on CPU (sec)
        LR-ASPP MobileNetV3-Large-2.165.26%55.890.30.1139
        DeepLabV3 MobileNetV3-Large-3.863.86%56.590.30.2121
        FCN MobileNetV3-Large (not released)-3.057.57%54.890.10.1571
        + +

        Here are some examples of visualizing the predictions of the LR-ASPP MobileNetV3-Large model:

        + +
        + +
        + +

        We hope that you found this article interesting. We are looking forward to your feedback to see if this is the type of content you would like us to publish more often. If the community finds that such posts are useful, we will be happy to publish more articles that cover the implementation details of newly introduced Machine Learning models.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchvision-ssd-implementation/index.html b/blog/torchvision-ssd-implementation/index.html new file mode 100644 index 000000000000..3d0913639eaf --- /dev/null +++ b/blog/torchvision-ssd-implementation/index.html @@ -0,0 +1,802 @@ + + + + + + + + + + + + + Everything You Need To Know About Torchvision’s SSD Implementation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Vasilis Vryniotis + +

        +

        In TorchVision v0.10, we’ve released two new Object Detection models based on the SSD architecture. Our plan is to cover the key implementation details of the algorithms along with information on how they were trained in a two-part article.

        + +

        In part 1 of the series, we will focus on the original implementation of the SSD algorithm as described on the Single Shot MultiBox Detector paper. We will briefly give a high-level description of how the algorithm works, then go through its main components, highlight key parts of its code, and finally discuss how we trained the released model. Our goal is to cover all the necessary details to reproduce the model including those optimizations which are not covered on the paper but are part on the original implementation.

        + +

        How Does SSD Work?

        + +

        Reading the aforementioned paper is highly recommended but here is a quick oversimplified refresher. Our target is to detect the locations of objects in an image along with their categories. Here is the Figure 5 from the SSD paper with prediction examples of the model:

        + +
        + +
        + +

        The SSD algorithm uses a CNN backbone, passes the input image through it and takes the convolutional outputs from different levels of the network. The list of these outputs are called feature maps. These feature maps are then passed through the Classification and Regression heads which are responsible for predicting the class and the location of the boxes.

        + +

        Since the feature maps of each image contain outputs from different levels of the network, their size varies and thus they can capture objects of different dimensions. On top of each, we tile several default boxes which can be thought as our rough prior guesses. For each default box, we predict whether there is an object (along with its class) and its offset (correction over the original location). During training time, we need to first match the ground truth to the default boxes and then we use those matches to estimate our loss. During inference, similar prediction boxes are combined to estimate the final predictions.

        + +

        The SSD Network Architecture

        + +

        In this section, we will discuss the key components of SSD. Our code follows closely the paper and makes use of many of the undocumented optimizations included in the official implementation.

        + +

        DefaultBoxGenerator

        + +

        The DefaultBoxGenerator class is responsible for generating the default boxes of SSD and operates similarly to the AnchorGenerator of FasterRCNN (for more info on their differences see pages 4-6 of the paper). It produces a set of predefined boxes of specific width and height which are tiled across the image and serve as the first rough prior guesses of where objects might be located. Here is Figure 1 from the SSD paper with a visualization of ground truths and default boxes:

        + +
        + +
        + +

        The class is parameterized by a set of hyperparameters that control their shape and tiling. The implementation will provide automatically good guesses with the default parameters for those who want to experiment with new backbones/datasets but one can also pass optimized custom values.

        + +

        SSDMatcher

        + +

        The SSDMatcher class extends the standard Matcher used by FasterRCNN and it is responsible for matching the default boxes to the ground truth. After estimating the IoUs of all combinations, we use the matcher to find for each default box the best candidate ground truth with overlap higher than the IoU threshold. The SSD version of the matcher has an extra step to ensure that each ground truth is matched with the default box that has the highest overlap. The results of the matcher are used in the loss estimation during the training process of the model.

        + +

        Classification and Regression Heads

        + +

        The SSDHead class is responsible for initializing the Classification and Regression parts of the network. Here are a few notable details about their code:

        + + + +

        Backbone Feature Extractor

        + +

        The feature extractor reconfigures and enhances a standard VGG backbone with extra layers as depicted on the Figure 2 of the SSD paper:

        + +
        + +
        + +

        The class supports all VGG models of TorchVision and one can create a similar extractor class for other types of CNNs (see this example for ResNet). Here are a few implementation details of the class:

        + +
          +
        • Patching the ceil_mode parameter of the 3rd Maxpool layer is necessary to get the same feature map sizes as the paper. This is due to small differences between PyTorch and the original Caffe implementation of the model.
        • +
        • It adds a series of extra feature layerson top of VGG. If the highres parameter is True during its construction, it will append an extra convolution. This is useful for the SSD512 version of the model.
        • +
        • As discussed on section 3 of the paper, the fully connected layers of the original VGG are converted to convolutions with the first one using Atrous. Moreover maxpool5’s stride and kernel size is modified.
        • +
        • As described on section 3.1, L2 normalization is used on the output of conv4_3 and a set of learnable weights are introduced to control its scaling.
        • +
        + +

        SSD Algorithm

        + +

        The final key piece of the implementation is on the SSD class. Here are some notable details:

        + + + +

        Here are the two core methods of the implementation:

        + + + +

        The SSD300 VGG16 Model

        + +

        The SSD is a family of models because it can be configured with different backbones and different Head configurations. In this section, we will focus on the provided SSD pre-trained model. We will discuss the details of its configuration and the training process used to reproduce the reported results.

        + +

        Training process

        + +

        The model was trained using the COCO dataset and all of its hyper-parameters and scripts can be found in our references folder. Below we provide details on the most notable aspects of the training process.

        + +

        Paper Hyperparameters

        + +

        In order to achieve the best possible results on COCO, we adopted the hyperparameters described on the section 3 of the paper concerning the optimizer configuration, the weight regularization etc. Moreover we found it useful to adopt the optimizations that appear in the official implementation concerning the tiling configuration of the DefaultBox generator. This optimization was not described in the paper but it was crucial for improving the detection precision of smaller objects.

        + +

        Data Augmentation

        + +

        Implementing the SSD Data Augmentation strategy as described on page 6 and page 12 of the paper was critical to reproducing the results. More specifically the use of random “Zoom In” and “Zoom Out” transformations make the model robust to various input sizes and improve its precision on the small and medium objects. Finally since the VGG16 has quite a few parameters, the photometric distortions included in the augmentations have a regularization effect and help avoid the overfitting.

        + +

        Weight Initialization & Input Scaling

        + +

        Another aspect that we found beneficial was to follow the weight initialization scheme proposed by the paper. To do that, we had to adapt our input scaling method by undoing the 0-1 scaling performed by ToTensor() and use pre-trained ImageNet weights fitted with this scaling (shoutout to Max deGroot for providing them in his repo). All the weights of new convolutions were initialized using Xavier and their biases were set to zero. After initialization, the network was trained end-to-end.

        + +

        LR Scheme

        + +

        As reported on the paper, after applying aggressive data augmentations it’s necessary to train the models for longer. Our experiments confirm this and we had to tweak the Learning rate, batch sizes and overall steps to achieve the best results. Our proposed learning scheme is configured to be rather on the safe side, showed signs of plateauing between the steps and thus one is likely to be able to train a similar model by doing only 66% of our epochs.

        + +

        Breakdown of Key Accuracy Improvements

        + +

        It is important to note that implementing a model directly from a paper is an iterative process that circles between coding, training, bug fixing and adapting the configuration until we match the accuracies reported on the paper. Quite often it also involves simplifying the training recipe or enhancing it with more recent methodologies. It is definitely not a linear process where incremental accuracy improvements are achieved by improving a single direction at a time but instead involves exploring different hypothesis, making incremental improvements in different aspects and doing a lot of backtracking.

        + +

        With that in mind, below we try to summarize the optimizations that affected our accuracy the most. We did this by grouping together the various experiments in 4 main groups and attributing the experiment improvements to the closest match. Note that the Y-axis of the graph starts from 18 instead from 0 to make the difference between optimizations more visible:

        + +
        + +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model ConfigurationmAP deltamAP
        Baseline with “FasterRCNN-style” Hyperparams-19.5
        + Paper Hyperparams1.621.1
        + Data Augmentation1.822.9
        + Weight Initialization & Input Scaling123.9
        + LR scheme1.225.1
        + +

        Our final model achieves an mAP of 25.1 and reproduces exactly the COCO results reported on the paper. Here is a detailed breakdown of the accuracy metrics.

        + +

        We hope you found the part 1 of the series interesting. On the part 2, we will focus on the implementation of SSDlite and discuss its differences from SSD. Until then, we are looking forward to your feedback.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchvision-ssdlite-implementation/index.html b/blog/torchvision-ssdlite-implementation/index.html new file mode 100644 index 000000000000..713e8833159d --- /dev/null +++ b/blog/torchvision-ssdlite-implementation/index.html @@ -0,0 +1,803 @@ + + + + + + + + + + + + + Everything You Need To Know About Torchvision’s SSDlite Implementation | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Vasilis Vryniotis + +

        +

        In the previous article, we’ve discussed how the SSD algorithm works, covered its implementation details and presented its training process. If you have not read the previous blog post, I encourage you to check it out before continuing.

        + +

        In this part 2 of the series, we will focus on the mobile-friendly variant of SSD called SSDlite. Our plan is to first go through the main components of the algorithm highlighting the parts that differ from the original SSD, then discuss how the released model was trained and finally provide detailed benchmarks for all the new Object Detection models that we explored.

        + +

        The SSDlite Network Architecture

        + +

        The SSDlite is an adaptation of SSD which was first briefly introduced on the MobileNetV2 paper and later reused on the MobileNetV3 paper. Because the main focus of the two papers was to introduce novel CNN architectures, most of the implementation details of SSDlite were not clarified. Our code follows all the details presented on the two papers and where necessary fills the gaps from the official implementation.

        + +

        As noted before, the SSD is a family of models because one can configure it with different backbones (such as VGG, MobileNetV3 etc) and different Heads (such as using regular convolutions, separable convolutions etc). Thus many of the SSD components remain the same in SSDlite. Below we discuss only those that are different

        + +

        Classification and Regression Heads

        + +

        Following the Section 6.2 of the MobileNetV2 paper, SSDlite replaces the regular convolutions used on the original Heads with separable convolutions. Consequently, our implementation introduces new heads that use 3x3 Depthwise convolutions and 1x1 projections. Since all other components of the SSD method remain the same, to create an SSDlite model our implementation initializes the SSDlite head and passes it directly to the SSD constructor.

        + +

        Backbone Feature Extractor

        + +

        Our implementation introduces a new class for building MobileNet feature extractors. Following the Section 6.3 of the MobileNetV3 paper, the backbone returns the output of the expansion layer of the Inverted Bottleneck block which has an output stride of 16 and the output of the layer just before the pooling which has an output stride of 32. Moreover, all extra blocks of the backbone are replaced with lightweight equivalents which use a 1x1 compression, a separable 3x3 convolution with stride 2 and a 1x1 expansion. Finally to ensure that the heads have enough prediction power even when small width multipliers are used, the minimum depth size of all convolutions is controlled by the min_depth hyperparameter.

        + +

        The SSDlite320 MobileNetV3-Large model

        + +
        + +
        + +

        This section discusses the configuration of the provided SSDlite pre-trained model along with the training processes followed to replicate the paper results as closely as possible.

        + +

        Training process

        + +

        All of the hyperparameters and scripts used to train the model on the COCO dataset can be found in our references folder. Here we discuss the most notable details of the training process.

        + +

        Tuned Hyperparameters

        + +

        Though the papers don’t provide any information on the hyperparameters used for training the models (such as regularization, learning rate and the batch size), the parameters listed in the configuration files on the official repo were good starting points and using cross validation we adjusted them to their optimal values. All the above gave us a significant boost over the baseline SSD configuration.

        + +

        Data Augmentation

        + +

        Key important difference of SSDlite comparing to SSD is that the backbone of the first has only a fraction of the weights of the latter. This is why in SSDlite, the Data Augmentation focuses more on making the model robust to objects of variable sizes than trying to avoid overfitting. Consequently, SSDlite uses only a subset of the SSD transformations and this way it avoids the over-regularization of the model.

        + +

        LR Scheme

        + +

        Due to the reliance on Data Augmentation to make the model robust to small and medium sized objects, we found that it is particularly beneficial for the training recipe to use large number of epochs. More specifically by using roughly 3x more epochs than SSD we are able to increase our precision by 4.2mAP points and by using a 6x multiplier we improve by 4.9mAP. Increasing further the epochs seems to yield diminishing returns and makes the training too slow and impractical, nevertheless based on the model configuration it seems that the authors of the paper used an equivalent 16x multiplier.

        + +

        Weight Initialization & Input Scaling & ReLU6

        + +

        A set of final optimizations that brought our implementation very close to the official one and helped us bridge the accuracy gap was training the backbone from scratch instead of initializing from ImageNet, adapting our weight initialization scheme, changing our Input Scaling and replacing all standard ReLUs added on the SSDlite heads with ReLU6. Note that since we trained the model from random weights, we additionally applied the speed optimization described on the paper of using a reduced tail on the backbone.

        + +

        Implementation Differences

        + +

        Comparing the above implementation with the one on the official repo, we’ve identified a few differences. Most of them are minor and they are related to how we initialize the weights (for example Normal initialization vs Truncated Normal), how we parameterize the LR Scheduling (for example smaller vs larger warmup rate, shorter vs longer training) etc. The biggest known difference lies in the way we compute the Classification loss. More specifically the implementation of SSDlite with MobileNetV3 backbone on the official repo doesn’t use the SSD’s Multibox loss but instead uses RetinaNet’s focal loss. This is a rather significant deviation from the paper and since TorchVision already offers a full implementation of RetinaNet, we decided to implement SSDlite using the normal Multi-box SSD loss.

        + +

        Break down of key accuracy improvements

        + +

        As discussed in previous articles, reproducing research papers and porting them to code is not a journey of monotonically increasing accuracies, especially in cases where the full training and implementation details are not known. Typically the process involves lots of backtracking as one needs to identify those implementation details and parameters that have significant impact on the accuracy from those that don’t. Below we try to visualize the most important iterations that improved our accuracy from the baseline:

        + +
        + +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        IterationmAP
        Baseline with “SSD-style” Hyperparams10.6
        + Tuned Hyperparams14.2
        + SSDlite Data Augmentation15.2
        + 3x LR Scheme19.4
        + 6x LR Scheme20.1
        + Weight Initialization & Input Scaling & ReLU621.3
        + +

        The order of optimizations presented above is accurate, though a bit idealized in some cases. For example, though different schedulers were tested during the Hyperparameter tuning phase, none of them provided significant improvements and thus we maintained the MultiStepLR which was used in the baseline. Nevertheless while later experimenting with different LR Schemes, we found it beneficial to switch to CosineAnnealingLR, as it required less configuration. Consequently, we believe that the main takeaway from the above summary should be that even by starting with a correct implementation and a set of optimal hyperparams from a model of the same family, there is always accuracy points to be found by optimizing the training recipe and tuning the implementation. Admittedly the above is a rather extreme case where the accuracy doubled, but still in many cases there is a large number of optimizations that can help us push the accuracy significantly.

        + +

        Benchmarks

        + +

        Here is how to initialize the two pre-trained models:

        + +
        ssdlite = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)
        +ssd = torchvision.models.detection.ssd300_vgg16(pretrained=True)
        +
        + +

        Below are the benchmarks between the new and selected previous detection models:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        ModelmAPInference on CPU (sec)# Params (M)
        SSDlite320 MobileNetV3-Large21.30.09113.44
        SSD300 VGG1625.10.830335.64
        SSD512 VGG16 (not released)28.82.249437.08
        SSD512 ResNet50 (not released)30.21.113742.70
        Faster R-CNN MobileNetV3-Large 320 FPN (Low-Res)22.80.167919.39
        Faster R-CNN MobileNetV3-Large FPN (High-Res)32.80.840919.39
        + +

        As we can see, the SSDlite320 MobileNetV3-Large model is by far the fastest and smallest model and thus it’s an excellent candidate for real-world mobile applications. Though its accuracy is lower than the pre-trained low-resolution Faster R-CNN equivalent, the SSDlite framework is adaptable and one can boost its accuracy by introducing heavier heads with more convolutions.

        + +

        On the other hand, the SSD300 VGG16 model is rather slow and less accurate. This is mainly because of its VGG16 backbone. Though extremely important and influential, the VGG architecture is nowadays quite outdated. Thus though the specific model has historical and research value and hence it’s included in TorchVision, we recommend to users who want high-resolution detectors for real world applications to either combine SSD with alternative backbones (see this example on how to create one) or use one of the Faster R-CNN pre-trained models.

        + +

        We hope you enjoyed the 2nd and final part of the SSD series. We are looking forward to your feedback.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/torchvision03/index.html b/blog/torchvision03/index.html new file mode 100644 index 000000000000..5b1635238562 --- /dev/null +++ b/blog/torchvision03/index.html @@ -0,0 +1,838 @@ + + + + + + + + + + + + + torchvision 0.3: segmentation, detection models, new datasets and more.. | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Francisco Massa + +

        +

        PyTorch domain libraries like torchvision provide convenient access to common datasets and models that can be used to quickly create a state-of-the-art baseline. Moreover, they also provide common abstractions to reduce boilerplate code that users might have to otherwise repeatedly write. The torchvision 0.3 release brings several new features including models for semantic segmentation, object detection, instance segmentation, and person keypoint detection, as well as custom C++ / CUDA ops specific to computer vision.

        + +
        + +
        + +

        New features include:

        + +

        Reference training / evaluation scripts: torchvision now provides, under the references/ folder, scripts for training and evaluation of the following tasks: classification, semantic segmentation, object detection, instance segmentation and person keypoint detection. These serve as a log of how to train a specific model and provide baseline training and evaluation scripts to quickly bootstrap research.

        + +

        torchvision ops: torchvision now contains custom C++ / CUDA operators. Those operators are specific to computer vision, and make it easier to build object detection models. These operators currently do not support PyTorch script mode, but support for it is planned for in the next release. Some of the ops supported include:

        + +
          +
        • roi_pool (and the module version RoIPool)
        • +
        • roi_align (and the module version RoIAlign)
        • +
        • nms, for non-maximum suppression of bounding boxes
        • +
        • box_iou, for computing the intersection over union metric between two sets of bounding boxes
        • +
        • box_area, for computing the area of a set of bounding boxes
        • +
        + +

        Here are a few examples on using torchvision ops:

        + +
        import torch
        +import torchvision
        +
        +# create 10 random boxes
        +boxes = torch.rand(10, 4) * 100
        +# they need to be in [x0, y0, x1, y1] format
        +boxes[:, 2:] += boxes[:, :2]
        +# create a random image
        +image = torch.rand(1, 3, 200, 200)
        +# extract regions in `image` defined in `boxes`, rescaling
        +# them to have a size of 3x3
        +pooled_regions = torchvision.ops.roi_align(image, [boxes], output_size=(3, 3))
        +# check the size
        +print(pooled_regions.shape)
        +# torch.Size([10, 3, 3, 3])
        +
        +# or compute the intersection over union between
        +# all pairs of boxes
        +print(torchvision.ops.box_iou(boxes, boxes).shape)
        +# torch.Size([10, 10])
        +
        + +

        New models and datasets: torchvision now adds support for object detection, instance segmentation and person keypoint detection models. In addition, several popular datasets have been added. Note: The API is currently experimental and might change in future versions of torchvision. New models include:

        + +

        Segmentation Models

        + +

        The 0.3 release also contains models for dense pixelwise prediction on images. +It adds FCN and DeepLabV3 segmentation models, using a ResNet50 and ResNet101 backbones. +Pre-trained weights for ResNet101 backbone are available, and have been trained on a subset of COCO train2017, which contains the same 20 categories as those from Pascal VOC.

        + +

        The pre-trained models give the following results on the subset of COCO val2017 which contain the same 20 categories as those present in Pascal VOC:

        + + + + + + + + + + + + + + + + + + + + + +
        Networkmean IoUglobal pixelwise acc
        FCN ResNet10163.791.9
        DeepLabV3 ResNet10167.492.4
        + +

        Detection Models

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Networkbox APmask APkeypoint AP
        Faster R-CNN ResNet-50 FPN trained on COCO37.0  
        Mask R-CNN ResNet-50 FPN trained on COCO37.934.6 
        Keypoint R-CNN ResNet-50 FPN trained on COCO54.6 65.0
        + +

        The implementations of the models for object detection, instance segmentation and keypoint detection are fast, specially during training.

        + +

        In the following table, we use 8 V100 GPUs, with CUDA 10.0 and CUDNN 7.4 to report the results. During training, we use a batch size of 2 per GPU, and during testing a batch size of 1 is used.

        + +

        For test time, we report the time for the model evaluation and post-processing (including mask pasting in image), but not the time for computing the precision-recall.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Networktrain time (s / it)test time (s / it)memory (GB)
        Faster R-CNN ResNet-50 FPN0.22880.05905.2
        Mask R-CNN ResNet-50 FPN0.27280.09035.4
        Keypoint R-CNN ResNet-50 FPN0.37890.12426.8
        + +

        You can load and use pre-trained detection and segmentation models with a few lines of code

        + +
        import torchvision
        +
        +model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
        +# set it to evaluation mode, as the model behaves differently
        +# during training and during evaluation
        +model.eval()
        +
        +image = PIL.Image.open('/path/to/an/image.jpg')
        +image_tensor = torchvision.transforms.functional.to_tensor(image)
        +
        +# pass a list of (potentially different sized) tensors
        +# to the model, in 0-1 range. The model will take care of
        +# batching them together and normalizing
        +output = model([image_tensor])
        +# output is a list of dict, containing the postprocessed predictions
        +
        + +

        Classification Models

        + +

        The following classification models were added:

        + +
          +
        • GoogLeNet (Inception v1)
        • +
        • MobileNet V2
        • +
        • ShuffleNet v2
        • +
        • ResNeXt-50 32x4d and ResNeXt-101 32x8d
        • +
        + +

        Datasets

        + +

        The following datasets were added:

        + +
          +
        • Caltech101, Caltech256, and CelebA
        • +
        • ImageNet dataset (improving on ImageFolder, provides class-strings)
        • +
        • Semantic Boundaries Dataset
        • +
        • VisionDataset as a base class for all datasets
        • +
        + +

        In addition, we’ve added more image transforms, general improvements and bug fixes, as well as improved documentation.

        + +

        See the full release notes here as well as this getting started tutorial on Google Colab here, which describes how to fine tune your own instance segmentation model on a custom dataset.

        + +

        Cheers!

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/towards-reproducible-research-with-pytorch-hub/index.html b/blog/towards-reproducible-research-with-pytorch-hub/index.html new file mode 100644 index 000000000000..e01a68050e68 --- /dev/null +++ b/blog/towards-reproducible-research-with-pytorch-hub/index.html @@ -0,0 +1,843 @@ + + + + + + + + + + + + + Towards Reproducible Research with PyTorch Hub | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Reproducibility is an essential requirement for many fields of research including those based on machine learning techniques. However, many machine learning publications are either not reproducible or are difficult to reproduce. With the continued growth in the number of research publications, including tens of thousands of papers now hosted on arXiv and submissions to conferences at an all time high, research reproducibility is more important than ever. While many of these publications are accompanied by code as well as trained models which is helpful but still leaves a number of steps for users to figure out for themselves.

        + +

        We are excited to announce the availability of PyTorch Hub, a simple API and workflow that provides the basic building blocks for improving machine learning research reproducibility. PyTorch Hub consists of a pre-trained model repository designed specifically to facilitate research reproducibility and enable new research. It also has built-in support for Colab, integration with Papers With Code and currently contains a broad set of models that include Classification and Segmentation, Generative, Transformers, etc.

        + +
        + +
        + +

        [Owner] Publishing models

        + +

        PyTorch Hub supports the publication of pre-trained models (model definitions and pre-trained weights) to a GitHub repository by adding a simple hubconf.py file. +This provides an enumeration of which models are to be supported and a list of dependencies needed to run the models. +Examples can be found in the torchvision, huggingface-bert and gan-model-zoo repositories.

        + +

        Let us look at the simplest case: torchvision’s hubconf.py:

        + +
        # Optional list of dependencies required by the package
        +dependencies = ['torch']
        +
        +from torchvision.models.alexnet import alexnet
        +from torchvision.models.densenet import densenet121, densenet169, densenet201, densenet161
        +from torchvision.models.inception import inception_v3
        +from torchvision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152,\
        +resnext50_32x4d, resnext101_32x8d
        +from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1
        +from torchvision.models.vgg import vgg11, vgg13, vgg16, vgg19, vgg11_bn, vgg13_bn, vgg16_bn, vgg19_bn
        +from torchvision.models.segmentation import fcn_resnet101, deeplabv3_resnet101
        +from torchvision.models.googlenet import googlenet
        +from torchvision.models.shufflenetv2 import shufflenet_v2_x0_5, shufflenet_v2_x1_0
        +from torchvision.models.mobilenet import mobilenet_v2
        +
        + +

        In torchvision, the models have the following properties:

        +
          +
        • Each model file can function and be executed independently
        • +
        • They dont require any package other than PyTorch (encoded in hubconf.py as dependencies['torch'])
        • +
        • They dont need separate entry-points, because the models when created, work seamlessly out of the box
        • +
        + +

        Minimizing package dependencies reduces the friction for users to load your model for immediate experimentation.

        + +

        A more involved example is HuggingFace’s BERT models. Here is their hubconf.py

        + +
        dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
        +
        +from hubconfs.bert_hubconf import (
        +    bertTokenizer,
        +    bertModel,
        +    bertForNextSentencePrediction,
        +    bertForPreTraining,
        +    bertForMaskedLM,
        +    bertForSequenceClassification,
        +    bertForMultipleChoice,
        +    bertForQuestionAnswering,
        +    bertForTokenClassification
        +)
        +
        + +

        Each model then requires an entrypoint to be created. Here is a code snippet to specify an entrypoint of the bertForMaskedLM model, which returns the pre-trained model weights.

        + +
        def bertForMaskedLM(*args, **kwargs):
        +    """
        +    BertForMaskedLM includes the BertModel Transformer followed by the
        +    pre-trained masked language modeling head.
        +    Example:
        +      ...
        +    """
        +    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
        +    return model
        +
        + +

        These entry-points can serve as wrappers around complex model factories. They can give a clean and consistent help docstring, have logic to support downloading of pretrained weights (for example via pretrained=True) or have additional hub-specific functionality such as visualization.

        + +

        With a hubconf.py in place, you can send a pull request based on the template here. +Our goal is to curate high-quality, easily-reproducible, maximally-beneficial models for research reproducibility. +Hence, we may work with you to refine your pull request and in some cases reject some low-quality models to be published. +Once we accept your pull request, your model will soon appear on Pytorch hub webpage for all users to explore.

        + +

        [User] Workflow

        + +

        As a user, PyTorch Hub allows you to follow a few simple steps and do things like: 1) explore available models; 2) load a model; and 3) understand what methods are available for any given model. Let’s walk through some examples of each.

        + +

        Explore available entrypoints.

        + +

        Users can list all available entrypoints in a repo using the torch.hub.list() API.

        + +
        >>> torch.hub.list('pytorch/vision')
        +>>>
        +['alexnet',
        +'deeplabv3_resnet101',
        +'densenet121',
        +...
        +'vgg16',
        +'vgg16_bn',
        +'vgg19',
        + 'vgg19_bn']
        +
        + +

        Note that PyTorch Hub also allows auxillary entrypoints (other than pretrained models), e.g. bertTokenizer for preprocessing in the BERT models, to make the user workflow smoother.

        + +

        Load a model

        + +

        Now that we know which models are available in the Hub, users can load a model entrypoint using the torch.hub.load() API. This only requires a single command without the need to install a wheel. In addition the torch.hub.help() API can provide useful information about how to instantiate the model.

        + +
        print(torch.hub.help('pytorch/vision', 'deeplabv3_resnet101'))
        +model = torch.hub.load('pytorch/vision', 'deeplabv3_resnet101', pretrained=True)
        +
        + +

        It is also common that repo owners will want to continually add bug fixes or performance improvements. PyTorch Hub makes it super simple for users to get the latest update by calling:

        + +
        model = torch.hub.load(..., force_reload=True)
        +
        + +

        We believe this will help to alleviate the burden of repetitive package releases by repo owners and instead allow them to focus more on their research. +It also ensures that, as a user, you are getting the freshest available models.

        + +

        On the contrary, stability is important for users. Hence, some model owners serve them from a specificed branch or tag, rather than the master branch, to ensure stability of the code. +For example, pytorch_GAN_zoo serves them from the hub branch:

        + +
        model = torch.hub.load('facebookresearch/pytorch_GAN_zoo:hub', 'DCGAN', pretrained=True, useGPU=False)
        +
        + +

        Note that the *args, **kwargs passed to hub.load() are used to instantiate a model. In the above example, pretrained=True and useGPU=False are given to the model’s entrypoint.

        + +

        Explore a loaded model

        + +

        Once you have a model from PyTorch Hub loaded, you can use the following workflow to find out the available methods that are supported as well as understand better what arguments are requires to run it.

        + +

        dir(model) to see all available methods of the model. Let’s take a look at bertForMaskedLM’s available methods.

        + +
        >>> dir(model)
        +>>>
        +['forward'
        +...
        +'to'
        +'state_dict',
        +]
        +
        + +

        help(model.forward) provides a view into what arguments are required to make your loaded model run

        + +
        >>> help(model.forward)
        +>>>
        +Help on method forward in module pytorch_pretrained_bert.modeling:
        +forward(input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None)
        +...
        +
        + +

        Have a closer look at the BERT and DeepLabV3 pages, where you can see how these models can be used once loaded.

        + +

        Other ways to explore

        + +

        Models available in PyTorch Hub also support both Colab and are directly linked on Papers With Code and you can get started with a single click. Here is a good example to get started with (shown below).

        + +
        + +
        + +

        Additional resources:

        + + + +

        A BIG thanks to the folks at HuggingFace, the PapersWithCode team, fast.ai and Nvidia as well as Morgane Riviere (FAIR Paris) and lots of others for helping bootstrap this effort!!

        + +

        Cheers!

        + +

        Team PyTorch

        + +

        FAQ:

        + +

        Q: If we would like to contribute a model that is already in the Hub but perhaps mine has better accuracy, should I still contribute?

        + +

        A: Yes!! A next step for Hub is to implement an upvote/downvote system to surface the best models.

        + +

        Q: Who hosts the model weights for PyTorch Hub?

        + +

        A: You, as the contributor, are responsible to host the model weights. You can host your model in your favorite cloud storage or, if it fits within the limits, on GitHub. If it is not within your means to host the weights, check with us via opening an issue on the hub repository.

        + +

        Q: What if my model is trained on private data? Should I still contribute this model?

        + +

        A: No! PyTorch Hub is centered around open research and that extends to the usage of open datasets to train these models on. If a pull request for a proprietary model is submitted, we will kindly ask that you resubmit a model trained on something open and available.

        + +

        Q: Where are my downloaded models saved?

        + +

        A: We follow the XDG Base Directory Specification and adhere to common standards around cached files and directories.

        + +

        The locations are used in the order of:

        + +
          +
        • Calling hub.set_dir(<PATH_TO_HUB_DIR>)
        • +
        • $TORCH_HOME/hub, if environment variable TORCH_HOME is set.
        • +
        • $XDG_CACHE_HOME/torch/hub, if environment variable XDG_CACHE_HOME is set.
        • +
        • ~/.cache/torch/hub
        • +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/trace-analysis-for-masses/index.html b/blog/trace-analysis-for-masses/index.html new file mode 100644 index 000000000000..21b1ce88de59 --- /dev/null +++ b/blog/trace-analysis-for-masses/index.html @@ -0,0 +1,789 @@ + + + + + + + + + + + + + PyTorch Trace Analysis for the Masses | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        January 09, 2023

        +

        + PyTorch Trace Analysis for the Masses +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Anupam Bhatnagar, Xizhou Feng, Brian Coutinho, Yifan Liu, Sung-Han Lin, Louis Feng, and Yuzhen Huang + +

        +

        We are excited to announce the public release of Holistic Trace Analysis (HTA), an open source performance analysis and visualization Python library for PyTorch users. HTA takes as input Kineto traces collected by the PyTorch profiler, which are complex and challenging to interpret, and up-levels the performance information contained in these traces. It was initially developed internally at Meta to understand and debug performance problems for large-scale distributed training jobs on GPUs. The multidisciplinary team has made a number of enhancements to HTA’s features and scaled them to support state-of-the-art ML workloads.

        + +

        ML researchers and systems engineers often struggle to computationally scale up their models because they are not aware of the performance bottlenecks in their workloads. The resources requested for a job (e.g. GPUs, memory) are often misaligned with the resources actually required due to lack of visibility “under the hood”. To achieve the best performance from the hardware stack, it is imperative to understand the resource utilization and bottlenecks for distributed training workloads.

        + +

        The initial HTA implementation was specifically targeted at Deep Learning Based Recommendation Models (DLRM). To make the features in HTA generic and applicable to use cases such as analyzing Vision and NLP models, we decided to refactor the HTA codebase and make the library available to the larger community. This new codebase has implemented several important ideas which lead to significant efficiency and performance improvements.

        + +

        In this blog, we present several features implemented in the open source version of HTA, which can be used as a Python script as well as interactively in a Jupyter notebook. HTA provides the following features:

        + +
          +
        1. Breakdown by Dimensions +
            +
          1. Temporal: Breakdown of GPU time in terms of time spent in computation, communication, memory events, and idle time on a single node and across all ranks.
          2. +
          3. Idle Time: Breakdown of GPU idle time into waiting for the host, waiting for another kernel or attributed to an unknown cause.
          4. +
          5. Kernel: Find kernels with the longest duration on each rank.
          6. +
          7. Communication Computation Overlap: Calculate the percentage of time when communication overlaps computation.
          8. +
          +
        2. +
        3. Statistical Analysis +
            +
          1. Kernel Duration Distribution: Distribution of average time taken by longest kernels across different ranks.
          2. +
          3. CUDA Kernel Launch: Distributions of GPU kernels with very small duration, large duration, and excessive launch time.
          4. +
          5. Augmented Counters (Memory bandwidth, Queue length): Augmented trace files which provide insights into memory copy bandwidth and number of outstanding operations on each CUDA stream.
          6. +
          +
        4. +
        5. Patterns +
            +
          1. Frequent CUDA Kernels: Find the CUDA kernels most frequently launched by any given PyTorch or user defined operator.
          2. +
          +
        6. +
        7. Trace Comparison +
            +
          1. Trace Diff: A trace comparison tool to identify and visualize the differences between traces.
          2. +
          +
        8. +
        + +

        HTA source code is available to users via Github. Users can request new features or build their own analysis using the core libraries and data structures provided in the codebase in addition to the features mentioned above.

        + +

        GPU Training Performance Debugging 101

        + +

        To understand the GPU performance in distributed training jobs, we consider how the model operators interact with the GPU devices and how such interactions are reflected in certain measurable metrics.

        + +

        At a high level, we can break down the GPU operations in a model execution into three broad categories, henceforth referred to as kernel types:

        +
          +
        1. Computation (COMP) - Compute kernels execute compiled routines for matrix multiplication and similar numeric calculations. They are responsible for all of the number-crunching necessary for model execution.
        2. +
        3. Communication (COMM) - Communication kernels are routines which are responsible for exchanging and synchronizing data between different GPU devices in a distributed training job. The NVIDIA Collective Communication Library (NCCL) is a widely used communication library and all its kernels have the prefix “nccl”. Example NCCL kernels include NCCL_AllGather, NCCL_ReduceScatter, NCCL_AllReduce, etc.
        4. +
        5. Memory (MEM) - Memory kernels manage the memory allocations/deallocations on the GPU devices and data movement between the memory space on the host and the GPUs. The memory kernels include Memcpy_H2D, Memcpy_D2H, Memcpy_D2D, Memset, etc. Here, H represents the Host and D represents the GPU Device. Thus, H2D, D2H, D2D stands for Host to Device, Device to Host and Device to Device respectively.
        6. +
        + +

        Because a modern GPU device like the NVIDIA A100 GPU is a massively parallel device which is capable of running multiple kernels simultaneously, it is possible to overlap the computation, communication, and memory kernels to reduce the model execution time. One common technique to achieve the overlap is to utilize multiple CUDA streams. A CUDA stream is a sequence of operations that execute on a GPU device in the order in which they are issued by the host code. Different CUDA streams can be interleaved and even run concurrently, thus achieving the effect of kernel overlap.

        + +

        To help understand the above concepts, Figure 1 provides a timeline of the GPU kernels in a sample distributed training job on 8 GPUs for one iteration. In the figure below, each rank represents one GPU and the kernels on each GPU run on 6 CUDA streams. In the right column of the figure, you can see names of the GPU kernels used. In the middle of the figure, you see the overlap between compute and communicate kernels. This figure is created using the plot_timeline example notebook available in HTA.

        + +

        Figure 1. An example of the execution timeline of GPU Kernels across multiple ranks

        + +

        Figure 1. An example of the execution timeline of GPU Kernels across multiple ranks

        + +

        The performance of multiple GPU training jobs is affected by multiple factors. Among these factors, how does a model execution create and orchestrate the GPU kernels plays a critical role. HTA provides insights on how the model execution interacts with the GPU devices and highlights the opportunities for performance improvement.

        + +

        With the features we built in HTA, we aim to provide users insights into “what is happening under the hood in a distributed GPU training?” We briefly describe these features in the next few paragraphs.

        + +

        Features in Holistic Trace Analysis

        + +

        For most users, understanding the performance of GPU training jobs is nontrivial. Thus, we built this library to simplify the task of trace analysis and provide the user useful insights by examining the model execution traces. As the first step, we developed features which are important and generic enough so that most users can benefit from this library.

        + +

        Temporal Breakdown: We begin by asking whether the GPU is spending time on computation, communication, memory events, or is it idle? To answer this question, the temporal breakdown feature presents a breakdown in terms of these categories. To achieve high training efficiency the code should maximize time used by computation kernels and minimize idle time and non-compute time (time used by communication or memory kernels). This is accomplished by implementing concurrent execution of computation kernels with communication or memory kernels. Note that, during concurrent execution of computation kernels with communication/memory kernels the time spent by communication/memory kernels is accounted for under compute time.

        + +

        Figure 2: Temporal Breakdown across 8 GPUs

        + +

        Figure 2: Temporal Breakdown across 8 GPUs

        + +

        Kernel Breakdown: It is natural to ask which kernels are taking the most amount of time. The next feature breaks down the time spent within each kernel type (COMM, COMP, MEM) and sorts them by duration. We present this information for each kernel type and for each rank as a pie chart. See figure 3 below.

        + +

        Figure 3: Pie chart of top computation and communication kernels

        + +

        Figure 3: Pie chart of top computation and communication kernels

        + +

        Kernel Duration Distribution: Subsequently, one can also ask - for any given kernel, what is the distribution of the time spent across the ranks? To answer this, HTA generates bar graphs for the average duration of a given kernel across all ranks. Additionally, the error bars in the bar graphs show the minimum and maximum amount of time taken by a given kernel on a given rank. Figure 4 below shows a discrepancy between average duration on rank 0 as compared to other ranks. This anomalous behavior on rank 0 guides the user on where to look for possible bugs.

        + +

        Figure 4: Average duration of NCCL AllReduce Kernel across 8 ranks

        + +

        Figure 4: Average duration of NCCL AllReduce Kernel across 8 ranks

        + +

        Communication Computation Overlap: In distributed training, a significant amount of time is spent in communication and synchronization events among multiple GPU devices. To achieve high GPU efficiency (i.e. TFLOPS/GPU) it is vital to keep the GPU doing actual computation work. In other words, a GPU should not be blocked because of waiting for data from other GPUs. One way to measure the extent to which computation is blocked by data dependencies is to calculate the computation-communication overlap. Higher GPU efficiency is observed if communication events overlap computation events. Lack of communication and computation overlap will lead to the GPU being idle, thus the efficiency would be low. Thus, the communication computation overlap feature calculates the percentage of time communication and computation overlap in a job for each rank and generates a bar graph representation. See figure below. More precisely, we measure the following ratio

        + +

        (time spent in computation while communicating) / (time spent in communication)

        + +

        Figure 5: Communication computation overlap

        + +

        Figure 5: Communication computation overlap

        + +

        Augmented Counters (Queue length, Memory bandwidth): To aid in debugging, HTA calculates the memory bandwidth statistics for D2H, H2D and D2D memory copy (memcpy) and memory set (memset) events. Additionally, HTA also computes the number of outstanding CUDA operations on each CUDA stream. We refer to this as queue length. When the queue length on a stream is 1024 or larger new events cannot be scheduled on that stream and the CPU will stall until the GPU events have processed. Additionally, HTA generates a new trace file containing tracks with the memory bandwidth and queue length time series. See Figure 6 below.

        + +

        Figure 6: Memory Bandwidth and Queue Length

        + +

        Figure 6: Memory Bandwidth and Queue Length

        + +

        These primary features give us a peek into the system performance and help answer “what is happening in the system?”. As HTA evolves, we hope to address “why is X happening?” and also suggest possible solutions to overcome the bottlenecks.

        + +

        Installation and Usage

        + +

        Installation

        + +

        For installing the HTA please refer to the README. In brief, the user is required to clone the repo and install the necessary Python packages via pip.

        + +

        Usage

        + +

        This version of Holistic Trace Analysis is currently in beta and we recommend using HTA in a Jupyter notebook. A demo notebook is provided for your convenience. To get started, import the hta package in a Jupyter notebook, create a TraceAnalysis object and off we go in exactly two lines of code.

        + +
        from hta.trace_analysis import TraceAnalysis
        +analyzer = TraceAnalysis(trace_dir = /trace/folder/path)
        +
        + +

        Requirements

        + +
          +
        • All trace files for a training or inference job must be stored in a unique folder.
        • +
        • Trace files are in json or gzipped json format.
        • +
        + +

        FAQ

        + +

        Q. How can I install HTA?

        + +

        Please see the README in the root directory of the repository.

        + +

        Q. Is there any documentation on the features and API in HTA?

        + +

        The documentation and detailed API is available here.

        + +

        Q. Can you implement feature X?

        + +

        Depending on how widely the feature is needed and the level of effort required to implement it we would consider developing the feature. Please open a Github Issue and tag it with the feature-request label.

        + +

        Q. Can I modify the code?

        + +

        Please do and send a PR along the way, if you think it would be useful for others.

        + +

        Q. How can I collect traces in PyTorch?

        + +

        Please refer to this tutorial here.

        + +

        Q. Can HTA be used at production scale?

        + +

        Yes, please see a use case study here.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/training-moes/index.html b/blog/training-moes/index.html new file mode 100644 index 000000000000..83a26c6624dd --- /dev/null +++ b/blog/training-moes/index.html @@ -0,0 +1,713 @@ + + + + + + + + + + + + + Training MoEs at Scale with PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        June 23, 2024

        +

        + Training MoEs at Scale with PyTorch +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Brian Chu, Mihir Patel, Less Wright, Vitaliy Chiley, Evan Racah, Wanchao Liang, Iris Zhang, Andrew Gu + +

        +

        Over the past year, Mixture of Experts (MoE) models have surged in popularity, fueled by powerful open-source models like DBRX, Mixtral, DeepSeek, and many more. At Databricks, we’ve worked closely with the PyTorch team to scale training of MoE models. In this blog post, we’ll talk about how we scale to over three thousand GPUs using PyTorch Distributed and MegaBlocks, an efficient open-source MoE implementation in PyTorch.

        + +

        What is a MoE?

        + +

        A MoE model is a model architecture that uses multiple expert networks to make predictions. A gating network is used to route and combine the outputs of experts, ensuring each expert is trained on a different, specialized distribution of tokens. The architecture of a transformer-based large language model typically consists of an embedding layer that leads into multiple transformer blocks (Figure 1, Subfigure A). Each transformer block contains an attention block and a dense feed forward network (Figure 1, Subfigure B). These transformer blocks are stacked such that the output of one transformer block leads to the input of the next block. The final output goes through a fully connected layer and softmax to obtain probabilities for the next token to output.

        + +

        When using a MoE in LLMs, the dense feed forward layer is replaced by a MoE layer which consists of a gating network and a number of experts (Figure 1, Subfigure D). The gating network, typically a linear feed forward network, takes in each token and produces a set of weights that determine which tokens are routed to which experts. The experts themselves are typically implemented as a feed forward network as well. During training, the gating network adapts to assign inputs to the experts, enabling the model to specialize and improve its performance. The router outputs are then used to weigh expert outputs to give the final output of the MoE layer.

        + +

        Figure 1: Using Mixture of Experts in a transformer block

        + +

        Figure 1: Using Mixture of Experts in a transformer block

        + +

        Compared to dense models, MoEs provide more efficient training for a given compute budget. This is because the gating network only sends tokens to a subset of experts, reducing the computational load. As a result, the capacity of a model (its total number of parameters) can be increased without proportionally increasing the computational requirements. During inference, only some of the experts are used, so a MoE is able to perform faster inference than a dense model. However, the entire model needs to be loaded in memory, not just the experts being used.

        + +

        The sparsity in MoEs that allows for greater computational efficiency comes from the fact that a particular token will only be routed to a subset of experts. The number of experts and how experts are chosen depends on the implementation of the gating network, but a common method is top k. The gating network first predicts a probability value for each expert, then routes the token to the top k experts to obtain the output. However, if all tokens always go to the same subset of experts, training becomes inefficient and the other experts end up undertrained. To alleviate this problem, a load balancing loss is introduced that encourages even routing to all experts.

        + +

        The number of experts and choosing the top k experts is an important factor in designing MoEs. A higher number of experts allows scaling up to larger models without increasing computational cost. This means that the model has a higher capacity for learning, however, past a certain point the performance gains tend to diminish. The number of experts chosen needs to be balanced with the inference costs of serving the model since the entire model needs to be loaded in memory. Similarly, when choosing top k, a lower top k during training results in smaller matrix multiplications, leaving free computation on the table if communication costs are large enough. During inference, however, a higher top k generally leads to slower inference speed.

        + +

        MegaBlocks

        + +

        MegaBlocks is an efficient MoE implementation that uses sparse matrix multiplication to compute expert outputs in parallel despite uneven token assignment. MegaBlocks implements a dropless MoE that avoids dropping tokens while using GPU kernels that maintain efficient training. Prior to MegaBlocks, dynamic routing formulations forced a tradeoff between model quality and hardware efficiency. Previously, users had to either drop tokens from computation or waste computation and memory on padding. Experts can receive a variable number of tokens and the expert computation can be performed efficiently using block sparse matrix multiplication. We’ve integrated MegaBlocks into LLM Foundry to enable scaling MoE training to thousands of GPUs.

        + +

        Figure 2: Matrix multiplication for expert computations

        + +

        Figure 2: Matrix multiplication for expert computations

        + +

        Expert Parallelism

        + +

        As models scale to larger sizes and fail to fit on a single GPU, we require more advanced forms of parallelism. Expert parallelism is a form of model parallelism where we place different experts on different GPUs for better performance. Instead of expert weights being communicated across all GPUs, tokens are sent to the device that contains the expert. By moving data instead of weights, we can aggregate data across multiple machines for a single expert. The router determines which tokens from the input sequence should be sent to which experts. This is typically done by computing a gating score for each token-expert pair, and then routing each token to the top-scoring experts. Once the token-to-expert assignments are determined, an all-to-all communication step is performed to dispatch the tokens to the devices hosting the relevant experts. This involves each device sending the tokens assigned to experts on other devices, while receiving tokens assigned to its local experts.

        + +

        The key advantage of expert parallelism is processing a few, larger matrix multiplications instead of several small matrix multiplications. As each GPU only has a subset of experts, it only has to do computation for those experts. Correspondly, as we aggregate tokens across multiple GPUs, the size of each matrix is proportionally larger. As GPUs are optimized for large-scale parallel computations, larger operations can better exploit their capabilities, leading to higher utilization and efficiency. A more in depth explanation of the benefits of larger matrix multiplications can be found here. Once the computation is complete, another all-to-all communication step is performed to send the expert outputs back to their original devices.

        + +

        Figure 3: Token routing in expert parallelism

        + +

        Figure 3: Token routing in expert parallelism

        + +

        We leverage PyTorch’s DTensor, a low-level abstraction for describing how tensors are sharded and replicated, to effectively implement expert parallelism. We first manually place experts on different GPUs, typically sharding across a node to ensure we can leverage NVLink for fast GPU communication when we route tokens. We can then build a device mesh on top of this layout, which lets us succinctly describe the parallelism across the entire cluster. We can use this device mesh to easily checkpoint or rearrange experts when we need alternate forms of parallelism.

        + +

        Scaling ZeRO-3 with PyTorch FSDP

        + +

        In conjunction with expert parallelism, we use data parallelism for all other layers, where each GPU stores a copy of the model and optimizer and processes a different chunk of data. After each GPU has completed a forward and backward pass, gradients are accumulated across GPUs for a global model update.

        + +

        ZeRO-3 is a form of data parallelism where weights and optimizers are sharded across each GPU instead of being replicated. Each GPU now only stores a subset of the full model, dramatically reducing memory pressure. When a part of the model is needed for computation, it is gathered across all the GPUs, and after the computation is complete, the gathered weights are discarded. We use PyTorch’s implementation of ZeRO-3, called Fully Sharded Data Parallel (FSDP).

        + +

        As we scale to thousands of GPUs, the cost of communication across devices increases, slowing down training. Communication increases due to the need to synchronize and share model parameters, gradients, and optimizer states across all GPUs which involves all-gather and reduce-scatter operations. To mitigate this issue while keeping the benefits of FSDP, we utilize Hybrid Sharded Data Parallel (HSDP) to shard the model and optimizer across a set number of GPUs and replicate this multiple times to fully utilize the cluster. With HSDP, an additional all reduce operation is needed in the backward pass to sync gradients across replicas. This approach allows us to balance memory efficiency and communication cost during large scale distributed training. To use HSDP we can extend our previous device mesh from expert parallelism and let PyTorch do the heavy lifting of actually sharding and gathering when needed.

        + +

        Figure 4: FSDP and HSDP

        + +

        Figure 4: FSDP and HSDP

        + +

        With PyTorch, we can effectively combine these two types of parallelism, leveraging FSDP’s higher level API while using the lower-level DTensor abstraction when we want to implement something custom like expert parallelism. We now have a 3D device mesh with expert parallel shard dimension, ZeRO-3 shard dimension, and a replicate dimension for pure data parallelism. Together, these techniques deliver near linear scaling across very large clusters, allowing us to achieve MFU numbers over 40%.

        + +

        Elastic Checkpointing with Torch Distributed

        + +

        Fault tolerance is crucial for ensuring that LLMs can be trained reliably over extended periods, especially in distributed environments where node failures are common. To avoid losing progress when jobs inevitably encounter failures, we checkpoint the state of the model, which includes parameters, optimizer states, and other necessary metadata. When a failure occurs, the system can resume from the last saved state rather than starting over. To ensure robustness to failures, we need to checkpoint often and save and load checkpoints in the most performant way possible to minimize downtime. Additionally, if too many GPUs fail, our cluster size may change. Accordingly, we need the ability to elastically resume on a different number of GPUs.

        + +

        PyTorch supports elastic checkpointing through its distributed training framework, which includes utilities for both saving and loading checkpoints across different cluster configurations. PyTorch Distributed Checkpoint ensures the model’s state can be saved and restored accurately across all nodes in the training cluster in parallel, regardless of any changes in the cluster’s composition due to node failures or additions.

        + +

        Additionally, when training very large models, the size of checkpoints may be very large, leading to very slow checkpoint upload and download times. PyTorch Distributed Checkpoint supports sharded checkpoints, which enables each GPU to save and load only its portion of the model. When combining sharded checkpointing with elastic training, each GPU reads the metadata file to determine which shards to download on resumption. The metadata file contains information on what parts of each tensor are stored in each shard. The GPU can then download the shards for its part of the model and load that part of the checkpoint.

        + +

        Figure 5: Checkpointing saving and resumption resharded on additional GPUs

        + +

        Figure 5: Checkpointing saving and resumption resharded on additional GPUs

        + +

        By parallelizing checkpointing across GPUs, we can spread out network load, improving robustness and speed. When training a model with 3000+ GPUs, network bandwidth quickly becomes a bottleneck. We take advantage of the replication in HSDP to first download checkpoints on one replica and then send the necessary shards to other replicas. With our integration in Composer, we can reliably upload checkpoints to cloud storage as frequently as every 30 minutes and automatically resume from the latest checkpoint in the event of a node failure in less than 5 minutes.

        + +

        Conclusion

        + +

        We’re very excited to see how PyTorch is enabling training state-of-the-art LLMs with great performance. In our post, we’ve shown how we implemented efficient MoE training through Pytorch Distributed and MegaBlocks on Foundry. Furthermore, Pytorch elastic checkpointing allowed us to quickly resume training on a different number of GPUs when node failures occurred. Using Pytorch HSDP has allowed us to scale training efficiently as well as improve checkpointing resumption times. We look forward to continuing building on a strong and vibrant open-source community to help bring great AI models to everyone. Come join us in building great models at LLM Foundry and PyTorch.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/training-production-ai-models/index.html b/blog/training-production-ai-models/index.html new file mode 100644 index 000000000000..5d8adc5ef676 --- /dev/null +++ b/blog/training-production-ai-models/index.html @@ -0,0 +1,773 @@ + + + + + + + + + + + + + Training Production AI Models with PyTorch 2.0 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + CK Luk, Daohang Shi, Yuzhen Huang, Jackie (Jiaqi) Xu, Jade Nie, Zhou Wang, Lu Fang, Flavio Sales Truzzi, Devashish Shankar, Dima Ivashchenko, Chunzhi Yang, Nicolas Macchioni, David Berard, Yu Guo, Xiaodong Wang, Bert Maher, Yanbo Liang, Edward Yang, Brian Hirsh, Michael Voznesensky, Animesh Jain, Michael Anderson + +

        +

        1. Introduction

        + +

        PyTorch 2.0 (abbreviated as PT2) can significantly improve the training and inference performance of an AI model using a compiler called torch.compile while being 100% backward compatible with PyTorch 1.x. There have been reports on how PT2 improves the performance of common benchmarks (e.g., huggingface’s diffusers). In this blog, we discuss our experiences in applying PT2 to production AI models at Meta.

        + +

        2. Background

        + +

        2.1 Why is automatic performance optimization important for production?

        + +

        Performance is particularly important for production—e.g, even a 5% reduction in the training time of a heavily used model can translate to substantial savings in GPU cost and data-center power. Another important metric is development efficiency, which measures how many engineer-months are required to bring a model to production. Typically, a significant part of this bring-up effort is spent on manual performance tuning such as rewriting GPU kernels to improve the training speed. By providing automatic performance optimization, PT2 can improve both cost and development efficiency.

        + +

        2.2 How PT2 improves performance

        + +

        As a compiler, PT2 can view multiple operations in the training graph captured from a model (unlike in PT1.x, where only one operation is executed at a time). Consequently, PT2 can exploit a number of performance optimization opportunities, including:

        + +
          +
        • Fusing multiple operations into a single GPU kernel: +
            +
          • A typical type of performance overhead in running a GPU program is the CPU overhead of launching small GPU kernels. By fusing multiple operations into a single GPU kernel, PT2 can significantly reduce the kernel-launching overhead on the CPU. For instance, consider the PyTorch program in Figure 1(a). When it is executed on GPU with PT1, it has three GPU kernels (two for the two sin() ops and one for the addition op). With PT2, there is only one kernel generated, which fuses all three ops.
          • +
          • After fusing some operations, certain operations in the graph may become dead and hence can be optimized away. This can save both compute and memory bandwidth on the GPU. For instance, in Figure 1(b), one of the duplicated sin() ops can be optimized away.
          • +
          • In addition, fusion can also reduce GPU device memory reads/writes (by composing pointwise kernels) and help improve hardware utilization.
          • +
          +
        • +
        + +

        Fig.1  How PT2 improves performance with fusion and dead-code elimination.

        + +

        Fig. 1: How PT2 improves performance with fusion and dead-code elimination.

        + +
          +
        • Reducing the type conversion overhead for using lower-precision data types: +
            +
          • PyTorch 1.x supports Automatic Mixed Precision (AMP). While AMP can reduce the compute time of an op, it introduces type conversion overhead before and after the op. PT2 can increase AMP performance by optimizing away unnecessary type conversion code, significantly reducing its overhead. As an example, Figure 2(a) converts three 32-bit input tensors (a32, b32, c32) to bf16 before doing the matrix multiplications. Nevertheless, in this example, a32 and c32 are actually the same tensor (a_float32). So, there is no need to convert a_float32 twice, as shown in the code generated by torch.compile in Figure 2(b). Note that while both this example and the previous one optimize away redundant computations, they are different in the sense that the type conversion code in this example is implicit via torch.autocast, unlike in the previous example where the torch.sin(x).cuda() is explicit in user code.
          • +
          +
        • +
        + +

        Fig.2  How PT2 reduces type conversion overhead when using AMP.

        + +

        Fig. 2: How PT2 reduces type conversion overhead when using AMP.

        + +
          +
        • Reusing buffers on the GPU: +
            +
          • With a global view, the scheduler in torch.compile can reuse buffers on the GPU, thereby reducing both memory allocation time and memory consumption. Figure 3 shows the driver program that calls the Triton kernels generated for the program in Figure 2(a). We can see that buf1 is reused as buf4.
          • +
          +
        • +
        + +

        Fig.3  Reuse of buffers.

        + +

        Fig. 3: Reuse of buffers.

        + +
          +
        • Autotuning: +
            +
          • PT2 has options to enable autotuning (via Triton) on matrix-multiply ops, pointwise ops, and reduction ops. Tunable parameters include block size, number of stages, and number of warps. With autotuning, the most performant implementation of an op can be found empirically.
          • +
          +
        • +
        + +

        3. Production environment considerations

        + +

        In this section, we describe a number of important considerations in applying PT2 to production.

        + +

        3.1 Ensuring no model quality degradation with torch.compile

        + +

        Applying torch.compile to a model will cause numerical changes because of (1) reordering of floating-point ops during various optimizations such as fusion and (2) use of lower precision data types like bf16 if AMP is enabled. Therefore 100% bitwise compatibility with PT 1.x is not expected. Nevertheless, we still need to make sure that the model quality (measured in some form of numeric scores) is preserved after applying torch.compile. Typically, each production model will have its own range of acceptable scores (e.g., percentage change must be within 0.01%).

        + +

        In case of a model-quality drop caused by torch.compile, we need to do a deep-dive debug.

        + +

        One useful technique for debugging a torch.compile-related numeric issue is to apply torch.compile with different backends, in particular “eager” and “aot_eager”, in addition to “inductor”:

        + +
          +
        • If the numeric issue happens with the “eager” backend, then the forward graph constructed by torch.compile is likely incorrect;
        • +
        • If the numeric issue doesn’t happen with “eager” but happens with “aot_eager”, then the backward graph constructed by torch.compile is likely incorrect;
        • +
        • If the numeric issue doesn’t happen with either “eager” or “aot_eager” but happens with “inductor”, then the code generation inside the inductor is likely incorrect.
        • +
        + +

        3.2 Autotuning in production

        + +

        By default, the autotuning in torch.inductor is done online while the model is executed. For some production models, we find that the autotuning time can take several hours, which is not acceptable for production. Therefore, we add offline autotuning which works as depicted in Figure 4. The very first time that a model is run, the details (e.g., input tensor shape, data type etc) on all ops that require tuning will be logged to a database. Then, a tuning process for these ops is run overnight to search for the most performant implementation of each op; the search result is updated to a persistent cache (implemented as a source file of torch.inductor). Next time when the model is run again, the tuned implementation of each op will be found in the cache and chosen for execution.

        + +

        Fig.4  The offline autotuning used in production.

        + +

        Fig. 4: The offline autotuning used in production.

        + +

        3.3 Profiling support for torch.compile

        + +

        As we previously discussed in this blog, a profiler is essential for debugging the performance of production models. We have enhanced the profiler to display torch.compile related events on the timeline. The most useful ones are marking which parts of the model are running compiled code so that we can quickly validate if the parts of the model that are supposed to be compiled are actually compiled by torch.compile. For example, the trace in Figure 5 has two compiled regions (with the label “CompiledFunction”). Other useful events are time spent on the compilation and that spent on accessing the compiler’s code-cache.

        + +

        Fig.5  A trace with two compiled regions.

        + +

        Fig. 5: A trace with two compiled regions.

        + +

        3.4 Controlling just-in-time compilation time

        + +

        torch.compile uses just-in-time compilation. The compilation happens when the first batch of data is trained. In our production setting, there is an upper limit on how much time is allowed for a training job to reach its first batch, aka Time-To-First-Batch (TTFB). We need to make sure that enabling torch.compile will not increase TTFB to over the limit. This could be challenging because production models are large and~~ ~~torch.compile can take substantial compilation time. We enable parallel compilation to keep the compile time under control (this is controlled by the global variable compile_threads inside torch/_inductor/config.py, which is already set to the CPU count on OSS Linux). A model is decomposed into one or more computational graphs; each graph is decomposed into multiple Triton kernels. If parallel compilation is enabled, all the Triton kernels in the same graph can be compiled simultaneously (nevertheless, kernels from different graphs are still compiled in serial). Figure 6 illustrates how parallel compilation helps.

        + +

        Fig.6  Using parallel compilation in production.

        + +

        Fig. 6: Using parallel compilation in production.

        + +

        4. Results

        + +

        In this section, we use three production models to evaluate PT2. First we show the training time speedups with PT2, using different optimization configs. Second, we show the importance of parallel compilation on the compilation time.

        + +

        4.1 Training-time speedup with torch.compile

        + +

        Figure 7 reports the training-time speedup with PT2. For each model, we show four cases: (i) no-compile with bf16, (ii) compile with fp32, (iii) compile with bf16, (iv) compile with bf16 and autotuning. The y-axis is the speedup over the baseline, which is no-compile with fp32. Note that no-compile with bf16 is actually slower than no-compile with fp32, due to the type conversion overhead. In contrast, compiling with bf16 achieves much larger speedups by reducing much of this overhead. Overall, given that these models are already heavily optimized by hand, we are excited to see that torch.compile can still provide 1.14-1.24x speedup.

        + +

        Fig.7 Training-time speedup with torch.compile (note: the baseline, no-compile/fp32, is  omitted in this figure).

        + +

        Fig. 7: Training-time speedup with torch.compile (note: the baseline, no-compile/fp32, is omitted in this figure).

        + +

        4.2 Compilation-time reduction with parallel compilation

        + +

        Figure 8 shows the compilation time with and without parallel compilation. While there is still room for improvement on the serial compilation time, parallel compilation has reduced the compilation overhead on TTFB to an acceptable level. Models B and C benefit more from parallel compilation than Model A does because they have more distinct Triton kernels per graph.

        + +

        Fig.8 PT2 compilation time.

        + +

        Fig. 8: PT2 compilation time.

        + +

        5. Concluding Remarks

        + +

        In this blog, we demonstrate that PT2 can significantly accelerate the training of large and complex production AI models with reasonable compilation time. In our next blog, we will discuss how PT2 can do general graph transformations.

        + +

        6. Acknowledgements

        + +

        Many thanks to Mark Saroufim, Adnan Aziz, and Gregory Chanan for their detailed and insightful reviews.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/training-using-float8-fsdp2/index.html b/blog/training-using-float8-fsdp2/index.html new file mode 100644 index 000000000000..a22b48d16dc2 --- /dev/null +++ b/blog/training-using-float8-fsdp2/index.html @@ -0,0 +1,859 @@ + + + + + + + + + + + + + Supercharging Training using float8 and FSDP2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        November 25, 2024

        +

        + Supercharging Training using float8 and FSDP2 +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + IBM and Meta + +

        +

        IBM: Tuan Hoang Trong, Alexei Karve, Yan Koyfman, Linsong Chu, Divya Kumari, Shweta Salaria, Robert Walkup, Praneet Adusumilli, Nirmit Desai, Raghu Ganti, Seetharami Seelam
        +Meta: Less Wright, Wei Feng, Vasiliy Kuznetsov, Driss Guesseous

        + +

        In this blog, we will demonstrate how we achieve up to 50% throughput speedup while achieving loss and evaluation benchmark parity in training over FSDP1 bf16 training. We achieve this speedup by leveraging FSDP2, DTensor, and torch.compile with torchao’s float8 via linear layer updates (compute), and float8 all_gathers for weight communication. We showcase these improvements across a spectrum of Meta LLaMa model architecture sizes, ranging from small 1.8B model size all the way to 405B model size, making training faster than ever.

        + +

        We demonstrate these improvements using the Meta Llama3 architecture, and then perform model quality studies at two scales: 100B tokens at 8B model size, and 50B tokens at 70B model size, which provide an exact comparison of float8 and bf16 training loss curves. We demonstrate that the loss curves result in identical loss convergence across these model training runs compared to the bf16 counterpart. Further, we train a 3B model to 1T tokens using the FineWeb-edu dataset and run standard evaluation benchmarks to ensure that the model quality is intact and comparable to a bf16 run.

        + +

        At IBM Research, we plan to adopt these capabilities for our data ablations to improve the number of experiments we can perform in a given GPU budget. Longer term, we will follow up with a larger scale model run to demonstrate the end-to-end feasibility of float8 training.

        + +

        What is Float8?

        + +

        The float8 format for training models was introduced by NVIDIA, ARM, and Intel in a 2022 paper which demonstrated the feasibility of training using lower precision float8, without sacrificing model quality. With the introduction of newer GPUs like the NVIDIA Hopper series, FP8 training became feasible with the potential of more than 2x improvement in training throughput due to native float8 tensor core support. There are a few challenges to realize this promise:
        +(i) Enable the core model operations like matmul and attention in float8,
        +(ii) Enable float8 training in a distributed framework, and
        +(iii) Enable weight communication between GPUs in float8.
        +While the float8 matmul was enabled by NVIDIA libraries, the latter two were provided in recent updates to FSDP2 and torchao.

        + +

        In this blog, we are using torchtitan as the entry point for training, IBM’s deterministic data loader, the float8 linear layer implementation from torchao, and the float8 all gather from the latest PyTorch nightlies in conjunction with FSDP2. For this training, we are using the float8 per tensor (tensorwise) scaling granularity rather than rowwise. We leverage torch.compile to ensure that we get maximum performance gains. We are computing attention in bf16 using SDPA and are currently working on moving this to float8 as well.

        + +

        Experiments

        + +

        We perform various experiments to demonstrate the benefits of float8 training. The first is to ensure that model quality is not sacrificed. To verify this, we train an 8B model and 70B model for a few thousand steps and compare the loss curves between both the float8 and bf16 training run. Our experiments are performed on three different H100 clusters with 128, 256, and 512 H100 GPU configurations in very different environments to demonstrate reproducibility. The first cluster is customized on Grand Teton in Meta with 400Gbps custom interconnect, the second is an IBM research cluster with 3.2Tbps Infiniband interconnect, and the third is an IBM Cloud cluster with 3.2Tbps RoCE interconnect for GPU-to-GPU communication.

        + +

        First, we plot the loss curve comparisons for both these models in the below figures to demonstrate loss parity for a few thousand steps.

        + +

        Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps

        + +

        Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps

        + +

        Figure 1: (a) 8B model loss parity for 2k steps, (b) 70B loss parity for 1k steps

        + +

        We observe that across these different models and in different environments, we obtain loss parity for the small scale of tokens. Next, we characterize the throughput gains for four different model sizes ranging from 1.8B to 405B. We explored the best batch size and activation checkpointing schemes for both the float8 and bf16 training runs to determine the tokens/sec/GPU (wps) metric and report the performance gain. For the 405B model, we leveraged DTensor for tensor parallel training with FSDP2. We use a sequence length of 8K for all our measurements.

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Model size + wps (bf16) + wps (float8) + Percent gain +
        1.8B + 29K + 35K + 18% +
        8B + 8K + 10K + 28% +
        70B + 956 + 1430 + 50% +
        405B (TP4) + 149 + 227 + 52% +
        + +

        Table 1: Performance gains over bf16 (both bf16 and float8 use torch.compile)

        + +

        We observe from Table 1 that the gains for larger models (70B and 405B) reach up to 50%, the smaller models see gains between roughly 20 and 30%. In further experiments, we observed that the addition of float8 all_gather enables a boost of ~5% beyond the compute itself in float8, which is inline with the observations in this blog.

        + +

        Second, to demonstrate the effectiveness of an FP8 model, we trained a 3B model following the Llama3 architecture for 1T tokens using the FineWeb-edu dataset from Hugging Face. We performed evaluations using the lm-eval-harness framework and present a small portion of these results in the below table. We observe that the bf16 performance is marginally better than the float8 scores (about one percent). While some scores are significantly better with bf16 (e.g., MMLU is 3 pts higher), we expect these gaps to vanish when the right hyper parameters are chosen and across larger scale training runs (e.g., the bf16 run had half the batch size and it is well known that smaller batch size runs can improve evaluation scores).

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Benchmark + Score (float8) + Score (bf16) +
        MMLU (5-shot) + 0.26 + 0.29 +
        ARC-e + 0.73 + 0.73 +
        ARC-c + 0.43 + 0.46 +
        Hellaswag + 0.65 + 0.67 +
        sciq + 0.89 + 0.88 +
        OpenBook QA + 0.43 + 0.43 +
        PIQA + 0.76 + 0.76 +
        Winogrande + 0.60 + 0.65 +
        Average + 0.59 + 0.60 +
        + +

        Table 2: Benchmark scores for float8 trained model running in FP16 for eval (at 1T tokens of FineWeb pre-training).

        + +

        Finally, we scale our experiments to 512 H100 GPUs on the IBM Cloud cluster. We were able to recreate the results and speedups that we observed even at 512 GPU scale. We summarize these results only for the large models in the below table (70B and 405B).

        + + + + + + + + + + + + + + + + + + + + +
        Model size + wps (bf16) + wps (float8) + Percent gain +
        70B + 960 + 1448 + 51% +
        405B (TP4) + 152 + 217 + 43% +
        + +

        Table 3: Performance gains over bf16 (both bf16 and float8 use torch.compile) for 512 GPU scale

        + +

        Future work

        + +

        We are also working on evaluating other forms of parallelism such as Context Parallelism. We plan to evaluate all of these features to demonstrate the composability and ability to make choices for training large scale models.

        + +

        Acknowledgements

        + +

        We thank Davis Wertheimer from IBM Research for enabling the data loader for torchtitan runs enabling us to replay data in the same order across multiple runs. We also thank IBM Cloud for enabling us with early test access to the H100 cluster.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/triton-kernel-compilation-stages/index.html b/blog/triton-kernel-compilation-stages/index.html new file mode 100644 index 000000000000..32f7cbc39b0d --- /dev/null +++ b/blog/triton-kernel-compilation-stages/index.html @@ -0,0 +1,820 @@ + + + + + + + + + + + + + Triton Kernel Compilation Stages | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        October 30, 2024

        +

        + Triton Kernel Compilation Stages +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Sara Kokkila-Schumacher*, Brian Vaughan*, Raghu Ganti*, and Less Wright+ (*IBM Research, +Meta) + +

        +

        The Triton open-source programming language and compiler offers a high-level, python-based approach to create efficient GPU code. In this blog, we highlight the underlying details of how a triton program is compiled and the intermediate representations. For an introduction to Triton, we refer readers to this blog.

        + +

        Triton Language and Compilation

        + +

        The Triton programming language supports different types of modern GPUs and follows a blocked programming approach. As an example, we will follow the Triton vector add tutorial with minor modifications. The vector addition kernel and helper function is defined as:

        + +
        import torch
        +import triton
        +import triton.language as tl
        +
        +@triton.jit
        +def add_kernel(x_ptr,  # *Pointer* to first input vector.
        +               y_ptr,  # *Pointer* to second input vector.
        +               output_ptr,  # *Pointer* to output vector.
        +               n_elements, 
        +               BLOCK_SIZE: tl.constexpr, 
        +               ):
        +  
        +    pid = tl.program_id(axis=0) 
        +    block_start = pid * BLOCK_SIZE
        +    offsets = block_start + tl.arange(0, BLOCK_SIZE)
        + 
        +    mask = offsets < n_elements
        +
        +    x = tl.load(x_ptr + offsets, mask=mask)
        +    y = tl.load(y_ptr + offsets, mask=mask)
        +    output = x + y
        +    tl.store(output_ptr + offsets, output, mask=mask)
        + 
        +def add(x: torch.Tensor, y: torch.Tensor):
        +    output = torch.empty_like(x)
        +    assert x.is_cuda and y.is_cuda and output.is_cuda
        +    n_elements = output.numel()
        +
        +    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
        +    triton_kernel=add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
        +    torch.cuda.synchronize()
        +
        +    # Save compilation stages - some of the stages identified here are specific to NVIDIA devices:
        +    with open('triton_IR.txt', 'w') as f:
        +        print(triton_kernel.asm['ttir'], file=f)
        +    with open('triton_TTGIR.txt', 'w') as f:
        +        print(triton_kernel.asm['ttgir'], file=f)
        +    with open('triton_LLVMIR.txt', 'w') as f:
        +        print(triton_kernel.asm['llir'], file=f)
        +    with open('triton_PTX.ptx', 'w') as f:
        +        print(triton_kernel.asm['ptx'], file=f)
        +    with open('triton_cubin.txt', 'w') as f:
        +        print(triton_kernel.asm['cubin'], file=f)
        +
        +    return output
        +
        +torch.manual_seed(0)
        +size = 98432
        +x = torch.rand(size, device='cuda')
        +y = torch.rand(size, device='cuda')
        +output_torch = x + y
        +output_triton = add(x, y)
        +print(output_torch)
        +print(output_triton)
        +print(f'The maximum difference between torch and triton is '
        +      f'{torch.max(torch.abs(output_torch - output_triton))}')    
        +
        + +

        The Triton vector add kernel includes the @triton.jit decorator. The Triton compiler will compile functions marked by @triton.jit, which lowers the function through multiple compilation stages. The helper function add allocates the output tensor, computes the appropriate GPU grid size, and additionally saves the intermediate compilation stages.

        + +

        Focusing on the compilation process, the Triton kernel is lowered to device specific assembly through a series of stages outlined in the following figure.

        + +

        compilation process

        + +

        The kernel is compiled by first walking the abstract syntax tree (AST) of the decorated python function to create the Triton Intermediate Representation (Triton-IR). The Triton-IR is an unoptimized, machine independent intermediate representation. It introduces tile-level programming requirements and is based on the open-source LLVM compiler project. Next the Triton compiler optimizes and converts the Triton-IR into the stages Triton-GPU IR (Triton-TTGIR) and then LLVM-IR. Both the Triton-IR and Triton-GPUIR representations are written as MLIR dialects, where MLIR is a subproject of LLVM that aims to improve compilation for heterogeneous hardware.

        + +

        For the Triton vector add tutorial kernel, the example Triton IR snippet is:

        + +
        module {
        +  tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0), %arg3: i32 {tt.divisibility = 16 : i32} loc("/u/saraks/triton_blog/01-vector-add.py":28:0)) attributes {noinline = false} {
        +    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
        +    %0 = tt.get_program_id x : i32 loc(#loc2)
        +    %1 = arith.muli %0, %c1024_i32 : i32 loc(#loc3)
        +    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc4)
        +    %3 = tt.splat %1 : i32 -> tensor<1024xi32> loc(#loc5)
        +    %4 = arith.addi %3, %2 : tensor<1024xi32> loc(#loc5)
        +    %5 = tt.splat %arg3 : i32 -> tensor<1024xi32> loc(#loc6)
        +    %6 = arith.cmpi slt, %4, %5 : tensor<1024xi32> loc(#loc6)
        +    %7 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc7)
        +    %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc7)
        +    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc8)
        +    %10 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc9)
        +    %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc9)
        +    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc10)
        +    %13 = arith.addf %9, %12 : tensor<1024xf32> loc(#loc11)
        +    %14 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc12)
        +    %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc12)
        +    tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc13)
        +    tt.return loc(#loc14)
        +  } loc(#loc)
        +} loc(#loc)
        +
        + +

        Notice that the main functions in the Triton kernel are now represented as:

        + + + + + + + + + + + + + + + + + + + + + + +
        Triton kernel + Triton IR +
        x = tl.load(x_ptr + offsets, mask=mask) + %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc8) +
        y = tl.load(y_ptr + offsets, mask=mask) + %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc10) +
        output = x + y + %13 = arith.addf %9, %12 : tensor<1024xf32> loc(#loc11) +
        tl.store(output_ptr + offsets, output, mask=mask) + tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>> loc(#loc13) +
        + +

        At the Triton IR stage, the %arg0: !tt.ptr&lt;f32> and the following tensor references show that the intermediate representation is already specialized by the data type.

        + +

        We ran this example on a Tesla V100-SXM2-32GB GPU with CUDA Version 12.2, Python version 3.11.9, and PyTorch 2.4.1 with the default version of Triton that is installed with PyTorch. On this device, the simple vector addition has the following Triton GPU IR snippet with lines omitted for clarity:

        + +
        #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
        +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:70", "triton_gpu.threads-per-warp" = 32 : i32} {
        +  tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}
        +    ⋮
        +    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc8)
        +    ⋮
        +    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc10)
        +    %13 = arith.addf %9, %12 : tensor<1024xf32, #blocked> loc(#loc11)
        +    ⋮
        +    tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc13)
        +    ⋮
        +  } loc(#loc)
        +} loc(#loc)
        +
        + +

        At this stage, some of the hardware specific information is included. For example, the compute capability is included along with details on how the tensors are distributed to cores and warps or for AMD GPUs on wavefronts. In this example, the tensors are represented as a #blocked layout. In this encoding, each warp owns a contiguous portion of the tensor. Currently, other possible memory optimizations include layouts such as slice (restructures and distributes a tensor along a dimension), dot_op(optimized layout for block matrix product), shared(indicates GPU shared memory), nvidia_mma (produced by NVIDIA tensor cores), amd_mfma (produced by AMD MFMA matrix core), and amd_wmma (produced by AMD WMMA matrix core). As announced at the recent Triton conference, this layout representation will transition to a new linear layout to unify layouts within and across backends. The stage from Triton-GPUIR to LLVM-IR converts the Triton-GPUIR to LLVM’s representation. At this time, Triton has third-party backend support for NVIDIA and AMD devices, but other device support is under active development by the open-source community.

        + +

        A small subset of the LLVM-IR vector add arguments shown below for illustration:

        + +
          %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !16
        +  %39 = extractvalue { i32, i32, i32, i32 } %38, 0, !dbg !18
        +  %23 = bitcast i32 %19 to float, !dbg !16
        +  %43 = bitcast i32 %39 to float, !dbg !18
        +  %56 = fadd float %23, %43, !dbg !19
        +
        + +

        After some pointer arithmetic and an inline assembly call to retrieve the data from global memory, the vector elements are extracted and cast to the correct type. Finally they are added together and later written to global memory through an inline assembly expression.

        + +

        The final stages of the Triton compilation process lower the LLVM-IR to a device specific binary. For the example vector add, on an NVIDIA GPU, the next intermediate is PTX (Parallel Thread Execution). The low-level PTX syntax specifies the execution at the thread level of NVIDIA devices, starting with the CUDA 1.0 release. For an in-depth guide on PTX, see NVIDIA’s documentation. In the vector add, the kernel parameters are passed from the host to the kernel, addresses are assigned and mov instructions facilitate the thread-level data access, ultimately representing the element addition calls with add.f32 such as the example below:

        + +
        	add.f32 	%f17, %f1, %f9// add type float32, output register, input register for x, input register for y 
        +
        + +

        The Triton compiler orchestrates the final stage with different hardware backends managing how the assembly code is compiled into binary. The Triton kernel is now ready for use.

        + +

        Summary

        + +

        Triton provides a high-level abstraction to program and compile kernels for different types of hardware. In this post, we highlight the different stages of the Triton code representations and Triton compiler. For details on including custom Triton kernels or accelerating different workloads with Triton kernels, check out the PyTorch Triton tutorial, the blog posts on Triton GPTQ kernels, Llama3 FP8 Inference with Triton, and CUDA-Free Inference for LLMs, or the PyTorch 2.2 Section on Triton code generation.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/understanding-gpu-memory-1/index.html b/blog/understanding-gpu-memory-1/index.html new file mode 100644 index 000000000000..4d341f15c3fc --- /dev/null +++ b/blog/understanding-gpu-memory-1/index.html @@ -0,0 +1,989 @@ + + + + + + + + + + + + + Understanding GPU Memory 1: Visualizing All Allocations over Time | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Aaron Shi, Zachary DeVito + +

        +

        During your time with PyTorch on GPUs, you may be familiar with this common error message:

        + +
        torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 79.32 GiB of which 401.56 MiB is free.
        +
        + +

        In this series, we show how to use memory tooling, including the Memory Snapshot, the Memory Profiler, and the Reference Cycle Detector to debug out of memory errors and improve memory usage.

        + +

        Memory Timeline

        + +

        The Memory Snapshot tool provides a fine-grained GPU memory visualization for debugging GPU OOMs. Captured memory snapshots will show memory events including allocations, frees and OOMs, along with their stack traces.

        + +

        In a snapshot, each tensor’s memory allocation is color coded separately. The x axis is over time, and the y axis is the amount of GPU memory in MB. The snapshot is interactive, so we can observe the stack trace for any allocation by mousing over. Try it yourself at https://github.com/pytorch/pytorch.github.io/blob/site/assets/images/understanding-gpu-memory-1/snapshot.html.

        + +

        In this snapshot, there are 3 peaks showing the memory allocations over 3 training iterations (this is configerable). When looking at the peaks, it is easy to see the rise of memory in the forward pass and the fall during the backward pass as the gradients are computed. It is also possible to see that the program has the same pattern of memory use iteration to iteration. One thing that stands out is the many tiny spikes in memory, by mousing over them, we see that they are buffers used temporarily by convolution operators.

        + +

        Capturing Memory Snapshots

        + +

        The API to capture memory snapshots is fairly simple and available in torch.cuda.memory:

        + +
          +
        • Start: torch.cuda.memory._record_memory_history(max_entries=100000)
        • +
        • Save: torch.cuda.memory._dump_snapshot(file_name)
        • +
        • Stop: torch.cuda.memory._record_memory_history(enabled=None)
        • +
        + +

        Code Snippet (for full code sample, see Appendix A):

        + +
           # Start recording memory snapshot history, initialized with a buffer
        +   # capacity of 100,000 memory events, via the `max_entries` field.
        +   torch.cuda.memory._record_memory_history(
        +       max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT
        +   )
        +
        +   # Run your PyTorch Model.
        +   # At any point in time, save a snapshot to file for later.
        +   for _ in range(5):
        +       pred = model(inputs)
        +       loss_fn(pred, labels).backward()
        +       optimizer.step()
        +       optimizer.zero_grad(set_to_none=True)
        +
        +   # In this sample, we save the snapshot after running 5 iterations.
        +   #   - Save as many snapshots as you'd like.
        +   #   - Snapshots will save last `max_entries` number of memory events
        +   #     (100,000 in this example).
        +   try:
        +       torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle")
        +   except Exception as e:
        +       logger.error(f"Failed to capture memory snapshot {e}")
        +
        +   # Stop recording memory snapshot history.
        +   torch.cuda.memory._record_memory_history(enabled=None)
        +
        + +

        To visualize the snapshot file, we have a tool hosted at https://pytorch.org/memory_viz. There, you can drag and drop your saved snapshot file and it will plot each allocation over time. Privacy Note: The tool will not save your snapshot.

        + +

        Memory Timeline

        + +

        Alternatively, you can generate an HTML from a .pickle by using the script at pytorch/torch/cuda/_memory_viz.py, here is an example:

        + +
        python torch/cuda/_memory_viz.py trace_plot snapshot.pickle -o snapshot.html
        +
        + +

        Debugging CUDA OOMs

        + +

        Let’s look at how we can use the memory snapshot tool to answer:

        + +
          +
        1. Why did a CUDA OOM happen?
        2. +
        3. Where is the GPU Memory being used?
        4. +
        + +

        ResNet50 with a bug

        + +

        We’ve taken a look at a properly working model in the first snapshot. Now, let’s take a look at a training example with a bug, see snapshot:

        + +

        Memory Timeline

        + +

        Notice how the second iteration uses far more memory than the first iteration. If this model were much larger, it could have CUDA OOM’d in the second iteration without much more insight into why.

        + +

        Memory Timeline

        + +

        When examining this snapshot further, we can clearly see that several tensors are staying alive from the first iteration to the second and later iterations. If we mouse over one of these tensors, it would show a stack trace suggesting that these were gradient tensors.

        + +

        And indeed if we go to the code, we can see that it doesn’t clear the gradient tensors, when it could have cleared them before the forward.

        + +

        Before:

        +
                for _ in range(num_iters):
        +          pred = model(inputs)
        +          loss_fn(pred, labels).backward()
        +          optimizer.step()
        +
        + +

        After:

        +
                for _ in range(num_iters):
        +          pred = model(inputs)
        +          loss_fn(pred, labels).backward()
        +          optimizer.step()
        +          # Add this line to clear grad tensors
        +          optimizer.zero_grad(set_to_none=True)
        +
        + +

        We can simply add an optimizer.zero_grad(set_to_none=True) instruction to clear the gradient tensors from iteration to iteration (more details about why we need to zero the gradients here: https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html).

        + +

        This is a simplification of a bug we’ve found in more complicated programs using this tool. We encourage you to try out the Memory Snapshot on your GPU memory problems and let us know how it goes.

        + +

        ResNet50 after bug fix

        + +

        After applying the fix, the snapshot seems to be clearing the gradients now.

        + +

        Memory Timeline

        + +

        We now have the snapshot of a properly working ResNet50 model. Try out the code yourself (see code sample in Appendix A).

        + +

        But you may be wondering, why is there still an increase in memory after the first iteration? To answer this, let’s visit the Memory Profiler in the next section.

        + +

        Categorized Memory Usage

        + +

        The Memory Profiler is an added feature of the PyTorch Profiler that categorizes memory usage over time. We still rely on the Memory Snapshot for stack traces for deep dives into memory allocations.

        + +

        To generate a memory timeline, here is a code snippet (full code sample in Appendix B):

        + +
           # Initialize the profiler context with record_shapes, profile_memory,
        +   # and with_stack set to True.
        +   with torch.profiler.profile(
        +       activities=[
        +           torch.profiler.ProfilerActivity.CPU,
        +           torch.profiler.ProfilerActivity.CUDA,
        +       ],
        +       schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1),
        +       record_shapes=True,
        +       profile_memory=True,
        +       with_stack=True,
        +       on_trace_ready=trace_handler,
        +   ) as prof:
        +       # Run the PyTorch Model inside the profile context.
        +       for _ in range(5):
        +           prof.step()
        +           with record_function("## forward ##"):
        +               pred = model(inputs)
        +
        +           with record_function("## backward ##"):
        +               loss_fn(pred, labels).backward()
        +
        +           with record_function("## optimizer ##"):
        +               optimizer.step()
        +               optimizer.zero_grad(set_to_none=True)
        +
        +   # Construct the memory timeline HTML plot.
        +   prof.export_memory_timeline(f"{file_prefix}.html", device="cuda:0")
        +
        + +

        For further reference, see https://pytorch.org/docs/main/profiler.html.

        + +

        The Memory Profiler automatically generates categories based on the graph of tensor operations recorded during profiling.

        + +

        Memory Timeline

        + +

        In this Memory Timeline collected using the Memory Profiler, we have the same training example as before. We can observe the gradients in blue are now being cleared from iteration to iteration. We can also notice that the optimizer state in yellow is allocated after the first iteration, and is kept constant for the rest of the job.

        + +

        This optimizer state is the reason behind the increase of GPU memory from the first iteration to the second. Try out the code yourself (see code sample in Appendix B). The Memory Profiler helps to improve training memory understanding so that model authors can figure out which categories are using the most GPU memory.

        + +

        Where can I find these tools?

        + +

        We hope that these tools will greatly improve your ability to debug CUDA OOMs and to understand your memory usage by category.

        + +

        The Memory Snapshot and the Memory Profiler are available in the v2.1 release of PyTorch as experimental features.

        + + + +

        Feedback

        + +

        We look forward to hearing from you about any enhancements, bugs or memory stories that our tools helped to solve! As always, please feel free to open new issues on PyTorch’s Github page.

        + +

        We are also open to contributions from the OSS community, feel free to tag Aaron Shi and Zachary DeVito in any Github PRs for reviews.

        + +

        Acknowledgements

        + +

        Really appreciate the content reviewers, Mark Saroufim and Gregory Chanan, for reviewing this post and improving its readability.

        + +

        Really appreciate the code reviews and feedback from Adnan Aziz and Lei Tian.

        + +

        Appendix

        + +

        Appendix A - ResNet50 Memory Snapshot Code Example

        + +
        # (c) Meta Platforms, Inc. and affiliates. 
        +import logging
        +import socket
        +from datetime import datetime, timedelta
        +
        +import torch
        +
        +from torchvision import models
        +
        +logging.basicConfig(
        +   format="%(levelname)s:%(asctime)s %(message)s",
        +   level=logging.INFO,
        +   datefmt="%Y-%m-%d %H:%M:%S",
        +)
        +logger: logging.Logger = logging.getLogger(__name__)
        +logger.setLevel(level=logging.INFO)
        +
        +TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S"
        +
        +# Keep a max of 100,000 alloc/free events in the recorded history
        +# leading up to the snapshot.
        +MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000
        +
        +def start_record_memory_history() -> None:
        +   if not torch.cuda.is_available():
        +       logger.info("CUDA unavailable. Not recording memory history")
        +       return
        +
        +   logger.info("Starting snapshot record_memory_history")
        +   torch.cuda.memory._record_memory_history(
        +       max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT
        +   )
        +
        +def stop_record_memory_history() -> None:
        +   if not torch.cuda.is_available():
        +       logger.info("CUDA unavailable. Not recording memory history")
        +       return
        +
        +   logger.info("Stopping snapshot record_memory_history")
        +   torch.cuda.memory._record_memory_history(enabled=None)
        +
        +def export_memory_snapshot() -> None:
        +   if not torch.cuda.is_available():
        +       logger.info("CUDA unavailable. Not exporting memory snapshot")
        +       return
        +
        +   # Prefix for file names.
        +   host_name = socket.gethostname()
        +   timestamp = datetime.now().strftime(TIME_FORMAT_STR)
        +   file_prefix = f"{host_name}_{timestamp}"
        +
        +   try:
        +       logger.info(f"Saving snapshot to local file: {file_prefix}.pickle")
        +       torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle")
        +   except Exception as e:
        +       logger.error(f"Failed to capture memory snapshot {e}")
        +       return
        +
        +# Simple Resnet50 example to demonstrate how to capture memory visuals.
        +def run_resnet50(num_iters=5, device="cuda:0"):
        +   model = models.resnet50().to(device=device)
        +   inputs = torch.randn(1, 3, 224, 224, device=device)
        +   labels = torch.rand_like(model(inputs))
        +   optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
        +   loss_fn = torch.nn.CrossEntropyLoss()
        +
        +   # Start recording memory snapshot history
        +   start_record_memory_history()
        +
        +   for _ in range(num_iters):
        +       pred = model(inputs)
        +       loss_fn(pred, labels).backward()
        +       optimizer.step()
        +       optimizer.zero_grad(set_to_none=True)
        +
        +   # Create the memory snapshot file
        +   export_memory_snapshot()
        +
        +   # Stop recording memory snapshot history
        +   stop_record_memory_history()
        +
        +if __name__ == "__main__":
        +    # Run the resnet50 model
        +    run_resnet50()
        +
        + +

        Appendix B - ResNet50 Memory Profiler Code Example

        + +
        # (c) Meta Platforms, Inc. and affiliates. 
        +import logging
        +import socket
        +from datetime import datetime, timedelta
        +
        +import torch
        +
        +from torch.autograd.profiler import record_function
        +from torchvision import models
        +
        +logging.basicConfig(
        +   format="%(levelname)s:%(asctime)s %(message)s",
        +   level=logging.INFO,
        +   datefmt="%Y-%m-%d %H:%M:%S",
        +)
        +logger: logging.Logger = logging.getLogger(__name__)
        +logger.setLevel(level=logging.INFO)
        +
        +TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S"
        +
        +def trace_handler(prof: torch.profiler.profile):
        +   # Prefix for file names.
        +   host_name = socket.gethostname()
        +   timestamp = datetime.now().strftime(TIME_FORMAT_STR)
        +   file_prefix = f"{host_name}_{timestamp}"
        +
        +   # Construct the trace file.
        +   prof.export_chrome_trace(f"{file_prefix}.json.gz")
        +
        +   # Construct the memory timeline file.
        +   prof.export_memory_timeline(f"{file_prefix}.html", device="cuda:0")
        +
        +def run_resnet50(num_iters=5, device="cuda:0"):
        +   model = models.resnet50().to(device=device)
        +   inputs = torch.randn(1, 3, 224, 224, device=device)
        +   labels = torch.rand_like(model(inputs))
        +   optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
        +   loss_fn = torch.nn.CrossEntropyLoss()
        +
        +   with torch.profiler.profile(
        +       activities=[
        +           torch.profiler.ProfilerActivity.CPU,
        +           torch.profiler.ProfilerActivity.CUDA,
        +       ],
        +       schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1),
        +       record_shapes=True,
        +       profile_memory=True,
        +       with_stack=True,
        +       on_trace_ready=trace_handler,
        +   ) as prof:
        +       for _ in range(num_iters):
        +           prof.step()
        +           with record_function("## forward ##"):
        +               pred = model(inputs)
        +
        +           with record_function("## backward ##"):
        +               loss_fn(pred, labels).backward()
        +
        +           with record_function("## optimizer ##"):
        +               optimizer.step()
        +               optimizer.zero_grad(set_to_none=True)
        +
        +if __name__ == "__main__":
        +    # Warm up
        +    run_resnet50()
        +    # Run the resnet50 model
        +    run_resnet50()
        +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/understanding-gpu-memory-2/index.html b/blog/understanding-gpu-memory-2/index.html new file mode 100644 index 000000000000..7b303408c05b --- /dev/null +++ b/blog/understanding-gpu-memory-2/index.html @@ -0,0 +1,985 @@ + + + + + + + + + + + + + Understanding GPU Memory 2: Finding and Removing Reference Cycles | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Aaron Shi, Zachary DeVito + +

        +

        This is part 2 of the Understanding GPU Memory blog series. Our first post Understanding GPU Memory 1: Visualizing All Allocations over Time shows how to use the memory snapshot tool. In this part, we will use the Memory Snapshot to visualize a GPU memory leak caused by reference cycles, and then locate and remove them in our code using the Reference Cycle Detector.

        + +

        Sometimes when we were using the Memory Snapshot, we saw plots of GPU memory that looked similar to this.

        + +

        GPU memory

        + +

        In this snapshot, each peak shows GPU tensors building up over time and then several tensors getting released at once. In addition, a CUDA OOM happens on the right side causing all the tensors to be released. Seeing the tensors accumulate like this is a clear indication of a problem, but it doesn’t immediately suggest why.

        + +

        Tensors in Reference Cycles

        + +

        During early debugging, we dug in further to find that this **pattern happens a lot when your Python code has objects with reference cycles. ** Python will clean up non-cyclic objects immediately using reference counting. However objects in reference cycles are only cleaned up later by a cycle collector. If these cycles refer to a GPU tensor, the GPU tensor will stay alive until that cycle collector runs and removes the reference cycle. Let’s take a look at a simplified example.

        + +

        Simple reference cycle

        + +

        Code Snippet behind the snapshot (full code in Appendix A):

        + +
            def leak(tensor_size, num_iter=100000, device="cuda:0"):
        +      class Node:
        +        def __init__(self, T):
        +          self.tensor = T
        +          self.link = None
        +
        +      for _ in range(num_iter):
        +        A = torch.zeros(tensor_size, device=device)
        +        B = torch.zeros(tensor_size, device=device)
        +        a, b = Node(A), Node(B)
        +
        +        # A reference cycle will force refcounts to be non-zero.
        +        a.link, b.link = b, a
        +        # Python will eventually garbage collect a & b, but will
        +        # OOM on the GPU before that happens (since python
        +        # runtime doesn't know about CUDA memory usage).
        +
        + +

        In this code example, the tensors A and B are created, where A has a link to B and vice versa. This forces a non-zero reference count when A and B go out of scope. When we run this for 100,000 iterations, we expect the automatic garbage collection to free the reference cycles when going out of scope. However, this will actually CUDA OOM.

        + +

        Why doesn’t automatic garbage collection work?

        + +

        The automatic garbage collection works well when there is a lot of extra memory as is common on CPUs because it amortizes the expensive garbage collection by using Generational Garbage Collection. But to amortize the collection work, it defers some memory cleanup making the maximum memory usage higher, which is less suited to memory constrained environments. The Python runtime also has no insights into CUDA memory usage, so it cannot be triggered on high memory pressure either. It’s even more challenging as GPU training is almost always memory constrained because we will often raise the batch size to use any additional free memory.

        + +

        The CPython’s garbage collection frees unreachable objects held in reference cycles via the mark-and-sweep. The garbage collection is automatically run when the number of objects exceeds certain thresholds. There are 3 generations of thresholds to help amortize the expensive costs of running garbage collection on every object. The later generations are less frequently run. This would explain why automatic collections will only clear several tensors on each peak, however there are still tensors that leak resulting in the CUDA OOM. Those tensors were held by reference cycles in later generations.

        + +

        Explicitly calling gc.collect()

        + +

        One way to fix this is by explicitly calling the garbage collector frequently. Here we can see that the GPU memory for tensors out of scope gets cleaned up when we explicitly call the garbage collector every 100 iterations. This also controls the maximum GPU peak memory held by leaking tensors.

        + +

        memory leak

        + +

        Although this works and fixes the CUDA OOM issue, calling gc.collect() too frequently can cause other issues including QPS regressions. Therefore we cannot simply increase the frequency of garbage collection on every training job. It’s best to just avoid creating reference cycles in the first place. More on this in section, Reference Cycle Detector.

        + +

        Sneaky Memory Leak in Callback

        + +

        Real examples are more complicated, so let’s look at a more realistic example that has a similar behavior. In this snapshot, we can observe the same behavior of tensors being accumulated and freed during automatic garbage collection, until we hit a CUDA OOM.

        + +

        memory leak

        + +

        Code Snippet behind this snapshot (full code sample in Appendix A):

        + +
            class AwaitableTensor:
        +      def __init__(self, tensor_size):
        +        self._tensor_size = tensor_size
        +        self._tensor = None
        +
        +      def wait(self):
        +        self._tensor = torch.zeros(self._tensor_size, device="cuda:0")
        +        return self._tensor
        +
        +    class AwaitableTensorWithViewCallback:
        +      def __init__(self, tensor_awaitable, view_dim):
        +        self._tensor_awaitable = tensor_awaitable
        +        self._view_dim = view_dim
        +        # Add a view filter callback to the tensor.
        +        self._callback = lambda ret: ret.view(-1, self._view_dim)
        +
        +      def wait(self):
        +        return self._callback(self._tensor_awaitable.wait())
        +
        +    async def awaitable_leak(
        +      tensor_size=2**27, num_iter=100000,
        +    ):
        +      for _ in range(num_iter):
        +        A = AwaitableTensor(tensor_size)
        +        AwaitableTensorWithViewCallBack(A, 4).wait()
        +
        + +

        In this code, we define two classes. The class AwaitableTensor will create a tensor when waited upon. Another class AwaitableTensorWithViewCallback will apply a view filter on the AwaitableTensor via callback lambda.

        + +

        When running awaitable_leak, which creates tensor A (512 MB) and applies a view filter for 100,000 iterations, we expect that A should be reclaimed each time it goes out of scope because the reference count should reach 0. However, this will actually OOM!

        + +

        While we know there is a reference cycle here, it isn’t clear from the code where the cycle is created. To help with these situations, we have created a tool to locate and report these cycles.

        + +

        Reference Cycle Detector

        + +

        Introducing the Reference Cycle Detector, which helps us find reference cycles keeping GPU tensors alive. The API is fairly simple:

        + +
          +
        • During model initialization: +
            +
          • Import: from torch.utils.viz._cycles import warn_tensor_cycles
          • +
          • Start: warn_tensor_cycles()
          • +
          +
        • +
        + +

        The Reference Cycle Detector will issue warnings every time that the cycle collector runs and finds a CUDA tensor that gets freed. The warning provides an object graph showing how the reference cycle refers to the GPU tensor.

        + +

        object graph

        + +

        For instance in this object graph, we can easily observe that there is a circular dependency on the outer circle of the graph, and highlighted in red is the GPU tensor kept alive.

        + +

        Most cycles are pretty easy to fix once they are discovered. For instance here we can remove the reference to self created by self._view_dim in the callback.

        + +

        code snippet

        + +

        We’ve spent some time fixing cycles in existing models using these tools. For example in TorchRec, we’ve found and removed a reference cycle in PR#1226.

        + +

        code snippet

        + +

        Once we’ve removed the reference cycles, the code will no longer issue a CUDA OOM nor show any memory leaks in their snapshots.

        + +

        What are the other benefits of using the Reference Cycle Detector?

        + +

        Removing these cycles will also directly lower the maximum GPU memory usage as well as make it less likely for memory to fragment because the allocator returns to the same state after each iteration.

        + +

        Where can I find these tools?

        + +

        We hope that the Reference Cycle Detector will greatly improve your ability to find and remove memory leaks caused by reference cycles. The Reference Cycle Detector is available in the v2.1 release of PyTorch as experimental features and More information about the Reference Cycle Detector can be found in the PyTorch Memory docs here.

        + +

        Feedback

        + +

        We look forward to hearing from you about any enhancements, bugs or memory stories that our tools helped to solve! As always, please feel free to open new issues on PyTorch’s Github page.

        + +

        We are also open to contributions from the OSS community, feel free to tag Aaron Shi and Zachary DeVito in any Github PRs for reviews.

        + +

        Acknowledgements

        + +

        Really appreciate the content reviewers, Mark Saroufim, Gregory Chanan, and Adnan Aziz for reviewing this post and improving its readability.

        + +

        Appendix

        + +

        Appendix A - Code Sample

        + +

        This code snippet was used to generate the plots and examples shown. Here are the arguments to reproduce the sections:

        + +
          +
        • Introduction: python sample.py
        • +
        • Explicitly calling gc.collect(): python sample.py --gc_collect_interval=100
        • +
        • Sneaky Memory Leak in Callback: python sample.py --workload=awaitable
        • +
        • Ref Cycle Detector: python sample.py --workload=awaitable --warn_tensor_cycles
        • +
        + +

        sample.py:

        + +
        # (c) Meta Platforms, Inc. and affiliates. 
        +import argparse
        +import asyncio
        +import gc
        +import logging
        +import socket
        +from datetime import datetime, timedelta
        +
        +import torch
        +
        +logging.basicConfig(
        +   format="%(levelname)s:%(asctime)s %(message)s",
        +   level=logging.INFO,
        +   datefmt="%Y-%m-%d %H:%M:%S",
        +)
        +logger: logging.Logger = logging.getLogger(__name__)
        +logger.setLevel(level=logging.INFO)
        +
        +TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S"
        +
        +# Keep a max of 100,000 alloc/free events in the recorded history
        +# leading up to the snapshot.
        +MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000
        +
        +def start_record_memory_history() -> None:
        +   if not torch.cuda.is_available():
        +       logger.info("CUDA unavailable. Not recording memory history")
        +       return
        +
        +   logger.info("Starting snapshot record_memory_history")
        +   torch.cuda.memory._record_memory_history(
        +       max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT
        +   )
        +
        +def stop_record_memory_history() -> None:
        +   if not torch.cuda.is_available():
        +       logger.info("CUDA unavailable. Not recording memory history")
        +       return
        +
        +   logger.info("Stopping snapshot record_memory_history")
        +   torch.cuda.memory._record_memory_history(enabled=None)
        +
        +def export_memory_snapshot() -> None:
        +   if not torch.cuda.is_available():
        +       logger.info("CUDA unavailable. Not exporting memory snapshot")
        +       return
        +
        +   # Prefix for file names.
        +   host_name = socket.gethostname()
        +   timestamp = datetime.now().strftime(TIME_FORMAT_STR)
        +   file_prefix = f"{host_name}_{timestamp}"
        +
        +   try:
        +       logger.info(f"Saving snapshot to local file: {file_prefix}.pickle")
        +       torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle")
        +   except Exception as e:
        +       logger.error(f"Failed to capture memory snapshot {e}")
        +       return
        +
        +# This function will leak tensors due to the reference cycles.
        +def simple_leak(tensor_size, gc_interval=None, num_iter=30000, device="cuda:0"):
        +    class Node:
        +        def __init__(self, T):
        +            self.tensor = T
        +            self.link = None
        +
        +    for i in range(num_iter):
        +        A = torch.zeros(tensor_size, device=device)
        +        B = torch.zeros(tensor_size, device=device)
        +        a, b = Node(A), Node(B)
        +        # A reference cycle will force refcounts to be non-zero, when
        +        # a and b go out of scope.
        +        a.link, b.link = b, a
        +        # Python will eventually gc a and b, but may OOM on the CUDA
        +        # device before that happens (since python runtime doesn't
        +        # know about CUDA memory usage).
        +
        +        # Since implicit gc is not called frequently enough due to
        +        # generational gc, adding an explicit gc is necessary as Python
        +        # runtime does not know about CUDA memory pressure.
        +        # https://en.wikipedia.org/wiki/Tracing_garbage_collection#Generational_GC_(ephemeral_GC)
        +        if gc_interval and i % int(gc_interval) == 0:
        +            gc.collect()
        +
        +async def awaitable_leak(
        +    tensor_size, gc_interval=None, num_iter=100000, device="cuda:0"
        +):
        +    class AwaitableTensor:
        +        def __init__(self, tensor_size, device) -> None:
        +            self._tensor_size = tensor_size
        +            self._device = device
        +            self._tensor = None
        +
        +        def wait(self) -> torch.Tensor:
        +            self._tensor = torch.zeros(self._tensor_size, device=self._device)
        +            return self._tensor
        +
        +    class AwaitableTensorWithViewCallBack:
        +        def __init__(
        +            self,
        +            tensor_awaitable: AwaitableTensor,
        +            view_dim: int,
        +        ) -> None:
        +            self._tensor_awaitable = tensor_awaitable
        +            self._view_dim = view_dim
        +            # Add a view filter callback to the tensor.
        +            self._callback = lambda ret: ret.view(-1, self._view_dim)
        +
        +        def wait(self) -> torch.Tensor:
        +            return self._callback(self._tensor_awaitable.wait())
        +
        +    for i in range(num_iter):
        +        # Create an awaitable tensor
        +        a_tensor = AwaitableTensor(tensor_size, device)
        +
        +        # Apply a view filter callback on the awaitable tensor.
        +        AwaitableTensorWithViewCallBack(a_tensor, 4).wait()
        +
        +        # a_tensor will go out of scope.
        +
        +        if gc_interval and i % int(gc_interval) == 0:
        +            gc.collect()
        +
        +if __name__ == "__main__":
        +    parser = argparse.ArgumentParser(description="A memory_leak binary instance")
        +    parser.add_argument(
        +        "--gc_collect_interval",
        +        default=None,
        +        help="Explicitly call GC every given interval. Default is off.",
        +    )
        +    parser.add_argument(
        +        "--workload",
        +        default="simple",
        +        help="Toggle which memory leak workload to run. Options are simple, awaitable.",
        +    )
        +    parser.add_argument(
        +        "--warn_tensor_cycles",
        +        action="store_true",
        +        default=False,
        +        help="Toggle whether to enable reference cycle detector.",
        +    )
        +    args = parser.parse_args()
        +
        +    if args.warn_tensor_cycles:
        +        from tempfile import NamedTemporaryFile
        +
        +        from torch.utils.viz._cycles import observe_tensor_cycles
        +
        +        logger.info("Enabling warning for Python reference cycles for CUDA Tensors.")
        +
        +        def write_and_log(html):
        +            with NamedTemporaryFile("w", suffix=".html", delete=False) as f:
        +                f.write(html)
        +                logger.warning(
        +                    "Reference cycle includes a CUDA Tensor see visualization of cycle %s",
        +                    f.name,
        +                )
        +
        +        observe_tensor_cycles(write_and_log)
        +    else:
        +        # Start recording memory snapshot history
        +        start_record_memory_history()
        +
        +    # Run the workload with a larger tensor size.
        +    # For smaller sizes, we will not CUDA OOM as gc will kick in often enough
        +    # to reclaim reference cycles before an OOM occurs.
        +    size = 2**26  # 256 MB
        +    try:
        +        if args.workload == "awaitable":
        +            size *= 2
        +            logger.info(f"Running tensor_size: {size*4/1024/1024} MB")
        +            asyncio.run(
        +                awaitable_leak(tensor_size=size, gc_interval=args.gc_collect_interval)
        +            )
        +        elif args.workload == "simple":
        +            logger.info(f"Running tensor_size: {size*4/1024/1024} MB")
        +            simple_leak(tensor_size=size, gc_interval=args.gc_collect_interval)
        +        else:
        +            raise Exception("Unknown workload.")
        +    except Exception:
        +        logger.exception(f"Failed to allocate {size*4/1024/1024} MB")
        +
        +    # Create the memory snapshot file
        +    export_memory_snapshot()
        +
        +    # Stop recording memory snapshot history
        +    stop_record_memory_history()
        +
        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu/index.html b/blog/understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu/index.html new file mode 100644 index 000000000000..5fcb321944ba --- /dev/null +++ b/blog/understanding-lazytensor-system-performance-with-pytorch-xla-on-cloud-tpu/index.html @@ -0,0 +1,823 @@ + + + + + + + + + + + + + Understanding LazyTensor System Performance with PyTorch/XLA on Cloud TPU | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Vaibhav Singh + +

        +

        Introduction

        + +

        Ease of use, expressivity, and debuggability are among the core principles of PyTorch. One of the key drivers for the ease of use is that PyTorch execution is by default “eager, i.e. op by op execution preserves the imperative nature of the program. However, eager execution does not offer the compiler based optimization, for example, the optimizations when the computation can be expressed as a graph.

        + +

        LazyTensor [1], first introduced with PyTorch/XLA, helps combine these seemingly disparate approaches. While PyTorch eager execution is widely used, intuitive, and well understood, lazy execution is not as prevalent yet.

        + +

        In this post we will explore some of the basic concepts of the LazyTensor System with the goal of applying these concepts to understand and debug performance of LazyTensor based implementations in PyTorch. Although we will use PyTorch/XLA on Cloud TPU as the vehicle for exploring these concepts, we hope that these ideas will be useful to understand other system(s) built on LazyTensors.

        + +

        LazyTensor

        + +

        Any operation performed on a PyTorch tensor is by default dispatched as a kernel or a composition of kernels to the underlying hardware. These kernels are executed asynchronously on the underlying hardware. The program execution is not blocked until the value of a tensor is fetched. This approach scales extremely well with massively parallel programmed hardware such as GPUs.

        + +

        The starting point of a LazyTensor system is a custom tensor type. In PyTorch/XLA, this type is called XLA tensor. In contrast to PyTorch’s native tensor type, operations performed on XLA tensors are recorded into an IR graph. Let’s examine an example that sums the product of two tensors:

        + +
        import torch
        +import torch_xla
        +import torch_xla.core.xla_model as xm
        +
        +dev = xm.xla_device()
        +
        +x1 = torch.rand((3, 3)).to(dev)
        +x2 = torch.rand((3, 8)).to(dev)
        +
        +y1 = torch.einsum('bs,st->bt', x1, x2)
        +print(torch_xla._XLAC._get_xla_tensors_text([y1]))
        +
        + +

        You can execute this colab notebook to examine the resulting graph for y1. Notice that no computation has been performed yet.

        + +
        y1 = y1 + x2
        +print(torch_xla._XLAC._get_xla_tensors_text([y1]))
        +
        + +

        The operations will continue until PyTorch/XLA encounters a barrier. This barrier can either be a mark step() api call or any other event which forces the execution of the graph recorded so far.

        + +
        xm.mark_step()
        +print(torch_xla._XLAC._get_xla_tensors_text([y1]))
        +
        + +

        Once the mark_step() is called, the graph is compiled and then executed on TPU, i.e. the tensors have been materialized. Therefore, the graph is now reduced to a single line y1 tensor which holds the result of the computation.

        + +

        Compile Once, Execute Often

        + +

        XLA compilation passes offer optimizations (e.g. op-fusion, which reduces HBM pressure by using scratch-pad memory for multiple ops, ref ) and leverages lower level XLA infrastructure to optimally use the underlying hardware. However, there is one caveat, compilation passes are expensive, i.e. can add to the training step time. Therefore, this approach scales well if and only if we can compile once and execute often (compilation cache helps, such that the same graph is not compiled more than once).

        + +

        In the following example, we create a small computation graph and time the execution:

        + +
        y1 = torch.rand((3, 8)).to(dev)
        +def dummy_step() :
        +  y1 = torch.einsum('bs,st->bt', y1, x)
        +  xm.mark_step()
        +  return y1
        +
        + +
        %timeit dummy_step
        +
        + +
        The slowest run took 29.74 times longer than the fastest. This could mean that an intermediate result is being cached.
        +10000000 loops, best of 5: 34.2 ns per loop
        +
        + +

        You notice that the slowest step is quite longer than the fastest. This is because of the graph compilation overhead which is incurred only once for a given shape of graph, input shape, and output shape. Subsequent steps are faster because no graph compilation is necessary.

        + +

        This also implies that we expect to see performance cliffs when the “compile once and execute often” assumption breaks. Understanding when this assumption breaks is the key to understanding and optimizing the performance of a LazyTensor system. Let’s examine what triggers the compilation.

        + +

        Graph Compilation and Execution and LazyTensor Barrier

        + +

        We saw that the computation graph is compiled and executed when a LazyTensor barrier is encountered. There are three scenarios when the LazyTensor barrier is automatically or manually introduced. The first is the explicit call of mark_step() api as shown in the preceding example. mark_step() is also called implicitly at every step when you wrap your dataloader with MpDeviceLoader (highly recommended to overlap compute and data upload to TPU device). The Optimizer step method of xla_model also allows to implicitly call mark_step (when you set barrier=True).

        + +

        The second scenario where a barrier is introduced is when PyTorch/XLA finds an op with no mapping (lowering) to equivalent XLA HLO ops. PyTorch has 2000+ operations. Although most of these operations are composite (i.e. can be expressed in terms of other fundamental operations), some of these operations do not have corresponding lowering in XLA.

        + +

        + +

        + +

        What happens when an op with no XLA lowering is used? PyTorch XLA stops the operation recording and cuts the graph(s) leading to the input(s) of the unlowered op. This cut graph is then compiled and dispatched for execution. The results (materialized tensor) of execution are sent back from device to host, the unlowered op is then executed on the host (cpu), and then downstream LazyTensor operations creating a new graph(s) until a barrier is encountered again.

        + +

        The third and final scenario which results in a LazyTensor barrier is when there is a control structure/statement or another method which requires the value of a tensor. This statement would at the minimum cause the execution of the computation graph leading to the tensor (if the graph has already been seen) or cause compilation and execution of both.

        + +

        Other examples of such methods include .item(), isEqual(). In general, any operation that maps Tensor -> Scalar will cause this behavior.

        + +

        Dynamic Graph

        + +

        As illustrated in the preceding section, graph compilation cost is amortized if the same shape of the graph is executed many times. It’s because the compiled graph is cached with a hash derived from the graph shape, input shape, and the output shape. If these shapes change it will trigger compilation, and too frequent compilation will result in training time degradation.

        + +

        Let’s consider the following example:

        + +
        def dummy_step(x, y, loss, acc=False):
        +  z = torch.einsum('bs,st->bt', y, x)
        +  step_loss = z.sum().view(1,)
        +  if acc:
        +    loss = torch.cat((loss, step_loss))
        +  else:
        +    loss = step_loss
        +  xm.mark_step()
        +  return loss
        +
        +
        +import time
        +def measure_time(acc=False):
        +  exec_times = []
        +  iter_count = 100
        +  x = torch.rand((512, 8)).to(dev)
        +  y = torch.rand((512, 512)).to(dev)
        +  loss = torch.zeros(1).to(dev)
        +  for i in range(iter_count):
        +    tic = time.time()
        +    loss = dummy_step(x, y, loss, acc=acc)
        +    toc = time.time()
        +    exec_times.append(toc - tic)
        +  return exec_times
        +
        +dyn = measure_time(acc=True) # acc= True Results in dynamic graph
        +st = measure_time(acc=False) # Static graph, computation shape, inputs and output shapes don't change
        +
        +import matplotlib.pyplot as plt
        +plt.plot(st, label = 'static graph')
        +plt.plot(dyn, label = 'dynamic graph')
        +plt.legend()
        +plt.title('Execution time in seconds')
        +
        + +

        + +

        + +

        Note that static and dynamic cases have the same computation but dynamic graph compiles every time, leading to the higher overall run-time. In practice, the training step with recompilation can sometimes be an order of magnitude or slower. In the next section we discuss some of the PyTorch/XLA tools to debug training degradation.

        + +

        Profiling Training Performance with PyTorch/XLA

        + +

        PyTorch/XLA profiling consists of two major components. First is the client side profiling. This feature is turned on by simply setting the environment variable PT_XLA_DEBUG to 1. Client side profiling points to unlowered ops or device-to-host transfer in your source code. Client side profiling also reports if there are too frequent compilations happening during the training. You can explore some metrics and counters provided by PyTorch/XLA in conjunction with the profiler in this notebook.

        + +

        The second component offered by PyTorch/XLA profiler is the inline trace annotation. For example:

        + +
        import torch_xla.debug.profiler as xp
        +
        +def train_imagenet():
        +  print('==> Preparing data..')
        +  img_dim = get_model_property('img_dim')
        +  ....
        +  server = xp.start_server(3294)
        +  def train_loop_fn(loader, epoch):
        +    ....
        +    model.train()
        +    for step, (data, target) in enumerate(loader):
        +      with xp.StepTrace('Train_Step', step_num=step):
        +        ....
        +        if FLAGS.amp:
        +        ....
        +        else:
        +          with xp.Trace('build_graph'):
        +            output = model(data)
        +            loss = loss_fn(output, target)
        +            loss.backward()
        +          xm.optimizer_step(optimizer)
        +
        + +

        Notice the start_server api call. The port number that you have used here is the same port number you will use with the tensorboard profiler in order to view the op trace similar to:

        + +

        + +

        + +

        Op trace along with the client-side debugging function is a powerful set of tools to debug and optimize your training performance with PyTorch/XLA. For more detailed instructions on the profiler usage, the reader is encouraged to explore blogs part-1, part-2, and part-3 of the blog series on PyTorch/XLA performance debugging.

        + +

        Summary

        + +

        In this article we have reviewed the fundamentals of the LazyTensor system. We built on those fundamentals with PyTorch/XLA to understand the potential causes of training performance degradation. We discussed why “compile once and execute often” helps to get the best performance on LazyTensor systems, and why training slows down when this assumption breaks.

        + +

        We hope that PyTorch users will find these insights helpful for their novel works with LazyTensor systems.

        + +

        Acknowledgements

        + +

        A big thank you to my outstanding colleagues Jack Cao, Milad Mohammedi, Karl Weinmeister, Rajesh Thallam, Jordan Tottan (Google) and Geeta Chauhan (Meta) for their meticulous reviews and feedback. And thanks to the extended PyTorch/XLA development team from Google, Meta, and the open source community to make PyTorch possible on TPUs. And finally, thanks to the authors of the LazyTensor paper not only for developing LazyTensor but also for writing such an accessible paper.

        + +

        Refrences

        + +

        [1] LazyTensor: combining eager execution with domain-specific compilers

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/unleashing-ai-mobile/index.html b/blog/unleashing-ai-mobile/index.html new file mode 100644 index 000000000000..f868f4a19cbd --- /dev/null +++ b/blog/unleashing-ai-mobile/index.html @@ -0,0 +1,754 @@ + + + + + + + + + + + + + Unleashing the Power of AI on Mobile: LLM Inference for Llama 3.2 Quantized Models with ExecuTorch and KleidiAI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Gian Marco Iodice, Arm and Digant Desai, Meta + +

        +

        Introduction

        + +

        At the recent PyTorch Conference, Arm highlighted the widespread impact of its technology, spanning from cloud to edge, emphasizing its commitment to delivering its advanced AI computing capabilities seamlessly to millions of developers worldwide.

        + +

        key stats

        + +

        During the presentation, it was emphasized that Arm bears the immense responsibility of equipping 20+ million developers and billions of users with advanced AI computing features without friction. Achieving this requires crucial software collaborations across a vast ecosystem of software and hardware partners.

        + +

        Just a few months ago, Arm launched Arm Kleidi, developer enablement technologies and resources to drive technical collaboration and innovation across the ML stack. This includes the KleidiAI software library providing optimized software routines, which when integrated into key frameworks such as XNNPACK enable automatic AI acceleration for developers on Arm Cortex-A CPUs.

        + +

        Today, we’re excited to announce a new milestone for the AI open-source community that brings Arm even closer to realizing this vision: the integration of KleidiAI into ExecuTorch via XNNPACK, boosting AI workload performance on Arm mobile CPUs!

        + +

        Thanks to the collaborative efforts of the engineering teams at Arm and Meta, AI developers can now deploy quantized Llama models which run up to 20% faster on Arm Cortex-A v9 CPUs with the i8mm ISA extension.

        + +

        And there’s more exciting news - the ExecuTorch team has officially launched the Beta release!

        + +

        This marks an important milestone in our partnership. In this blog, we are eager to share more details about ExecuTorch capabilities, the new Meta Llama 3.2 models, the integer 4-bit with per-block quantization, and the impressive performance recorded on certain Arm CPUs. Notably, we have achieved speeds of over 350 tokens per second on the prefill stage with the quantized Llama 3.2 1B model on Samsung S24+ device, as shown in the following screenshots.

        + +

        mobile app screenshots

        + +

        Now, let’s dive into the key components that enabled the demo creation presented in the preceding images. First up: new Llama 3.2 models!

        + +

        Meta Llama 3.2

        + +

        Meta recently announced the first lightweight quantized Llama models, which are designed to run on popular mobile devices. Meta used two techniques for quantizing Llama 3.2 1B and 3B models: Quantization-Aware Training (QAT) with LoRA adaptors (QLoRA), and SpinQuant, a state-of-the-art post-training quantization method. The quantized models were evaluated using PyTorch’s ExecuTorch framework as the inference engine, with the Arm CPU as a backend.

        + +

        These instruction-tuned models retain the quality and safety of the original 1B and 3B models while achieving a 2-4x speedup and reducing model size by 56% on average and memory footprint by 41% on average compared to the original BF16 format.

        + +

        In this blog post, we will demonstrate the performance improvements we observed in our experiments.

        + +

        ExecuTorch

        + +

        ExecuTorch is a PyTorch-native framework specifically designed for deploying AI models on-device, enhancing privacy and reducing latency. It supports the deployment of cutting-edge open-source AI models, including the Llama family of models and vision and speech models like Segment Anything and Seamless.

        + +

        This unlocks new possibilities for edge devices such as mobile phones, smart glasses, VR headsets, and smart home cameras. Traditionally, deploying PyTorch-trained AI models to resource-limited edge devices has been challenging and time-consuming, often requiring conversion to other formats which could lead to errors and suboptimal performance. The varied toolchains across the hardware and edge ecosystem have also degraded the developer experience, making a universal solution impractical.

        + +

        ExecuTorch addresses these issues by providing composable components that include core runtime, operator library, and delegation interface that allows for portability as well extensibility. Models can be exported using torch.export(), producing a graph that is natively compatible with the ExecuTorch runtime, capable of running on most edge devices with CPUs, and extendable to specialized hardware like GPUs and NPUs for enhanced performance.

        + +

        Working with Arm, ExecuTorch now leverages the optimized low-bit matrix multiplication kernels from the Arm KleidiAI library to improve on-device Large Language Model (LLM) inference performance via XNNPACK. We also thank the XNNPACK team at Google for supporting this effort.

        + +

        In this post, we will focus on this integration available in ExecuTorch

        + +

        Evolving the architecture for AI workloads

        + +

        At Arm, we have been deeply committed to investing in open-source projects and advancing new technologies in our processors since the early days of the deep learning wave, focusing on making AI workloads high-performing and more power-efficient.

        + +

        For instance, Arm introduced the SDOT instruction, starting with the Armv8.2-A architecture, to accelerate dot product arithmetic between 8-bit integer vectors. This feature, now widely available in mobile devices, significantly speeds up the computation of quantized 8-bit models. After the SDOT instruction, Arm introduced the BF16 data type and the MMLA instruction to further enhance the floating-point and integer matrix multiplication performance on CPUs and, most recently, announced the Scalable Matrix Extension (SME), marking a significant leap forward in machine learning capabilities.

        + +

        The following image shows a few examples of Arm CPU’s continuous innovations in the AI space over the last decade:

        + +

        line chart

        + +

        Given the widespread use of Arm CPUs, AI frameworks need to take full advantage of these technologies in key operators to maximize performance. Recognizing this, we saw the need for an open-source library to share these optimized software routines. However, we were mindful of the challenges in integrating a new library into AI frameworks, such as concerns about library size, dependencies, and documentation and the need to avoid adding extra burdens for developers. So, we took extra steps to gather feedback from our partners and ensure a smooth integration process that does not require additional dependencies for AI developers. This effort led to KleidiAI, an open-source library that provides optimized performance-critical routines for artificial intelligence (AI) workloads tailored for Arm CPUs. You can learn more about KleidiAI here.

        + +

        Working with the ExecuTorch team at Meta, Arm provided the software optimizations for their novel 4-bit with per-block quantization schema, which is used to accelerate the matrix multiplication kernel in the Transformer layer’s torch.nn.linear operator for Llama 3.2 quantized models. This flexible 4-bit quantization schema from ExecuTorch strikes a balance between model accuracy and low-bit matrix multiplication performance targeting on-device LLMs.

        + +

        The integer 4-bit with per-block quantization

        + +

        In KleidiAI, we introduced micro-kernels optimized for this new 4-bit integer quantization scheme (matmul_clamp_f32_qai8dxp_qsi4c32p)

        + +

        As shown in the following image, this 4-bit quantization uses a per-block strategy for weight (RHS matrix) quantization and an 8-bit per-row quantization for activations (LHS matrix):

        + +

        arch diagram

        + +

        As you can see in the preceding image, each output feature map (OFM) in the weight matrix is divided into equally sized blocks (group size), with each block having a scale factor stored in BF16 format. BF16 is advantageous because it maintains the dynamic range of 32-bit floating-point (FP32) format with half the bit size, and it’s easy to convert to and from FP32 using a simple shift operation. This makes BF16 ideal for saving model space, preserving accuracy, and ensuring backward compatibility with devices that lack BF16 hardware acceleration. You can learn more about the BF16 format in this Arm Community blog post.

        + +

        For completeness, this 4-bit quantization scheme and our implementation in KleidiAI allow users to configure group size for the linear weights (RHS), allowing them to trade-off between model size, model accuracy, and model performance if the model is quantized by the user.

        + +

        At this point, we are ready to unveil the incredible performance recorded on Arm CPUs with ExecuTorch when running Llama 3.2 1B and Llama 3.2 3B. Let’s first go over metrics we will use to evaluate the performance of LLM inference.

        + +

        Metrics for LLM Inference

        + +

        Typically, performance metrics used to evaluate LLM performance during inference include:

        + +
          +
        • Time To First Token (TTFT): This measures the time it takes to produce the first output token after a prompt is provided by the user. This latency or response time is important for a good user experience, especially on a phone. TTFT is also a function of the length of the prompt or prompt tokens. To make this metric independent of the prompt length, we use Prefill tokens/second as a proxy here. The relationship between these is inverse: lower TTFT corresponds to higher Prefill tokens/second.
        • +
        • Decode Performance: This is the average number of output tokens generated per second, thus reported in Tokens/Second. It is independent of the total number of tokens generated. For on-device inference, it is important to keep this higher than a user’s average reading speed.
        • +
        • Peak Runtime Memory: This metric reflects the amount of RAM, typically reported in MegaBytes (MiB), needed to run the model with expected performance measured using the metrics above. Given the limited amount of RAM available on Android and iOS devices, this is one of the key metrics for on-device LLM deployment. It dictates the type of models that can be deployed on a device.
        • +
        + +

        Results

        + +

        The quantized Llama 3.2 1B models, both SpinQuant and QLoRA, are designed to run efficiently on a wide range of phones with limited RAM. In this section, we demonstrate that the quantized Llama 3.2 1B models can achieve over 350 tokens per second in the prefill phase and over 40 tokens per second in the decode stage. This level of performance is sufficient to enable on-device text summarization with a reasonable user experience using only Arm CPUs. To put this into perspective, on average, 50 unread messages contain about 600 tokens. With this performance, the response time (the time it takes for the first generated word to appear on the screen) is approximately two seconds.

        + +

        We present measurements from a Samsung S24+ running vanilla Android. We used Llama 3.2 1B parameter models for these experiments. Although we only demonstrate using 1B models, similar performance gains can be expected for the 3B parameter models. The experiment setup involves doing a single warmup run, sequence length of 128, prompt length of 64, and using 6 out of 8 available CPUs, and measuring results over adb.

        + +

        Using the ExecuTorch main branch from GitHub, we first generated the ExecuTorch PTE binary files for each model using the published checkpoints. Then, using the same repository, we generated the ExecuTorch runtime binary for Armv8. In the rest of the section, we will compare the performance of different quantized 1B models against the BF16 model using the binary built with KleidiAI. We will also compare the performance gains for quantized models between the binary with KleidiAI and the one without KleidiAI to distill the impact from KleidiAI.

        + +

        Quantized Model Performance

        + +

        Llama 3.2 quantized models both SpinQuant and QLoRA perform significantly better on prompt prefill and text generation (decode) compared to the baseline BF16. We observed a >2x improvement in decode and a >5x improvement in prefill performance.

        + +

        Furthermore, the quantized model size, PTE file size in bytes, is less than half that of the BF16 model, 2.3 GiB vs. 1.1 GiB. Although the size of int4 is a quarter of BF16, some layers in the model are quantized with int8, making the PTE file size ratio larger. We observed runtime peak memory footprint reduction of almost 40% from 3.1 GiB for the BF16 model to 1.9 GiB for the SpinQuant model, measured in Resident Set Size (RSS) for a maximum sequence length of 2048.

        + +

        With all-around improvements, the new quantized Llama 3.2 models are ideal for on-device deployment targeting Arm CPUs. For more information on accuracy, check out the Meta Llama 3.2 blog.

        + +

        bar graph

        + +

        KleidiAI Impact

        + +

        ExecuTorch relies on the Arm KleidiAI library to provide low-bit performant matrix multiplication kernels for the latest Arm CPUs with advanced Armv8/9 ISA features. These kernels are utilized for on-device quantized Llama 3.2 model inference in ExecuTorch. As depicted in the graph below, ExecuTorch achieves an average of >20% better prefill performance on S24+ with KleidiAI compared to non-KleidiAI kernels, while maintaining the same accuracy. This performance advantage is not limited to specific models or devices, and is expected to benefit all ExecuTorch models using low-bit quantized matrix multiplication on Arm CPUs.

        + +

        To assess the impact of Kleidi, we generated two ExecuTorch runtime binaries targeting Arm Cortex-A CPUs and compared their performance.

        + +
          +
        1. The first ExecuTorch runtime binary built with the Arm KleidiAI library through the XNNPACK library.
        2. +
        3. The second binary was built without the Arm KleidiAI repository, using native kernels from the XNNPACK library.
        4. +
        + +

        bar chart

        + +

        Try it yourself!

        + +

        Ready to experience the performance improvements firsthand? Here’s how you can try out ExecuTorch with the optimizations provided by KleidiAI on your projects: Here is a link to the learning path from Arm to start developing your own application using LLMs using ExecuTorch and KleidiAI.

        + +

        We look forward to hearing your feedback!

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/unlocking-pt-2-6-intel/index.html b/blog/unlocking-pt-2-6-intel/index.html new file mode 100644 index 000000000000..b727a7fbb6fa --- /dev/null +++ b/blog/unlocking-pt-2-6-intel/index.html @@ -0,0 +1,717 @@ + + + + + + + + + + + + + Unlocking the Latest Features in PyTorch 2.6 for Intel Platforms | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + the Intel PyTorch Team + +

        +

        PyTorch* 2.6 has just been released with a set of exciting new features including torch.compile compatibility with Python 3.13, new security and performance enhancements, and a change in the default parameter for torch.load. PyTorch also announced the deprecation of its official Anaconda channel.

        + +

        Among the performance features are three that enhance developer productivity on Intel platforms:

        + +
          +
        1. Improved Intel GPU availability
        2. +
        3. FlexAttention optimization on x86 CPU for LLM
        4. +
        5. FP16 on x86 CPU support for eager and Inductor modes
        6. +
        + +

        Improved Intel GPU Availability

        + +

        To provide developers working in artificial intelligence (AI) with better support for Intel GPUs, the PyTorch user experience on these GPUs has been enhanced. This improvement includes simplified installation steps, a Windows* release binary distribution, and expanded coverage of supported GPU models, including the latest Intel® Arc™ B-Series discrete graphics.

        + +

        These new features help promote accelerated machine learning workflows within the PyTorch ecosystem, providing a consistent developer experience and support. Application developers and researchers seeking to fine-tune, perform inference, and develop with PyTorch models on Intel® Core™ Ultra AI PCs  and Intel® Arc™ discrete graphics will now be able to install PyTorch directly with binary releases for Windows, Linux*, and Windows Subsystem for Linux 2.

        + +

        The new features include:

        + +
          +
        • Simplified Intel GPU software stack setup to enable one-click installation of the torch-xpu PIP wheels to run deep learning workloads in a ready-to-use fashion, thus eliminating the complexity of installing and activating Intel GPU development software bundles. 
        • +
        • Windows binary releases for torch core, torchvision and torchaudio have been made available for Intel GPUs, expanding from Intel® Core™ Ultra Series 2 with Intel® Arc™ Graphics and Intel® Arc™ A-Series graphics to the latest GPU hardware Intel® Arc™ B-Series graphics support. 
        • +
        • Further enhanced coverage of Aten operators on Intel GPUs with SYCL* kernels for smooth eager mode execution, as well as bug fixes and performance optimizations for torch.compile on Intel GPUs. 
        • +
        + +

        Get a tour of new environment setup, PIP wheels installation, and examples on Intel® Client GPUs and Intel® Data Center GPU Max Series in the Getting Started Guide.

        + +

        FlexAttention Optimization on X86 CPU for LLM

        + +

        FlexAttention was first introduced in PyTorch 2.5, to address the need to support various Attentions or even combinations of them. This PyTorch API leverages torch.compile to generate a fused FlashAttention kernel, which eliminates extra memory allocation and achieves performance comparable to handwritten implementations.

        + +

        Previously, FlexAttention was implemented for CUDA* devices based on the Triton backend. Since PyTorch 2.6, X86 CPU support of FlexAttention was added through TorchInductor CPP backend. This new feature leverages and extends current CPP template abilities to support broad attention variants (e.g., PageAttention, which is critical for LLMs inference) based on the existing FlexAttention API, and brings optimized performance on x86 CPUs. With this feature, user can easily use FlexAttention API to compose their Attention solutions on CPU platforms and achieve good performance.

        + +

        Typically, FlexAttention is utilized by popular LLM ecosystem projects, such as Hugging Face transformers and vLLM in their LLM related modeling (e.g., PagedAttention) to achieve better out-of-the-box performance. Before the official adoption happens, this enabling PR in Hugging Face can help us the performance benefits that FlexAttention can bring on x86 CPU platforms.

        + +

        The graph below shows the performance comparison of PyTorch 2.6 (with this feature) and PyTorch 2.5 (without this feature) on typical Llama models. For real-time mode (Batch Size = 1), there is about 1.13x-1.42x performance improvement for next token across different input token lengths. As for best throughput under a typical SLA (P99 token latency <=50ms), PyTorch 2.6 achieves more than 7.83x performance than PyTorch 2.5 as PyTorch 2.6 can run at 8 inputs (Batch Size = 8) together and still keep SLA while PyTorch 2.5 can only run 1 input, because FlexAttention based PagedAttention in PyTorch 2.6 provides more efficiency during multiple batch size scenarios.

        + +

        Figure 1. Performance comparison of PyTorch 2.6 and PyTorch 2.5 on Typical Llama Models

        + +

        Figure 1. Performance comparison of PyTorch 2.6 and PyTorch 2.5 on Typical Llama Models

        + +

        FP16 on X86 CPU Support for Eager and Inductor Modes

        + +

        Float16 is a commonly used reduced floating-point type that improves performance in neural network inference and training. CPUs like recently launched Intel® Xeon® 6 with P-Cores support Float16 datatype with native accelerator AMX, which highly improves the Float16 performance. Float16 support on x86 CPU was first introduced in PyTorch 2.5 as a prototype feature. Now it has been further improved for both eager mode and Torch.compile + Inductor mode, which is pushed to Beta level for broader adoption. This helps the deployment on the CPU side without the need to modify the model weights when the model is pre-trained with mixed precision of Float16/Float32. On platforms that support AMX Float16 (i.e., the Intel® Xeon® 6 processors with P-cores), Float16 has the same pass rate as Bfloat16 across the typical PyTorch benchmark suites: TorchBench, Hugging Face, and Timms. It also shows good performance comparable to 16 bit datatype Bfloat16.

        + +

        Summary

        + +

        In this blog, we discussed three features to enhance developer productivity on Intel platforms in PyTorch 2.6. These three features are designed to improve Intel GPU availability, optimize FlexAttention for x86 CPUs tailored for large language models (LLMs), and support FP16 on x86 CPUs in both eager and Inductor modes. Get PyTorch 2.6 and try them for yourself or you can access PyTorch 2.6 on the Intel® Tiber™ AI Cloud to take advantage of hosted notebooks that are optimized for Intel hardware and software.

        + +

        Acknowledgements

        + +

        The release of PyTorch 2.6 is an exciting milestone for Intel platforms, and it would not have been possible without the deep collaboration and contributions from the community. We extend our heartfelt thanks to Alban, Andrey, Bin, Jason, Jerry and Nikita for sharing their invaluable ideas, meticulously reviewing PRs, and providing insightful feedback on RFCs. Their dedication has driven continuous improvements and pushed the ecosystem forward for Intel platforms.

        + +

        References

        + + + +

        Product and Performance Information

        + +

        Measurement on AWS EC2 m7i.metal-48xl using: 2x Intel® Xeon® Platinum 8488C, HT On, Turbo On, NUMA 2, Integrated Accelerators Available [used]: DLB [8], DSA [8], IAA[8], QAT[on CPU, 8], Total Memory 512GB (16x32GB DDR5 4800 MT/s [4400 MT/s]), BIOS Amazon EC2 1.0, microcode 0x2b000603, 1x Elastic Network Adapter (ENA) 1x Amazon Elastic Block Store 800G, Ubuntu 24.04.1 LTS 6.8.0-1018-aws Test by Intel on Jan 15th 2025.

        + +

        Notices and Disclaimers

        + +

        Performance varies by use, configuration and other factors. Learn more on the Performance Index site. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates.  See backup for configuration details.  No product or component can be absolutely secure. Your costs and results may vary. Intel technologies may require enabled hardware, software or service activation.

        + +

        Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

        + +

        AI disclaimer:

        + +

        AI features may require software purchase, subscription or enablement by a software or platform provider, or may have specific configuration or compatibility requirements. Details at www.intel.com/AIPC. Results may vary.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/updates-improvements-to-pytorch-tutorials/index.html b/blog/updates-improvements-to-pytorch-tutorials/index.html new file mode 100644 index 000000000000..785d2c22f44c --- /dev/null +++ b/blog/updates-improvements-to-pytorch-tutorials/index.html @@ -0,0 +1,722 @@ + + + + + + + + + + + + + Updates & Improvements to PyTorch Tutorials | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        + +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        PyTorch.org provides researchers and developers with documentation, installation instructions, latest news, community projects, tutorials, and more. Today, we are introducing usability and content improvements including tutorials in additional categories, a new recipe format for quickly referencing common topics, sorting using tags, and an updated homepage.

        + +

        Let’s take a look at them in detail.

        + +

        TUTORIALS HOME PAGE UPDATE

        +

        The tutorials home page now provides clear actions that developers can take. For new PyTorch users, there is an easy-to-discover button to take them directly to “A 60 Minute Blitz”. Right next to it, there is a button to view all recipes which are designed to teach specific features quickly with examples.

        + +
        + +
        + +

        In addition to the existing left navigation bar, tutorials can now be quickly filtered by multi-select tags. Let’s say you want to view all tutorials related to “Production” and “Quantization”. You can select the “Production” and “Quantization” filters as shown in the image shown below:

        + +
        + +
        + +

        The following additional resources can also be found at the bottom of the Tutorials homepage:

        + + +

        PYTORCH RECIPES

        +

        Recipes are new bite-sized, actionable examples designed to teach researchers and developers how to use specific PyTorch features. Some notable new recipes include:

        + + +

        View the full recipes here.

        + +

        LEARNING PYTORCH

        +

        This section includes tutorials designed for users new to PyTorch. Based on community feedback, we have made updates to the current Deep Learning with PyTorch: A 60 Minute Blitz tutorial, one of our most popular tutorials for beginners. Upon completion, one can understand what PyTorch and neural networks are, and be able to build and train a simple image classification network. Updates include adding explanations to clarify output meanings and linking back to where users can read more in the docs, cleaning up confusing syntax errors, and reconstructing and explaining new concepts for easier readability.

        + +

        DEPLOYING MODELS IN PRODUCTION

        +

        This section includes tutorials for developers looking to take their PyTorch models to production. The tutorials include:

        + + +

        FRONTEND APIS

        +

        PyTorch provides a number of frontend API features that can help developers to code, debug, and validate their models more efficiently. This section includes tutorials that teach what these features are and how to use them. Some tutorials to highlight:

        + + +

        MODEL OPTIMIZATION

        +

        Deep learning models often consume large amounts of memory, power, and compute due to their complexity. This section provides tutorials for model optimization:

        + + +

        PARALLEL AND DISTRIBUTED TRAINING

        +

        PyTorch provides features that can accelerate performance in research and production such as native support for asynchronous execution of collective operations and peer-to-peer communication that is accessible from Python and C++. This section includes tutorials on parallel and distributed training:

        + + +

        Making these improvements are just the first step of improving PyTorch.org for the community. Please submit your suggestions here.

        + +

        Cheers,

        + +

        Team PyTorch

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/vllm-joins-pytorch/index.html b/blog/vllm-joins-pytorch/index.html new file mode 100644 index 000000000000..7c1324ff4398 --- /dev/null +++ b/blog/vllm-joins-pytorch/index.html @@ -0,0 +1,705 @@ + + + + + + + + + + + + + vLLM Joins PyTorch Ecosystem: Easy, Fast, and Cheap LLM Serving for Everyone | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + vLLM Team + +

        +

        vllm logo

        + +

        We’re thrilled to announce that the vLLM project has become a PyTorch ecosystem project, and joined the PyTorch ecosystem family!

        + +

        For more information on what it means to be a PyTorch ecosystem project, see the PyTorch Ecosystem Tools page.

        + +

        Running large language models (LLMs) is both resource-intensive and complex, especially as these models scale to hundreds of billions of parameters. That’s where vLLM comes in — a high-throughput, memory-efficient inference and serving engine designed for LLMs.

        + +

        Originally built around the innovative PagedAttention algorithm, vLLM has grown into a comprehensive, state-of-the-art inference engine. A thriving community is also continuously adding new features and optimizations to vLLM, including pipeline parallelism, chunked prefill, speculative decoding, and disaggregated serving.

        + +

        Since its release, vLLM has garnered significant attention, achieving over 31,000 GitHub stars—a testament to its popularity and thriving community. This milestone marks an exciting chapter for vLLM as we continue to empower developers and researchers with cutting-edge tools for efficient and scalable AI deployment. Welcome to the next era of LLM inference!

        + +

        vLLM has always had a strong connection with the PyTorch project. It is deeply integrated into PyTorch, leveraging it as a unified interface to support a wide array of hardware backends. These include NVIDIA GPUs, AMD GPUs, Google Cloud TPUs, Intel GPUs, Intel CPUs, Intel Gaudi HPUs, and AWS Neuron, among others. This tight coupling with PyTorch ensures seamless compatibility and performance optimization across diverse hardware platforms.

        + +

        Do you know you can experience the power of vLLM right from your phone? During this year’s Amazon Prime Day, vLLM played a crucial role in delivering lightning-fast responses to millions of users. Across three regions, over 80,000 Trainium and Inferentia chips powered an average of 3 million tokens per minute, all while maintaining a P99 latency of less than 1 second for the first response. That means when customers opened the Amazon app and chatted with Rufus, they were seamlessly interacting with vLLM in action!

        + +

        vLLM also collaborates tightly with leading model vendors to ensure support for popular models. This includes tight integration with Meta LLAMA, Mistral, QWen, and DeepSeek models, plus many others. One particularly memorable milestone was the release of LLAMA 3.1 (405B). As the launching partner, vLLM was the first to enable running this very large model, showcasing vLLM’s capability to handle the most complex and resource-intensive language models.

        + +

        To install vLLM, simply run:

        + +
        pip install vllm
        +
        + +

        vLLM is designed for both researchers and production-grade serving.

        + +

        To run vLLM as an OpenAI API compatible server, just use the Huggingface model ID:

        + +
        vllm serve meta-llama/Llama-3.1-8B
        +
        + +

        To run vLLM as a simple function:

        + +
        from vllm import LLM, SamplingParams
        +
        +# Sample prompts.
        +prompts = [
        +   "Hello, my name is",
        +   "The president of the United States is",
        +   "The capital of France is",
        +   "The future of AI is",
        +]
        +# Create a sampling params object.
        +sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
        +
        +# Create an LLM.
        +llm = LLM(model="meta-llama/Llama-3.1-8B")
        +# Generate texts from the prompts. The output is a list of RequestOutput objects
        +# that contain the prompt, generated text, and other information.
        +outputs = llm.generate(prompts, sampling_params)
        +# Print the outputs.
        +for output in outputs:
        +   prompt = output.prompt
        +   generated_text = output.outputs[0].text
        +   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        +
        + +

        Open-source innovation is part of the vLLM’s DNA. Born out of a Berkeley academic project, it follows the legacy of other pioneering open-source initiatives such as BSD, which revolutionized operating systems in the 1980s. Other innovations from the same organization include Apache Spark and Ray, now the standard for big data and AI systems. In the Gen AI era, vLLM serves as a platform dedicated to democratizing AI inference.

        + +

        The vLLM team remains steadfast in its mission to keep the project “of the community, by the community, and for the community.” Collaboration and inclusivity lie at the heart of everything we do.

        + +

        If you have collaboration requests or inquiries, feel free to reach out at vllm-questions@lists.berkeley.edu. To join the active and growing vLLM community, explore our GitHub repository or connect with us on the vLLM Slack. Together, we can push the boundaries of AI innovation and make it accessible to all.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/warp-specialization/index.html b/blog/warp-specialization/index.html new file mode 100644 index 000000000000..412bb7e7a3c8 --- /dev/null +++ b/blog/warp-specialization/index.html @@ -0,0 +1,741 @@ + + + + + + + + + + + + + Enabling advanced GPU features in PyTorch - Warp Specialization | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Meta and NVIDIA + +

        +

        Meta: Hongtao Yu, Manman Ren, Bert Maher, Shane Nay
        +NVIDIA: Gustav Zhu, Shuhao Jiang

        + +

        Over the past few months, we have been working on enabling advanced GPU features for PyTorch and Triton users through the Triton compiler. One of our key goals has been to introduce warp specialization support on NVIDIA Hopper GPUs. Today, we are thrilled to announce that our efforts have resulted in the rollout of fully automated Triton warp specialization, now available to users in the upcoming release of Triton 3.2, which will ship with PyTorch 2.6. PyTorch users can leverage this feature by implementing user-defined Triton kernels. This work leveraged an initial implementation of warp specialization in Triton by NVIDIA and we look forward to further development with the community in the future.

        + +

        Warp specialization (WS) is a GPU programming technique where warps (a group of 32 threads on NVIDIA GPUs) within a threadblock are assigned distinct roles or tasks. This approach optimizes performance by enabling efficient execution of workloads that require task differentiation or cooperative processing. It enhances kernel performance by leveraging an asynchronous execution model, where different parts of the kernel are managed by separate hardware units. Data communication between these units, facilitated via shared memory on the NVIDIA H100, is highly efficient. Compared to a uniform warp approach, warp specialization allows the hardware multitasking warp scheduler to operate more effectively, maximizing resource utilization and overall performance.

        + +

        Using GEMM as an example, a typical uniform warp approach on the H100 GPU involves 8 warps per thread block collectively computing a tile of the output tensor. These 8 warps are divided into two warp groups (WG), with each group cooperatively computing half of the tile using efficient warp-group-level MMA (WGMMA) instructions, as illustrated in Figure 1.

        + +

        Figure 1. GEMM K-loop Body with Uniform Warps

        + +

        Figure 1. GEMM K-loop Body with Uniform Warps

        + +

        The implementation is clean, easy to understand, and generally performs well, thanks to an elegant software pipeliner. The pipeliner’s purpose is to enhance instruction-level parallelism by executing non-dependent operations on different hardware units. For instance, load operations from the next loop iteration can be executed simultaneously with WGMMA operations in the current iteration. However, this approach relies heavily on the compiler to craft an instruction sequence that ensures load and WGMMA instructions are issued at precisely the right time. While this is relatively straightforward for GEMM, which involves a limited number of operations, it becomes significantly more challenging for more complex kernels, such as flash attention.

        + +

        On the other hand, warp specialization addresses programming challenges by separating operations intended to run simultaneously on different hardware units into distinct warps, synchronizing them efficiently using low-cost barriers in shared memory. This allows each warp to have its own instruction sequence, enabling instructions to be issued and executed continuously without being interrupted by other operations, thanks to the multi-way warp scheduler. An illustration of a warp-specialized GEMM can be seen in Figure 2.

        + +

        Figure 2. GEMM K-loop Body with Specialized Warps

        + +

        Figure 2. GEMM K-loop Body with Specialized Warps

        + +

        How to enable WS

        + +

        To enable warp specialization, users simply need to specify two autotune flags: num_consumer_groups and num_buffers_warp_spec. For example, a warp-specialized GEMM implementation might look as shown below. Users can enable warp specialization by setting a non-zero value for num_consumer_groups, which defines the number of consumer warp groups. There is no corresponding flag to set the number of producer warp groups, as currently only one producer is supported. The num_buffers_warp_spec flag specifies the number of buffers the producer warp group will use to communicate with the consumer warp groups. A working example of a warp-specialized kernel is provided in the persistent GEMM tutorial.

        + +
        @triton.autotune(
        +    configs=[
        +        triton.Config(
        +            {
        +                "BLOCK_SIZE_M": 128,
        +                "BLOCK_SIZE_N": 256,
        +                "BLOCK_SIZE_K": 64,
        +                "GROUP_SIZE_M": 8,
        +            },
        +            num_stages=2,
        +            num_warps=4,
        +            num_consumer_groups=2,
        +            num_buffers_warp_spec=3,
        +        ),
        +    ],
        +    key=["M", "N", "K"],
        +)
        +@triton.jit
        +def matmul_persistent_ws_kernel(
        +   a_ptr, b_ptr, c_ptr, M, N, K,
        +   stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn,
        +   BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
        +):
        +   pid = tl.program_id(axis=0)
        +   num_pid_m = tl.cdiv(M, BLOCK_M)
        +   num_pid_n = tl.cdiv(N, BLOCK_N)
        +   pid_m = pid // num_pid_m
        +   pid_n = pid % num_pid_n
        +   offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
        +   offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
        +   offs_k = tl.arange(0, BLOCK_K)
        +   a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
        +   b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)
        +   acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
        +   for k in range(0, tl.cdiv(K, BLOCK_K)):
        +       a = tl.load(a_ptrs)
        +       b = tl.load(b_ptrs)
        +       acc += tl.dot(a, b)
        +       a_ptrs += BLOCK_K * stride_ak
        +       b_ptrs += BLOCK_K * stride_bk
        +   c = acc.to(tl.float16)
        +   c_ptrs = c_ptr + stride_cm * offs_m[:, None] + stride_cn * offs_n[None, :]
        +   tl.store(c_ptrs, c)
        +
        + +

        Under the Hood

        + +

        Warp specialization uses a set of hierarchical compiler transformations and IR changes to translate a user’s non-warp-specialized kernel into warp-specialized machine code. These include:

        + +
          +
        • Task Partitioning: The entire kernel is automatically divided into asynchronous tasks based on predefined heuristics. The compiler determines how to utilize one producer warp group and a user-specified number of consumer warp groups to execute the kernel. It assigns task IDs to specific anchor operations, which then influence the task assignments for remaining operations through asynchronous task ID propagation and dependency analysis. Since shared memory is the most efficient method for data transfer between warp groups across all supported platforms, the compiler optimizes task partitions to minimize register spills to shared memory, ensuring efficient execution.
        • +
        • Data Partitioning for Multiple Consumer Groups: Efficiently partitioning data among multiple consumer groups is key to optimizing workload distribution. On the H100 GPU, the compiler, by default, attempts to partition the input tensor A along the M dimension, allowing each consumer group to compute half of the output tensor independently. This strategy, known as cooperative partitioning, maximizes efficiency under most conditions. However, if this split leads to inefficiencies—such as producing a workload smaller than the native WGMMA instruction size—the compiler dynamically adjusts and partitions along the N dimension instead.
        • +
        • Dataflow Pipelining: The compiler creates cyclic shared memory buffers to pipeline dataflows across multiple-dimensional loops. Warp-specialized pipelining supports complex control flow. For example, our warp-specialized persistent GEMM kernel uses a doubly-nested loop, allowing the producer to begin fetching data for the next output tile while the consumer is finishing the compute for the prior tile.
        • +
        • Communication Operations: We introduced four high-level Triton GPU IR (TTGIR) communication operations—ProducerAcquireOp, ProducerCommitOp, ConsumerWaitOp, and ConsumerReleaseOp—to manage pipelined dataflows. These support both TMA and non-TMA memory operations.
        • +
        • Code Partitioning: Each async task is outlined into its own standalone code region, guarded by warp group ID checks. Control dependencies are duplicated as needed.
        • +
        • TTGIR to LLVM/PTX Materialization: TTGIR communication operations are materialized into corresponding LLVM/PTX barrier operations.
        • +
        + +

        Performance

        + +

        The warp specialization release introduces a range of Triton compiler transformations that collectively convert user code into warp-specialized kernels. This feature has been applied to several key kernels, including Flash Attention and FP8 row-wise GEMM, resulting in significant performance gains of 10% to 15%. Below, we highlight the latest performance metrics for these high-impact kernels.

        + +

        bar chart

        + +

        bar chart

        + +

        Future Work

        + +

        Looking ahead, we plan to further enhance Triton’s warp specialization support by introducing new features such as Ping-Pong scheduling, expanded buffer sharing support, improved transparent handling for TMA, refined partitioning heuristics for upcoming NVIDIA hardware.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/what-every-user-should-know-about-mixed-precision-training-in-pytorch/index.html b/blog/what-every-user-should-know-about-mixed-precision-training-in-pytorch/index.html new file mode 100644 index 000000000000..ff39048f2482 --- /dev/null +++ b/blog/what-every-user-should-know-about-mixed-precision-training-in-pytorch/index.html @@ -0,0 +1,757 @@ + + + + + + + + + + + + + What Every User Should Know About Mixed Precision Training in PyTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Syed Ahmed, Christian Sarofeen, Mike Ruberry, Eddie Yan, Natalia Gimelshein, Michael Carilli, Szymon Migacz, Piotr Bialecki, Paulius Micikevicius, Dusan Stosic, Dong Yang, and Naoya Maruyama + +

        +

        Efficient training of modern neural networks often relies on using lower precision data types. Peak float16 matrix multiplication and convolution performance is 16x faster than peak float32 performance on A100 GPUs. And since the float16 and bfloat16 data types are only half the size of float32 they can double the performance of bandwidth-bound kernels and reduce the memory required to train a network, allowing for larger models, larger batches, or larger inputs. Using a module like torch.amp (short for “Automated Mixed Precision”) makes it easy to get the speed and memory usage benefits of lower precision data types while preserving convergence behavior.

        + +

        Going faster and using less memory is always advantageous – deep learning practitioners can test more model architectures and hyperparameters, and larger, more powerful models can be trained. Training very large models like those described in Narayanan et al. and Brown et al. (which take thousands of GPUs months to train even with expert handwritten optimizations) is infeasible without using mixed precision.

        + +

        We’ve talked about mixed precision techniques before (here, here, and here), and this blog post is a summary of those techniques and an introduction if you’re new to mixed precision.

        + +

        Mixed Precision Training in Practice

        + +

        Mixed precision training techniques – the use of the lower precision float16 or bfloat16 data types alongside the float32 data type – are broadly applicable and effective. See Figure 1 for a sampling of models successfully trained with mixed precision, and Figures 2 and 3 for example speedups using torch.amp.

        + +

        + +

        + +

        + Figure 1: Sampling of DL Workloads Successfully Trained with float16 (Source). +

        + +

        + +

        + +

        + Figure 2: Performance of mixed precision training using torch.amp on NVIDIA 8xV100 vs. float32 training on 8xV100 GPU. Bars represent the speedup factor of torch.amp over float32. +(Higher is better.) (Source). +

        + +

        + +

        + +

        + Figure 3. Performance of mixed precision training using torch.amp on NVIDIA 8xA100 vs. 8xV100 GPU. Bars represent the speedup factor of A100 over V100. +(Higher is Better.) (Source). +

        + +

        See the NVIDIA Deep Learning Examples repository for more sample mixed precision workloads.

        + +

        Similar performance charts can be seen in 3D medical image analysis, gaze estimation, video synthesis, conditional GANs, and convolutional LSTMs. Huang et al. showed that mixed precision training is 1.5x to 5.5x faster over float32 on V100 GPUs, and an additional 1.3x to 2.5x faster on A100 GPUs on a variety of networks. On very large networks the need for mixed precision is even more evident. Narayanan et al. reports that it would take 34 days to train GPT-3 175B on 1024 A100 GPUs (with a batch size of 1536), but it’s estimated it would take over a year using float32!

        + +

        Getting Started With Mixed Precision Using torch.amp

        + +

        torch.amp, introduced in PyTorch 1.6, makes it easy to leverage mixed precision training using the float16 or bfloat16 dtypes. See this blog post, tutorial, and documentation for more details. Figure 4 shows an example of applying AMP with grad scaling to a network.

        + +
        import torch
        +# Creates once at the beginning of training
        +scaler = torch.cuda.amp.GradScaler()
        +
        +for data, label in data_iter:
        +   optimizer.zero_grad()
        +   # Casts operations to mixed precision
        +   with torch.amp.autocast(device_type=“cuda”, dtype=torch.float16):
        +      loss = model(data)
        +
        +   # Scales the loss, and calls backward()
        +   # to create scaled gradients
        +   scaler.scale(loss).backward()
        +
        +   # Unscales gradients and calls
        +   # or skips optimizer.step()
        +   scaler.step(optimizer)
        +
        +   # Updates the scale for next iteration
        +   scaler.update()
        +
        + +

        + Figure 4: AMP recipe +

        + +

        Picking The Right Approach

        + +

        Out-of-the-box mixed precision training with either float16 or bfloat16 is effective at speeding up the convergence of many deep learning models, but some models may require more careful numerical accuracy management. Here are some options:

        + +
          +
        • Full float32 precision. Floating point tensors and modules are created in float32 precision by default in PyTorch, but this is a historic artifact not representative of training most modern deep learning networks. It’s rare that networks need this much numerical accuracy.
        • +
        • Enabling TensorFloat32 (TF32) mode. On Ampere and later CUDA devices matrix multiplications and convolutions can use the TensorFloat32 (TF32) mode for faster but slightly less accurate computations. See the Accelerating AI Training with NVIDIA TF32 Tensor Cores blog post for more details. By default PyTorch enables TF32 mode for convolutions but not matrix multiplications, and unless a network requires full float32 precision we recommend enabling this setting for matrix multiplications, too (see the documentation here for how to do so). It can significantly speed up computations with typically negligible loss of numerical accuracy.
        • +
        • Using torch.amp with bfloat16 or float16. Both these low precision floating point data types are usually comparably fast, but some networks may only converge with one vs the other. If a network requires more precision it may need to use float16, and if a network requires more dynamic range it may need to use bfloat16, whose dynamic range is equal to that of float32. If overflows are observed, for example, then we suggest trying bfloat16.
        • +
        + +

        There are even more advanced options than those presented here, like using torch.amp’s autocasting for only parts of a model, or managing mixed precision directly. These topics are largely beyond the scope of this blog post, but see the “Best Practices” section below.

        + +

        Best Practices

        + +

        We strongly recommend using mixed precision with torch.amp or the TF32 mode (on Ampere and later CUDA devices) whenever possible when training a network. If one of those approaches doesn’t work, however, we recommend the following:

        + +
          +
        • High Performance Computing (HPC) applications, regression tasks, and generative networks may simply require full float32 IEEE precision to converge as expected.
        • +
        • Try selectively applying torch.amp. In particular we recommend first disabling it on regions performing operations from the torch.linalg module or when doing pre- or post-processing. These operations are often especially sensitive. Note that TF32 mode is a global switch and can’t be used selectively on regions of a network. Enable TF32 first to check if a network’s operators are sensitive to the mode, otherwise disable it.
        • +
        • If you encounter type mismatches while using torch.amp we don’t suggest inserting manual casts to start. This error is indicative of something being off with the network, and it’s usually worth investigating first.
        • +
        • Figure out by experimentation if your network is sensitive to range and/or precision of a format. For example fine-tuning bfloat16-pretrained models in float16 can easily run into range issues in float16 because of the potentially large range from training in bfloat16, so users should stick with bfloat16 fine-tuning if the model was trained in bfloat16.
        • +
        • The performance gain of mixed precision training can depend on multiple factors (e.g. compute-bound vs memory-bound problems) and users should use the tuning guide to remove other bottlenecks in their training scripts. Although having similar theoretical performance benefits, BF16 and FP16 can have different speeds in practice. It’s recommended to try the mentioned formats and use the one with best speed while maintaining the desired numeric behavior.
        • +
        + +

        For more details, refer to the AMP Tutorial, Training Neural Networks with Tensor Cores, and see the post “More In-Depth Details of Floating Point Precision” on PyTorch Dev Discussion.

        + +

        Conclusion

        + +

        Mixed precision training is an essential tool for training deep learning models on modern hardware, and it will become even more important in the future as the performance gap between lower precision operations and float32 continues to grow on newer hardware, as reflected in Figure 5.

        + +

        + +

        + +

        +Figure 5: Relative peak throughput of float16 (FP16) vs float32 matrix multiplications on Volta and Ampere GPUs. On Ampere relative peak throughput for the TensorFloat32 (TF32) mode and bfloat16 matrix multiplications are shown, too. The relative peak throughput of low precision data types like float16 and bfloat16 vs. float32 matrix multiplications is expected to grow as new hardware is released. +

        + +

        PyTorch’s torch.amp module makes it easy to get started with mixed precision, and we highly recommend using it to train faster and reduce memory usage. torch.amp supports both float16 and bfloat16 mixed precision.

        + +

        There are still some networks that are tricky to train with mixed precision, and for these networks we recommend trying TF32 accelerated matrix multiplications on Ampere and later CUDA hardware. Networks are rarely so precision sensitive that they require full float32 precision for every operation.

        + +

        If you have questions or suggestions for torch.amp or mixed precision support in PyTorch then let us know by posting to the mixed precision category on the PyTorch Forums or filing an issue on the PyTorch GitHub page.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/blog/zeus/index.html b/blog/zeus/index.html new file mode 100644 index 000000000000..98ea60e63b08 --- /dev/null +++ b/blog/zeus/index.html @@ -0,0 +1,809 @@ + + + + + + + + + + + + + Deep Learning Energy Measurement and Optimization | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + + +
        +
        +
        + +
        +

        + by + + Jae-Won Chung + +

        +

        Zeus logo

        + +

        This post is authored by Jae-Won Chung, a PhD student at the University of Michigan and the lead of the ML.ENERGY Initiative.

        + +

        Deep learning consumes quite a bit of energy. For instance, training a single 200B LLM on AWS p4d instances consumed around 11.9 GWh (source: CIDR 2024 keynote), which is an amount that can single-handedly power more than a thousand average US households for a year.

        + +

        Zeus is an open-source toolbox for measuring and optimizing the energy consumption of deep learning workloads. Our goal is to make energy optimization based on accurate measurements as easy as possible for diverse deep learning workloads and setups by offering composable tools with minimal assumptions.

        + +

        Zeus largely provides two types of tools:

        + +
          +
        1. Programmatic and command line GPU energy measurement tools
        2. +
        3. Several energy optimization tools that find the best ML and/or GPU configurations
        4. +
        + +

        Zeus can benefit those who would like to

        + +
          +
        • measure and optimize their electricity cost
        • +
        • reduce heat dissipation from their GPUs (by lowering power draw)
        • +
        • report energy usage from research and development
        • +
        • reduce carbon footprint from electricity usage
        • +
        + +

        Part 1: Measuring Energy

        + +

        Just like performance optimization, accurate measurement is the basis of effective energy optimization. Popular proxies for estimating power consumption like the maximum power draw of the hardware can sometimes be vastly off compared to actual measurement.

        + +

        To make energy measurement as easy and transparent as possible, the core utility Zeus offers is the ZeusMonitor class. Let’s take a look at the actual snippet:

        + +
        from zeus.monitor import ZeusMonitor
        +
        +# All four GPUs are measured simultaneously.
        +monitor = ZeusMonitor(gpu_indices=[0,1,2,3])
        +
        +# Measure total time and energy within the window.
        +monitor.begin_window("training")
        +for e in range(100):
        +
        +    # Measurement windows can arbitrarily be overlapped.
        +    monitor.begin_window("epoch")
        +    for x, y in train_dataloader:
        +        y_hat = model(x)
        +        loss = criterion(y, y_hat)
        +        loss.backward()
        +        optim.step()
        +    measurement = monitor.end_window("epoch")
        +    print(f"Epoch {e}: {measurement.time} s, {measurement.total_energy} J")
        +
        +measurement = monitor.end_window("training")
        +print(f"Entire training: {measurement.time} s, {measurement.total_energy} J")
        +
        + +

        What you see above is a typical PyTorch training loop which uses four GPUs for data parallel training. Inside, we created an instance of ZeusMonitor and passed in a list of GPU indices to monitor. Then, using the monitor, we can measure the time and energy consumption of arbitrary execution windows within the training script by pairing calls to begin_window and end_window. Multiple windows can overlap and nest in arbitrary ways without affecting the measurement of each, as long as their names are different.

        + +

        ZeusMonitor adds very little overhead – typically single digit milliseconds – around the window. This allows ZeusMonitor to be used in various applications. For instance:

        + +
          +
        • The ML.ENERGY Leaderboard: The first open-source benchmark on how much energy LLM text generation consumes.
        • +
        • The ML.ENERGY Colosseum: An online service that lets users compare LLM responses side-by-side based on response quality and energy consumption.
        • +
        + +

        See our blog post for a deeper technical dive into accurate GPU energy measurement.

        + +

        Part 2: Optimizing Energy

        + +

        Let me introduce you to two of the energy optimizers provided by Zeus.

        + +

        GlobalPowerLimitOptimizer

        + +

        GPUs allow users to configure its maximum power draw, called power limit. Typically, as you lower the GPU’s power limit from the default maximum, computation may get slightly slower, but you’ll save disproportionately more energy. The GlobalPowerLimitOptimizer in Zeus automatically finds the optimal GPU power limit globally across all GPUs.

        + +
        from zeus.monitor import ZeusMonitor
        +from zeus.optimizer.power_limit import GlobalPowerLimitOptimizer
        +
        +# The optimizer measures time and energy through the ZeusMonitor.
        +monitor = ZeusMonitor(gpu_indices=[0,1,2,3])
        +plo = GlobalPowerLimitOptimizer(monitor)
        +
        +for e in range(100):
        +    plo.on_epoch_begin()
        +    for x, y in train_dataloader:
        +        plo.on_step_begin()
        +
        +        y_hat = model(x)
        +        loss = criterion(y, y_hat)
        +        loss.backward()
        +        optim.step()
        +
        +        plo.on_step_end()
        +    plo.on_epoch_end()
        +
        + +

        In our familiar PyTorch training loop, we have instantiated GlobalPowerLimitOptimizer and passed it an instance of the ZeusMonitor, through which the optimizer sees the GPUs. Then, we just need to let the optimizer know about training progress (step and epoch boundaries), and the optimizer will transparently do all the necessary profiling and converge to the optimal power limit.

        + +

        If you’re using the HuggingFace Trainer or SFTTrainer, integration is even easier:

        + +
        from zeus.monitor import ZeusMonitor
        +from zeus.optimizer.power_limit import HFGlobalPowerLimitOptimizer
        +
        +# ZeusMonitor actually auto-detects CUDA_VISIBLE_DEVICES.
        +monitor = ZeusMonitor()
        +pl_optimizer = HFGlobalPowerLimitOptimizer(monitor)
        +
        +# Pass in the optimizer as a Trainer callback. Also works for SFTTrainer.
        +trainer = Trainer(
        +    model=model,
        +    train_dataset=train_dataset,
        +    ...,
        +    callbacks=[pl_optimizer],
        +)
        +
        + +

        The HFGlobalPowerLimitOptimizer wraps GlobalPowerLimitOptimizer so that it automatically detects step and epoch boundaries. We have example integrations here, including running Gemma 7B supervised fine-tuning with QLoRA.

        + +

        Now, we know how to integrate the optimizer, but what is the optimal power limit? We know different users can have different preferences regarding trading off time and energy, so we allow users to specify an OptimumSelector (basically the Strategy Pattern) to express their needs.

        + +
        # Built-in strategies for selecting the optimal power limit.
        +from zeus.optimizer.power_limit import (
        +    GlobalPowerLimitOptimizer,
        +    Time,
        +    Energy,
        +    MaxSlowdownConstraint,
        +)
        +
        +# Minimize energy while tolerating at most 10% slowdown.
        +plo = GlobalPowerLimitOptimizer(
        +    monitor,
        +    MaxSlowdownConstraint(factor=1.1),
        +)
        +
        +
        + +

        Some of the built-in strategies include “Minimize time” (Time, this might still reduce the power limit from the default since some workloads exhibit almost no slowdown even on lower power limits), “Minimize energy” (Energy), “Somewhere in between” (ZeusCost), and “Minimize energy given maximum slowdown” (MaxSlowdownConstraint). Users can also create their own optimum selectors as needed.

        + +

        PipelineFrequencyOptimizer

        + +

        The pipeline frequency optimizer, based on our research paper Perseus, is our latest work on energy optimization for large model training, like GPT-3. Perseus can reduce the energy consumption of large model training with no or negligible training throughput degradation. We’ll briefly talk about how.

        + +

        one iteration of training with four stage pipeline parallelism

        + +

        The above is a visualization of one iteration of training with four stage pipeline parallelism running with the 1F1B schedule. Each box is either a forward or a backward computation, and is colored with its power consumption.

        + +

        The key observation here is that when models are partitioned into pipeline stages, it’s very difficult to slice them in perfectly equal sizes. This leads to forward/backward boxes of varying widths and therefore computation idle time between boxes. You would notice that those smaller boxes can run slightly slower than wider boxes and the overall critical path (blue line) will not change at all.

        + +

        one iteration of training with four stage pipeline parallelism

        + +

        That’s what Perseus automatically does. Based on profiling, it identifies computation boxes that are not on the critical path and figures out the precise amount of slowdown for each box that minimizes energy consumption. When done correctly, computations we slowed down will consume less power & energy, but the overall iteration time of the pipeline does not change.

        + +

        See our guide to get started with Perseus!

        + +

        Final Words

        + +

        For users who run their own on-premise compute, energy consumption and the resulting electricity bill is not something that can be easily overlooked. On a larger scale, energy consumption is not just about electricity bills, but also about data center power delivery. With thousands of GPUs running in clusters, finding stable, affordable, and sustainable electricity sources to power data centers is becoming increasingly challenging. Finding ways to reduce energy disproportionately more than slowdown leads to lower average power consumption, which can help with the power delivery challenge.

        + +

        With Zeus, we hope to take the first step towards deep learning energy measurement and optimization.

        + +

        Wondering where to go from here? Here are a couple helpful links:

        + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/board_info/advanced-micro-devices.html b/board_info/advanced-micro-devices.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/advanced-micro-devices.html @@ -0,0 +1 @@ + diff --git a/board_info/arm.html b/board_info/arm.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/arm.html @@ -0,0 +1 @@ + diff --git a/board_info/aws.html b/board_info/aws.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/aws.html @@ -0,0 +1 @@ + diff --git a/board_info/google-cloud.html b/board_info/google-cloud.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/google-cloud.html @@ -0,0 +1 @@ + diff --git a/board_info/huawei.html b/board_info/huawei.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/huawei.html @@ -0,0 +1 @@ + diff --git a/board_info/hugging-face.html b/board_info/hugging-face.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/hugging-face.html @@ -0,0 +1 @@ + diff --git a/board_info/ibm.html b/board_info/ibm.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/ibm.html @@ -0,0 +1 @@ + diff --git a/board_info/intel.html b/board_info/intel.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/intel.html @@ -0,0 +1 @@ + diff --git a/board_info/lightning.html b/board_info/lightning.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/lightning.html @@ -0,0 +1 @@ + diff --git a/board_info/meta.html b/board_info/meta.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/meta.html @@ -0,0 +1 @@ + diff --git a/board_info/microsoft-corporation.html b/board_info/microsoft-corporation.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/microsoft-corporation.html @@ -0,0 +1 @@ + diff --git a/board_info/nvidia-corporation.html b/board_info/nvidia-corporation.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/board_info/nvidia-corporation.html @@ -0,0 +1 @@ + diff --git a/case_studies/amazon-ads.html b/case_studies/amazon-ads.html new file mode 100644 index 000000000000..588ea386377c --- /dev/null +++ b/case_studies/amazon-ads.html @@ -0,0 +1,642 @@ + + + + + + + + + + + + + Amazon Ads | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        June 04, 2025

        +

        + Amazon Ads +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Reduce inference costs by 71% and drive scale out using PyTorch, TorchServe, and AWS Inferentia.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/case_studies/salesforce.html b/case_studies/salesforce.html new file mode 100644 index 000000000000..c21adc40c3fe --- /dev/null +++ b/case_studies/salesforce.html @@ -0,0 +1,642 @@ + + + + + + + + + + + + + Salesforce | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        June 04, 2025

        +

        + Salesforce +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Pushing the state of the art in NLP and Multi-task learning.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/case_studies/stanford-university.html b/case_studies/stanford-university.html new file mode 100644 index 000000000000..230a34ef5bc5 --- /dev/null +++ b/case_studies/stanford-university.html @@ -0,0 +1,642 @@ + + + + + + + + + + + + + Stanford University | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        June 04, 2025

        +

        + Stanford University +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        Using PyTorch’s flexibility to efficiently research new algorithmic approaches.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/code-of-conduct.html b/code-of-conduct.html index 419ba4a38970..149dd98924a3 100644 --- a/code-of-conduct.html +++ b/code-of-conduct.html @@ -1,11 +1,310 @@ ---- -layout: default -title: PyTorch Foundation Code of Conduct -body-class: announcement -background-class: announcement-background -permalink: /code-of-conduct ---- -{% assign cards = site.board_info %} + + + + + + + + + + + + + PyTorch Foundation Code of Conduct | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        @@ -222,3 +521,306 @@

        ​​Acknowledgements

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/community-blog.html b/community-blog.html index 61339dbf6542..2f8fccf5b337 100644 --- a/community-blog.html +++ b/community-blog.html @@ -1,12 +1,310 @@ ---- -layout: default -title: Community Blog -permalink: /community-blog -body-class: blog -background-class: features-background ---- - -
        + + + + + + + + + + + + + Community Blog | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        Community Blog

        Stories from the PyTorch Ecosystem

        @@ -21,23 +319,670 @@

        Stories from the PyTorch Ecosystem

        - {% assign community_blog = site.community_blog | sort_natural: "date" | reverse %} - {% for post in community_blog %} + + +
        +
        +

        March 19, 2025

        +

        + SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine +

        +

        We’re thrilled to announce that the SGLang project has been integrated into the PyTorch ecosystem! This integration ensures that SGLang aligns with PyTorch’s standards and practices, providing developers with a reliable and community-supported framework for fast and flexible serving of LLMs. +

        +
        + + Read More + +
        + +
        +
        +

        March 16, 2025

        +

        + PyTorch at GTC 2025 +

        +

        GTC is coming back to San Jose on March 17–21, 2025. Join PyTorch Foundation members Arm, AWS, Google Cloud, IBM, Lightning AI, Meta, Microsoft Azure, Snowflake, and thousands of developers as we celebrate PyTorch. Together learn how AI & accelerated computing are helping humanity solve our most complex challenges. +

        +
        + + Read More + +
        + +
        +
        +

        March 07, 2025

        +

        + Powering AI with PyTorch, Fedora, and Open Source Communities +

        +

        At DevConf.IN 2025 in Pune, I had the opportunity to host a PyTorch Meetup on February 28th. The session, titled “Powering AI with PyTorch, Fedora, and Open Source Communities” was aimed at introducing PyTorch to students and professionals, explaining why PyTorch+Fedora form an ideal AI development platform. The other key aspect I covered was collaboration between open source communities. +

        +
        + + Read More + +
        + +
        +
        +

        February 19, 2025

        +

        + Optimize LLMs for Efficiency & Sustainability +

        +

        The rapid growth of large language model (LLM) applications is linked to rapid growth in energy demand. According to the International Energy Agency (IEA), data center electricity consumption is projected to roughly double by 2026 primarily driven by AI. This is due to the energy-intensive training requirements for massive LLMs – however, the increase in AI Inferencing workloads also plays a role. For example, compared with traditional search queries, a single AI inference can consume about 1...

        +
        + + Read More + +
        + +
        +
        +

        February 12, 2025

        +

        + Solve Real-Word AI Challenges with PyTorch at Datathon 2025: DataOrbit +

        +

        We’re excited to have PyTorch sponsor Datathon 2025: DataOrbit, a place where students can collaborate with a team to solve problems using real-world datasets! This event, hosted by Data Science UCSB in collaboration with Gaucho Sports Analytics and ACM@UCSB, will take place on February 22–23rd, 2025 at UC Santa Barbara, with the incredible opportunity to present your project to a panel of corporate and faculty judges – including the executive director of Pytorch! – for a chance to win prizes...

        +
        + + Read More + +
        + +
        +
        +

        January 22, 2025

        +

        + Bringing the PyTorch Community Together +

        +

        As we step into a new year, it’s a great moment to reflect on the incredible community events that made 2024 a memorable year for the PyTorch Foundation. Global meetups, events, and conferences brought the community together to learn, connect, and grow. Here’s a quick recap of the year’s highlights and what to expect in 2025. +

        +
        + + Read More + +
        + +
        +
        +

        January 15, 2025

        +

        + MLOps Workflow Simplified for PyTorch with Arm and GitHub Collaboration +

        +

        PyTorch is one of the most widely used and most powerful deep learning frameworks for training and deploying complex neural networks. It has never been easier to train and deploy AI applications, and low-cost, high-performance, energy-efficient hardware, tools, and technology for creating optimized workflows are more accessible than ever. But data science, machine learning, and devops can be deep topics unto themselves, and it can be overwhelming for developers with one specialty to see how ...

        +
        + + Read More + +
        + +
        +
        +

        December 18, 2024

        +

        + docTR joins PyTorch Ecosystem: From Pixels to Data, Building a Recognition Pipeline with PyTorch and docTR +

        +

        We’re thrilled to announce that the docTR project has been integrated into the PyTorch ecosystem! This integration ensures that docTR aligns with PyTorch’s standards and practices, giving developers a reliable, community-backed solution for powerful OCR workflows. +

        +
        + + Read More + +
        + +
        +
        +

        December 09, 2024

        +

        + vLLM Joins PyTorch Ecosystem: Easy, Fast, and Cheap LLM Serving for Everyone +

        +

        We’re thrilled to announce that the vLLM project has become a PyTorch ecosystem project, and joined the PyTorch ecosystem family! + +

        +
        + + Read More + +
        + +
        +
        +

        September 08, 2024

        +

        + PyTorch Shanghai Meetup Notes +

        +

        We are honored to successfully host the PyTorch Shanghai Meetup on August 15, 2024. This Meetup has received great attention from the industry. We invited senior PyTorch developers from Intel and Huawei as guest speakers, who shared their valuable experience and the latest technical trends. In addition, this event also attracted PyTorch enthusiasts from many technology companies and well-known universities. A total of more than 40 participants gathered together to discuss and exchange the lat...

        +
        + + Read More + +
        + +
        +
        +

        May 12, 2024

        +

        + Enhancing Deep Learning Workflows: PyTorch Ecosystem Tools +

        +

        Welcome to the thriving PyTorch ecosystem, where a wealth of tools and libraries await, purpose-built to elevate your experience in deep learning as a developer or researcher. The Ecosystem Tools pages host many projects from experts spanning academia, industry, application development, and machine learning. +

        +
        + + Read More + +
        + +
        +
        +

        May 11, 2024

        +

        + Deep Learning Energy Measurement and Optimization +

        +

        Zeus is an open-source toolbox for measuring and optimizing the energy consumption of deep learning workloads. Our goal is to make energy optimization based on accurate measurements as easy as possible for diverse deep learning workloads and setups by offering composable tools with minimal assumptions. +

        +
        + + Read More + +
        + +
        +
        +

        May 11, 2024

        +

        + Introducing depyf: mastering torch.compile with ease +

        +

        We are thrilled to introduce depyf, a new project to the PyTorch ecosystem designed to help users understand, learn, and adapt to torch.compile! +

        +
        + + Read More + +
        + +
        +
        +

        February 15, 2024

        +

        + Exploring scientific machine learning pipelines through the SimulAI toolkit +

        +

        SciML, short for Scientific Machine Learning, encompasses work that merges quantitative sciences with machine learning. It has gained significant traction over the past decade, driven by the widespread availability of specialized hardware (such as GPUs and TPUs) and datasets. Additionally, it has been propelled by the overarching influence of the machine learning wave, now ingrained in the zeitgeist of our times. In this context, we’d like to introduce SimulAI, an open-source toolkit under th...

        +
        + + Read More + +
        + +
        +
        +

        January 29, 2024

        +

        + Colossal-LLaMA-2: Low Cost and High-quality Domain-specific LLM Solution Using LLaMA and Colossal-AI +

        +

        The most prominent distinction between LLaMA-1 and LLaMA-2 lies in the incorporation of higher-quality corpora, a pivotal factor contributing to significant performance enhancements in LLaMA-2. This, coupled with its commercial availability, extends the potential for creative applications of large models within the open-source community. +

        +
        + + Read More + +
        + +
        +
        +

        January 25, 2024

        +

        + 3D rotations and spatial transformations made easy with RoMa +

        +

        Struggling with quaternions, rotation vectors, right-hand rules and all these stuffs? Try RoMa: an easy-to-to-use, stable and efficient library to deal with rotations and spatial transformations in PyTorch. +

        +
        + + Read More + +
        + +
        +
        +

        January 04, 2024

        +

        + torchdistill — a modular, configuration-driven framework for reproducible deep learning and knowledge distillation experiments +

        +

        This article summarizes key features and concepts of torchdistill (v1.0.0). Refer to the official documentation for its APIs and research projects. +

        +
        + + Read More + +
        + +
        +
        +

        December 06, 2023

        +

        + PyPose: A Library for Robot Learning with Physics-based Optimization +

        +

        We are excited to share our new open-source library PyPose. It is a PyTorch-based robotics-oriented library that provides a set of tools and algorithms for connecting deep learning with physics-based optimization. +

        +
        + + Read More + +
        + +
        +
        +

        November 09, 2023

        +

        + How Activation Checkpointing enables scaling up training deep learning models +

        +

        Activation checkpointing is a technique used for reducing the memory footprint at the cost of more compute. It utilizes the simple observation that we can avoid saving intermediate tensors necessary for backward computation if we just recompute them on demand instead. +

        +
        + + Read More + +
        + +
        +
        +

        October 26, 2023

        +

        + torch.compile, explained +

        +

        Have you ever felt overwhelmed by the complexities of torch.compile? Diving into its workings can feel like black magic, with bytecode and Python internal details that many users fail to understand, hindering them from understanding and debugging torch.compile. +

        +
        + + Read More + +
        + +
        +
        +

        July 06, 2023

        +

        + Unveiling the Power of Semi-Supervised Learning: The Unified Semi-Supervised Learning Benchmark +

        +

        Machine Learning models thrive on high-quality, fully-annotated data. The traditional supervised learning approach typically requires data on the scale of millions, or even billions, to train large foundational models. However, obtaining such a vast amount of labeled data is often tedious and labor-intensive. As an alternative, semi-supervised learning (SSL) aims to enhance model generalization with only a fraction of labeled data, complemented by a considerable amount of unlabeled data. This...

        +
        + + Read More + +
        + +
        +
        +

        June 29, 2023

        +

        + Introducing TorchOpt: A High-Performance Differentiable Optimization Library for PyTorch +

        +

        Explore TorchOpt, a PyTorch-based library that revolutionizes differentiable optimization with its unified programming abstraction, high-performance distributed execution runtime, and support for various differentiation modes.” +

        +
        + + Read More + +
        + +
        +
        +

        April 04, 2023

        +

        + Profiling PyTorch language models with octoml-profile +

        +

        The recent launch of PyTorch 2.0 makes it clear that the community is heavily investing in a compiler-powered future for machine learning. The new OctoML Profiler can help any user realize the full potential of these shifts in the ML landscape. +

        +
        + + Read More + +
        + +
        +
        +

        February 10, 2023

        +

        + How FASHABLE achieves SoA realistic AI generated images using PyTorch and Azure Machine Learning +

        +

        Fashable is a company born at XNFY Lab (a joint initiative with Microsoft). The company’s main goal is to revolutionize the world of fashion with ethical Artificial Intelligence (AI) technologies built on PyTorch framework. Fashable is focused on developing AI models that generates synthetic contents for the global fashion industry. The Fashion industry has been criticized in recent years because it generates a lot of waste and is responsible for up to 10% of global carbon dioxide output. Fas...

        +
        + + Read More + +
        +
        -

        {{ post.date | date: '%B %d, %Y' }}

        +

        January 31, 2023

        - {{ post.title }} + Latest Colossal-AI boasts novel automatic parallelism and offers savings up to 46x for Stable Diffusion 2

        -

        {{ post.excerpt | strip_html | truncate: 500}}

        +

        As a new PyTorch Ecosystem Partner, we at HPC-AI Tech look forward to working with the PyTorch community to advance AI technologies through our open source project, Colossal-AI. We are excited to join forces with the PyTorch community in this effort. +

        - + Read More
        - {% endfor %} + +
        +
        +

        January 06, 2023

        +

        + Distributed training with PyTorch and Azure ML +

        +

        Suppose you have a very large PyTorch model, and you’ve already tried many common tricks to speed up training: you optimized your code, you moved training to the cloud and selected a fast GPU VM, you installed software packages that improve training performance (for example, by using the ACPT curated environment on Azure ML). And yet, you still wish your model could train faster. Maybe it’s time to give distributed training a try! Continue reading to learn the simplest way to do distributed t...

        +
        + + Read More + +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/community-stories.html b/community-stories.html index 84f0e2395229..c156e8bc15c7 100644 --- a/community-stories.html +++ b/community-stories.html @@ -1,12 +1,310 @@ ---- -layout: default -title: Community Stories -permalink: /community-stories -body-class: blog -background-class: comm-stories-background ---- - -
        + + + + + + + + + + + + + Community Stories | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        Community Stories

        Read case studies on how our community solves real, everyday machine learning problems with PyTorch

        @@ -18,31 +316,1524 @@

        Community Stories

        - {% assign community_stories = site.community_stories | sort_natural: "date" | reverse %} - {% for post in community_stories %} + +
        -

        {{ post.date | date: '%B %d, %Y' }}

        +

        May 01, 2025

        - {{ post.title }} + How IBM Research Uses PyTorch and TerraTorch to Make Geospatial Computer Vision Accessible for Everyone

        -

        {{ post.excerpt | strip_html | truncate: 500}}

        +

        Geospatial computer vision is essential for understanding our planet — from monitoring deforestation to tracking urban development and analyzing the impacts of climate change. However, the coding and deep learning skills for applying AI models to satellite imagery and earth observation data has traditionally been a major barrier for many practitioners. +

        - + Read More
        - {% endfor %} - -
        -
        -
        -
        + +
        +
        +

        January 24, 2025

        +

        + How Intel Uses PyTorch to Empower Generative AI through Intel Arc GPUs +

        + +

        Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads. +

        +
        + + Read More + +
        + +
        +
        +

        September 27, 2024

        +

        + Using PyTorch for Monocular Depth Estimation Webinar +

        + +

        In this webinar, Bob Chesebrough of Intel guides you through the steps he took to create a clipped image with background clutter removed from the image. He accomplished this using monocular depth estimation with PyTorch. This could potentially be used to automate structure from motion and other image-related tasks where you want to highlight or focus on a single portion of an image, particularly for identifying parts of the image that were closest to the camera. Specifically, he used depth es...

        +
        + + Read More + +
        + +
        +
        +

        May 25, 2024

        +

        + AI Helps Duolingo Personalize Language Learning +

        + +

        Learning a foreign language was probably one of your goals last year. And the year before, and the year before that. Like gym memberships, our best intentions often don’t survive very long. Aside from the time required to achieve proficiency with a new language, most people struggle with traditional approaches to learning. Even many web-based language tools can be monotonous and cumbersome. +

        +
        + + Read More + +
        + +
        +
        +

        October 11, 2023

        +

        + ML Model Server Resource Saving - Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance +

        + +

        Here, We will be sharing our experience in moving AI workloads from our GPU servers to our Intel CPU servers without any performance or quality degradation, and saving annual costs of approximately 340 thousand U.S. Dollar (refer to the Conclusion) in the process. +

        +
        + + Read More + +
        + +
        +
        +

        March 09, 2023

        +

        + Axon offers technology boost for public safety with in-car Automated License Plate Recognition on Azure +

        + +

        Axon, a technology leader in public safety, developed AI technology to add cutting-edge license plate recognition capabilities to its in-car camera products, which now can identify plates for vehicles of interest and provide law enforcement with proactive notifications and alerts. Axon AI scientists and engineers chose Microsoft Azure infrastructure as a scalable, cost-efficient, and feature-rich environment where they can develop and test AI models. With Azure compute, storage, and PyTorch a...

        +
        + + Read More + +
        + +
        +
        +

        February 21, 2023

        +

        + HippoScreen Improves AI Performance by 2.4x with oneAPI Tools +

        + +

        The Taiwan-based neurotechnology startup used tools and frameworks in the Intel® oneAPI Base and AI Analytics Toolkits to the improve efficiency and build times of deep-learning models used in its Brain Waves AI system. As a result, HippoScreen is able to broaden the system’s applications to a wider range of psychiatric conditions and diseases. +

        +
        + + Read More + +
        + +
        +
        +

        February 02, 2023

        +

        + NASA and IBM to Speed AI Creation with New Foundation Models +

        + +

        NASA and IBM are working together to create foundation models based on NASA’s data sets — including geospatial data — with the goal of accelerating the creation of AI models. + +

        +
        + + Read More + +
        + +
        +
        +

        January 23, 2023

        +

        + Search Model Serving Using PyTorch and TorchServe +

        + +

        Walmart Search has embarked on the journey of adopting Deep Learning in the search ecosystem to improve search relevance. For our pilot use case, we served the computationally intensive Bert Base model at runtime with an objective to achieve low latency and high throughput. +

        +
        + + Read More + +
        + +
        +
        +

        December 30, 2022

        +

        + Extracting value from siloed healthcare data using federated learning with Azure Machine Learning +

        + +

        Sensitive information such as healthcare data is often siloed within health organization boundaries. This has posed a challenge to machine learning models used by the health and life sciences industry that require data for training purposes. To improve patient care and accelerate health industry progression, the Microsoft Health & Life Sciences AI group used a federated learning setup to train their biomedical natural language processing service, Text Analytics for Health, while preservin...

        +
        + + Read More + +
        + +
        +
        +

        December 02, 2022

        +

        + How PyTorch is bringing the power of AI to computers and smartphones +

        + +

        Many of the experiences people enjoy on Facebook and Instagram are powered by artificial intelligence (AI). A number of them, like Assistant, Avatars, and AR effects, cannot be powered by server-side AI due to latency, network bandwidth, and other constraints. Running AI on-device —that is, directly on a phone, tablet, or even a pair of smart glasses — offers huge advantages over constantly sending data back to a server. It’s faster, and it creates a privacy-enhancing experience for people wh...

        +
        + + Read More + +
        + +
        +
        +

        November 17, 2022

        +

        + IBM Research: Bringing massive AI models to any cloud +

        + +

        The field of AI is in the middle of a revolution. In recent years, AI models have made images, songs, or even websites out of simple text prompts. These types of models with billions of parameters, called foundation models, can with little fine-tuning be repurposed from one task to another, removing countless hours of training and labelling, and refitting a model to take on a new task. +

        +
        + + Read More + +
        + +
        +
        +

        October 25, 2022

        +

        + Run inference at scale for OpenFold, a PyTorch-based protein folding ML model, using Amazon EKS +

        + +

        In drug discovery, understanding the 3D structure of proteins is key to assessing the ability of a drug to bind to it, directly impacting its efficacy. Predicting the 3D protein form, however, is very complex, challenging, expensive, and time consuming, and can take years when using traditional methods such as X-ray diffraction. Applying machine learning (ML) to predict these structures can significantly accelerate the time to predict protein structures—from years to hours. Several high-profi...

        +
        + + Read More + +
        + +
        +
        +

        October 04, 2022

        +

        + Optimize Protein Folding Costs with OpenFold on AWS Batch +

        + +

        Knowing the physical structure of proteins is an important part of the drug discovery process. Machine learning (ML) algorithms like AlphaFold v2.0 significantly reduce the cost and time needed to generate usable protein structures. These projects have also inspired development of AI-driven workflows for de novo protein design and protein-ligand interaction analysis. +

        +
        + + Read More + +
        + +
        +
        +

        June 28, 2022

        +

        + Crayon boosts speed, accuracy of healthcare auditing process using Azure Machine Learning and PyTorch +

        + +

        Healthcare providers need to be able to verify that they’re maintaining the highest operating safety and efficacy standards. Those standards are set by a national accreditation organization whose surveyors, often healthcare professionals themselves, regularly visit facilities and document situations that might need to be corrected or brought back in line with the latest rules and policies. That assessment and accreditation process generates a huge amount of data, and even the most experienced...

        +
        + + Read More + +
        + +
        +
        +

        May 25, 2022

        +

        + Wayve’s AV2.0 builds a brighter future with Azure Machine Learning and PyTorch +

        + +

        Wayve wants to accelerate and scale autonomous vehicle (AV) development by using vision-based machine learning for rapid prototyping and quick iteration. So, it developed a platform that uses the open-source machine learning framework PyTorch with Microsoft Azure Machine Learning to gather, manage, and process millions of hours of driving data per year—petabytes of data—consisting of images, GPS data, and data from other sensors. Wayve now has the scalable capacity to build and iterate drivin...

        +
        + + Read More + +
        + +
        +
        +

        May 12, 2022

        +

        + Ambient Clinical Intelligence: Generating Medical Reports with PyTorch +

        + +

        Complete and accurate clinical documentation is an essential tool for tracking patient care. It allows for treatment plans to be shared among care teams to aid in continuity of care and ensures a transparent and effective process for reimbursement. +

        +
        + + Read More + +
        + +
        +
        +

        March 16, 2022

        +

        + Bentley Systems creates breakthrough framework, drastically speeds up AI development with Azure Machine Learning +

        + +

        Software innovator Bentley Systems offers a broad portfolio of solutions to help the organizations that design, build, and operate the world’s infrastructure assets. The company uses machine learning in its flagship product to read disparate paper-based asset data and transform it into consolidated digital data. To speed up and formalize this process, Bentley created a machine learning operations framework using Microsoft Azure Machine Learning and PyTorch. Developers’ speed and job satisfact...

        +
        + + Read More + +
        + +
        +
        +

        March 14, 2022

        +

        + Solliance makes headlines with cryptocurrency news analysis platform powered by Azure Machine Learning, PyTorch +

        + +

        Solliance delivers cutting-edge solutions that fill gaps across a wide variety of industries. Through its recent collaboration with Baseline, Solliance revolutionizes the cryptocurrency trading experience, extracting news insights from more than 150,000 global sources in near real time. To manage Baseline workloads, Solliance brought Microsoft Azure Machine Learning and PyTorch together for maximum processing power and deep learning capabilities. The result: investors can get under the headli...

        +
        + + Read More + +
        + +
        +
        +

        March 02, 2022

        +

        + Create a Wine Recommender Using NLP on AWS +

        + +

        In this tutorial, we’ll build a simple machine learning pipeline using a BERT word embedding model and the Nearest Neighbor algorithm to recommend wines based on user inputted preferences. To create and power this recommendation engine, we’ll leverage AWS’s SageMaker platform, which provides a fully managed way for us to train and deploy our service. +

        +
        + + Read More + +
        + +
        +
        +

        February 24, 2022

        +

        + Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing +

        + +

        Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out. Amazon Ads helps companies build their brand and connect with shoppers through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses and brands of all sizes, including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies can upload their own ad...

        +
        + + Read More + +
        + +
        +
        +

        February 10, 2022

        +

        + ChemicalX: A Deep Learning Library for Drug Pair Scoring +

        + +

        In this paper, we introduce ChemicalX, a PyTorch-based deep learning library designed for providing a range of state of the art models to solve the drug pair scoring task. The primary objective of the library is to make deep drug pair scoring models accessible to machine learning researchers and practitioners in a streamlined this http URL design of ChemicalX reuses existing high level model training utilities, geometric deep learning, and deep chemistry layers from the PyTorch ecosystem. Our...

        +
        + + Read More + +
        + +
        +
        +

        January 04, 2022

        +

        + The Why and How of Scaling Large Language Models +

        + +

        Anthropic is an AI safety and research company that’s working to build reliable, interpretable, and steerable AI systems. Over the past decade, the amount of compute used for the largest training runs has increased at an exponential pace. We’ve also seen in many domains that larger models are able to attain better performance following precise scaling laws. The compute needed to train these models can only be attained using many coordinated machines that are communicating data between them. I...

        +
        + + Read More + +
        + +
        +
        +

        November 21, 2021

        +

        + Running BERT model inference on AWS Inf1: From model compilation to speed comparison +

        + +

        In this tech blog, we will compare the speed and cost of Inferentia, GPU, and CPU for a BERT sequence labeling example. We also provide a helpful tutorial on the steps for model compilation and inference on Inf1 instances. +

        +
        + + Read More + +
        + +
        +
        +

        November 09, 2021

        +

        + SearchSage: Learning Search Query Representations at Pinterest +

        + +

        Pinterest surfaces billions of ideas to people every day, and the neural modeling of embeddings for content, users, and search queries are key in the constant improvement of these machine learning-powered recommendations. Good embeddings — representations of discrete entities as vectors of numbers — enable fast candidate generation and are strong signals to models that classify, retrieve and rank relevant content. +

        +
        + + Read More + +
        + +
        +
        +

        October 18, 2021

        +

        + How We Built: An Early-Stage Recommender System +

        + +

        Personalization is ubiquitous on most platforms today. Supercharged by connectivity, and scaled by machine learning, most experiences on the internet are tailored to our personal tastes. Peloton classes offer a diversity of instructors, languages, fitness disciplines, durations and intensity. Each Member has specific fitness goals, schedule, fitness equipment, and level of skill or strength. This diversity of content and individuality of Member needs at massive scale creates the opportunity f...

        +
        + + Read More + +
        + +
        +
        +

        September 07, 2021

        +

        + Using a Grapheme to Phoneme Model in Cisco’s Webex Assistant +

        + +

        Grapheme to Phoneme (G2P) is a function that generates pronunciations (phonemes) for words based on their written form (graphemes). It has an important role in automatic speech recognition systems, natural language processing, and text-to-speech engines. In Cisco’s Webex Assistant, we use G2P modelling to assist in resolving person names from voice. See here for further details of various techniques we use to build robust voice assistants. +

        +
        + + Read More + +
        + +
        +
        +

        September 07, 2021

        +

        + How AI is Helping Vets to Help our Pets +

        + +

        1 in 4 dogs, and 1 in 5 cats, will develop cancer at some point in their lives. Pets today have a better chance of being successfully treated than ever, thanks to advances in early recognition, diagnosis and treatment. +

        +
        + + Read More + +
        + +
        +
        +

        August 10, 2021

        +

        + University of Pécs enables text and speech processing in Hungarian, builds the BERT-large model with just 1,000 euro with Azure +

        + +

        Everyone prefers to use their mother tongue when communicating with chat agents and other automated services. However, for languages like Hungarian—spoken by only 15 million people—the market size will often be viewed as too small for large companies to create software, tools or applications that can process Hungarian text as input. Recognizing this need, the Applied Data Science and Artificial Intelligence team from University of Pécs decided to step up. Using Microsoft AI Solutions and ONNX...

        +
        + + Read More + +
        + +
        +
        +

        June 17, 2021

        +

        + How 3DFY.ai Built a Multi-Cloud, Distributed Training Platform Over Spot Instances with TorchElastic and Kubernetes +

        + +

        Deep Learning development is becoming more and more about minimizing the time from idea to trained model. To shorten this lead time, researchers need access to a training environment that supports running multiple experiments concurrently, each utilizing several GPUs. +

        +
        + + Read More + +
        + +
        +
        +

        June 07, 2021

        +

        + AI21 Labs Trains 178-Billion-Parameter Language Model Using Amazon EC2 P4d Instances, PyTorch +

        + +

        AI21 Labs uses machine learning to develop language models focused on understanding meaning, and in 2021 it set a goal to train the recently released Jurassic-1 Jumbo, an autoregressive language model with 178 billion parameters. Developers who register for beta testing will get access to Jurassic-1 Jumbo and can immediately start to customize the model for their use case. The software startup wanted to train the model efficiently, so it looked to Amazon Web Services (AWS) and built a solutio...

        +
        + + Read More + +
        + +
        +
        +

        June 02, 2021

        +

        + PyTorch Community Voices +

        + +

        Join us for an interview with star PyTorch community members Alexander O’Connor and Binghui Ouyang from AutoDesk as we learn how they used PyTorch and AWS Inferentia to deploy production-scale models in chatbot intent classification. +

        +
        + + Read More + +
        + +
        +
        +

        May 14, 2021

        +

        + How Outreach Productionizes PyTorch-based Hugging Face Transformers for NLP +

        + +

        At Outreach, a leading sales engagement platform, our data science team is a driving force behind our innovative product portfolio largely driven by deep learning and AI. We recently announced enhancements to the Outreach Insights feature, which is powered by the proprietary Buyer Sentiment deep learning model developed by the Outreach Data Science team. This model allows sales teams to deepen their understanding of customer sentiment through the analysis of email reply content, moving from j...

        +
        + + Read More + +
        + +
        +
        +

        April 29, 2021

        +

        + Automated Background Removal in E-commerce Fashion Image Processing Using PyTorch on Databricks +

        + +

        Wehkamp is one of the biggest e-commerce companies in the Netherlands, with more than 500,000 daily visitors on their website. A wide variety of products offered on the Wehkamp site aims to meet its customers’ many needs. An important aspect of any customer visit on an e-commerce website is a qualitative and accurate visual experience of the products. At a large scale, this is no easy task, with thousands of product photos processed in a local photo studio. +

        +
        + + Read More + +
        + +
        +
        +

        April 27, 2021

        +

        + Disney's Creative Genome by Miquel Farré +

        + +

        Miquel Farré is a senior technology manager at Disney, taking the lead on projects at the intersection of video technology, machine learning and web applications. Metadata that drives content searchability is most often indexed at the title level, with limited governance and high ambiguity; at best, keyword metadata has been added to a title as a layer of enrichment. +

        +
        + + Read More + +
        + +
        +
        +

        April 07, 2021

        +

        + How We Used AWS Inferentia to Boost PyTorch NLP Model Performance by 4.9x for the Autodesk Ava Chatbot +

        + +

        Autodesk is a multinational software company with world-renowned products in areas such as Architecture, Engineering, & Construction, Manufacturing, and Media & Entertainment. Amongst Autodesk’s best-known products are AutoCAD, Revit, Maya, and Fusion 360. The company has millions of customers around the world, and many of them have need for support to make best use of their products. +

        +
        + + Read More + +
        + +
        +
        +

        February 25, 2021

        +

        + Machine Learning at Tubi: Powering Free Movies, TV and News for All +

        + +

        In this blog series, our aim is to highlight the nuances of Machine Learning in Tubi’s Ad-based Video on Demand (AVOD) space as practiced at Tubi. Machine Learning helps solve myriad problems involving recommendations, content understanding and ads. We extensively use PyTorch for several of these use cases as it provides us the flexibility, computational speed and ease of implementation to train large scale deep neural networks using GPUs. +

        +
        + + Read More + +
        + +
        +
        +

        January 27, 2021

        +

        + Deepset achieves a 3.9x speedup and 12.8x cost reduction for training NLP models by working with AWS and NVIDIA +

        + +

        At deepset, we’re building the next-level search engine for business documents. Our core product, Haystack, is an open-source framework that enables developers to utilize the latest NLP models for semantic search and question answering at scale. Our software as a service (SaaS) platform, Haystack Hub, is used by developers from various industries, including finance, legal, and automotive, to find answers in all kinds of text documents. You can use these answers to improve the search experienc...

        +
        + + Read More + +
        + +
        +
        +

        December 17, 2020

        +

        + Using PyTorch to streamline machine-learning projects +

        + +

        For many surgeons, the possibility of going back into the operating room to review the actions they carried out on a patient could provide invaluable medical insights. +

        +
        + + Read More + +
        + +
        +
        +

        December 17, 2020

        +

        + How theator Built a Continuous Training Framework To Scale up Its Surgical Intelligence Platform +

        + +

        Performing surgery is largely about decision making. As Dr. Frank Spencer put it in 1978, “A skillfully performed operation is about 75% decision making and 25% dexterity”. Five decades later, and the surgical field is finally — albeit gradually — implementing advances in data science and AI to enhance surgeons’ ability to make the best decisions in the operating room. That’s where theator comes in: the company is re-imagining surgery with a Surgical Intelligence platform that leverages highl...

        +
        + + Read More + +
        + +
        +
        +

        December 02, 2020

        +

        + Graph Convolutional Operators in the PyTorch JIT +

        + +

        In this talk, scientist Lindsey Gray and Ph.D. student Matthias Fey co-examine how the challenges of High Energy Particle Physics are driving the need for more efficient research and development pipelines in neural network development. In particular, they look at the additions made to PyTorch Geometric, which allow Graph Neural Network models to be compiled by the PyTorch JIT, significantly easing the process of deploying such networks at scale. +

        +
        + + Read More + +
        + +
        +
        +

        October 22, 2020

        +

        + How Wadhwani AI Uses PyTorch To Empower Cotton Farmers +

        + +

        Cotton is a major fibre crop across the world, cultivated in over 80 countries with nearly 100 million families across the world rely on cotton farming for their livelihood. With such importance placed on many farmers’ crops, cotton’s particular vulnerability to pest infestations has been troubling to many. However, pest infestation is also simultaneously one of the most significant and preventable problems that farmers face with 55% of all pesticide usage in India being devoted to cotton far...

        +
        + + Read More + +
        + +
        +
        +

        October 07, 2020

        +

        + How Lyft Uses PyTorch to Power Machine Learning for Their Self-Driving Cars +

        + +

        Lyft’s mission is to improve people’s lives with the world’s best transportation. We believe in a future where self-driving cars make transportation safer and more accessible for everyone. That’s why Level 5, Lyft’s self-driving division, is developing a complete autonomous system for the Lyft network to provide riders’ access to the benefits of this technology. However, this is an incredibly complex task. +

        +
        + + Read More + +
        + +
        +
        +

        September 30, 2020

        +

        + Speeding up drug discovery with advanced machine learning +

        + +

        Whatever our job title happens to be at AstraZeneca, we’re seekers. I’m part of the Biological Insights Knowledge Graph (BIKG) team. We help scientists comb through massive amounts of data in our quest to find the information we need to help us deliver life-changing medicines. +

        +
        + + Read More + +
        + +
        +
        +

        September 30, 2020

        +

        + AstraZeneca is using PyTorch-powered algorithms to discover new drugs +

        + +

        Since it launched in 2017, Facebook’s machine-learning framework PyTorch has been put to good use, with applications ranging from powering Elon Musk’s autonomous cars to driving robot-farming projects. Now pharmaceutical firm AstraZeneca has revealed how its in-house team of engineers are tapping PyTorch too, and for equally as important endeavors: to simplify and speed up drug discovery. +

        +
        + + Read More + +
        + +
        +
        +

        August 06, 2020

        +

        + AI for AG: Production machine learning for agriculture +

        + +

        How did farming affect your day today? If you live in a city, you might feel disconnected from the farms and fields that produce your food. Agriculture is a core piece of our lives, but we often take it for granted. +

        +
        + + Read More + +
        + +
        +
        +

        July 17, 2020

        +

        + How Pixar uses AI and GANs to create high-resolution content +

        + +

        As digital animators continue to push the boundaries of technology and creativity, the technical teams that support them are turning to artificial intelligence and machine learning to deliver the tools they need. That’s the case at Pixar, where the company has made new machine learning breakthroughs it hopes will both improve quality and reduce costs. +

        +
        + + Read More + +
        + +
        +
        +

        July 16, 2020

        +

        + How Disney uses PyTorch for animated character recognition +

        + +

        The long and incremental evolution of the media industry, from a traditional broadcast and home video model, to a more mixed model with increasingly digitally-accessible content, has accelerated the use of machine learning and artificial intelligence (AI). Advancing the implementation of these technologies is critical for a company like Disney that has produced nearly a century of content, as it allows for new consumer experiences and enables new applications for illustrators and writers to c...

        +
        + + Read More + +
        + +
        +
        +

        June 16, 2020

        +

        + How Trigo built a scalable AI development & deployment pipeline for Frictionless Retail +

        + +

        Trigo is a provider of AI & computer vision based checkout-free systems for the retail market, enabling frictionless checkout and a range of other in-store operational and marketing solutions such as predictive inventory management, security and fraud prevention, pricing optimization and event-driven marketing. +

        +
        + + Read More + +
        + +
        +
        +

        June 09, 2020

        +

        + How Datarock is using PyTorch for more intelligent mining decision making +

        + +

        The mining industry is currently going through a digital revolution as it looks for new and innovative ways to explore and extract mineral resources. This has largely been driven by a need to reduce costs in a competitive global industry that’s experiencing declining ore grades and fewer new discoveries. +

        +
        + + Read More + +
        + +
        +
        +

        April 25, 2020

        +

        + Deploying huggingface‘s BERT to production with pytorch/serve +

        + +

        TL;DR: pytorch/serve is a new awesome framework to serve torch models in production. This story teaches you how to use it for huggingface/transformers models like BERT. +

        +
        + + Read More + +
        + +
        +
        +

        November 14, 2019

        +

        + Using deep learning and PyTorch to power next gen aircraft at Caltech +

        + +

        Learn how Caltech’s Center for Autonomous Systems and Technologies (CAST) uses PyTorch to build deep learning systems that can understand the aerodynamics of how aircrafts interact with the ground to enable much smoother and safer landings. +

        +
        + + Read More + +
        + +
        +
        +

        November 06, 2019

        +

        + PyTorch at Dolby Labs +

        + +

        Hear how Dolby Labs is using PyTorch to develop deep learning for audio, and learn about the challenges that audio AI presents and the breakthroughs and applications they’ve built at Dolby to push the field forward. +

        +
        + + Read More + +
        + +
        +
        +

        August 20, 2019

        +

        + Dialogue Assistance for Customer Service at Airbnb +

        + +

        Businesses are using PyTorch, an open source machine learning framework, to seamlessly build, train, and deploy AI models in production across their products and services. Hear how industry leaders leverage PyTorch to help power everything from ubiquitous productivity software used across the world to enabling advances in medicine for fighting cancer. +

        +
        + + Read More + +
        + +
        +
        +

        July 23, 2019

        +

        + Mapillary Research: Seamless Scene Segmentation and In-Place Activated BatchNorm +

        + +

        With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry. +

        +
        + + Read More + +
        + + +
        +
        + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/community-stories/1/index.html b/community-stories/1/index.html new file mode 100644 index 000000000000..cfe97e93ab4c --- /dev/null +++ b/community-stories/1/index.html @@ -0,0 +1 @@ +

        At Outreach, a leading sales engagement platform, our data science team is a driving force behind our innovative product portfolio largely driven by deep learning and AI. We recently announced enhancements to the Outreach Insights feature, which is powered by the proprietary Buyer Sentiment deep learning model developed by the Outreach Data Science team. This model allows sales teams to deepen their understanding of customer sentiment through the analysis of email reply content, moving from just counting the reply rate to classification of the replier’s intent.

        diff --git a/community-stories/10/index.html b/community-stories/10/index.html new file mode 100644 index 000000000000..a9c0fc3c1270 --- /dev/null +++ b/community-stories/10/index.html @@ -0,0 +1 @@ +

        Solliance delivers cutting-edge solutions that fill gaps across a wide variety of industries. Through its recent collaboration with Baseline, Solliance revolutionizes the cryptocurrency trading experience, extracting news insights from more than 150,000 global sources in near real time. To manage Baseline workloads, Solliance brought Microsoft Azure Machine Learning and PyTorch together for maximum processing power and deep learning capabilities. The result: investors can get under the headlines and see which specific news metrics are moving the volatile crypto market to make more informed trading decisions, while Baseline can release new features in weeks instead of months.

        diff --git a/community-stories/11/index.html b/community-stories/11/index.html new file mode 100644 index 000000000000..728a64d17b29 --- /dev/null +++ b/community-stories/11/index.html @@ -0,0 +1 @@ +

        In this tutorial, we’ll build a simple machine learning pipeline using a BERT word embedding model and the Nearest Neighbor algorithm to recommend wines based on user inputted preferences. To create and power this recommendation engine, we’ll leverage AWS’s SageMaker platform, which provides a fully managed way for us to train and deploy our service.

        diff --git a/community-stories/12/index.html b/community-stories/12/index.html new file mode 100644 index 000000000000..20e17bac23cf --- /dev/null +++ b/community-stories/12/index.html @@ -0,0 +1 @@ +

        Healthcare providers need to be able to verify that they’re maintaining the highest operating safety and efficacy standards. Those standards are set by a national accreditation organization whose surveyors, often healthcare professionals themselves, regularly visit facilities and document situations that might need to be corrected or brought back in line with the latest rules and policies. That assessment and accreditation process generates a huge amount of data, and even the most experienced surveyors struggle to keep ahead of the ongoing development of thousands of policy rules that might be relevant in any particular scenario. Vaagan and his team took on the task of fixing the issue by building a machine learning solution that could ingest text from those reports and return a top ten list of the latest associated rules with unprecedented accuracy. They used Azure technology, development tools, and services to bring that solution to fruition. Crayon customers report clear time savings with the new healthcare solution. Just as important, the solution provides consistent responses that aren’t subject to the vagaries of individual interpretation or potentially out-of-date data.

        diff --git a/community-stories/13/index.html b/community-stories/13/index.html new file mode 100644 index 000000000000..19261c8f2a75 --- /dev/null +++ b/community-stories/13/index.html @@ -0,0 +1 @@ +

        Sensitive information such as healthcare data is often siloed within health organization boundaries. This has posed a challenge to machine learning models used by the health and life sciences industry that require data for training purposes. To improve patient care and accelerate health industry progression, the Microsoft Health & Life Sciences AI group used a federated learning setup to train their biomedical natural language processing service, Text Analytics for Health, while preserving the trust boundaries of siloed data. The federated learning framework was built using Microsoft Azure Machine Learning and open-source technologies to help organizations analyze siloed data and build new applications without compromising data privacy.

        diff --git a/community-stories/14/index.html b/community-stories/14/index.html new file mode 100644 index 000000000000..88bca8c17c55 --- /dev/null +++ b/community-stories/14/index.html @@ -0,0 +1 @@ +

        The Taiwan-based neurotechnology startup used tools and frameworks in the Intel® oneAPI Base and AI Analytics Toolkits to the improve efficiency and build times of deep-learning models used in its Brain Waves AI system. As a result, HippoScreen is able to broaden the system’s applications to a wider range of psychiatric conditions and diseases.

        diff --git a/community-stories/16/index.html b/community-stories/16/index.html new file mode 100644 index 000000000000..1430e12feb24 --- /dev/null +++ b/community-stories/16/index.html @@ -0,0 +1 @@ +

        Miquel Farré is a senior technology manager at Disney, taking the lead on projects at the intersection of video technology, machine learning and web applications. Metadata that drives content searchability is most often indexed at the title level, with limited governance and high ambiguity; at best, keyword metadata has been added to a title as a layer of enrichment.

        diff --git a/community-stories/17/index.html b/community-stories/17/index.html new file mode 100644 index 000000000000..57e31ab4d9f3 --- /dev/null +++ b/community-stories/17/index.html @@ -0,0 +1 @@ +

        The long and incremental evolution of the media industry, from a traditional broadcast and home video model, to a more mixed model with increasingly digitally-accessible content, has accelerated the use of machine learning and artificial intelligence (AI). Advancing the implementation of these technologies is critical for a company like Disney that has produced nearly a century of content, as it allows for new consumer experiences and enables new applications for illustrators and writers to create the highest-quality content.

        diff --git a/community-stories/18/index.html b/community-stories/18/index.html new file mode 100644 index 000000000000..892a30dd97e3 --- /dev/null +++ b/community-stories/18/index.html @@ -0,0 +1 @@ +

        In this blog series, our aim is to highlight the nuances of Machine Learning in Tubi’s Ad-based Video on Demand (AVOD) space as practiced at Tubi. Machine Learning helps solve myriad problems involving recommendations, content understanding and ads. We extensively use PyTorch for several of these use cases as it provides us the flexibility, computational speed and ease of implementation to train large scale deep neural networks using GPUs.

        diff --git a/community-stories/19/index.html b/community-stories/19/index.html new file mode 100644 index 000000000000..fee7d95008c0 --- /dev/null +++ b/community-stories/19/index.html @@ -0,0 +1 @@ +

        As digital animators continue to push the boundaries of technology and creativity, the technical teams that support them are turning to artificial intelligence and machine learning to deliver the tools they need. That’s the case at Pixar, where the company has made new machine learning breakthroughs it hopes will both improve quality and reduce costs.

        diff --git a/community-stories/2/index.html b/community-stories/2/index.html new file mode 100644 index 000000000000..27b9b1224b6f --- /dev/null +++ b/community-stories/2/index.html @@ -0,0 +1 @@ +

        Amazon Ads uses PyTorch, TorchServe, and AWS Inferentia to reduce inference costs by 71% and drive scale out. Amazon Ads helps companies build their brand and connect with shoppers through ads shown both within and beyond Amazon’s store, including websites, apps, and streaming TV content in more than 15 countries. Businesses and brands of all sizes, including registered sellers, vendors, book vendors, Kindle Direct Publishing (KDP) authors, app developers, and agencies can upload their own ad creatives, which can include images, video, audio, and, of course, products sold on Amazon.

        diff --git a/community-stories/20/index.html b/community-stories/20/index.html new file mode 100644 index 000000000000..88ad90797b78 --- /dev/null +++ b/community-stories/20/index.html @@ -0,0 +1 @@ +

        In this tech blog, we will compare the speed and cost of Inferentia, GPU, and CPU for a BERT sequence labeling example. We also provide a helpful tutorial on the steps for model compilation and inference on Inf1 instances.

        diff --git a/community-stories/21/index.html b/community-stories/21/index.html new file mode 100644 index 000000000000..7d51b399621a --- /dev/null +++ b/community-stories/21/index.html @@ -0,0 +1 @@ +

        Complete and accurate clinical documentation is an essential tool for tracking patient care. It allows for treatment plans to be shared among care teams to aid in continuity of care and ensures a transparent and effective process for reimbursement.

        diff --git a/community-stories/22/index.html b/community-stories/22/index.html new file mode 100644 index 000000000000..28e02017dd25 --- /dev/null +++ b/community-stories/22/index.html @@ -0,0 +1 @@ +

        Since it launched in 2017, Facebook’s machine-learning framework PyTorch has been put to good use, with applications ranging from powering Elon Musk’s autonomous cars to driving robot-farming projects. Now pharmaceutical firm AstraZeneca has revealed how its in-house team of engineers are tapping PyTorch too, and for equally as important endeavors: to simplify and speed up drug discovery.

        diff --git a/community-stories/23/index.html b/community-stories/23/index.html new file mode 100644 index 000000000000..c5a733be0486 --- /dev/null +++ b/community-stories/23/index.html @@ -0,0 +1 @@ +

        TL;DR: pytorch/serve is a new awesome framework to serve torch models in production. This story teaches you how to use it for huggingface/transformers models like BERT.

        diff --git a/community-stories/24/index.html b/community-stories/24/index.html new file mode 100644 index 000000000000..93c138da5885 --- /dev/null +++ b/community-stories/24/index.html @@ -0,0 +1 @@ +

        1 in 4 dogs, and 1 in 5 cats, will develop cancer at some point in their lives. Pets today have a better chance of being successfully treated than ever, thanks to advances in early recognition, diagnosis and treatment.

        diff --git a/community-stories/25/index.html b/community-stories/25/index.html new file mode 100644 index 000000000000..0e6e8f14e830 --- /dev/null +++ b/community-stories/25/index.html @@ -0,0 +1 @@ +

        Performing surgery is largely about decision making. As Dr. Frank Spencer put it in 1978, “A skillfully performed operation is about 75% decision making and 25% dexterity”. Five decades later, and the surgical field is finally — albeit gradually — implementing advances in data science and AI to enhance surgeons’ ability to make the best decisions in the operating room. That’s where theator comes in: the company is re-imagining surgery with a Surgical Intelligence platform that leverages highly advanced AI, specifically machine learning and computer vision technology, to analyze every step, event, milestone, and critical junction of surgical procedures — significantly boosting surgeons’ overall performance.

        diff --git a/community-stories/26/index.html b/community-stories/26/index.html new file mode 100644 index 000000000000..db073271a4d6 --- /dev/null +++ b/community-stories/26/index.html @@ -0,0 +1 @@ +

        Whatever our job title happens to be at AstraZeneca, we’re seekers. I’m part of the Biological Insights Knowledge Graph (BIKG) team. We help scientists comb through massive amounts of data in our quest to find the information we need to help us deliver life-changing medicines.

        diff --git a/community-stories/27/index.html b/community-stories/27/index.html new file mode 100644 index 000000000000..1a498999036d --- /dev/null +++ b/community-stories/27/index.html @@ -0,0 +1 @@ +

        For many surgeons, the possibility of going back into the operating room to review the actions they carried out on a patient could provide invaluable medical insights.

        diff --git a/community-stories/28/index.html b/community-stories/28/index.html new file mode 100644 index 000000000000..0a0738494ff2 --- /dev/null +++ b/community-stories/28/index.html @@ -0,0 +1 @@ +

        In drug discovery, understanding the 3D structure of proteins is key to assessing the ability of a drug to bind to it, directly impacting its efficacy. Predicting the 3D protein form, however, is very complex, challenging, expensive, and time consuming, and can take years when using traditional methods such as X-ray diffraction. Applying machine learning (ML) to predict these structures can significantly accelerate the time to predict protein structures—from years to hours. Several high-profile research teams have released algorithms such as AlphaFold2 (AF2), RoseTTAFold, and others. These algorithms were recognized by Science magazine as the 2021 Breakthrough of the Year.

        diff --git a/community-stories/29/index.html b/community-stories/29/index.html new file mode 100644 index 000000000000..d6abcbf4e966 --- /dev/null +++ b/community-stories/29/index.html @@ -0,0 +1 @@ +

        Knowing the physical structure of proteins is an important part of the drug discovery process. Machine learning (ML) algorithms like AlphaFold v2.0 significantly reduce the cost and time needed to generate usable protein structures. These projects have also inspired development of AI-driven workflows for de novo protein design and protein-ligand interaction analysis.

        diff --git a/community-stories/3/index.html b/community-stories/3/index.html new file mode 100644 index 000000000000..8b03718d850f --- /dev/null +++ b/community-stories/3/index.html @@ -0,0 +1,3 @@ +

        NASA and IBM are working together to create foundation models based on NASA’s data sets — including geospatial data — with the goal of accelerating the creation of AI models.

        + +

        Foundation models are trained on large, broad data sets, then used to train other AI models by using targeted and smaller datasets. Foundation models can be used for different tasks and can apply information about one situation to another. One real-world example of a foundation model at work is ChatGPT3, which was built with the foundation model, GPT3.

        diff --git a/community-stories/30/index.html b/community-stories/30/index.html new file mode 100644 index 000000000000..54bc97d7e118 --- /dev/null +++ b/community-stories/30/index.html @@ -0,0 +1 @@ +

        The mining industry is currently going through a digital revolution as it looks for new and innovative ways to explore and extract mineral resources. This has largely been driven by a need to reduce costs in a competitive global industry that’s experiencing declining ore grades and fewer new discoveries.

        diff --git a/community-stories/32/index.html b/community-stories/32/index.html new file mode 100644 index 000000000000..8f2902f2a44a --- /dev/null +++ b/community-stories/32/index.html @@ -0,0 +1 @@ +

        Trigo is a provider of AI & computer vision based checkout-free systems for the retail market, enabling frictionless checkout and a range of other in-store operational and marketing solutions such as predictive inventory management, security and fraud prevention, pricing optimization and event-driven marketing.

        diff --git a/community-stories/33/index.html b/community-stories/33/index.html new file mode 100644 index 000000000000..189445f7c4b1 --- /dev/null +++ b/community-stories/33/index.html @@ -0,0 +1 @@ +

        Personalization is ubiquitous on most platforms today. Supercharged by connectivity, and scaled by machine learning, most experiences on the internet are tailored to our personal tastes. Peloton classes offer a diversity of instructors, languages, fitness disciplines, durations and intensity. Each Member has specific fitness goals, schedule, fitness equipment, and level of skill or strength. This diversity of content and individuality of Member needs at massive scale creates the opportunity for a recommender system to create a personalized experience on the Peloton platform.

        diff --git a/community-stories/34/index.html b/community-stories/34/index.html new file mode 100644 index 000000000000..8b4c0b6ee44b --- /dev/null +++ b/community-stories/34/index.html @@ -0,0 +1 @@ +

        Wehkamp is one of the biggest e-commerce companies in the Netherlands, with more than 500,000 daily visitors on their website. A wide variety of products offered on the Wehkamp site aims to meet its customers’ many needs. An important aspect of any customer visit on an e-commerce website is a qualitative and accurate visual experience of the products. At a large scale, this is no easy task, with thousands of product photos processed in a local photo studio.

        diff --git a/community-stories/35/index.html b/community-stories/35/index.html new file mode 100644 index 000000000000..c9c9dc75f45e --- /dev/null +++ b/community-stories/35/index.html @@ -0,0 +1,2 @@ +

        Walmart Search has embarked on the journey of adopting Deep Learning in the search ecosystem to improve search relevance. For our pilot use case, we served the computationally intensive Bert Base model at runtime with an objective to achieve low latency and high throughput.

        + diff --git a/community-stories/36/index.html b/community-stories/36/index.html new file mode 100644 index 000000000000..e00333fdcdb5 --- /dev/null +++ b/community-stories/36/index.html @@ -0,0 +1,2 @@ +

        Autodesk is a multinational software company with world-renowned products in areas such as Architecture, Engineering, & Construction, Manufacturing, and Media & Entertainment. Amongst Autodesk’s best-known products are AutoCAD, Revit, Maya, and Fusion 360. The company has millions of customers around the world, and many of them have need for support to make best use of their products.

        + diff --git a/community-stories/37/index.html b/community-stories/37/index.html new file mode 100644 index 000000000000..dd9bfb7823e9 --- /dev/null +++ b/community-stories/37/index.html @@ -0,0 +1 @@ +

        Software innovator Bentley Systems offers a broad portfolio of solutions to help the organizations that design, build, and operate the world’s infrastructure assets. The company uses machine learning in its flagship product to read disparate paper-based asset data and transform it into consolidated digital data. To speed up and formalize this process, Bentley created a machine learning operations framework using Microsoft Azure Machine Learning and PyTorch. Developers’ speed and job satisfaction have shot up since they began using this stable, reproducible framework, which easily gets their code into the cloud, accelerating delivery by three to five times and significantly increasing efficiency.

        diff --git a/community-stories/38/index.html b/community-stories/38/index.html new file mode 100644 index 000000000000..3ff272c45bf4 --- /dev/null +++ b/community-stories/38/index.html @@ -0,0 +1 @@ +

        Join us for an interview with star PyTorch community members Alexander O’Connor and Binghui Ouyang from AutoDesk as we learn how they used PyTorch and AWS Inferentia to deploy production-scale models in chatbot intent classification.

        diff --git a/community-stories/39/index.html b/community-stories/39/index.html new file mode 100644 index 000000000000..7422af52d2c6 --- /dev/null +++ b/community-stories/39/index.html @@ -0,0 +1 @@ +

        Many of the experiences people enjoy on Facebook and Instagram are powered by artificial intelligence (AI). A number of them, like Assistant, Avatars, and AR effects, cannot be powered by server-side AI due to latency, network bandwidth, and other constraints. Running AI on-device —that is, directly on a phone, tablet, or even a pair of smart glasses — offers huge advantages over constantly sending data back to a server. It’s faster, and it creates a privacy-enhancing experience for people who use our platforms. However, on-device AI presents new challenges, since it requires coping with devices that have a small battery, far less powerful processors, and less memory than a server in a data center.

        diff --git a/community-stories/4/index.html b/community-stories/4/index.html new file mode 100644 index 000000000000..c3601e6d5dec --- /dev/null +++ b/community-stories/4/index.html @@ -0,0 +1,2 @@ +

        How did farming affect your day today? If you live in a city, you might feel disconnected from the farms and fields that produce your food. Agriculture is a core piece of our lives, but we often take it for granted.

        + diff --git a/community-stories/40/index.html b/community-stories/40/index.html new file mode 100644 index 000000000000..509a7cad16f2 --- /dev/null +++ b/community-stories/40/index.html @@ -0,0 +1 @@ +

        Axon, a technology leader in public safety, developed AI technology to add cutting-edge license plate recognition capabilities to its in-car camera products, which now can identify plates for vehicles of interest and provide law enforcement with proactive notifications and alerts. Axon AI scientists and engineers chose Microsoft Azure infrastructure as a scalable, cost-efficient, and feature-rich environment where they can develop and test AI models. With Azure compute, storage, and PyTorch and machine learning resources, Axon can easily take advantage of the latest software and hardware technology to develop best-in-class AI solutions for its customers.

        diff --git a/community-stories/41/index.html b/community-stories/41/index.html new file mode 100644 index 000000000000..f886e1e59c98 --- /dev/null +++ b/community-stories/41/index.html @@ -0,0 +1 @@ +

        Here, We will be sharing our experience in moving AI workloads from our GPU servers to our Intel CPU servers without any performance or quality degradation, and saving annual costs of approximately 340 thousand U.S. Dollar (refer to the Conclusion) in the process.

        diff --git a/community-stories/42/index.html b/community-stories/42/index.html new file mode 100644 index 000000000000..f8b8c8804732 --- /dev/null +++ b/community-stories/42/index.html @@ -0,0 +1 @@ +

        Businesses are using PyTorch, an open source machine learning framework, to seamlessly build, train, and deploy AI models in production across their products and services. Hear how industry leaders leverage PyTorch to help power everything from ubiquitous productivity software used across the world to enabling advances in medicine for fighting cancer.

        diff --git a/community-stories/43/index.html b/community-stories/43/index.html new file mode 100644 index 000000000000..8b1873748546 --- /dev/null +++ b/community-stories/43/index.html @@ -0,0 +1 @@ +

        Learn how Caltech’s Center for Autonomous Systems and Technologies (CAST) uses PyTorch to build deep learning systems that can understand the aerodynamics of how aircrafts interact with the ground to enable much smoother and safer landings.

        diff --git a/community-stories/44/index.html b/community-stories/44/index.html new file mode 100644 index 000000000000..2f5c7fedcdb4 --- /dev/null +++ b/community-stories/44/index.html @@ -0,0 +1 @@ +

        At deepset, we’re building the next-level search engine for business documents. Our core product, Haystack, is an open-source framework that enables developers to utilize the latest NLP models for semantic search and question answering at scale. Our software as a service (SaaS) platform, Haystack Hub, is used by developers from various industries, including finance, legal, and automotive, to find answers in all kinds of text documents. You can use these answers to improve the search experience, cover the long-tail of chat bot queries, extract structured data from documents, or automate invoicing processes.

        diff --git a/community-stories/45/index.html b/community-stories/45/index.html new file mode 100644 index 000000000000..c6edca44f515 --- /dev/null +++ b/community-stories/45/index.html @@ -0,0 +1 @@ +

        Hear how Dolby Labs is using PyTorch to develop deep learning for audio, and learn about the challenges that audio AI presents and the breakthroughs and applications they’ve built at Dolby to push the field forward.

        diff --git a/community-stories/46/index.html b/community-stories/46/index.html new file mode 100644 index 000000000000..9047ef314c73 --- /dev/null +++ b/community-stories/46/index.html @@ -0,0 +1 @@ +

        Grapheme to Phoneme (G2P) is a function that generates pronunciations (phonemes) for words based on their written form (graphemes). It has an important role in automatic speech recognition systems, natural language processing, and text-to-speech engines. In Cisco’s Webex Assistant, we use G2P modelling to assist in resolving person names from voice. See here for further details of various techniques we use to build robust voice assistants.

        diff --git a/community-stories/47/index.html b/community-stories/47/index.html new file mode 100644 index 000000000000..e86fbad663b2 --- /dev/null +++ b/community-stories/47/index.html @@ -0,0 +1 @@ +

        AI21 Labs uses machine learning to develop language models focused on understanding meaning, and in 2021 it set a goal to train the recently released Jurassic-1 Jumbo, an autoregressive language model with 178 billion parameters. Developers who register for beta testing will get access to Jurassic-1 Jumbo and can immediately start to customize the model for their use case. The software startup wanted to train the model efficiently, so it looked to Amazon Web Services (AWS) and built a solution using Amazon Elastic Compute Cloud (Amazon EC2), a web service that provides secure, resizable compute capacity in the cloud. Choosing Amazon EC2 gave the company control over the training process, including node allocation.

        diff --git a/community-stories/48/index.html b/community-stories/48/index.html new file mode 100644 index 000000000000..352973490e86 --- /dev/null +++ b/community-stories/48/index.html @@ -0,0 +1 @@ +

        Anthropic is an AI safety and research company that’s working to build reliable, interpretable, and steerable AI systems. Over the past decade, the amount of compute used for the largest training runs has increased at an exponential pace. We’ve also seen in many domains that larger models are able to attain better performance following precise scaling laws. The compute needed to train these models can only be attained using many coordinated machines that are communicating data between them. In this talk, Nicholas Joseph (Technical Staff, Anthropic) goes through why and how they can scale up training runs to use these machines efficiently.

        diff --git a/community-stories/49/index.html b/community-stories/49/index.html new file mode 100644 index 000000000000..fea2e24a5a05 --- /dev/null +++ b/community-stories/49/index.html @@ -0,0 +1 @@ +

        Everyone prefers to use their mother tongue when communicating with chat agents and other automated services. However, for languages like Hungarian—spoken by only 15 million people—the market size will often be viewed as too small for large companies to create software, tools or applications that can process Hungarian text as input. Recognizing this need, the Applied Data Science and Artificial Intelligence team from University of Pécs decided to step up. Using Microsoft AI Solutions and ONNX Runtime solutions, it built and trained its own BERT-large model in native Hungarian in under 200 hours and total build cost of 1,000 euro.

        diff --git a/community-stories/5/index.html b/community-stories/5/index.html new file mode 100644 index 000000000000..0d0f5135f201 --- /dev/null +++ b/community-stories/5/index.html @@ -0,0 +1 @@ +

        In this webinar, Bob Chesebrough of Intel guides you through the steps he took to create a clipped image with background clutter removed from the image. He accomplished this using monocular depth estimation with PyTorch. This could potentially be used to automate structure from motion and other image-related tasks where you want to highlight or focus on a single portion of an image, particularly for identifying parts of the image that were closest to the camera. Specifically, he used depth estimation on a couple of images that he took at a natural history museum to capture just the dinosaur in the foreground, eliminating the background murals, lights, and building structure. The cool thing about this algorithm is that it creates a depth estimate from a single image!

        diff --git a/community-stories/50/index.html b/community-stories/50/index.html new file mode 100644 index 000000000000..57fd76786bb4 --- /dev/null +++ b/community-stories/50/index.html @@ -0,0 +1 @@ +

        With roads in developed countries like the US changing up to 15% annually, Mapillary addresses a growing demand for keeping maps updated by combining images from any camera into a 3D visualization of the world. Mapillary’s independent and collaborative approach enables anyone to collect, share, and use street-level images for improving maps, developing cities, and advancing the automotive industry.

        diff --git a/community-stories/51/index.html b/community-stories/51/index.html new file mode 100644 index 000000000000..dfd47f994995 --- /dev/null +++ b/community-stories/51/index.html @@ -0,0 +1 @@ +

        Deep Learning development is becoming more and more about minimizing the time from idea to trained model. To shorten this lead time, researchers need access to a training environment that supports running multiple experiments concurrently, each utilizing several GPUs.

        diff --git a/community-stories/52/index.html b/community-stories/52/index.html new file mode 100644 index 000000000000..25a9447da531 --- /dev/null +++ b/community-stories/52/index.html @@ -0,0 +1 @@ +

        Pinterest surfaces billions of ideas to people every day, and the neural modeling of embeddings for content, users, and search queries are key in the constant improvement of these machine learning-powered recommendations. Good embeddings — representations of discrete entities as vectors of numbers — enable fast candidate generation and are strong signals to models that classify, retrieve and rank relevant content.

        diff --git a/community-stories/53/index.html b/community-stories/53/index.html new file mode 100644 index 000000000000..243c15d06967 --- /dev/null +++ b/community-stories/53/index.html @@ -0,0 +1 @@ +

        The field of AI is in the middle of a revolution. In recent years, AI models have made images, songs, or even websites out of simple text prompts. These types of models with billions of parameters, called foundation models, can with little fine-tuning be repurposed from one task to another, removing countless hours of training and labelling, and refitting a model to take on a new task.

        diff --git a/community-stories/54/index.html b/community-stories/54/index.html new file mode 100644 index 000000000000..632fee728703 --- /dev/null +++ b/community-stories/54/index.html @@ -0,0 +1 @@ +

        In this paper, we introduce ChemicalX, a PyTorch-based deep learning library designed for providing a range of state of the art models to solve the drug pair scoring task. The primary objective of the library is to make deep drug pair scoring models accessible to machine learning researchers and practitioners in a streamlined this http URL design of ChemicalX reuses existing high level model training utilities, geometric deep learning, and deep chemistry layers from the PyTorch ecosystem. Our system provides neural network layers, custom pair scoring architectures, data loaders, and batch iterators for end users. We showcase these features with example code snippets and case studies to highlight the characteristics of ChemicalX. A range of experiments on real world drug-drug interaction, polypharmacy side effect, and combination synergy prediction tasks demonstrate that the models available in ChemicalX are effective at solving the pair scoring task. Finally, we show that ChemicalX could be used to train and score machine learning models on large drug pair datasets with hundreds of thousands of compounds on commodity hardware.

        diff --git a/community-stories/55/index.html b/community-stories/55/index.html new file mode 100644 index 000000000000..caa6849eb657 --- /dev/null +++ b/community-stories/55/index.html @@ -0,0 +1 @@ +

        In this talk, scientist Lindsey Gray and Ph.D. student Matthias Fey co-examine how the challenges of High Energy Particle Physics are driving the need for more efficient research and development pipelines in neural network development. In particular, they look at the additions made to PyTorch Geometric, which allow Graph Neural Network models to be compiled by the PyTorch JIT, significantly easing the process of deploying such networks at scale.

        diff --git a/community-stories/56/index.html b/community-stories/56/index.html new file mode 100644 index 000000000000..26bae87a28eb --- /dev/null +++ b/community-stories/56/index.html @@ -0,0 +1,2 @@ +

        Intel has long been at the forefront of technological innovation, and its recent venture into Generative AI (GenAI) solutions is no exception. With the rise of AI-powered gaming experiences, Intel sought to deliver an accessible and intuitive GenAI inferencing solution tailored for AI PCs powered by Intel’s latest GPUs. By leveraging PyTorch as the backbone for development efforts, Intel successfully launched AI Playground, an open source application that showcases advanced GenAI workloads.

        + diff --git a/community-stories/57/index.html b/community-stories/57/index.html new file mode 100644 index 000000000000..4940f1df3660 --- /dev/null +++ b/community-stories/57/index.html @@ -0,0 +1 @@ +

        Geospatial computer vision is essential for understanding our planet — from monitoring deforestation to tracking urban development and analyzing the impacts of climate change. However, the coding and deep learning skills for applying AI models to satellite imagery and earth observation data has traditionally been a major barrier for many practitioners.

        diff --git a/community-stories/6/index.html b/community-stories/6/index.html new file mode 100644 index 000000000000..b0658a98dffb --- /dev/null +++ b/community-stories/6/index.html @@ -0,0 +1 @@ +

        Cotton is a major fibre crop across the world, cultivated in over 80 countries with nearly 100 million families across the world rely on cotton farming for their livelihood. With such importance placed on many farmers’ crops, cotton’s particular vulnerability to pest infestations has been troubling to many. However, pest infestation is also simultaneously one of the most significant and preventable problems that farmers face with 55% of all pesticide usage in India being devoted to cotton farming.

        diff --git a/community-stories/7/index.html b/community-stories/7/index.html new file mode 100644 index 000000000000..7c23debd8c62 --- /dev/null +++ b/community-stories/7/index.html @@ -0,0 +1 @@ +

        Lyft’s mission is to improve people’s lives with the world’s best transportation. We believe in a future where self-driving cars make transportation safer and more accessible for everyone. That’s why Level 5, Lyft’s self-driving division, is developing a complete autonomous system for the Lyft network to provide riders’ access to the benefits of this technology. However, this is an incredibly complex task.

        diff --git a/community-stories/8/index.html b/community-stories/8/index.html new file mode 100644 index 000000000000..425e24461aa0 --- /dev/null +++ b/community-stories/8/index.html @@ -0,0 +1 @@ +

        Wayve wants to accelerate and scale autonomous vehicle (AV) development by using vision-based machine learning for rapid prototyping and quick iteration. So, it developed a platform that uses the open-source machine learning framework PyTorch with Microsoft Azure Machine Learning to gather, manage, and process millions of hours of driving data per year—petabytes of data—consisting of images, GPS data, and data from other sensors. Wayve now has the scalable capacity to build and iterate driving models for complex urban environments, adjust models more nimbly, and adapt to new environments more readily.

        diff --git a/community-stories/9/index.html b/community-stories/9/index.html new file mode 100644 index 000000000000..0bed358bf0f5 --- /dev/null +++ b/community-stories/9/index.html @@ -0,0 +1,2 @@ +

        Learning a foreign language was probably one of your goals last year. And the year before, and the year before that. Like gym memberships, our best intentions often don’t survive very long. Aside from the time required to achieve proficiency with a new language, most people struggle with traditional approaches to learning. Even many web-based language tools can be monotonous and cumbersome.

        + diff --git a/community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.html b/community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.html new file mode 100644 index 000000000000..86fb96e996e9 --- /dev/null +++ b/community_blog/3d-rotations-and-spatial-transformations-made-easy-with-roma-356a495a20c4.html @@ -0,0 +1 @@ +

        Struggling with quaternions, rotation vectors, right-hand rules and all these stuffs? Try RoMa: an easy-to-to-use, stable and efficient library to deal with rotations and spatial transformations in PyTorch.

        diff --git a/community_blog/bringing-the-pytorch-community-together.html b/community_blog/bringing-the-pytorch-community-together.html new file mode 100644 index 000000000000..3d8829afc1b3 --- /dev/null +++ b/community_blog/bringing-the-pytorch-community-together.html @@ -0,0 +1 @@ +

        As we step into a new year, it’s a great moment to reflect on the incredible community events that made 2024 a memorable year for the PyTorch Foundation. Global meetups, events, and conferences brought the community together to learn, connect, and grow. Here’s a quick recap of the year’s highlights and what to expect in 2025.

        diff --git a/community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.html b/community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.html new file mode 100644 index 000000000000..9f7e844ec19f --- /dev/null +++ b/community_blog/colossal-llama-2-low-cost-and-high-quality-domain-specific-llm-solution-using-llama-and-26d2e4b9fd92.html @@ -0,0 +1 @@ +

        The most prominent distinction between LLaMA-1 and LLaMA-2 lies in the incorporation of higher-quality corpora, a pivotal factor contributing to significant performance enhancements in LLaMA-2. This, coupled with its commercial availability, extends the potential for creative applications of large models within the open-source community.

        diff --git a/community_blog/datathon-2025.html b/community_blog/datathon-2025.html new file mode 100644 index 000000000000..582d28a09270 --- /dev/null +++ b/community_blog/datathon-2025.html @@ -0,0 +1 @@ +

        We’re excited to have PyTorch sponsor Datathon 2025: DataOrbit, a place where students can collaborate with a team to solve problems using real-world datasets! This event, hosted by Data Science UCSB in collaboration with Gaucho Sports Analytics and ACM@UCSB, will take place on February 22–23rd, 2025 at UC Santa Barbara, with the incredible opportunity to present your project to a panel of corporate and faculty judges – including the executive director of Pytorch! – for a chance to win prizes up to $3000.

        diff --git a/community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.html b/community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.html new file mode 100644 index 000000000000..e42bac8a407d --- /dev/null +++ b/community_blog/distributed-training-with-pytorch-and-azure-ml-898429139098.html @@ -0,0 +1 @@ +

        Suppose you have a very large PyTorch model, and you’ve already tried many common tricks to speed up training: you optimized your code, you moved training to the cloud and selected a fast GPU VM, you installed software packages that improve training performance (for example, by using the ACPT curated environment on Azure ML). And yet, you still wish your model could train faster. Maybe it’s time to give distributed training a try! Continue reading to learn the simplest way to do distributed training with PyTorch and Azure ML.

        diff --git a/community_blog/doctr-joins-pytorch-ecosystem.html b/community_blog/doctr-joins-pytorch-ecosystem.html new file mode 100644 index 000000000000..25b5830a11a4 --- /dev/null +++ b/community_blog/doctr-joins-pytorch-ecosystem.html @@ -0,0 +1 @@ +

        We’re thrilled to announce that the docTR project has been integrated into the PyTorch ecosystem! This integration ensures that docTR aligns with PyTorch’s standards and practices, giving developers a reliable, community-backed solution for powerful OCR workflows.

        diff --git a/community_blog/enhancing-deep-learning.html b/community_blog/enhancing-deep-learning.html new file mode 100644 index 000000000000..5d90ad45ab17 --- /dev/null +++ b/community_blog/enhancing-deep-learning.html @@ -0,0 +1 @@ +

        Welcome to the thriving PyTorch ecosystem, where a wealth of tools and libraries await, purpose-built to elevate your experience in deep learning as a developer or researcher. The Ecosystem Tools pages host many projects from experts spanning academia, industry, application development, and machine learning.

        diff --git a/community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.html b/community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.html new file mode 100644 index 000000000000..0afbab8d074e --- /dev/null +++ b/community_blog/exploring-scientific-machine-learning-pipelines-through-the-simulai-toolkit-9fda42d6c6a0.html @@ -0,0 +1 @@ +

        SciML, short for Scientific Machine Learning, encompasses work that merges quantitative sciences with machine learning. It has gained significant traction over the past decade, driven by the widespread availability of specialized hardware (such as GPUs and TPUs) and datasets. Additionally, it has been propelled by the overarching influence of the machine learning wave, now ingrained in the zeitgeist of our times. In this context, we’d like to introduce SimulAI, an open-source toolkit under the Apache 2.0 license. SimulAI is designed to be user-friendly, providing a high-level Python interface for managing scientific machine learning pipelines. This article aims to showcase its current workflow and utility in constructing scientific experiments. We encourage feedback and potential contributions from the interested community, with plans to delve into more advanced topics in future articles.

        diff --git a/community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.html b/community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.html new file mode 100644 index 000000000000..d06738e1ba72 --- /dev/null +++ b/community_blog/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d.html @@ -0,0 +1 @@ +

        Activation checkpointing is a technique used for reducing the memory footprint at the cost of more compute. It utilizes the simple observation that we can avoid saving intermediate tensors necessary for backward computation if we just recompute them on demand instead.

        diff --git a/community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.html b/community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.html new file mode 100644 index 000000000000..635264cf35fb --- /dev/null +++ b/community_blog/how-fashable-achieves-soa-realistic-ai-generated-images-using-pytorch-and-azure-machine-learning-2313c4cf5f44.html @@ -0,0 +1,2 @@ +

        Fashable is a company born at XNFY Lab (a joint initiative with Microsoft). The company’s main goal is to revolutionize the world of fashion with ethical Artificial Intelligence (AI) technologies built on PyTorch framework. Fashable is focused on developing AI models that generates synthetic contents for the global fashion industry. The Fashion industry has been criticized in recent years because it generates a lot of waste and is responsible for up to 10% of global carbon dioxide output. Fashable has stepped up to address this issue by introducing multiple AI solutions that generates realistic personalized consumer garments without actually producing them to help in reducing carbon footprint. This will help the fashion brands make informed decisions without investing in experimental products and also reducing the industry’s carbon footprint globally. Hence, in Fashable, our IP models utilize modern approaches, such as Generative Adversarial Networks (GANs), best seller analysis, custom dataset creation, and so on to resolve such problems.

        + diff --git a/community_blog/introducing-depyf.html b/community_blog/introducing-depyf.html new file mode 100644 index 000000000000..8d56edceed89 --- /dev/null +++ b/community_blog/introducing-depyf.html @@ -0,0 +1 @@ +

        We are thrilled to introduce depyf, a new project to the PyTorch ecosystem designed to help users understand, learn, and adapt to torch.compile!

        diff --git a/community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.html b/community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.html new file mode 100644 index 000000000000..e239d2b0c617 --- /dev/null +++ b/community_blog/introducing-torchopt-a-high-performance-differentiable-optimization-library-for-pytorch-37c4c0ef6ae1.html @@ -0,0 +1 @@ +

        Explore TorchOpt, a PyTorch-based library that revolutionizes differentiable optimization with its unified programming abstraction, high-performance distributed execution runtime, and support for various differentiation modes.”

        diff --git a/community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.html b/community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.html new file mode 100644 index 000000000000..1d4f1a9b5175 --- /dev/null +++ b/community_blog/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02.html @@ -0,0 +1 @@ +

        As a new PyTorch Ecosystem Partner, we at HPC-AI Tech look forward to working with the PyTorch community to advance AI technologies through our open source project, Colossal-AI. We are excited to join forces with the PyTorch community in this effort.

        diff --git a/community_blog/mlops-workflow.html b/community_blog/mlops-workflow.html new file mode 100644 index 000000000000..1e9e964ef6a1 --- /dev/null +++ b/community_blog/mlops-workflow.html @@ -0,0 +1 @@ +

        PyTorch is one of the most widely used and most powerful deep learning frameworks for training and deploying complex neural networks. It has never been easier to train and deploy AI applications, and low-cost, high-performance, energy-efficient hardware, tools, and technology for creating optimized workflows are more accessible than ever. But data science, machine learning, and devops can be deep topics unto themselves, and it can be overwhelming for developers with one specialty to see how they all come together in the real world, or even to know where to get started.

        diff --git a/community_blog/optimize-llms.html b/community_blog/optimize-llms.html new file mode 100644 index 000000000000..b7eb36968f4d --- /dev/null +++ b/community_blog/optimize-llms.html @@ -0,0 +1 @@ +

        The rapid growth of large language model (LLM) applications is linked to rapid growth in energy demand. According to the International Energy Agency (IEA), data center electricity consumption is projected to roughly double by 2026 primarily driven by AI. This is due to the energy-intensive training requirements for massive LLMs – however, the increase in AI Inferencing workloads also plays a role. For example, compared with traditional search queries, a single AI inference can consume about 10x more energy.

        diff --git a/community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.html b/community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.html new file mode 100644 index 000000000000..d5a6630672d2 --- /dev/null +++ b/community_blog/profiling-pytorch-language-models-with-octoml-profile-eda7ece6b7bd.html @@ -0,0 +1 @@ +

        The recent launch of PyTorch 2.0 makes it clear that the community is heavily investing in a compiler-powered future for machine learning. The new OctoML Profiler can help any user realize the full potential of these shifts in the ML landscape.

        diff --git a/community_blog/pt-fedora-os-communities.html b/community_blog/pt-fedora-os-communities.html new file mode 100644 index 000000000000..de170ab8cc3b --- /dev/null +++ b/community_blog/pt-fedora-os-communities.html @@ -0,0 +1,2 @@ +

        At DevConf.IN 2025 in Pune, I had the opportunity to host a PyTorch Meetup on February 28th. The session, titled “Powering AI with PyTorch, Fedora, and Open Source Communities” was aimed at introducing PyTorch to students and professionals, explaining why PyTorch+Fedora form an ideal AI development platform. The other key aspect I covered was collaboration between open source communities.

        + diff --git a/community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.html b/community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.html new file mode 100644 index 000000000000..7c05468c0c4a --- /dev/null +++ b/community_blog/pypose-a-library-for-robot-learning-with-physics-based-optimization-861bc0bb92f1.html @@ -0,0 +1 @@ +

        We are excited to share our new open-source library PyPose. It is a PyTorch-based robotics-oriented library that provides a set of tools and algorithms for connecting deep learning with physics-based optimization.

        diff --git a/community_blog/pytorch-at-gtc.html b/community_blog/pytorch-at-gtc.html new file mode 100644 index 000000000000..3d1786d203a1 --- /dev/null +++ b/community_blog/pytorch-at-gtc.html @@ -0,0 +1 @@ +

        GTC is coming back to San Jose on March 17–21, 2025. Join PyTorch Foundation members Arm, AWS, Google Cloud, IBM, Lightning AI, Meta, Microsoft Azure, Snowflake, and thousands of developers as we celebrate PyTorch. Together learn how AI & accelerated computing are helping humanity solve our most complex challenges.

        diff --git a/community_blog/pytorch-shanghai-notes.html b/community_blog/pytorch-shanghai-notes.html new file mode 100644 index 000000000000..a8e0b3ecef39 --- /dev/null +++ b/community_blog/pytorch-shanghai-notes.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + + PyTorch Shanghai Meetup Notes | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        + +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + +
        +
        +

        September 08, 2024

        +

        + PyTorch Shanghai Meetup Notes +

        +
        +
        + +
        +
        +
        + +
        +

        + by + + Team PyTorch + +

        +

        We are honored to successfully host the PyTorch Shanghai Meetup on August 15, 2024. This Meetup has received great attention from the industry. We invited senior PyTorch developers from Intel and Huawei as guest speakers, who shared their valuable experience and the latest technical trends. In addition, this event also attracted PyTorch enthusiasts from many technology companies and well-known universities. A total of more than 40 participants gathered together to discuss and exchange the latest applications and technological advances of PyTorch.

        + +

        This Meetup not only strengthened the connection between PyTorch community members, but also provided a platform for local AI technology enthusiasts to learn, communicate and grow. We look forward to the next gathering to continue to promote the development of PyTorch technology in the local area.

        + +

        1. PyTorch Foundation Updates

        + +

        man instructing students

        + +

        PyTorch Board member Fred Li shared the latest updates in the PyTorch community, He reviewed the development history of the PyTorch community, explained in detail the growth path of community developers, encouraged everyone to delve deeper into technology, and introduced the upcoming PyTorch Conference 2024 related matters.

        + +

        2. Intel’s Journey with PyTorch Democratizing AI with ubiquitous hardware and open software

        + +

        PyTorch CPU module maintainer Jiong Gong shared 6-year technical contributions from Intel to PyTorch and its ecosystem, explored the remarkable advancements that Intel has made in both software and hardware democratizing AI, ensuring accessibility, and optimizing performance across a diverse range of Intel hardware platforms.

        + +

        man instructing students

        + +

        3. Exploring Multi-Backend Support in PyTorch Ecosystem: A Case Study of Ascend

        + +

        man instructing students

        + +

        Fengchun Hua, a PyTorch contributor from Huawei, took Huawei Ascend NPU as an example to demonstrate the latest achievements in multi-backend support for PyTorch applications. He introduced the hardware features of Huawei Ascend NPU and the infrastructure of CANN (Compute Architecture for Neural Networks), and explained the key achievements and innovations in native support work. He also shared the current challenges and the next work plan.

        + +

        Yuanhao Ji, another PyTorch contributor from Huawei, then introduced the Autoload Device Extension proposal, explained its implementation details and value in improving the scalability of PyTorch, and introduced the latest work progress of the PyTorch Chinese community.

        + +

        4. Intel XPU Backend for Inductor

        + +

        man instructing students

        + +

        Eikan is a PyTorch contributor from Intel. He focuses on torch.compile stack for both Intel CPU and GPU. In this session, Eikan presented Intel’s efforts on torch.compile for Intel GPUs. He provided updates on the current status of Intel GPUs within PyTorch, covering both functionality and performance aspects. Additionally, Eikan used Intel GPU as a case study to demonstrate how to integrate a new backend into the Inductor using Triton.

        + +

        5. PyTorch PrivateUse1 Evolution Approaches and Insights

        + +

        man instructing students

        + +

        Jiawei Li, a PyTorch collaborator from Huawei, introduced PyTorch’s Dispatch mechanism and emphasized the limitations of DIspatchKey. He took Huawei Ascend NPU as an example to share the best practices of the PyTorch PrivateUse1 mechanism. He mentioned that while using the PrivateUse1 mechanism, Huawei also submitted many improvements and bug fixes for the mechanism to the PyTorch community. He also mentioned that due to the lack of upstream CI support for out-of-tree devices, changes in upstream code may affect their stability and quality, and this insight was recognized by everyone.

        + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/community_blog/sglang-joins-pytorch.html b/community_blog/sglang-joins-pytorch.html new file mode 100644 index 000000000000..f12a8fe637e4 --- /dev/null +++ b/community_blog/sglang-joins-pytorch.html @@ -0,0 +1 @@ +

        We’re thrilled to announce that the SGLang project has been integrated into the PyTorch ecosystem! This integration ensures that SGLang aligns with PyTorch’s standards and practices, providing developers with a reliable and community-supported framework for fast and flexible serving of LLMs.

        diff --git a/community_blog/torch-compile-explained-ae0def293084.html b/community_blog/torch-compile-explained-ae0def293084.html new file mode 100644 index 000000000000..b3268ed0be8d --- /dev/null +++ b/community_blog/torch-compile-explained-ae0def293084.html @@ -0,0 +1 @@ +

        Have you ever felt overwhelmed by the complexities of torch.compile? Diving into its workings can feel like black magic, with bytecode and Python internal details that many users fail to understand, hindering them from understanding and debugging torch.compile.

        diff --git a/community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.html b/community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.html new file mode 100644 index 000000000000..85cdacb8e4cf --- /dev/null +++ b/community_blog/torchdistill-a-modular-configuration-driven-framework-for-reproducible-deep-learning-and-9e0ecabf2815.html @@ -0,0 +1 @@ +

        This article summarizes key features and concepts of torchdistill (v1.0.0). Refer to the official documentation for its APIs and research projects.

        diff --git a/community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.html b/community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.html new file mode 100644 index 000000000000..0b061be998ea --- /dev/null +++ b/community_blog/unveiling-the-power-of-semi-supervised-learning-the-unified-semi-supervised-learning-benchmark-849f42bbc32a.html @@ -0,0 +1 @@ +

        Machine Learning models thrive on high-quality, fully-annotated data. The traditional supervised learning approach typically requires data on the scale of millions, or even billions, to train large foundational models. However, obtaining such a vast amount of labeled data is often tedious and labor-intensive. As an alternative, semi-supervised learning (SSL) aims to enhance model generalization with only a fraction of labeled data, complemented by a considerable amount of unlabeled data. This blog introduces USB — the Unified Semi-Supervised Learning Framework and Benchmark, covering multi-modalities and various SSL scenarios.

        diff --git a/community_blog/vllm-joins-pytorch.html b/community_blog/vllm-joins-pytorch.html new file mode 100644 index 000000000000..92b038ca93e1 --- /dev/null +++ b/community_blog/vllm-joins-pytorch.html @@ -0,0 +1,3 @@ +

        We’re thrilled to announce that the vLLM project has become a PyTorch ecosystem project, and joined the PyTorch ecosystem family!

        + +

        Running large language models (LLMs) is both resource-intensive and complex, especially as these models scale to hundreds of billions of parameters. That’s where vLLM comes in — a high-throughput, memory-efficient inference and serving engine designed for LLMs.

        diff --git a/community_blog/zeus.html b/community_blog/zeus.html new file mode 100644 index 000000000000..48ceabdbdea7 --- /dev/null +++ b/community_blog/zeus.html @@ -0,0 +1 @@ +

        Zeus is an open-source toolbox for measuring and optimizing the energy consumption of deep learning workloads. Our goal is to make energy optimization based on accurate measurements as easy as possible for diverse deep learning workloads and setups by offering composable tools with minimal assumptions.

        diff --git a/contact-us.html b/contact-us.html index 6decac66c3eb..06925803eac4 100644 --- a/contact-us.html +++ b/contact-us.html @@ -1,12 +1,310 @@ ---- -layout: default -title: Contact Us -body-class: announcement -background-class: announcement-background -permalink: /contact-us ---- - -
        + + + + + + + + + + + + + Contact Us | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        Contact Us

        @@ -36,3 +334,306 @@

        Get in Touch

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/credits-sponsor.html b/credits-sponsor.html deleted file mode 100644 index c79d50ac7d5d..000000000000 --- a/credits-sponsor.html +++ /dev/null @@ -1,36 +0,0 @@ ---- -layout: default -title: Sponsor Cloud Credits -body-class: announcement -background-class: announcement-background -permalink: /credits/sponsor ---- -{% assign cards = site.board_info %} - -
        -
        -
        -

        PyTorch Cloud
               Credit Program

        -
        -
        -
        - -
        -
        -
        -
        -

        Sponsor cloud credits and support PyTorch. Please fill in the form and we will be in touch. -

        - - -
        -
        -
        -
        diff --git a/credits.html b/credits.html index d4e3dc24f111..a0fa4bf72d4b 100644 --- a/credits.html +++ b/credits.html @@ -1,11 +1,310 @@ ---- -layout: default -title: PyTorch Cloud Credit Program -body-class: announcement -background-class: announcement-background -permalink: /credits ---- -{% assign cards = site.board_info %} + + + + + + + + + + + + + PyTorch Cloud Credit Program | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        @@ -203,3 +502,306 @@

        Projects benefit from your donation

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/credits/sponsor.html b/credits/sponsor.html new file mode 100644 index 000000000000..516c548caced --- /dev/null +++ b/credits/sponsor.html @@ -0,0 +1,638 @@ + + + + + + + + + + + + + Sponsor Cloud Credits | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + +
        +
        +
        +

        PyTorch Cloud
               Credit Program

        +
        +
        +
        + +
        +
        +
        +
        +

        Sponsor cloud credits and support PyTorch. Please fill in the form and we will be in touch. +

        + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem-github-stars.json b/ecosystem-github-stars.json index 719211d02384..ee842a222dd9 100644 --- a/ecosystem-github-stars.json +++ b/ecosystem-github-stars.json @@ -1,13 +1,525 @@ ---- ---- - { "data": [ - {% for item in site.ecosystem %} + { - "id": "{{ item.github-id }}" + "id": "pytorch/captum" } - {% if forloop.last != true %},{% endif %} - {% endfor %} + , + + { + "id": "flairNLP/flair" + } + , + + { + "id": "asyml/forte" + } + , + + { + "id": "pytorch/ignite" + } + , + + { + "id": "open-mmlab" + } + , + + { + "id": "huggingface/accelerate" + } + , + + { + "id": "petuum/adaptdl" + } + , + + { + "id": "BorealisAI/advertorch" + } + , + + { + "id": "albumentations-team/albumentations" + } + , + + { + "id": "allenai/allennlp" + } + , + + { + "id": "ContinualAI/avalanche" + } + , + + { + "id": "ElementAI/baal" + } + , + + { + "id": "pytorch/botorch" + } + , + + { + "id": "catalyst-team/catalyst" + } + , + + { + "id": "aramis-lab/AD-DL" + } + , + + { + "id": "hpcaitech/ColossalAI" + } + , + + { + "id": "hpcaitech/ColossalAI" + } + , + + { + "id": "mosaicml/composer" + } + , + + { + "id": "facebookresearch/CrypTen" + } + , + + { + "id": "microsoft/DeepSpeed" + } + , + + { + "id": "thuml/depyf" + } + , + + { + "id": "facebookresearch/detectron2" + } + , + + { + "id": "determined-ai/determined" + } + , + + { + "id": "dmlc/dgl" + } + , + + { + "id": "huggingface/diffusers" + } + , + + { + "id": "mindee/doctr" + } + , + + { + "id": "arogozhnikov/einops" + } + , + + { + "id": "TorchEnsemble-Community/Ensemble-Pytorch" + } + , + + { + "id": "facebookresearch/fairscale" + } + , + + { + "id": "fastai/fastai" + } + , + + { + "id": "adap/flower" + } + , + + { + "id": "BiomedSciAI/fuse-med-ml" + } + , + + { + "id": "mlcommons/GaNDLF" + } + , + + { + "id": "pytorch/glow" + } + , + + { + "id": "cornellius-gp/gpytorch" + } + , + + { + "id": "facebookresearch/higher" + } + , + + { + "id": "horovod/horovod" + } + , + + { + "id": "microsoft/hummingbird" + } + , + + { + "id": "facebookresearch/hydra" + } + , + + { + "id": "intel/neural-compressor" + } + , + + { + "id": "intel/intel-extension-for-pytorch" + } + , + + { + "id": "unifyai/ivy" + } + , + + { + "id": "joeynmt/joeynmt" + } + , + + { + "id": "kornia/kornia" + } + , + + { + "id": "lyft/l5kit" + } + , + + { + "id": "lightly-ai/lightly" + } + , + + { + "id": "ludwig-ai/ludwig" + } + , + + { + "id": "facebookresearch/mmf" + } + , + + { + "id": "Project-MONAI/MONAI" + } + , + + { + "id": "NVIDIA/NeMo" + } + , + + { + "id": "octoml/octoml-profile" + } + , + + { + "id": "microsoft/onnxruntime" + } + , + + { + "id": "pytorch/opacus" + } + , + + { + "id": "open-compass/opencompass" + } + , + + { + "id": "optuna/optuna" + } + , + + { + "id": "lf1-io/padl" + } + , + + { + "id": "facebookresearch/ParlAI" + } + , + + { + "id": "PennyLaneAI/pennylane" + } + , + + { + "id": "pfnet/pfrl" + } + , + + { + "id": "polyaxon/polyaxon" + } + , + + { + "id": "jmschrei/pomegranate" + } + , + + { + "id": "graphcore/poptorch" + } + , + + { + "id": "GRAAL-Research/poutyne" + } + , + + { + "id": "pykale/pykale" + } + , + + { + "id": "pypose/pypose" + } + , + + { + "id": "WenjieDu/PyPOTS" + } + , + + { + "id": "pyro-ppl/pyro" + } + , + + { + "id": "pystiche/pystiche" + } + , + + { + "id": "OpenMined/PySyft" + } + , + + { + "id": "pyg-team/pytorch_geometric" + } + , + + { + "id": "PyTorchLightning/pytorch-lightning" + } + , + + { + "id": "KevinMusgrave/pytorch-metric-learning" + } + , + + { + "id": "PetrochukM/PyTorch-NLP" + } + , + + { + "id": "facebookresearch/pytorch3d" + } + , + + { + "id": "benedekrozemberczki/pytorch_geometric_temporal" + } + , + + { + "id": "pytorchfi/pytorchfi" + } + , + + { + "id": "facebookresearch/pytorchvideo" + } + , + + { + "id": "azavea/raster-vision" + } + , + + { + "id": "ray-project/ray" + } + , + + { + "id": "awslabs/renate" + } + , + + { + "id": "" + } + , + + { + "id": "IBM/simulai" + } + , + + { + "id": "skorch-dev/skorch" + } + , + + { + "id": "DLR-RM/stable-baselines3" + } + , + + { + "id": "fidelity/stoke" + } + , + + { + "id": "substra" + } + , + + { + "id": "tensorly/tensorly" + } + , + + { + "id": "airaria/TextBrewer" + } + , + + { + "id": "TissueImageAnalytics/tiatoolbox" + } + , + + { + "id": "yoshitomo-matsubara/torchdistill" + } + , + + { + "id": "TorchDrift/TorchDrift" + } + , + + { + "id": "DeepGraphLearning/torchdrug" + } + , + + { + "id": "microsoft/torchgeo" + } + , + + { + "id": "fepegar/torchio" + } + , + + { + "id": "PyTorchLightning/metrics" + } + , + + { + "id": "metaopt/TorchOpt" + } + , + + { + "id": "nicolas-chaulet/torch-points3d" + } + , + + { + "id": "mit-han-lab/torchquantum" + } + , + + { + "id": "allegroai/clearml" + } + , + + { + "id": "huggingface/transformers" + } + , + + { + "id": "NVIDIA/Torch-TensorRT" + } + , + + { + "id": "microsoft/Semi-supervised-learning" + } + , + + { + "id": "facebookresearch/vissl" + } + , + + { + "id": "vllm-project/vllm" + } + + ] } diff --git a/ecosystem/Captum/index.html b/ecosystem/Captum/index.html new file mode 100644 index 000000000000..31040df11839 --- /dev/null +++ b/ecosystem/Captum/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/Flair/index.html b/ecosystem/Flair/index.html new file mode 100644 index 000000000000..9f5fdc549820 --- /dev/null +++ b/ecosystem/Flair/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/Forte/index.html b/ecosystem/Forte/index.html new file mode 100644 index 000000000000..c088f3ae3845 --- /dev/null +++ b/ecosystem/Forte/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + forte | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        forte

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Forte is a toolkit for building NLP pipelines featuring composable components, convenient data interfaces, and cross-task interaction.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/Ignite/index.html b/ecosystem/Ignite/index.html new file mode 100644 index 000000000000..8563e096fe40 --- /dev/null +++ b/ecosystem/Ignite/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/OpenMMLab/index.html b/ecosystem/OpenMMLab/index.html new file mode 100644 index 000000000000..3a8d7297773b --- /dev/null +++ b/ecosystem/OpenMMLab/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + OpenMMLab | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        OpenMMLab

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        OpenMMLab covers a wide range of computer vision research topics including classification, detection, segmentation, and super-resolution.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/accelerate/index.html b/ecosystem/accelerate/index.html new file mode 100644 index 000000000000..71d1b84229f8 --- /dev/null +++ b/ecosystem/accelerate/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + accelerate | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        accelerate

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        🚀 A simple way to train and use PyTorch models with multi-GPU, TPU, mixed-precision

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/adaptdl/index.html b/ecosystem/adaptdl/index.html new file mode 100644 index 000000000000..9aa514b41071 --- /dev/null +++ b/ecosystem/adaptdl/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + AdaptDL | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        AdaptDL

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        AdaptDL is a resource-adaptive deep learning training and scheduling framework.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/advertorch/index.html b/ecosystem/advertorch/index.html new file mode 100644 index 000000000000..8a53f2c06731 --- /dev/null +++ b/ecosystem/advertorch/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/albumentations/index.html b/ecosystem/albumentations/index.html new file mode 100644 index 000000000000..1f4c0a495aaa --- /dev/null +++ b/ecosystem/albumentations/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Albumentations | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Albumentations

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Fast and extensible image augmentation library for different CV tasks like classification, segmentation, object detection and pose estimation.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/allennlp/index.html b/ecosystem/allennlp/index.html new file mode 100644 index 000000000000..6c6822d4f35f --- /dev/null +++ b/ecosystem/allennlp/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/avalanche/index.html b/ecosystem/avalanche/index.html new file mode 100644 index 000000000000..4e328006482d --- /dev/null +++ b/ecosystem/avalanche/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + avalanche | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        avalanche

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Avalanche: an End-to-End Library for Continual Learning

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/baal/index.html b/ecosystem/baal/index.html new file mode 100644 index 000000000000..2fc2af7b51b0 --- /dev/null +++ b/ecosystem/baal/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + baal | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        baal

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        baal (bayesian active learning) aims to implement active learning using metrics of uncertainty derived from approximations of bayesian posteriors in neural networks.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/botorch/index.html b/ecosystem/botorch/index.html new file mode 100644 index 000000000000..c527581fae6a --- /dev/null +++ b/ecosystem/botorch/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/catalyst/index.html b/ecosystem/catalyst/index.html new file mode 100644 index 000000000000..579d4d9c96fd --- /dev/null +++ b/ecosystem/catalyst/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Catalyst | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Catalyst

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Catalyst helps you write compact, but full-featured deep learning and reinforcement learning pipelines with a few lines of code.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/clinicadl/index.html b/ecosystem/clinicadl/index.html new file mode 100644 index 000000000000..80d071bfd11c --- /dev/null +++ b/ecosystem/clinicadl/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + ClinicaDL | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        ClinicaDL

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Framework for reproducible classification of Alzheimer's Disease

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/colossal-llama-2/index.html b/ecosystem/colossal-llama-2/index.html new file mode 100644 index 000000000000..a6e4d6aa327c --- /dev/null +++ b/ecosystem/colossal-llama-2/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Colossal-LLaMA-2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Colossal-LLaMA-2

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A complete and open-sourced solution for injecting domain-specific knowledge into pre-trained LLM.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/colossal/index.html b/ecosystem/colossal/index.html new file mode 100644 index 000000000000..22bf0d86b87e --- /dev/null +++ b/ecosystem/colossal/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + ColossalAI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        ColossalAI

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Colossal-AI is a Unified Deep Learning System for Big Model Era

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/composer/index.html b/ecosystem/composer/index.html new file mode 100644 index 000000000000..d007d9cb899f --- /dev/null +++ b/ecosystem/composer/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + composer | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        composer

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        library of algorithms to speed up neural network training

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/contributor-awards-2023.html b/ecosystem/contributor-awards-2023.html index c08a831ef719..c7c141272ef5 100644 --- a/ecosystem/contributor-awards-2023.html +++ b/ecosystem/contributor-awards-2023.html @@ -1,12 +1,310 @@ ---- -layout: default -title: Announcing the 2023 PyTorch Contributor Awards -permalink: ecosystem/contributor-awards-2023 -background-class: ecosystem-background -body-class: ecosystem ---- - -
        + + + + + + + + + + + + + Announcing the 2023 PyTorch Contributor Awards | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        Announcing the 2023 PyTorch Contributor Awards

        @@ -143,3 +441,306 @@

        PyTorch 2023 Nominees

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/contributor-awards-2024.html b/ecosystem/contributor-awards-2024.html index 7360f40ef845..e745ef2859af 100644 --- a/ecosystem/contributor-awards-2024.html +++ b/ecosystem/contributor-awards-2024.html @@ -1,12 +1,310 @@ ---- -layout: default -title: Announcing the 2024 PyTorch Contributor Awards -permalink: ecosystem/contributor-awards-2024 -background-class: ecosystem-background -body-class: ecosystem ---- - -
        + + + + + + + + + + + + + Announcing the 2024 PyTorch Contributor Awards | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        Announcing the 2024 PyTorch Contributor Awards

        @@ -128,3 +426,306 @@

        PyTorch 2024 Nominees

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/contributors.html b/ecosystem/contributors.html deleted file mode 100644 index 13024213ed53..000000000000 --- a/ecosystem/contributors.html +++ /dev/null @@ -1,98 +0,0 @@ ---- -layout: default -title: Contributors -permalink: /resources/contributors/ -body-class: ecosystem -background-class: ecosystem-join-background -redirect_to: "/newsletter" ---- - -
        -
        -

        PyTorch Contributors

        - -

        The central place for PyTorch contributors to stay up-to-date with the codebase and discover notable RFCs, PRs and more.

        -
        -
        - -
        -
        - - - {% include past_issues.html %} - -
        -
        -

        Newsletter Sign Up

        -

        Follow the contributors newsletter for curated news from across the PyTorch developer community

        -
        - - -
        - View Issues -
        - -
        -
        -

        Join the conversation

        -

        Join the contributor's discussion forum to learn and collaborate on the latest development across PyTorch

        - Contributor Forums -
        -
        -
        -
        -
        - - diff --git a/ecosystem/crypten/index.html b/ecosystem/crypten/index.html new file mode 100644 index 000000000000..b5d204449807 --- /dev/null +++ b/ecosystem/crypten/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/deepspeed/index.html b/ecosystem/deepspeed/index.html new file mode 100644 index 000000000000..ef9cb7edd8d9 --- /dev/null +++ b/ecosystem/deepspeed/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + DeepSpeed | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        DeepSpeed

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/depyf/index.html b/ecosystem/depyf/index.html new file mode 100644 index 000000000000..40b1d6ddf050 --- /dev/null +++ b/ecosystem/depyf/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + depyf | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        depyf

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        depyf is a tool to help users understand and adapt to PyTorch compiler torch.compile.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/detectron2/index.html b/ecosystem/detectron2/index.html new file mode 100644 index 000000000000..af2a3268a99a --- /dev/null +++ b/ecosystem/detectron2/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Detectron2 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Detectron2

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Detectron2 is FAIR's next-generation platform for object detection and segmentation.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/determined/index.html b/ecosystem/determined/index.html new file mode 100644 index 000000000000..6539032ea222 --- /dev/null +++ b/ecosystem/determined/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Determined | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Determined

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Determined is a platform that helps deep learning teams train models more quickly, easily share GPU resources, and effectively collaborate.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/dgl/index.html b/ecosystem/dgl/index.html new file mode 100644 index 000000000000..b616257327d0 --- /dev/null +++ b/ecosystem/dgl/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + DGL | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        DGL

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Deep Graph Library (DGL) is a Python package built for easy implementation of graph neural network model family, on top of PyTorch and other frameworks.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/diffusers/index.html b/ecosystem/diffusers/index.html new file mode 100644 index 000000000000..2b9da3e5cc20 --- /dev/null +++ b/ecosystem/diffusers/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Diffusers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Diffusers

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Diffusers provides pretrained diffusion models across multiple modalities, such as vision and audio, and serves as a modular toolbox for inference and training of diffusion models.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/doctr/index.html b/ecosystem/doctr/index.html new file mode 100644 index 000000000000..b3baf06e130c --- /dev/null +++ b/ecosystem/doctr/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + docTR | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        docTR

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        docTR (Document Text Recognition) - a seamless, high-performing & accessible library for OCR-related tasks powered by Deep Learning.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/ecosystem.html b/ecosystem/ecosystem.html deleted file mode 100644 index b60f4bc5efd2..000000000000 --- a/ecosystem/ecosystem.html +++ /dev/null @@ -1,143 +0,0 @@ ---- -layout: default -title: Ecosystem -permalink: ecosystem/ -background-class: ecosystem-background -body-class: ecosystem -redirect_to: https://landscape.pytorch.org/ ---- - -
        -
        -

        - Ecosystem
        - Tools -

        - -

        Tap into a rich ecosystem of tools, libraries, and more to support, accelerate, and explore AI development.

        -

        Join the Ecosystem

        - - -
        -
        - -
        -
        -
        - - - - {% include ecosystem_sort.html %} - -
        - -
        -
        - {% assign ecosystem = site.ecosystem | sample: site.ecosystem.size %} - {% for item in ecosystem %} - - {% endfor %} -
        - -
        - -
        -

        Have a project you want featured?

        -

        Join the PyTorch ecosystem

        -
        - -
        -
        -
        - - - - - -
        - diff --git a/ecosystem/einops/index.html b/ecosystem/einops/index.html new file mode 100644 index 000000000000..1911f4d098d2 --- /dev/null +++ b/ecosystem/einops/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + einops | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        einops

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Flexible and powerful tensor operations for readable and reliable code.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/ensemble-pytorch/index.html b/ecosystem/ensemble-pytorch/index.html new file mode 100644 index 000000000000..367566175e1a --- /dev/null +++ b/ecosystem/ensemble-pytorch/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Ensemble-Pytorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Ensemble-Pytorch

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A unified ensemble framework for PyTorch to improve the performance and robustness of your deep learning model.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/fairscale/index.html b/ecosystem/fairscale/index.html new file mode 100644 index 000000000000..9e550a3832a7 --- /dev/null +++ b/ecosystem/fairscale/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + FairScale | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        FairScale

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        FairScale is a PyTorch extension library for high performance and large scale training on one or multiple machines/nodes.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/fastai/index.html b/ecosystem/fastai/index.html new file mode 100644 index 000000000000..5bd21367f9ff --- /dev/null +++ b/ecosystem/fastai/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/flower/index.html b/ecosystem/flower/index.html new file mode 100644 index 000000000000..cf65882f6e86 --- /dev/null +++ b/ecosystem/flower/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Flower | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Flower

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Flower - A Friendly Federated Learning Framework

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/fusemedml/index.html b/ecosystem/fusemedml/index.html new file mode 100644 index 000000000000..c108c142fe27 --- /dev/null +++ b/ecosystem/fusemedml/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + FuseMedML | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        FuseMedML

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        FuseMedML is a python framework accelerating ML based discovery in the medical field by encouraging code reuse

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/gandlf/index.html b/ecosystem/gandlf/index.html new file mode 100644 index 000000000000..df64b37b437d --- /dev/null +++ b/ecosystem/gandlf/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + GaNDLF | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        GaNDLF

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A generalizable application framework for segmentation, regression, and classification using PyTorch

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/glow/index.html b/ecosystem/glow/index.html new file mode 100644 index 000000000000..0908b1aea9fe --- /dev/null +++ b/ecosystem/glow/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/gpytorch/index.html b/ecosystem/gpytorch/index.html new file mode 100644 index 000000000000..fdfc910bb3d4 --- /dev/null +++ b/ecosystem/gpytorch/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/higher/index.html b/ecosystem/higher/index.html new file mode 100644 index 000000000000..085f98d8e1bd --- /dev/null +++ b/ecosystem/higher/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + higher | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        higher

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        higher is a library which facilitates the implementation of arbitrarily complex gradient-based meta-learning algorithms and nested optimisation loops with near-vanilla PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/horovod/index.html b/ecosystem/horovod/index.html new file mode 100644 index 000000000000..4e2bfca1d1fa --- /dev/null +++ b/ecosystem/horovod/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/hummingbird/index.html b/ecosystem/hummingbird/index.html new file mode 100644 index 000000000000..ff1aed72a7ad --- /dev/null +++ b/ecosystem/hummingbird/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Hummingbird | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Hummingbird

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Hummingbird compiles trained ML models into tensor computation for faster inference.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/hydra/index.html b/ecosystem/hydra/index.html new file mode 100644 index 000000000000..931c2e873d5d --- /dev/null +++ b/ecosystem/hydra/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Hydra | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Hydra

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A framework for elegantly configuring complex applications.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/inc/index.html b/ecosystem/inc/index.html new file mode 100644 index 000000000000..5bc23bbc6235 --- /dev/null +++ b/ecosystem/inc/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + neural-compressor | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        neural-compressor

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Intel® Neural Compressor provides unified APIs for network compression technologies for faster inference

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/index.html b/ecosystem/index.html new file mode 100644 index 000000000000..757af3e1be1b --- /dev/null +++ b/ecosystem/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/ipex/index.html b/ecosystem/ipex/index.html new file mode 100644 index 000000000000..83a7b4aff068 --- /dev/null +++ b/ecosystem/ipex/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + intel-extension-for-pytorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        intel-extension-for-pytorch

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A Python package for improving PyTorch performance on Intel platforms

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/ivy/index.html b/ecosystem/ivy/index.html new file mode 100644 index 000000000000..b3098ba878fe --- /dev/null +++ b/ecosystem/ivy/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + ivy | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        ivy

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        The Unified Machine Learning Framework

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/joeynmt/index.html b/ecosystem/joeynmt/index.html new file mode 100644 index 000000000000..26b534079cb7 --- /dev/null +++ b/ecosystem/joeynmt/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + joeynmt | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        joeynmt

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Minimalist Neural Machine Translation toolkit for educational purposes

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/join.html b/ecosystem/join.html index ce102e84e325..96432a4b1f3e 100644 --- a/ecosystem/join.html +++ b/ecosystem/join.html @@ -1,8 +1,11 @@ ---- -layout: default -title: Join -permalink: ecosystem/join.html -body-class: ecosystem -background-class: ecosystem-join-background -redirect_to: https://github.com/pytorch-fdn/ecosystem ---- + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/kornia/index.html b/ecosystem/kornia/index.html new file mode 100644 index 000000000000..c25b1b364d4b --- /dev/null +++ b/ecosystem/kornia/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Kornia | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Kornia

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Kornia is a differentiable computer vision library that consists of a set of routines and differentiable modules to solve generic CV problems.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/l5kit/index.html b/ecosystem/l5kit/index.html new file mode 100644 index 000000000000..0114b4289e79 --- /dev/null +++ b/ecosystem/l5kit/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + L5Kit | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        L5Kit

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        ML Prediction, Planning and Simulation for Self-Driving built on PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/lightly/index.html b/ecosystem/lightly/index.html new file mode 100644 index 000000000000..025310db0fc0 --- /dev/null +++ b/ecosystem/lightly/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Lightly | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Lightly

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Lightly is a computer vision framework for self-supervised learning.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/ludwig/index.html b/ecosystem/ludwig/index.html new file mode 100644 index 000000000000..dbda8d6a0747 --- /dev/null +++ b/ecosystem/ludwig/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + ludwig | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        ludwig

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Data-centric declarative deep learning framework

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/mmf/index.html b/ecosystem/mmf/index.html new file mode 100644 index 000000000000..85f18bdd7489 --- /dev/null +++ b/ecosystem/mmf/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + MMF | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        MMF

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A modular framework for vision & language multimodal research from Facebook AI Research (FAIR).

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/monai/index.html b/ecosystem/monai/index.html new file mode 100644 index 000000000000..d3a271419caa --- /dev/null +++ b/ecosystem/monai/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + MONAI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        MONAI

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        MONAI provides domain-optimized foundational capabilities for developing healthcare imaging training workflows.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/nemo/index.html b/ecosystem/nemo/index.html new file mode 100644 index 000000000000..1ed5aa89097e --- /dev/null +++ b/ecosystem/nemo/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + NeMo | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        NeMo

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        NeMo: a toolkit for conversational AI.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/octoml/index.html b/ecosystem/octoml/index.html new file mode 100644 index 000000000000..b3c023c7d2e5 --- /dev/null +++ b/ecosystem/octoml/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + OctoML Profile | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        OctoML Profile

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        octoml-profile is a python library and cloud service designed to provide a simple experience for assessing and optimizing the performance of PyTorch models.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/onnxrt/index.html b/ecosystem/onnxrt/index.html new file mode 100644 index 000000000000..5ad6c0edce4d --- /dev/null +++ b/ecosystem/onnxrt/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + ONNX Runtime | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        ONNX Runtime

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        ONNX Runtime is a cross-platform inferencing and training accelerator.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/opacus/index.html b/ecosystem/opacus/index.html new file mode 100644 index 000000000000..fe2235f08c64 --- /dev/null +++ b/ecosystem/opacus/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Opacus | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Opacus

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Train PyTorch models with Differential Privacy

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/opencompass/index.html b/ecosystem/opencompass/index.html new file mode 100644 index 000000000000..a3c793d91c7c --- /dev/null +++ b/ecosystem/opencompass/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + OpenCompass | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        OpenCompass

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        OpenCompass is an LLM evaluation platform, supporting a wide range of models (Llama3, Mistral, InternLM2,GPT-4,LLaMa2, Qwen,GLM, Claude, etc) over 100+ datasets.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/optuna/index.html b/ecosystem/optuna/index.html new file mode 100644 index 000000000000..f3a1df7f28bd --- /dev/null +++ b/ecosystem/optuna/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Optuna | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Optuna

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        An open source hyperparameter optimization framework to automate hyperparameter search.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/padl/index.html b/ecosystem/padl/index.html new file mode 100644 index 000000000000..44f88d50c435 --- /dev/null +++ b/ecosystem/padl/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + padl | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        padl

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Pipeline Abstractions for Deep Learning in PyTorch

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/parlai/index.html b/ecosystem/parlai/index.html new file mode 100644 index 000000000000..cb4951482d8f --- /dev/null +++ b/ecosystem/parlai/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/pennylane/index.html b/ecosystem/pennylane/index.html new file mode 100644 index 000000000000..ba362b35d754 --- /dev/null +++ b/ecosystem/pennylane/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/pfrl/index.html b/ecosystem/pfrl/index.html new file mode 100644 index 000000000000..db4cd09b175e --- /dev/null +++ b/ecosystem/pfrl/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PFRL | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PFRL

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        PFRL is a deep reinforcement learning library that implements various state-of-the-art deep reinforcement algorithms in Python using PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/polyaxon/index.html b/ecosystem/polyaxon/index.html new file mode 100644 index 000000000000..a6d85ca61f31 --- /dev/null +++ b/ecosystem/polyaxon/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Polyaxon | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Polyaxon

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Polyaxon is a platform for building, training, and monitoring large-scale deep learning applications.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pomegranate/index.html b/ecosystem/pomegranate/index.html new file mode 100644 index 000000000000..09de76f3a91e --- /dev/null +++ b/ecosystem/pomegranate/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + pomegranate | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        pomegranate

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        pomegranate is a library of probabilistic models that is built in a modular manner and treats all models as the probability distributions that they are.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/poptorch/index.html b/ecosystem/poptorch/index.html new file mode 100644 index 000000000000..235e65f6384f --- /dev/null +++ b/ecosystem/poptorch/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PopTorch | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PopTorch

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        The PopTorch interface library is a simple wrapper for running PyTorch programs directly on Graphcore IPUs.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/poutyne/index.html b/ecosystem/poutyne/index.html new file mode 100644 index 000000000000..a8dc3468ddb7 --- /dev/null +++ b/ecosystem/poutyne/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Poutyne | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Poutyne

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Poutyne is a Keras-like framework for PyTorch and handles much of the boilerplating code needed to train neural networks.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/ptc/2022.html b/ecosystem/ptc/2022.html index d4bfc82efa4e..5a4078c48c53 100644 --- a/ecosystem/ptc/2022.html +++ b/ecosystem/ptc/2022.html @@ -1,12 +1,310 @@ ---- -layout: default -title: PyTorch Conference 2022 -permalink: ecosystem/ptc/2022 -background-class: features-background -body-class: ecosystem ---- - -
        + + + + + + + + + + + + + PyTorch Conference 2022 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Conference

        2022

        @@ -27,32 +325,760 @@

        2022


        Posters

        - {% for poster in site.data.ecosystem.ptc['2022'].posters %} + +
        +
        + + + +
        + + Enabling State-of-the-art Interpretability for Medical Imaging Using PyTorch + +
        +
        Dinkar Juyal, Syed Asher Javed, Harshith Padigela, Limin Yu, Aaditya Prakash, Logan Kilpatrick, Anand Sampat, PathAI
        +

        PathAI is a Boston based company focussed on improving patient care using AI powered pathology. We heavily use PyTorch for building our ML systems, specifically training and deploying models on large gigapixel pathology images. In this case study, we highlight our use of PyTorch to build, experiment and deploy Additive Multiple Instance Learning (MIL) models. Additive MIL is a novel MIL technique built using PyTorch Lightning which allows end-to-end learning from millions of pixels while providing granular interpretability of spatial heatmaps. These models allow for the exact computation of the extent to which each smaller region in the gigapixel-sized image contributes to the final model prediction. This enables class-wise excitatory and inhibitory contributions to be visualized on top of the pathology image. This informs the practitioners of model failures and guides the pathologists to areas of interest. All this is made possible due to PyTorch's rapid research-to-prototype-to-deployment iteration cycle.

        + +

        + COMPUTER VISION +

        +
        +
        + +
        +
        + + + +
        + + TorchUnmix: Automatic Stain Unmixing and Augmentation for Histopathology Images in PyTorch + +
        +
        Erik Hagendorn
        +

        TorchUnmix is a library which aims to provide automatic stain unmixing and augmentation for histopathology whole slide images. Separation of histochemical stains (unmixing) is performed by orthonormal transformation of the RGB pixel data from predefined light absorption coefficients called stain vectors [1]. Precomputed publicly available stain vector definitions are often used, but inter-laboratory variation due to the histology and/or image acquisition process is common, yielding suboptimal unmixing results. Classical stain vector estimation methods rely on abundant distribution of stains, making them less practical for sparser distributions as observed from immunohistochemical stains. Geis et al. proposed a method based on k-means clustering of pixel values in the hue-saturation-density color space to determine optimal stain vectors which has been used in this work [2]. While stain vectors may be used for quantification of individual stains, TorchUnmix also provides functionalities to perform stain augmentation. Stain augmentation is a method used during the training process of deep learning models to improve generalization by unmixing the image, stochastically modifying the individual stains, and then compositing the stains into the final augmented image [3]. To our knowledge, no other libraries fully implement the above methods in PyTorch, utilizing GPU-acceleration. Additionally, TorchUnmix has extended all calculations used to perform the automatic stain unmixing and augmentation to operate on batches of images, drastically accelerating execution performance speeds in comparison to other libraries.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Scalable Training and Inference With Ray AIR + +
        +
        Kai Fricke, Balaji Veeramani
        +

        Scaling machine learning is hard: Cloud platform solutions like SageMaker can limit flexibility, but a custom distributed framework is often too hard to implement. In effect, ML engineers struggle to scale their workloads from local prototyping to the cloud. + The Ray AI Runtime ('Ray AIR') is an integrated collection of machine learning libraries built around distributed computing framework Ray. It provides an easy to use interface for scalable data processing, training, tuning, batch prediction, and online serving. Adapting existing PyTorch training loops to Ray AIR's PyTorch integration needs as little as 10 lines of code changes. And scaling from local development to the cloud needs no code changes at all.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + AutoMAD: Mixed Mode Autodiff for PyTorch Models + +
        +
        Jan Hückelheim
        +

        Mixed Mode autodiff combines back-propagation and forward differentiation. Both modes have pros and cons: Back-propagation is efficient for scalar functions with many trainable parameters. Back-propagation uses memory for intermediate results, requires data flow reversal, scales poorly for many output variables. Forward differentiation is straightforward to implement, memory-efficient, and easy to vectorize/parallelize or port to new hardware. Forward mode scales poorly with large number of trainable parameters. AutoMAD makes it possible to combine both modes. Use forward differentiation for some layers, while using back-prop for others.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + xFormers: Building Blocks for Efficient Transformers + +
        +
        Daniel Haziza, Francisco Massa, Jeremy Reizenstein, Patrick Labatut, Diana Liskovich
        +

        We present xFormers, a toolbox to accelerate research on Transformers. It contains efficient components, like an exact memory-efficient multi-head attention that can accelerate trainings 2x while using a fraction of the memory. xFormers components are also customizable and can be combined together to build variations of Transformers. Our hope is to enable the next generation of research based on Transformers.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + linear_operator - Structured Linear Algebra in PyTorch + +
        +
        Max Balandat
        +

        linear_operator (https://github.com/cornellius-gp/linear_operator) is a library for structured linear algebra built on PyTorch. It provides a LinearOperator class that represents a tensor that is never instantiated but is instead accessed through operations like matrix multiplication, solves, decompositions, and indexing. These objects use custom linear algebra operations that can exploit particular matrix structure (e.g. diagonal, block-diagonal, triangular, Kronecker, etc.) in computations in order to achieve substantial (many orders of magnitude) improvements in time and memory complexity. Moreover, many efficient linear algebra operations (e.g. solves, decompositions, indexing, etc.) can be automatically generated from the LinearOperator's matmul function. This makes it extremely easy to compose or implement custom LinearOperators. + The key aspect that makes linear_operator easy to use in PyTorch code is its integration with the `__torch_function__` interface - Common linear algebra operations (such as matrix multiplication, solve, SVD) are mapped to the respective torch functions (`__matmul__`, `torch.linalg.solve`, `torch.linalg.svd`), so that LinearOperator objects can be used as drop-in replacements for dense tensors even in existing code. LinearOperator operations themselves may return LinearOperator objects, automatically keeping track of algebraic structure after each computation. As a result, users never need to reason about what efficient linear algebra routines to use (so long as the input elements defined by the user encode known input structure).

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Declarative Machine Learning with Ludwig: End-to-end Machine Learning Pipelines Using Simple and Flexible Data-driven Configurations + +
        +
        Justin Zhao
        +

        Ludwig is a declarative machine learning framework that makes it easy to define and compare machine learning pipelines using a simple and flexible data-driven configuration system. The minimal configuration declares the input and output features with their respective data types. Users can specify additional parameters to preprocess, encode, and decode features, load from pre-trained models, compose the internal model architecture, set training parameters, or run hyperparameter optimization. Ludwig will build an end-to-end machine learning pipeline automatically, using whatever is explicitly specified in the configuration, while falling back to smart defaults for any parameters that are not. Scientists, engineers, and researchers use Ludwig to explore state-of-the-art model architectures, run hyperparameter search, and scale up to larger than available memory datasets and multi-node clusters, on a variety of problems using structured and unstructured features. Ludwig has 8.5K+ stars on Github and is built on top of PyTorch, Horovod, and Ray.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Generalized Shapes: Block Sparsity, MaskedTensor, NestedTensor + +
        +
        Christian Puhrsch
        +

        This poster presents an overview of available and ongoing developments related to sparse memory formats, masked computation, and support for collections of variably shaped data. In particular it contains a case study of block sparse memory formats, MaskedTensor, and NestedTensor.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Betty: An Automatic Differentiation Library for Generalized Meta Learning + +
        +
        Sang Keun Choe
        +

        Betty is a simple, scalable and modular library for generalized meta-learning (GML) and multilevel optimization (MLO), built upon PyTorch, that allows a unified programming interface for a number of GML/MLO applications including few-shot learning, hyperparameter optimization, neural architecture search, data reweighting, and many more. The internal autodiff mechanism and the software design of Betty are developed by the novel interpretation of GML/MLO as a dataflow graph.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Functorch: Composable Function Transforms in Pytorch + +
        +
        Samantha Andow, Richard Zhou, Horace He, Animesh Jain
        +

        Inspired by Google JAX, functorch is a library in Pytorch that offers composable vmap (vectorization) and autodiff transforms (grad, vjp, jvp). Since its first release alongside Pytorch 1.11, combining these transforms has helped users develop and explore new techniques that were previously tricky to write in Pytorch, like Neural Tangent Kernels and non-linear optimizations (see Theseus, also from PyTorch). This will go through some basic usages and highlight some research that leverages functorch.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Large-Scale Neural Solvers for Partial Differential Equations + +
        +
        Patrick Stiller, Jeyhun Rustamov, Friedrich Bethke, Maksim Zhdanov, Raj Sutarya, Mahnoor Tanveer, Karan Shah, Richard Pausch, Sunna Torge, Alexander Debus, Attila Cangi, Peter Steinbach, Michael Bussmann, Nico Hoffmann
        +

        Our open-source Neural Solvers framework provides data-free ML-based solvers for the study and analysis of phenomena in natural sciences built on top of Pytorch. We were the first to show that certain quantum systems modeled by the 2d Schrödinger equation can be accurately solved while retaining strong scaling. We also developed a novel neural network architecture, GatedPINN [1], introducing adaptable domain decomposition into the training of Physics-informed Neural Networks based on the Mixture-of-Experts paradigm. Distributed large-scale training of our GatedPINN is facilitated by Horovod, resulting in excellent GPU utilization making Neural Solvers ready for the upcoming exascale era. Upcoming projects involve higher dimensional problems such as 3d laser systems and coupled models to study the Vlasov-Maxwell system. Further experiments on novel very scalable compute hardware paves the way for applications of high-fidelity Neural Solvers to real-world applications such as Inverse Scattering Problems.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + PyTorch Video: A Deep Learning Library for Video Understanding + +
        +
        Haoqi Fan
        +

        PyTorchVideo is the deep learning library for video understanding research in PyTorch. +

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Model Preparation Federated Learning and Device Computation + +
        +
        Zhihan Fang
        +

        Federated Learning with Differential Privacy has witnessed an increased adoption as one of the most promising ways to train machine learning models while preserving user privacy. Existing models in Meta around people attributes are mostly built on traditional centralized machine learning methods. Recently, due to the increasing concerns about user privacy internally and externally, Machine Learning teams at Meta are experiencing either signal loss or restriction on applying new features in models to further improve model performance. In this paper, we are introducing a generic framework we built for preparing and generating models for federated learning. The model preparation process is to utilize traditional machine learning to understand model structure and hyperparameters for the target problems including training, inference, evaluations. It also requires a simulation process to train the target model structure and understand the simulated environment on the server side to tune FL specific hyperparameters. + The model generation process is to generate device compatible models, which can be used directly on users’ devices for federated learning. We applied the FL framework on our on-device models, and integrated with device signals to improve user experience and protect user privacy.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Constrained Optimization in PyTorch With Cooper + +
        +
        Jose Gallego-Posada, Juan Camilo Ramirez
        +

        Cooper (https://github.com/cooper-org/cooper) is a general-purpose, deep learning-first constrained optimization library in PyTorch. Cooper is (almost!) seamlessly integrated with PyTorch and preserves the usual loss backward step workflow. If you are already familiar with PyTorch, using Cooper will be a breeze! + This library aims to encourage and facilitate the study of constrained optimization problems in deep learning. Cooper focuses on non-convex constrained optimization problems for which the loss or constraints are not necessarily “nicely behaved” or “theoretically tractable”. Moreover, Cooper has been designed to play nicely with mini-batched/stochastic estimates for the objective and constraint functions. + Cooper implements several popular constrained optimization protocols so you can focus on your project, while we handle the nitty-gritty behind the scenes.

        + +

        + https://github.com/cooper-org/cooper +

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + Two Dimensional Parallelism Using Distributed Tensors + +
        +
        Wanchao Liang, Junjie Wang
        +

        This talk will introduce 2-dimensional parallelism with PyTorch (Data Parallelism + Tensor Parallelism) using Distributed Tensor, a fundamental distributed primitive offered by PyTorch Distributed that empowers Tensor Parallelism. We have proven that using FSDP + Tensor Parallelism together could enable us to train large models like Transformer, and increase training performance. We offer end to end training techniques that enable you to train models in 2-D parallelism fashion, and checkpoint save/load models in a distributed manner.

        + +

        + LIBRARIES +

        +
        +
        + +
        +
        + + + +
        + + PyTorch Tabular: A Framework for Deep Learning with Tabular Data + +
        +
        Manu Joseph
        +

        In spite of showing unreasonable effectiveness in modalities like text and image, Deep Learning has always lagged Gradient Boosting in tabular data- both in popularity and performance. But recently there have been newer models created specifically for tabular data, which is pushing the performance bar. Popularity is still a challenge, however, because there is no easy, ready-to-use library like Sci-Kit Learn for deep learning. PyTorch Tabular aims to change that by being an easy-to-use and flexible framework which makes using SOTA model architectures in tabular data as easy as Sci-Kit Learn.

        + +

        + LIBRARIES +

        +
        +
        +
        - {% if poster.poster_link %} - - {% endif %} +
        - {% if poster.poster_link %} - {{ poster.title }} - {% else %} {{ poster.title }} {% endif %} + + Better Transformer: Accelerating Transformer Inference in PyTorch +
        -
        {{ poster.authors | join: ", "}}
        -

        {{ poster.description }}

        - {% if poster.link %} +
        Michael Gschwind, Christian Puhrsch, Driss Guessous, Rui Zhu, Daniel Haziza, Francisco Massa
        +

        We introduce Better Transformer, the PyTorch project to accelerate Transformers for inference and training with out-of-the-box enablement by implementing the Better Transformer ‘fastpath’. Fastpath accelerates many of the most commonly executed functions in Transformer models. Starting with PyTorch 1.13, the PyTorch Core API is implemented with accelerated operations to deliver up to 2x-4x speedups on many Transformer models, such as BERT and XLM-R. Accelerated operations are based on (1) operator and kernel fusion and (2) exploiting sparsity created by variable sequence-length NLP batches. In addition to improving MultiHeadAttention with fastpath, the model also includes sparsity support for MultiHeadAttention and TransformerEncoder modules to take advantage of variable sequence-length information with Nested Tensors for NLP models. + At present, we enable torchtext and Hugging Face domain libraries with Better Transformer, delivering significant speedups for text, image, and audio models. Starting with the next release, PyTorch core will include even faster fused kernels and training support. You can preview these features today with PyTorch Nightlies, the nightly preview builds of the upcoming PyTorch release.

        +

        - {{ poster.link }} + LIBRARIES

        - {% endif %} +
        +
        + +
        +
        + + + +
        + + PiPPy: Automated Pipeline Parallelism for PyTorch + +
        +
        Ke Wen, Pavel Belevich, Anjali Sridhar
        +

        PiPPy is a library that provides automated pipeline parallelism for PyTorch models. With compiler techniques, PiPPy splits a model into pipeline stages without requiring model changes. PiPPy also provides a distributed runtime that distributes the split stages to multiple devices and hosts and orchestrates micro-batch execution in an overlapped fashion. We demonstrate application of PiPPy to Hugging Face models achieving 3x speedup on cloud platforms.

        +

        - {{ poster.categories }} + LIBRARIES

        - {% endfor %} + +
        +
        + + + +
        + + Practical Guide on PyTorch Inference Using AWS Inferentia + +
        +
        Keita Watanabe
        +

        In this session we will go through step-by-step how to conduct the inference process of machine learning models using Inferentia. In addition, we compare the inference performance with GPU and discuss the cost advantage. In the later part of the session, we will also cover model deployment on Kubernetes.

        + +

        + OPTIMIZATION +

        +
        +
        + +
        +
        + + + +
        + + PyG Performance Optimization for CPU + +
        +
        Mingfei Ma
        +

        Accelerating PyG CPU performance with faster sparse aggregation. +PyG is a library built upon PyTorch to easily write and train Graph Neural Networks, which heavily relies on the mechanism of Message Passing for information aggregation. We have optimized critical bottlenecks of Message Passing from PyTorch, including: 1. Scatter Reduce: maps to classic PyG use case when the EdgeIndex is stored in COO memory format. 2. SpMM Reduce: maps to the usage case when the EdgeIndex is stored in CSR memory format.

        + +

        + OPTIMIZATION +

        +
        +
        + +
        +
        + + + +
        + + Quantization in PyTorch 2.0 Export + +
        +
        Jerry Zhang
        +

        Currently, PyTorch Architecture Optimization (torch.ao) offers two quantization flow tools: eager mode quantization (beta) and fx graph mode quantization (prototype). With PyTorch 2.0 coming up, we are going to redesign quantization on top of the PyTorch 2.0 export path, this talk will introduce our plans for supporting quantization in PyTorch 2.0 export path, its main advantages over the previous tools, and how modeling developers and backend developers will be interacting with this flow.

        + +

        + OPTIMIZATION +

        +
        +
        + +
        +
        + + + +
        + + Torch-TensorRT: A Compiler for Accelerating PyTorch Inference Using TensorRT + +
        +
        Naren Dasan, Dheeraj Peri, Bo Wang, Apurba Bose, George Stefanakis, Nick Comly, Wei Wei, Shirong Wu, Yinghai Lu
        +

        Torch-TensorRT is an open-source compiler targeting NVIDIA GPUs for high-performance deep-learning inference in PyTorch. It combines the usability of PyTorch with the performance of TensorRT allowing for easy optimization of inference workloads on NVIDIA GPUs. Torch-TensorRT supports all classes of optimizations in TensorRT including reduced mixed precision down to INT8, through simple Python & C++ APIs designed to work directly from PyTorch. Torch-TensorRT outputs standard PyTorch modules as well as the TorchScript format to allow for a completely self-contained, portable, & static module with TensorRT engines embedded. We present recent improvements to Torch-TensorRT including the new FX frontend which allows developers to use a full Python workflow for optimizing models and extend Torch-TensorRT in Python, the unified Torch-TensorRT Runtime which enables hybrid FX + TorchScript workflows and discuss future work for the project.

        + +

        + OPTIMIZATION +

        +
        +
        + +
        +
        + + + +
        + + Accelerating Inference with PyTorch by Leveraging Graph Fusions With oneDNN Graph + +
        +
        Sanchit Jain
        +

        The open-source oneDNN Graph library extends oneDNN with a flexible graph API to maximize the optimization opportunities for generating efficient code on AI hardware (currently x86-64 CPUs, but GPU support is on the way). It automatically identifies the graph partitions to be accelerated via fusion. Its fusion patterns entail fusing compute-intensive operations such as convolution, matmul and their neighbor operations for both inference and training use cases. Since PyTorch 1.12, oneDNN Graph has been supported as an experimental feature to speed up inference with Float32 datatype on x86-64 CPUs. Support for inference with oneDNN Graph using BFloat16 datatype exists in the PyTorch master branch, and hence also in nightly PyTorch releases. Intel Extension for PyTorch is an open-source library that builds on top of PyTorch, and can be thought of as a 'staging-ground' for optimizations in PyTorch from Intel. It leverages oneDNN Graph for inference with int8 datatype. This poster presents reproducible results with PyTorch’s TorchBench benchmarking suite to demonstrate the inference speedup achieved with PyTorch & oneDNN Graph using Float32, BFloat16 & int8 datatypes.

        + +

        + OPTIMIZATION +

        +
        +
        + +
        +
        + + + +
        + + Back to Python: Extending PyTorch Without Touching C++ + +
        +
        Alban Desmaison
        +

        This poster presents the new extension points that the PyTorch team has designed to allow users to extend PyTorch from Python. We will cover an introduction to Tensor Subclassing, Modes and torch library. We will briefly describe each extension point and talk through examples such as memory profiling, logging used operators, quantization and custom sparse kernel all in less than 100 LOC. We will also introduce the new ways you can add new devices and author kernels without the need to modify PyTorch directly.

        + +

        + OTHER +

        +
        +
        + +
        +
        + + + +
        + + Functionalization in PyTorch + +
        +
        Brian Hirsh
        +

        Functionalization is a way to remove mutations from arbitrary PyTorch programs sent to downstream compilers. The PyTorch 2.0 stack is all about capturing graphs of PyTorch operations and sending them off to a compiler to get better performance. PyTorch programs can mutate and alias state, making them unfriendly to compilers. Functionalization is a technique to take a program full of PyTorch operators, including mutable and aliasing operators, and remove all mutations from the program while preserving semantics.

        + +

        + OTHER +

        +
        +
        + +
        +
        + + + +
        + + Walmart Search: Serving Models at a Scale on TorchServe + +
        +
        Pankaj Takawale, Dagshayani Kamalaharan, Zbigniew Gasiorek, Rahul Sharnagat
        +

        Walmart Search has embarked on the journey of adopting Deep Learning in the Search ecosystem for improving Search relevance in various parts. As our pilot use case, we wanted to serve the computationally intensive Bert Base model at runtime with an objective to achieve low latency and high throughput. We had JVM hosted web applications loading and serving multiple models. The experimental models were being loaded onto the same applications. These models are large in size and computation is expensive. + We were facing the following limitations with this approach: Refreshing model with the latest version or adding new experimental model would need application deployment. Increased memory pressure on a single application. Slow startup time due to loading multiple ML models during startup. Concurrency was not beneficial due to limited CPU (Metrics on concurrent model prediction vs sequential).

        + +

        + OTHER +

        +
        +
        + +
        +
        + + + +
        + + TorchX: From Local Development to Kubernetes and Back + +
        +
        Joe Doliner, Jimmy Whitaker
        +

        TorchX is incredibly useful for developing PyTorch applications quickly. But when it comes to deployment, nothing is easy. With docker development, Kubernetes, and customer schedulers, there’s a lot to learn. In this talk, we’ll discuss how organizations can deploy to production, why TorchX is a great system for this, and lessons we learned so you can avoid hitting them too.

        + +

        + PRODUCTION +

        +
        +
        + +
        +
        + + + +
        + + Training at Scale Using Fully Sharded Data Parallel (FSDP) with PyTorch/XLA + +
        +
        Shauheen Zahirazami, Jack Cao, Blake Hechtman, Alex Wertheim, Ronghang Hu
        +

        PyTorch/XLA enables PyTorch users to run their models on XLA devices including Google's Cloud TPUs. The latest improvements in PyTorch/XLA enables training PyTorch models using FSDP to train very large models. In this work we present benchmarks and Hardware Flops Utilization of training HuggingFace GPT-2 on Cloud TPU v4.

        + +

        + PRODUCTION +

        +
        +
        + +
        +
        + + + +
        + + FSDP Production Readiness + +
        +
        Rohan Varma, Andrew Gu
        +

        This talk dives into recent advances in PyTorch Fully Sharded Data Parallel (FSDP) that have enabled better throughput, memory savings, and extensibility. These improvements have unblocked using FSDP for models of different modalities and for varying model and data sizes. We will share best practices to apply these features to specific use cases such as XLMR, FLAVA, ViT, DHEN, and GPT3-style models.

        + +

        + PRODUCTION +

        +
        +
        + +
        +
        + + + +
        + + Orchestrating Pytorch Workflows With Kubeflow Pipelines and TorchX + +
        +
        Erwin Huizenga, Nikita Namjoshi
        +

        TorchX is a universal job launcher for PyTorch applications that helps ML practitioners speed up iteration time and support end to end production. In this talk, we show you how to build and run TorchX components as a pipeline using the Kubeflow Pipeline (KFL) DSL. We go into detail on how to use KFP and TorchX to build components and how to use KFP DSL to orchestrate and run ML workflows.

        + +

        + PRODUCTION +

        +
        +
        + +
        +
        + + + +
        + + A Community- led and OSS Ecosystem of ML Compiler and Infrastructure Projects + +
        +
        Shauheen Zahirazami, James Rubin, Mehdi Amini, Thea Lamkin, Eugene Burmako, Navid Khajouei
        +

        ML development is often stymied by incompatibilities between frameworks and hardware, forcing developers to compromise on technologies when building ML solutions. OpenXLA is a community-led and open-source ecosystem of ML compiler and infrastructure projects being co-developed by AI/ML leaders including Alibaba, Amazon Web Services, AMD, Arm, Apple, Google, Intel, Meta, NVIDIA, and more. It will address this challenge by letting ML developers build their models on leading frameworks and execute them with high performance across any hardware backend. This flexibility will let developers make the right choice for their project, rather than being locked into decisions by closed systems. Our community will start by collaboratively evolving the XLA compiler and StableHLO, a portable ML compute operation set that makes frameworks easier to deploy across different hardware options.

        + +

        + PRODUCTION +

        +
        +
        + +
        +
        + + + +
        + + Squeezing GPU Memory Usage in PyTorch + +
        +
        Mao Lin, Keren Zhou, Penfei Su
        +

        The limited GPU memory resources can often hinder the performance of GPU-accelerated applications. While PyTorch’s Caching Allocator aims to minimize the number of expensive memory allocations and deallocations and maximize the efficient utilization of GPU memory resources, our study of common deep learning models revealed significant memory fragmentation problems. In some cases, up to 50% of GPU memory is wasted. To better understand the root causes of memory fragmentation, we developed a tool that visualizes GPU memory usage in two ways: the allocator view and the block view. The allocator view presents memory usage with each allocation or deallocation event, and the block view shows the changes in specific memory blocks over time. Our analysis revealed the considerable potential to save GPU memory, which would relieve the bottleneck of limited resources. By employing strategies such as swapping, activation recomputation, and memory defragmentation, we were able to reduce GPU memory waste significantly.

        + +

        + TOOLS +

        +
        +
        + +
        +
        + + + +
        + + 'Brainchop': In Browser MRI Volumetric Segmentation and Rendering + +
        +
        Mohamed Masoud, Farfalla Hu, Sergey Plis
        +

        In brainchop project, we bring high fidelity pre-trained deep learning models for volumetric analysis of structural magnetic resonance imaging (MRI) right to the browsers of scientists and clinicians with no requirement on their technical skills in setting up AI-solutions. All of this in an extensible open-source framework. Our tool is the first front-end MRI segmentation tool on the web that supports full brain volumetric processing in a single pass inside a browser. This property is powered by our lightweight and reliable deep learning model Meshnet that enables volumetric processing of the entire brain at once, which leads to increased accuracy with modest computational requirements. High-quality client-side processing solves the privacy problem, as the data does not need to leave the client. Moreover, browser-based implementation is able to take advantage of available hardware acceleration regardless of the brand or architecture. + GitHub: https://github.com/neuroneural/brainchop

        + +

        + https://github.com/neuroneural/brainchop +

        + +

        + TOOLS +

        +
        +
        + +
        +
        + + + +
        + + TorchBench: Quantifying PyTorch Performance During the Development Loop + +
        +
        Xu Zhao, Will Constable, David Berard, Taylor Robie, Eric Han, Adnan Aziz
        +

        Holding the line of performance is challenging for ML frameworks like PyTorch. The existing AI benchmarks like MLPerf are end-to-end, therefore require large volumes of datasets, at-scale GPU clusters, and long benchmarking time. We develop TorchBench, a novel AI benchmark suite which highlights with minimal data inputs, single GPU, and milliseconds-per-test latencies. TorchBench is now deployed as part of the PyTorch nightly release process, guarding performance/correctness regressions and testing experimental PyTorch features on SOTA machine learning models.

        + +

        + TOOLS +

        +
        +
        + +
        +
        + + + +
        + + Democratizing AI for Biology With OpenFold + +
        +
        Gustaf Ahdritz, Sachin Kadyan, Will Gerecke, Luna Xia, Nazim Bouatta, Mohammed AlQuraishi
        +

        OpenFold, developed by Columbia University, is an open-source protein structure prediction model implemented with PyTorch. The goal of OpenFold is to verify that AlphaFold 2 — DeepMind's protein structure prediction model — can be reproduced from scratch and beyond that, make components of the system available to like-minded researchers and academics so they can build on top of it. During this research, Weights & Biases was used to accelerate OpenFold’s reproduction of AlphaFold 2. The collaborative nature of W&B allowed for insights to scale from a single researcher to the entire team and helped solve the reproducibility challenge in ML.

        + +

        + TOOLS +

        +
        +
        +
        @@ -72,4 +1098,306 @@
        {{ poster.authors | join: ", "}}
        $(this).toggle($(this).text().toLowerCase().indexOf(input) > -1); }); }); - \ No newline at end of file + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/ptdd/2021.html b/ecosystem/ptdd/2021.html index e46be84cf225..171c1d12762b 100644 --- a/ecosystem/ptdd/2021.html +++ b/ecosystem/ptdd/2021.html @@ -1,12 +1,310 @@ ---- -layout: default -title: Developer's Day 2021 -permalink: ecosystem/ptdd/2021 -background-class: ecosystem-join-background -body-class: ecosystem ---- - -
        + + + + + + + + + + + + + Developer's Day 2021 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Developer Day

        2021

        @@ -40,43 +338,1325 @@

        2021


        Posters

        - {% for poster in site.data.ecosystem.ptdd['2021'].posters %} +
        - {% if poster.poster_link %} - - {% endif %} +
        - {% if poster.poster_link %} - {{ poster.title }} - {% else %} {{ poster.title }} {% endif %} + + xaitk-saliency: Saliency built for analytics and autonomy applications +
        -
        {{ poster.authors | join: ", "}}
        -

        {{ poster.description }}

        - {% if poster.link %} +
        Brian Hu, Paul Tunison, Elim Schenck, Roddy Collins, Anthony Hoogs
        +

        Despite significant progress in the past few years, machine learning-based systems are still often viewed as “black boxes,” which lack the ability to explain their output decisions to human users. Explainable artificial intelligence (XAI) attempts to help end-users understand and appropriately trust machine learning-based systems. One commonly used technique involves saliency maps, which are a form of visual explanation that reveals what an algorithm pays attention to during its decision process. We introduce the xaitk-saliency python package, an open-source, explainable AI framework and toolkit for visual saliency algorithm interfaces and implementations, built for analytics and autonomy applications. The framework is modular and easily extendable, with support for several image understanding tasks, including image classification, image similarity, and object detection. We have also recently added support for the autonomy domain, by creating saliency maps for pixel-based deep reinforcement-learning agents in environments such as ATARI. Several example notebooks are included that demo the current capabilities of the toolkit. xaitk-saliency will be of broad interest to anyone who wants to deploy AI capabilities in operational settings and needs to validate, characterize and trust AI performance across a wide range of real-world conditions and application areas using saliency maps. To learn more, please visit: https://github.com/XAITK/xaitk-saliency.

        +

        - {{ poster.link }} + https://github.com/XAITK/xaitk-saliency

        - {% endif %} +

        - {{ poster.categories }} + MEDICAL & HEALTHCARE, RESPONSIBLE AI

        - {% endfor %} -
        -
        -
        -
        - \ No newline at end of file + +
        +
        + + + +
        + + CovRNN—A collection of recurrent neural network models for predicting outcomes of COVID-19 patients using their EHR data + +
        +
        Laila Rasmy, Ziqian Xie, Bingyu Mao, Khush Patel, Wanheng Zhang, Degui Zhi
        +

        CovRNN is a collection of recurrent neural network (RNN)-based models to predict COVID-19 patients' outcomes, using their available electronic health record (EHR) data on admission, without the need for specific feature selection or missing data imputation. CovRNN is designed to predict three outcomes: in-hospital mortality, need for mechanical ventilation, and long length of stay (LOS >7 days). Predictions are made for time-to-event risk scores (survival prediction) and all-time risk scores (binary prediction). Our models were trained and validated using heterogeneous and de-identified data of 247,960 COVID-19 patients from 87 healthcare systems, derived from the Cerner® Real-World Dataset (CRWD) and 36,140 de-identified patients' data derived from the Optum® de-identified COVID-19 Electronic Health Record v. 1015 dataset (2007 - 2020). CovRNN shows higher performance than do traditional models. It achieved an area under the receiving operating characteristic (AUROC) of 93% for mortality and mechanical ventilation predictions on the CRWD test set (vs. 91·5% and 90% for light gradient boost machine (LGBM) and logistic regression (LR), respectively) and 86.5% for prediction of LOS > 7 days (vs. 81·7% and 80% for LGBM and LR, respectively). For survival prediction, CovRNN achieved a C-index of 86% for mortality and 92·6% for mechanical ventilation. External validation confirmed AUROCs in similar ranges. https://www.medrxiv.org/content/10.1101/2021.09.27.2126

        + +

        + https://github.com/ZhiGroup/CovRNN +

        + +

        + MEDICAL & HEALTHCARE, RESPONSIBLE AI +

        +
        +
        + +
        +
        + + + +
        + + Farabio - Deep learning for Biomedical Imaging + +
        +
        Sanzhar Askaruly, Nurbolat Aimakov, Alisher Iskakov, Hyewon Cho, Yujin Ahn, Myeong Hoon Choi, Hyunmo Yang, Woonggyu Jung
        +

        Deep learning has transformed many aspects of industrial pipelines recently. Scientists involved in biomedical imaging research are also benefiting from the power of AI to tackle complex challenges. Although the academic community has widely accepted image processing tools, such as scikit-image, ImageJ, there is still a need for a tool which integrates deep learning into biomedical image analysis. We propose a minimal, but convenient Python package based on PyTorch with common deep learning models, extended by flexible trainers and medical datasets. In this work, we also share theoretical dive in the form of course as well as minimal tutorials to run Android applications, containing models trained with Farabio.

        + +

        + https://github.com/tuttelikz/farabio +

        + +

        + MEDICAL & HEALTHCARE, RESPONSIBLE AI +

        +
        +
        + +
        +
        + + + +
        + + TorchIO: Pre-processing & Augmentation of Medical Images for Deep Learning Applications + +
        +
        Fernando Pérez-García, Rachel Sparks, Sébastien Ourselin
        +

        Processing of medical images such as MRI or CT presents different challenges compared to RGB images typically used in computer vision: a lack of labels for large datasets, high computational costs, and the need of metadata to describe the physical properties of voxels. Data augmentation is used to artificially increase the size of the training datasets. Training with image patches decreases the need for computational power. Spatial metadata needs to be carefully taken into account in order to ensure a correct alignment and orientation of volumes. We present TorchIO, an open-source Python library to enable efficient loading, preprocessing, augmentation and patch-based sampling of medical images for deep learning. TorchIO follows the style of PyTorch and integrates standard medical image processing libraries to efficiently process images during training of neural networks. TorchIO transforms can be easily composed, reproduced, traced and extended. We provide multiple generic preprocessing and augmentation operations as well as simulation of MRI-specific artifacts.TorchIO was developed to help researchers standardize medical image processing pipelines and allow them to focus on the deep learning experiments. It encourages good open-science practices, as it supports experiment reproducibility and is version-controlled so that the software can be cited precisely. Due to its modularity, the library is compatible with other frameworks for deep learning with medical images.

        + +

        + https://github.com/fepegar/torchio/ +

        + +

        + MEDICAL & HEALTHCARE, RESPONSIBLE AI +

        +
        +
        + +
        +
        + + + +
        + + MONAI: A Domain Specialized Library for Healthcare Imaging + +
        +
        Michael Zephyr, Prerna Dogra, Richard Brown, Wenqi Li, Eric Kerfoot
        +

        Healthcare image analysis for both radiology and pathology is increasingly being addressed with deep-learning-based solutions. These applications have specific requirements to support various imaging modalities like MR, CT, ultrasound, digital pathology, etc. It is a substantial effort for researchers in the field to develop custom functionalities to handle these requirements. Consequently, there has been duplication of effort, and as a result, researchers have incompatible tools, which makes it hard to collaborate. MONAI stands for Medical Open Network for AI. Its mission is to accelerate the development of healthcare imaging solutions by providing domain-specialized building blocks and a common foundation for the community to converge in a native PyTorch paradigm.

        + +

        + https://monai.io/ +

        + +

        + MEDICAL & HEALTHCARE, RESPONSIBLE AI +

        +
        +
        + +
        +
        + + + +
        + + A Framework for Bayesian Neural Networks + +
        +
        Sahar Karimi, Beliz Gokkaya, Audrey Flower, Ehsan Emamjomeh-Zadeh, Adly Templeton, Ilknur Kaynar Kabul, Erik Meijer
        +

        We are presenting a framework for building Bayesian Neural Networks (BNN). One of the critical use cases of BNNs is uncertainty quantification of ML predictions in deep learning models. Uncertainty quantification leads to more robust and reliable ML systems that are often employed to prevent catastrophic outcomes of overconfident predictions especially in sensitive applications such as integrity, medical imaging and treatments, self driving cars, etc.. Our framework provides tools to build BNN models, estimate the uncertainty of their predictions, and transform existing models into their BNN counterparts. We discuss the building blocks and API of our framework along with a few examples and future directions.

        + +

        + MEDICAL & HEALTHCARE, RESPONSIBLE AI +

        +
        +
        + +
        +
        + + + +
        + + Revamp of torchvision datasets and transforms + +
        +
        Philip Meier, torchvision team, torchdata team
        +

        torchvision provides a lot of image and video datasets as well as transformations for research and prototyping. In fact, the very first release of torchvision in 2016 was all about these two submodules. Since their inception their extent has grown organically and became hard to maintain and sometimes also hard to use. Over the years we have gathered a lot of user feedback and decided to revamp the datasets and transforms. This poster will showcase the current state of the rework and compare it to the hopefully soon to be legacy API.

        + +

        + https://pytorchvideo.org/ +

        + +

        + AUDIO, IMAGE & VIDEO, VISION +

        +
        +
        + +
        +
        + + + +
        + + OpenMMLab: Open-Source Toolboxes for Artificial Intelligence + +
        +
        Wenwei Zhang, Han Lyu, Kai Chen
        +

        OpenMMLab builds open-source tool boxes for computer vision. It aims to 1) provide high-quality codebases to reduce the difficulties in algorithm reimplementation; 2) create efficient deployment toolchains targeting a variety of inference engines and devices; 3) build a solid foundation for the community to bridge the gap between academic research and industrial applications. Based on PyTorch, OpenMMLab develops MMCV to provide unified abstract interfaces and common utils, which serve as a foundation of the whole system. Since the initial release in October 2018, OpenMMLab has released 15+ tool boxes covering different research areas. It has implemented 200+ algorithms and released contain 1800+ pre-trained models. With tighter collaboration with the community, OpenMMLab will open source more toolboxes and full-stack toolchains in the future.

        + +

        + openmmlab.com +

        + +

        + AUDIO, IMAGE & VIDEO, VISION +

        +
        +
        + +
        +
        + + + +
        + + Flood Segmentation on Sentinel-1 SAR Imagery with Semi-Supervised Learning + +
        +
        Siddha Ganju, Sayak Paul
        +

        Floods wreak havoc throughout the world, causing billions of dollars in damages, and uprooting communities, ecosystems and economies. Aligning flood extent mapping with local topography can provide a plan-of-action that the disaster response team can consider. Thus, remote flood level estimation via satellites like Sentinel-1 can prove to be remedial. The Emerging Techniques in Computational Intelligence (ETCI) competition on Flood Detection tasked participants with predicting flooded pixels after training with synthetic aperture radar (SAR) images in a supervised setting. We use a cyclical approach involving two stages (1) training an ensemble model of multiple UNet architectures with available high and low confidence labeled data and, generating pseudo labels or low confidence labels on the entire unlabeled test dataset, and then, (2) filter out quality generated labels and, (3) combining the generated labels with the previously available high confidence labeled dataset. This assimilated dataset is used for the next round of training ensemble models. This cyclical process is repeated until the performance improvement plateaus. Additionally, we post-process our results with Conditional Random Fields. Our approach sets the second-highest score on the public hold-out test leaderboard for the ETCI competition with 0.7654 IoU. To the best of our knowledge we believe this is one of the first works to try out semi-supervised learning to improve flood segmentation models.

        + +

        + https://github.com/sidgan/ETCI-2021-Competition-on-FLood-Detection +

        + +

        + AUDIO, IMAGE & VIDEO, VISION +

        +
        +
        + +
        +
        + + + +
        + + Real time Speech Enhancement + +
        +
        Xiaoyu Liu, James Wagner, Roy Fejgin, Joan Serra, Santiago Pascual, Cong Zhou, Jordi Pons, Vivek Kumar
        +

        Speech enhancement is a fundamental audio processing task that has experienced a radical change with the advent of deep learning technologies. We will overview the main characteristics of the task and the key principles of existing deep learning solutions. We will be presenting the past and present work done by our group with the overall goal of delivering the best possible intelligibility and sound quality. Finally, we will provide our view on the future of speech enhancement and show how our current long-term research aligns with such a view.

        + +

        + AUDIO, IMAGE & VIDEO, VISION +

        +
        +
        + +
        +
        + + + +
        + + Kornia AI: Low Level Computer Vision for AI + +
        +
        Edgar Riba, Dmytro Mishkin, Jian Shi, Luis Ferraz
        +

        Kornia is a differentiable library that allows classical computer vision to be integrated into deep learning models. It consists of a set of routines and differentiable modules to solve generic computer vision problems. At its core, the package uses PyTorch as its main backend both for efficiency and to take advantage of the reverse-mode auto-differentiation to define and compute the gradient of complex functions.

        + +

        + https://kornia.github.io// +

        + +

        + AUDIO, IMAGE & VIDEO, VISION +

        +
        +
        + +
        +
        + + + +
        + + Video Transformer Network + +
        +
        Daniel Neimark, Omri Bar, Maya Zohar, Dotan Asselmann
        +

        This paper presents VTN, a transformer-based framework for video recognition. Inspired by recent developments in vision transformers, we ditch the standard approach in video action recognition that relies on 3D ConvNets and introduce a method that classifies actions by attending to the entire video sequence information. Our approach is generic and builds on top of any given 2D spatial network. In terms of wall runtime, it trains 16.1× faster and runs 5.1× faster during inference while maintaining competitive accuracy compared to other state-of-the-art methods. It enables whole video analysis, via a single end-to-end pass, while requiring 1.5× fewer GFLOPs. We report competitive results on Kinetics-400 and present an ablation study of VTN properties and the trade-off between accuracy and inference speed. We hope our approach will serve as a new baseline and start a fresh line of research in the video recognition domain. Code and models are available at: https://github.com/bomri/SlowFast/blob/master/projects/vtn/README.md . See paper: https://arxiv.org/abs/2102.00719

        + +

        + https://github.com/bomri/SlowFast/blob/master/projects/vtn/README.md +

        + +

        + AUDIO, IMAGE & VIDEO, VISION +

        +
        +
        + +
        +
        + + + +
        + + DLRT: Ultra Low-Bit Precision Inference Engine for PyTorch on CPU + +
        +
        Dr. Ehsan Saboori, Dr. Sudhakar Sah, MohammadHossein AskariHemmat Saad Ashfaq, Alex Hoffman, Olivier Mastropietro, Davis Sawyer
        +

        The emergence of Deep Neural Networks (DNNs) on embedded and low-end devices holds tremendous potential to expand the adoption of AI technologies to wider audiences. However, making DNNs applicable for inference on such devices using techniques such as quantization and model compression, while maintaining model accuracy, remains a challenge for production deployment. Furthermore, there is a lack of inference engines available in any AI framework to run such low precision networks. Our work presents a novel inference engine and model compression framework that automatically enables PyTorch developers to quantize and run their deep learning models at 2bit and 1bit precision, making them faster, smaller and more energy-efficient in production. DLRT empowers PyTorch developers to unlock advanced AI on low-power CPUs, starting with ARM CPUs and MCUs. This work allows AI researchers and practitioners to achieve 10x faster inference and near-GPU level performance on a fraction of the power and cost.

        + +

        + https://github.com/deeplite +

        + +

        + PERFORMANCE, PRODUCTION & DEPLOYMENT +

        +
        +
        + +
        +
        + + + +
        + + Serving PyTorch Models in Production at Walmart Search + +
        +
        Adway Dhillo, Nidhin Pattaniyil
        +

        This poster is for a data scientist or ML engineer looking to productionalize their pytorch models. It will cover post training steps that should be taken to optimize the model such as quantization and torch script. It will also walk the user in packaging and serving the model through Facebook’s TorchServe. Will also cover benefits of script mode and Pytorch JIT. Benefits of Torch Serve: high performance serving , multi model serving , model version for A/B testing, server side batching, support for pre and post processing

        + +

        + https://pytorch.org/serve/ +

        + +

        + PERFORMANCE, PRODUCTION & DEPLOYMENT +

        +
        +
        + +
        +
        + + + +
        + + CleanRL: high-quality single file implementation of Deep Reinforcement Learning algorithms with research-friendly features + +
        +
        Shengyi Huang, Rousslan Fernand Julien Dossa, Chang Ye, Jeff Braga
        +

        CleanRL is an open-source library that provides high-quality single-file implementations of Deep Reinforcement Learning algorithms. It provides a simpler yet scalable developing experience by having a straightforward codebase and integrating production tools to help interact and scale experiments. In CleanRL, we put all details of an algorithm into a single file, making these performance-relevant details easier to recognize. Additionally, an experiment tracking feature is available to help log metrics, hyperparameters, videos of an agent's gameplay, dependencies, and more to the cloud. Despite succinct implementations, we have also designed tools to help scale, at one point orchestrating experiments on more than 2000 machines simultaneously via Docker and cloud providers.environments. The source code can be found at https://github.com/vwxyzjn/cleanrl.

        + +

        + https://github.com/vwxyzjn/cleanrl/ +

        + +

        + PERFORMANCE, PRODUCTION & DEPLOYMENT +

        +
        +
        + +
        +
        + + + +
        + + Deploying a Food Classifier on PyTorch Mobile + +
        +
        Nidhin Pattaniyil, Reshama Shaikh
        +

        As technology improves, so does the use of training deep learning models. Additionally, since the time spent on mobile devices is greater than on desktop, the demand for applications running natively on mobile devices is also high. This demo will go through a complete example of training a deep learning vision classifier on the Food-101 dataset using PyTorch. We then deploy it on web and mobile using TorchServe and PyTorch Mobile.

        + +

        + https://github.com/npatta01/pytorch-food +

        + +

        + PERFORMANCE, PRODUCTION & DEPLOYMENT +

        +
        +
        + +
        +
        + + + +
        + + Torch-TensorRT: Accelerating Inference Performance Directly from PyTorch using TensorRT + +
        +
        Naren Dasan, Nick Comly, Dheeraj Peri, Anurag Dixit, Abhiram Iyer, Bo Wang, Arvind Sridhar, Boris Fomitchev, Josh Park
        +

        Learn how to accelerate PyTorch inference, from framework, for model deployment. The PyTorch integration for TensorRT makes the performance of TensorRT's GPU optimizations available in PyTorch for any model. We will walk you through how with 3 lines of code you can go from a trained model to optimized TensorRT-embedded TorchScript, ready to deploy to a production environment.

        + +

        + https://github.com/NVIDIA/Torch-TensorRT/ +

        + +

        + PERFORMANCE, PRODUCTION & DEPLOYMENT +

        +
        +
        + +
        +
        + + + +
        + + Tensorized Deep Learning with TensorLy-Torch + +
        +
        Jean Kossaifi
        +

        Most of the data in modern machine learning (e.g. fMRI, videos, etc) is inherently multi-dimensional and leveraging that structure is crucial for good performance. Tensor methods are the natural way to achieve this and can improve deep learning and enable i) large compression ratios through a reduction of the number of parameters, ii) computational speedups, iii) improved performance and iv) better robustness. The TensorLy project provides the tools to manipulate tensors, including tensor algebra, regression and decomposition. TensorLy-Torch builds on top of this and enables tensor-based deep learning by providing out-of-the-box tensor based PyTorch layers that can be readily combined with any deep neural network architecture and takes care of things such as initialization and tensor dropout.

        + +

        + http://tensorly.org/quantum +

        + +

        + PERFORMANCE, PRODUCTION & DEPLOYMENT +

        +
        +
        + +
        +
        + + + +
        + + Catalyst-Accelerated Deep Learning R&D + +
        +
        Sergey Kolesnikov
        +

        Catalyst is a PyTorch framework for Deep Learning Research and Development. It focuses on reproducibility, rapid experimentation, and codebase reuse so you can create something new rather than write yet another train loop.

        + +

        + https://catalyst-team.com/ +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + Ray Lightning: Easy Multi-node PyTorch Lightning training + +
        +
        Amog Kamsetty, Richard Liaw, Will Drevo, Michael Galarnyk
        +

        PyTorch Lightning is a library that provides a high-level interface for PyTorch which helps you organize your code and reduce boilerplate. By abstracting away engineering code, it makes deep learning experiments easier to reproduce and improves developer productivity. PyTorch Lightning also includes plugins to easily parallelize your training across multiple GPUs. This parallel training, however, depends on a critical assumption: that you already have your GPU(s) set up and networked together in an efficient way for training. While you may have a managed cluster like SLURM for multi-node training on the cloud, setting up the cluster and its configuration is no easy task. Ray Lightning was created with this problem in mind to make it easy to leverage multi-node training without needing extensive infrastructure expertise. It is a simple and free plugin for PyTorch Lightning with a number of benefits like simple setup, easy scale up, seamless creation of multi-node clusters on AWS/Azure/GCP via the Ray Cluster Launcher, and an integration with Ray Tune for large-scale distributed hyperparameter search and state of the art algorithms

        + +

        + https://github.com/ray-project/ray_lightning +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + Supercharge your Federated Learning with Synergos + +
        +
        Jin Howe Teo, Way Yen Chen, Najib Ninaba, Choo Heng Chong Mark
        +

        Data sits as the centerpiece of any machine learning endeavour, yet in many real-world projects, a single party’s data is often insufficient and needs to be augmented with data from other sources. This is unfortunately easier said than done, as there are many innate concerns (be it regulatory, ethical, commercial etc.) stopping parties from exchanging data. Fortunately, there exists an emerging privacy-preserving machine learning technology called Federated Learning. It enables multiple parties holding local data to collaboratively train machine learning models without actually exchanging their data with one another, hence preserving the confidentiality of different parties’ local data.Today, we will be showcasing Synergos, a distributed platform built here at AI Singapore to facilitate the adoption of Federated Learning. Specifically, it strives to make the complex mechanisms involved in any federated endeavour simple, accessible and sustainable.

        + +

        + https://github.com/aimakerspace/synergos_simulator +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + AdaptDL: An Open-Source Resource-Adaptive Deep Learning Training and Scheduling Framework + +
        +
        Aurick Qiao, Omkar Pangarkar, Richard Fan
        +

        AdaptDL is an open source framework and scheduling algorithm that directly optimizes cluster-wide training performance and resource utilization. By elastically re-scaling jobs, co-adapting batch sizes and learning rates, and avoiding network interference, AdaptDL improves shared-cluster training compared with alternative schedulers. AdaptDL can automatically determine the optimal number of resources given a job’s need. It will efficiently add or remove resources dynamically to ensure the highest-level performance. The AdaptDL scheduler will automatically figure out the most efficient number of GPUs to allocate to your job, based on its scalability. When the cluster load is low, your job can dynamically expand to take advantage of more GPUs. AdaptDL offers an easy-to-use API to make existing PyTorch training code elastic with adaptive batch sizes and learning rates. We have also ported AdaptDL to Ray/Tune which can automatically scale trials of an Experiment and can be used to schedule stand-alone PyTorch training jobs on the cloud in a cost-effective way.

        + +

        + https://github.com/petuum/adaptdl +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + Define-by-run quantization + +
        +
        Vasiliy Kuznetsov, James Reed, Jerry Zhang
        +

        Describes a prototype PyTorch workflow to perform quantization syntax transforms in Eager mode with: * no model changes needed (compared to Eager mode which requires manual quant/dequant insertion and fusion) * almost no model syntax restrictions (compared to FX graph mode which requires symbolic traceability)

        + +

        + https://pytorch.org/docs/stable/quantization.html +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + Fx Numeric Suite Core APIs + +
        +
        Charles Hernandez, Vasiliy Kuznetzov, Haixin Liu
        +

        wrong when it doesn't satisfy the accuracy we expect. Debugging the accuracy issue of quantization is not easy and time consuming. The Fx Numeric Suite Core APIs allows users to better diagnose the source of their quantization error for both statically and dynamically quantized modelsThis poster gives an overview of the core APIs and techniques available to users through the Fx Numeric Suite, and how they can use them to improve quantization performance.

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + snnTorch: Training spiking neural networks using gradient-based optimization + +
        +
        J.K. Eshraghian, M. Ward, E.O. Neftci, G. Lenz, X. Wang, G. Dwivedi, M. Bennamoun, D.S. Jeong, W.D. Lu
        +

        The brain is the perfect place to look for inspiration to develop more efficient neural networks. One of the main differences with modern deep learning is that the brain encodes and processes information as spikes rather than continuous activations. Combining the training methods intended for neural networks with the sparse, spiking activity inspired by biological neurons has shown the potential to improve the power efficiency of training and inference by several orders of magnitude. snnTorch is a Python package for performing gradient-based learning with spiking neural networks. It extends the capabilities of PyTorch, taking advantage of its GPU accelerated tensor computation and applying it to networks of event-driven spiking neurons. snnTorch is designed to be intuitively used with PyTorch, as though each spiking neuron were simply another activation in a sequence of layers. It is therefore agnostic to fully-connected layers, convolutional layers, residual connections, etc. The classical challenges that have faced the neuromorphic engineering community, such as the non-differentiability of spikes, the dead neuron problem, vanishing gradients in backpropagation-through-time, are effectively solved in snnTorch and enable the user to focus on building applications that leverage sparsity and event-driven data streams.

        + +

        + https://snntorch.readthedocs.io/en/latest/ +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + PyTorch for R + +
        +
        Daniel Falbel
        +

        Last year the PyTorch for the R language project has been released allowing R users to benefit of PyTorch's speed and flexibility. Since then we have a growing community of contributors that are both improving the torch for R interface, building research and products on top of it and using it to teach deep learning methods. In this poster we will showcase what are the past and current developments in the PyTorch for R project as well as what are our plans for the future.

        + +

        + https://torch.mlverse.org/ +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + ocaml-torch and tch-rs: writing and using PyTorch models using OCaml or Rust + +
        +
        Laurent Mazare
        +

        The main front-end for using PyTorch is its Python API, however LibTorch provides a lower level C++ API to manipulate tensors, perform automatic differentiation, etc. ocaml-torch and tch-rs are two open-source projects providing wrappers for this C++ API respectively in OCaml and Rust. Users can then write OCaml and Rust code to create new models, perform inference and training, and benefit from the guarantees provided by strongly typed programming languages and functional programming. They can also use TorchScript to leverage existing Python models. The libraries provide various examples, ranging from the main computer vision models to a minimalist GPT implementation. The main challenges for these bindings are to provide idiomatic APIs adapted to the languages specificities; to automatically generate most of the bindings code as there are thousands of C++ functions to expose; and to interact properly with the memory models for each language.

        + +

        + https://github.com/laurentMazare/ocaml-torch +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + PyTorch Lightning Flash - Your PyTorch AI Factory + +
        +
        Ari Bornstein
        +

        Flash is a high-level deep learning framework for fast prototyping, baselining, finetuning and solving deep learning problems. It features a set of tasks for you to use for inference and finetuning out of the box, and an easy to implement API to customize every step of the process for full flexibility. Flash is built for beginners with a simple API that requires very little deep learning background, and for data scientists, Kagglers, applied ML practitioners and deep learning researchers that want a quick way to get a deep learning baseline with advanced features PyTorch Lightning offers. Flash enables you to easily configure and run complex AI recipes for over 15 tasks across 7 data domains

        + +

        + https://github.com/PyTorchLightning +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + PyTorch-Ignite: Training and evaluating neural networks flexibly and transparently + +
        +
        Victor Fomin, Taras Savchyn, Priyansi
        +

        PyTorch-Ignite is a high-level library to help with training and evaluating neural networks in PyTorch flexibly and transparently. PyTorch-Ignite is designed to be at the crossroads of high-level Plug & Play features and under-the-hood expansion possibilities. The tool aims to improve the deep learning community's technical skills by promoting best practices where things are not hidden behind a divine tool that does everything, but remain within the reach of users. PyTorch-Ignite differs from other similar tools by allowing users to compose their applications without being focused on a super multi-purpose object, but rather on weakly coupled components allowing advanced customization.

        + +

        + https://pytorch-ignite.ai/ecosystem/ +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + Benchmarking the Accuracy and Robustness of Feedback Alignment Methods + +
        +
        Albert Jimenez, Mohamed Akrout
        +

        Backpropagation is the default algorithm for training deep neural networks due to its simplicity, efficiency and high convergence rate. However, its requirements make it impossible to be implemented in a human brain. In recent years, more biologically plausible learning methods have been proposed. Some of these methods can match backpropagation accuracy, and simultaneously provide other extra benefits such as faster training on specialized hardware (e.g., ASICs) or higher robustness against adversarial attacks. While the interest in the field is growing, there is a necessity for open-source libraries and toolkits to foster research and benchmark algorithms. In this poster, we present BioTorch, a software framework to create, train, and benchmark biologically motivated neural networks. In addition, we investigate the performance of several feedback alignment methods proposed in the literature, thereby unveiling the importance of the forward and backward weight initialization and optimizer choice. Finally, we provide a novel robustness study of these methods against state-of-the-art white and black-box adversarial attacks.

        + +

        + https://github.com/jsalbert/biotorch +

        + +

        + EXTENDING PYTORCH, APIs, PARALLEL & DISTRIBUTED TRAINING +

        +
        +
        + +
        +
        + + + +
        + + Salina: Easy programming of Sequential Decision Learning and Reinforcement Learning Models in pytorch + +
        +
        Ludovic Denoyer, Alfredo de la Fuente, Song Duong, Jean-Baptiste Gaya, Pierre-Alexandre Kamienny, Daniel H. Thompson
        +

        salina is a lightweight library extending PyTorch modules for the development of sequential decision models. It can be used for Reinforcement Learning (including model-based with differentiable environments, multi-agent RL, ...), but also in a supervised/unsupervised learning settings (for instance for NLP, Computer Vision, etc..).

        + +

        + https://github.com/facebookresearch/salina +

        + +

        + ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY +

        +
        +
        + +
        +
        + + + +
        + + Structured and Unstructured Pruning Workflow in PyTorch + +
        +
        Zafar Takhirov, Karen Zhou, Raghuraman Krishnamoorthi
        +

        Two new toolflows for model pruning are introduced: Sparsifier and Pruner, which enable unstructured and structured pruning of the model weights respectively. The toolflow can be combined with other optimization techniques, such as quantization to achieve even higher levels of model compression. In addition to that, the "Pruner" toolflow can also be used for "shape propagation", where the physical structure of the model is modified after structured pruning (in FX graph mode only).This poster gives a high-level overview of the prototype API, usage example, currently supported sparse quantized kernels, as well as provides a brief overview of future plans

        + +

        + https://github.com/pytorch/pytorch +

        + +

        + ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY +

        +
        +
        + +
        +
        + + + +
        + + Torch-CAM: class activation explorer + +
        +
        François-Guillaume Fernandez
        +

        One of the core inconveniences of Deep Learning comes from its interpretability, which remains obscure for most non-basic convolutional models. Their very performances are granted by optimization processes that have high degrees of freedom and no constraints on explainability. Fortunately, modern frameworks mechanisms grant access to information flow in their components, which paved the way to building intuition around result interpretability in CNN models. The main contributions of the author are described as follows: - building a flexible framework for class activation computation - providing high-quality implementations of most popular methods - making these methods usable by entry users as well as researchers The open-source project is available here: https://github.com/frgfm/torch-cam

        + +

        + https://github.com/frgfm/torch-cam +

        + +

        + ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY +

        +
        +
        + +
        +
        + + + +
        + + moai: A Model Development Kit to Accelerate Data-driven Workflows + +
        +
        Nikolaos Zioulis
        +

        moai is a PyTorch-based AI Model Development Kit (MDK) that seeks to improve data-driven model workflows, design and understanding. It relies on hydra for handling configuration and lightning for handling infrastructure. As a kit, It offers a set of actions to `train` or `evaluate` models using the corresponding actions which consume configuration files. Apart from the definition of the model, data, training scheme, optimizer, visualization and logging, these configuration files additionally use named tensors to define tensor processing graphs. These are created by chaining various building blocks called monads, which are functional units or otherwise single responsibility modules. Monad parameters and input/output tensors are defined on the configuration file, allowing for the entire model to be summarized into a single file. This opens up novel functionalities like querying for inter-model differences using the `diff` action, or aggregating the results of multiple models using the `plot` action which uses hiplot to compare models in various ways. moai facilitates high quality reproduction (using the `reprod` action), as apart from automatically handling all boilerplate related to it, it standardizes the process of developing modules/monads and implicitly logs all hyperparameters. Even though no code is required, moai exploits python’s flexibility to allow developers to integrate their own code into its engine from external projects, vastly increasing their productivity.

        + +

        + https://github.com/ai-in-motion/moai +

        + +

        + ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY +

        +
        +
        + +
        +
        + + + +
        + + Building Production ML Pipelines for PyTorch Models + +
        +
        Vaibhav Singh, Rajesh Thallam, Jordan Totten, Karl Weinmeister
        +

        Machine Learning Operationalization has rapidly evolved in the last few years with a growing set of tools for each phase of development. From experimentation to automated model analysis and deployment, each of these tools offer some unique capabilities. In this work we survey a slice of these tools and demonstrate an opinionated example of an end to end CI/CD pipeline for PyTorch model development and deployment using Vertex AI SDK. The goal of this session is to aid an informed conversation on the choices available to PyTorch industry practitioners who are looking to operationalize their ML models, and to researchers who are simply trying to organize their experiments. Although our implementation example will make tool choices at various stages, we will be focused on ML design patterns that are applicable to a wide variety of commercial and open-source offerings.

        + +

        + https://github.com/GoogleCloudPlatform/vertex-ai-samples +

        + +

        + ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY +

        +
        +
        + +
        +
        + + + +
        + + Customizing MLOps pipelines with JSON-AI: a declarative syntax to streamline ML in the database + +
        +
        George Hosu, Particio Cerda-Mardini, Natasha Seelam, Jorge Torres
        +

        Nearly 64% of companies take over a month to a year to deploy a single machine learning (ML) model into production [1]. Many of these companies cite key challenges integrating with complex ML frameworks as a root cause [1], as there is still a gap between where data lives, how models are trained, and how downstream applications access predictions from models [1, 2]. MindsDB is a PyTorch-based ML platform that aims to solve fundamental MLOps challenges by abstracting ML models as “virtual tables”, allowing models to be queried in the same natural way users work with data in databases. As data is diverse and varied, we recently developed an open-source declarative syntax, named “JSON-AI” to allow others to customize ML model internals without changing source code. We believe that the key elements of the data science (DS)/ML pipeline, namely data pre-processing/cleaning, feature engineering, and model-building [2], should be automated in a robust, reliable, and reproducible manner with simplicity. JSON-AI allows you refined control of each of these steps, and enables users to bring custom routines into their ML pipeline. In our poster, we will show how a user interfaces with JSON-AI to bring original approaches to each of the aforementioned parts of the DS/ML, along with control over analysis and explainability tools. [1] Algorithmia (2021). 2021 state of enterprise machine learning [2] “How Much Automation Does a Data Scientist Want?” ArXiV (2021)

        + +

        + https://github.com/mindsdb/mindsdb/ +

        + +

        + ML Ops, MODELS, MODEL OPTIMIZATION & INTERPRETABILITY +

        +
        +
        + +
        +
        + + + +
        + + TorchStudio, a full featured IDE for PyTorch + +
        +
        Robin Lobel
        +

        TorchStudio is an open-source, full-featured IDE for PyTorch. It aims to simplify the creation, training and iterations of AI models. It can load, analyze and explore datasets from the TorchVision or TorchAudio categories, or custom datasets with any format and number of inputs and outputs. TorchVision, TorchAudio or custom models can then be loaded or written from scratch, debugged, visualized as a graph, and trained using local hardware, a distant server or GPUs in the cloud. Trainings can then be compared in the dashboard with several analyzing tools to help you identify the best performing set of models and hyper parameters and export it as TorchScript or ONNX files. TorchStudio is also highly customizable, with 90% of its functionalities accessible as open source scripts and independent modules, to fit as many AI scenario as possible.

        + +

        + https://torchstudio.ai/ +

        + +

        + ACCELERATORS, TOOLS, LIBRARY, DATA +

        +
        +
        + +
        +
        + + + +
        + + Accelerate TorchServe with Intel Extension for PyTorch + +
        +
        Mark Saroufim, Hamid Shojanazeri, Patrick Hu, Geeta Chauhan, Jing Xu, Jianan Gu, Jiong Gong, Ashok Emani, Eikan Wang, Min Jean Cho, Fan Zhao
        +

        Accelerate TorchServe with Intel® Extension for PyTorch: Intel is collaborating with Meta to take advantage of performance boosting from Intel® Extension for PyTorch* from TorchServe, so that users can easily deploy their PyTorch models with out of the box satisfying performance. With these SW advancements, we demonstrated ease-of-use IPEX user-facing API, and we also showcased speed-up with Intel® Extension for PyTorch* FP32 inference with the stock PyTorch and speed-up with Intel® Extension for PyTorch* INT8 inference with the stock PyTorch.

        + +

        + www.intel.com/Performanceindex +

        + +

        + ACCELERATORS, TOOLS, LIBRARY, DATA +

        +
        +
        + +
        +
        + + + +
        + + Kaolin Library + +
        +
        Clement Fuji Tsang, Jean-Francois Lafleche, Charles Loop, Masha Shugrina, Towaki Takikawa, Jiehan Wang
        +

        NVIDIA Kaolin is a suite of tools for accelerating 3D Deep Learning research. The Kaolin library provides a PyTorch API for working with a variety of 3D representations and includes a growing collection of GPU-optimized operations such as modular differentiable rendering, fast conversions between representations, loss functions, data loading, 3D checkpoints and more. The library also contains a lightweight 3D visualizer Dash3D and can work with an Omniverse companion app for dataset/checkpoint visualization and synthetic data generation.

        + +

        + ACCELERATORS, TOOLS, LIBRARY, DATA +

        +
        +
        + +
        +
        + + + +
        + + Accelerate PyTorch training with Cloud TPUs + +
        +
        Jack Cao, Milad Mohammadi, Zak Stone, Vaibhav Singh, Calvin Pelletier, Shauheen Zahirazami
        +

        PyTorch / XLA offers PyTorch users the ability to train their models on XLA devices including Cloud TPUs. This compiled path often makes it possible to utilize creative optimizations and achieve top performance on target XLA devices. With the introduction of Cloud TPU VMs, users have direct access to TPU host machines and therefore a great level of flexibility. In addition, TPU VMs make debugging easier and reduce data transfer overheads. Google has also recently announced the availability of Cloud TPU v4 Pods, which are exaflop-scale supercomputers for machine learning. Cloud TPU v4 Pods offer a whole new level of performance for large-scale PyTorch / XLA training of ML models.

        + +

        + ACCELERATORS, TOOLS, LIBRARY, DATA +

        +
        +
        + +
        +
        + + + +
        + + Accelerating PyTorch on the largest chip ever built (WSE) + +
        +
        Antonio Kim, Behzad Abghari, Chris Oliver, Cynthia Liu, Mark Browning, Vishal Subbiah, Kamran Jafari, Emad Barsoum, Jessica Liu, Sean Lie
        +

        The Cerebras Wafer Scale Engine (WSE) is the largest processor ever built, dedicated to accelerating deep learning model for training and inference. A single chip in a single CS-2 system provides the compute power of a cluster of GPUs but acts as a single processor, making it also much simpler to use. We present the current PyTorch backend architecture for the Cerebras CS-2 and how we go all the way from PyTorch to laying out the model graph on the wafer. Additionally, we will discuss the advantages of training on Cerebras hardware and its unique capabilities.

        + +

        + https://cerebras.net +

        + +

        + ACCELERATORS, TOOLS, LIBRARY, DATA +

        +
        +
        + +
        + + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pted/2021.html b/ecosystem/pted/2021.html index 95453eba07eb..38421baed4e9 100644 --- a/ecosystem/pted/2021.html +++ b/ecosystem/pted/2021.html @@ -1,12 +1,310 @@ ---- -layout: default -title: Ecosystem Day 2021 -permalink: ecosystem/pted/2021 -background-class: features-background -body-class: ecosystem ---- - -
        + + + + + + + + + + + + + Ecosystem Day 2021 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Ecosystem Day

        2021

        @@ -55,43 +353,2204 @@

        2021


        Posters

        - {% for poster in site.data.ecosystem.pted['2021'].posters %} +
        - {% if poster.poster_link %} - - {% endif %} +
        - {% if poster.poster_link %} - {{ poster.title }} - {% else %} {{ poster.title }} {% endif %} + + Bring quantum machine learning to PyTorch with PennyLane +
        -
        {{ poster.authors | join: ", "}}
        -

        {{ poster.description }}

        - {% if poster.link %} +
        Josh Izaac, Thomas Bromley
        +

        PennyLane allows you to train quantum circuits just like neural networks!, This poster showcases how PennyLane can be interfaced with PyTorch to enable training of quantum and hybrid machine learning models. The outputs of a quantum circuit are provided as a Torch tensor with a defined gradient. We highlight how this functionality can be used to explore new paradigms in machine learning, including the use of hybrid models for transfer learning.

        +

        - {{ poster.link }} + http://pennylane.ai

        - {% endif %} +

        - {{ poster.categories }} + Platform, Ops & Tools

        - {% endfor %} -
        -
        -
        -
        - +Thus, pre-trained models are often used as-is when a researcher wants to experiment only with a specific facet of a problem. See, as examples, FastAI's work into optimizers, schedulers, and gradual training through pre-trained residual models, or NLP projects with Hugging Face models as their backbone. + +We think that, for many of these problems, we can automatically generate a "good enough" model and data-processing pipeline from just the raw data and the endpoint. To address this situation, we are developing MindsDB, an open-source, PyTorch-based ML platform that works inside databases via SQL commands. It is built with a modular approach, and in this talk we are going to focus on Lightwood, the stand-alone core component that performs machine learning automation on top of the PyTorch framework. + +Lightwood automates model building into 5 stages: (1) classifying each feature into a "data type", (2) running statistical analyses on each column of a dataset, (3) fitting multiple models to normalize, tokenize, and generate embeddings for each feature, (4) deploying the embeddings to fit a final estimator, and (5) running an analysis on the final ensemble to evaluate it and generate a confidence model. It can generate quick "baseline" models to benchmark performance for any custom encoder representation of a data type and can also serve as scaffolding for investigating new hypotheses (architectures, optimizers, loss-functions, hyperparameters, etc). + +We aim to present our benchmarks covering wide swaths of problem types and illustrate how Lightwood can be useful for researchers and engineers through a hands-on demo.

        + +

        + https://mindsdb.com +

        + +

        + Database & AI Accelerators +

        + + + +
        +
        + + + +
        + + PyTorch on Supercomputers Simulations and AI at Scale with SmartSim + +
        +
        Sam Partee , Alessandro Rigazzi, Mathew Ellis, Benjamin Rob
        +

        SmartSim is an open source library dedicated to enabling online analysis and Machine Learning (ML) for traditional High Performance Computing (HPC) simulations. Clients are provided in common HPC simulation languages, C/C++/Fortran, that enable simulations to perform inference requests in parallel on large HPC systems. SmartSim utilizes the Redis ecosystem to host and serve PyTorch models alongside simulations. We present a use case of SmartSim where a global ocean simulation, used in climate modeling, is augmented with a PyTorch model to resolve quantities of eddy kinetic energy within the simulation.

        + +

        + https://github.com/CrayLabs/SmartSim +

        + +

        + Database & AI Accelerators +

        +
        +
        + +
        +
        + + + +
        + + Model agnostic confidence estimation with conformal predictors for AutoML + +
        +
        Patricio Cerda-Mardini, Natasha Seelam
        +

        Many domains leverage the extraordinary predictive performance of machine learning algorithms. However, there is an increasing need for transparency of these models in order to justify deploying them in applied settings. Developing trustworthy models is a great challenge, as they are usually optimized for accuracy, relegating the fit between the true and predicted distributions to the background [1]. This concept of obtaining predicted probability estimates that match the true likelihood is also known as calibration. + +Contemporary ML models generally exhibit poor calibration. There are several methods that aim at producing calibrated ML models [2, 3]. Inductive conformal prediction (ICP) is a simple yet powerful framework to achieve this, offering strong guarantees about the error rates of any machine learning model [4]. ICP provides confidence scores and turns any point prediction into a prediction region through nonconformity measures, which indicate the degree of inherent strangeness a data point presents when compared to a calibration data split. + +In this work, we discuss the integration of ICP with MindsDB --an open source AutoML framework-- successfully replacing its existing quantile loss approach for confidence estimation capabilities. +Our contribution is threefold. First, we present a study on the effect of a "self-aware" neural network normalizer in the width of predicted region sizes (also known as efficiency) when compared to an unnormalized baseline. Our benchmarks consider results for over 30 datasets of varied domains with both categorical and numerical targets. Second, we propose an algorithm to dynamically determine the confidence level based on a target size for the predicted region, effectively prioritizing efficiency over a minimum error rate. Finally, we showcase the results of a nonconformity measure specifically tailored for small datasets. + +References: +[1] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K.Q. (2017). On Calibration of Modern Neural Networks. ArXiv, abs/1706.04599. +[2] Naeini, M., Cooper, G., & Hauskrecht, M. (2015). Obtaining Well Calibrated Probabilities Using Bayesian Binning. Proceedings of the AAAI Conference on Artificial Intelligence. AAAI Conference on Artificial Intelligence, 2015, 2901-2907 . +[3] Maddox, W., Garipov, T., Izmailov, P., Vetrov, D., & Wilson, A. (2019). A Simple Baseline for Bayesian Uncertainty in Deep Learning. NeurIPS. +[4] Papadopoulos, H., Vovk, V., & Gammerman, A. (2007). Conformal Prediction with Neural Networks. 19th IEEE International Conference on Tools with Artificial Intelligence (ICTAI 2007), 2, 388-395.

        + +

        + https://mindsdb.com +

        + +

        + Database & AI Accelerators +

        +
        +
        + +
        +
        + + + +
        + + Enabling PyTorch on AMD Instinct™ GPUs with the AMD ROCm™ Open Software Platform + +
        +
        Derek Bouius
        +

        AMD Instinct GPUs are enabled with the upstream PyTorch repository via the ROCm open software platform. Now users can also easily download the installable Python package, built from the upstream PyTorch repository and hosted on pytorch.org. Notably, it includes support for distributed training across multiple GPUs and supports accelerated mixed precision training. AMD also provides hardware support for the PyTorch community build to help develop and maintain new features. This poster will highlight some of the work that has gone into enabling PyTorch support.

        + +

        + https://www.amd.com/rocm +

        + +

        + Database & AI Accelerators +

        +
        +
        + +
        +
        + + + +
        + + DeepSpeed: Shattering barriers of deep learning speed & scale + +
        +
        DeepSpeed Team Microsoft Corporation
        +

        In the poster (and a talk during the breakout session), we will present three aspects of DeepSpeed (https://github.com/microsoft/DeepSpeed), a deep learning optimization library based on PyTorch framework: 1) How we overcome the GPU memory barrier by ZeRO-powered data parallelism. 2) How we overcome the network bandwidth barrier by 1-bit Adam and 1-bit Lamb compressed optimization algorithms. 3) How we overcome the usability barrier by integration with Azure ML, HuggingFace, and PyTorch Lightning.

        + +

        + +

        + +

        + Distributed Training +

        +
        +
        + +
        +
        + +
        + Dask PyTorch DDP: A new library bringing Dask parallelization to PyTorch training +
        +
        Stephanie Kirmer, Hugo Shi
        +

        We have developed a library that helps simplify the task of multi-machine parallel training for PyTorch models, bringing together the power of PyTorch DDP with Dask for parallelism on GPUs. Our poster describes the library and its core function, and demonstrates how the multi-machine training process works in practice.

        + +

        + https://github.com/saturncloud/dask-pytorch-ddp +

        + +

        + Distributed Training +

        +
        +
        + +
        +
        + + + +
        + + Optimising Physics Informed Neural Networks. + +
        +
        Vignesh Gopakumar
        +

        Solving PDEs using Neural Networks are often ardently laborious as it requires training towards a well-defined solution, i.e. global minima for a network architecture - objective function combination. For a family of complex PDEs, Physics Informed neural networks won't offer much in comparison to traditional numerical methods as their global minima becomes more and more intractable. We propose a modified approach that hinges on continual and parametrised learning that can create more general PINNs that can solve for a variety of PDE scenarios rather than solving for a well-defined case. We believe that this brings Neural Network based PDE solvers in comparison to numerical solvers.

        + +

        + +

        + +

        + Distributed Training +

        +
        +
        + +
        +
        + + + +
        + + FairScale-A general purpose modular PyTorch library for high performance and large scale training + +
        +
        Mandeep Baines, Shruti Bhosale, Vittorio Caggiano, Benjamin Lefaudeux, Vitaliy Liptchinsky, Naman Goyal, Siddhardth Goyal, Myle Ott, Sam Sheifer, Anjali Sridhar, Min Xu
        +

        FairScale is a library that extends basic PyTorch capabilities while adding new SOTA techniques for high performance and large scale training on one or multiple machines. FairScale makes available the latest distributed training techniques in the form of composable modules and easy to use APIs. + +Machine Learning (ML) training at scale traditionally means data parallelism to reduce training time by using multiple devices to train on larger batch size. Nevertheless, with the recent increase of ML models sizes data parallelism is no longer enough to satisfy all "scaling" needs. FairScale provides several options to overcome some of the limitations to scale. + +For scaling training that is bottlenecked by memory (optimizer state, intermediate activations, parameters), FairScale provides APIs that have implemented optimizer, gradient and parameter sharding. This will allow users to train large models using devices in a more memory efficient manner. + +To overcome the memory required for large models FairScale provides various flavors of pipeline and model parallelism, MOE (Mixture Of Experts) layer, and Offload models. Those methods allow to perform computation only of shards of the models across multiple devices with micro batches of data to maximize device efficiency. + +FairScale also provides modules to aid users to scale batch size effectively without changing their existing learning rate hyperparameter - AdaScale - and save memory with checkpoint activation of intermediate layers. + +FairScale has also been integrated into Pytorch Lightening, HuggingFace, FairSeq, VISSL, and MMF to enable users of those frameworks to take advantage of its features.

        + +

        + +

        + +

        + Distributed Training +

        +
        +
        + +
        +
        + + + +
        + + AdaptDL: An Open-Source Resource-Adaptive Deep Learning Training/Scheduling Framework + +
        +
        Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, Eric P. Xing
        +

        AdaptDL is an open source framework and scheduling algorithm that directly optimizes cluster-wide training performance and resource utilization. By elastically re-scaling jobs, co-adapting batch sizes and learning rates, and avoiding network interference, AdaptDL improves shared-cluster training compared with alternative schedulers. AdaptDL can automatically determine the optimal number of resources given a job's need. It will efficiently add or remove resources dynamically to ensure the highest-level performance. The AdaptDL scheduler will automatically figure out the most efficient number of GPUs to allocate to your job, based on its scalability. When the cluster load is low, your job can dynamically expand to take advantage of more GPUs. AdaptDL offers an easy-to-use API to make existing PyTorch training code elastic with adaptive batch sizes and learning rates. +Showcase: Distributed training and Data Loading

        + +

        + +

        + +

        + Distributed Training +

        +
        +
        + +
        +
        + +
        + Accelerate PyTorch large model training with ONNX Runtime: just add one line of code! +
        +
        Natalie Kershaw
        +

        As deep learning models, especially transformer models get bigger and bigger, reducing training time becomes both a financial and environmental imperative. ONNX Runtime can accelerate large-scale distributed training of PyTorch transformer models with a one-line code change (in addition to import statements ;-)) Adding in the DeepSpeed library improves training speed even more. + +With the new ORTModule API, you wrap an existing torch.nn.Module, and have us automatically: export the model as an ONNX computation graph; compile and optimize it with ONNX Runtime; and integrate it into your existing training script. + +In this poster, we demonstrate how to fine-tune a popular HuggingFace model and show the performance improvement, on a multi-GPU cluster in the Azure Machine Learning cloud service.

        + +

        + https://aka.ms/pytorchort +

        + +

        + Distributed Training +

        +
        +
        + +
        +
        + + + +
        + + PyTorch/XLA with new Cloud TPU VMs and Profiler + +
        +
        Jack Cao, Daniel Sohn, Zak Stone, Shauheen Zahirazami
        +

        PyTorch / XLA enables users to train PyTorch models on XLA devices including Cloud TPUs. Cloud TPU VMs now provide direct access to TPU host machines and hence offer much greater flexibility in addition to making debugging easier and reducing data transfer overheads. PyTorch / XLA has now full support for this new architecture. A new profiling tool has also been developed to enable better profiling of PyTorch / XLA. These improvements not only make it much easier to develop models but also reduce the cost of large-scale PyTorch / XLA training runs on Cloud TPUs.

        + +

        + http://goo.gle/pt-xla-tpuvm-signup +

        + +

        + Distributed Training +

        +
        +
        + +
        +
        + + + +
        + + PyTorch Lightning: Deep Learning without the Boilerplate + +
        +
        Ari Bornstein
        +

        PyTorch Lightning reduces the engineering boilerplate and resources required to implement state-of-the-art AI. Organizing PyTorch code with Lightning enables seamless training on multiple-GPUs, TPUs, CPUs, and the use of difficult to implement best practices such as model sharding, 16-bit precision, and more, without any code changes. In this poster, we will use practical Lightning examples to demonstrate how to train Deep Learning models with less boilerplate.

        + +

        + https://www.pytorchlightning.ai/ +

        + +

        + Frontend & Experiment Manager +

        +
        +
        + +
        +
        + + + +
        + + Accelerate PyTorch with IPEX and oneDNN using Intel BF16 Technology + +
        +
        Jiong Gong, Nikita Shustrov, Eikan Wang, Jianhui Li, Vitaly Fedyunin
        +

        Intel and Facebook collaborated to enable BF16, a first-class data type in PyTorch, and a data type that are accelerated natively with the 3rd Gen Intel® Xeon® scalable processors. This poster introduces the latest SW advancements added in Intel Extension for PyTorch (IPEX) on top of PyTorch and the oneAPI DNN library for ease-of-use and high-performance BF16 DL compute on CPU. With these SW advancements, we demonstrated ease-of-use IPEX user-facing API, and we also showcased 1.55X-2.42X speed-up with IPEX BF16 training over FP32 with the stock PyTorch and 1.40X-4.26X speed-up with IPEX BF16 inference over FP32 with the stock PyTorch.

        + +

        + https://github.com/intel/intel-extension-for-pytorch +

        + +

        + Frontend & Experiment Manager +

        +
        +
        + +
        +
        + + + +
        + + TorchStudio, a machine learning studio software based on PyTorch + +
        +
        Robin Lobel
        +

        TorchStudio is a standalone software based on PyTorch and LibTorch. It aims to simplify the creation, training and iterations of PyTorch models. It runs locally on Windows, Ubuntu and macOS. It can load, analyze and explore PyTorch datasets from the TorchVision or TorchAudio categories, or custom datasets with any number of inputs and outputs. PyTorch models can then be loaded and written from scratch, analyzed, and trained using local hardware. Trainings can be run simultaneously and compared to identify the best performing models, and export them as a trained TorchScript or ONNX model.

        + +

        + https://torchstudio.ai/ +

        + +

        + Frontend & Experiment Manager +

        +
        +
        + +
        +
        + + + +
        + + Hydra Framework + +
        +
        Jieru Hu, Omry Yadan
        +

        Hydra is an open source framework for configuring and launching research Python applications. Key features: - Compose and override your config dynamically to get the perfect config for each run - Run on remote clusters like SLURM and AWS without code changes - Perform basic greed search and hyper parameter optimization without code changes - Command line tab completion for your dynamic config And more.

        + +

        + +

        + +

        + Frontend & Experiment Manager +

        +
        +
        + +
        +
        + + + +
        + + PyTorch-Ignite: training common things easy and the hard things possible + +
        +
        Victor Fomin, Sylvain Desroziers, Taras Savchyn
        +

        This poster intends to give a brief but illustrative overview of what PyTorch-Ignite can offer for Deep Learning enthusiasts, professionals and researchers. Following the same philosophy as PyTorch, PyTorch-Ignite aims to keep it simple, flexible and extensible but performant and scalable. Throughout this poster, we will introduce the basic concepts of PyTorch-Ignite, its API and features it offers. We also assume that the reader is familiar with PyTorch.

        + +

        + +

        + +

        + Frontend & Experiment Manager +

        +
        +
        + +
        +
        + + + +
        + + Farabio - Deep Learning Toolkit for Biomedical Imaging + +
        +
        Sanzhar Askaruly, Nurbolat Aimakov, Alisher Iskakov, Hyewon Cho
        +

        Deep learning has transformed many aspects of industrial pipelines recently. Scientists involved in biomedical imaging research are also benefiting from the power of AI to tackle complex challenges. Although the academic community has widely accepted image processing tools, such as scikit-image, ImageJ, there is still a need for a tool which integrates deep learning into biomedical image analysis. We propose a minimal, but convenient Python package based on PyTorch with common deep learning models, extended by flexible trainers and medical datasets.

        + +

        + https://github.com/tuttelikz/farabio +

        + +

        + Medical & Healthcare +

        +
        +
        + +
        +
        + + + +
        + + MONAI: A Domain Specialized Library for Healthcare Imaging + +
        +
        Michael Zephyr, Prerna Dogra Richard Brown, Wenqi Li, Eric Kerfoot
        +

        Healthcare image analysis for both radiology and pathology is increasingly being addressed with deep-learning-based solutions. These applications have specific requirements to support various imaging modalities like MR, CT, ultrasound, digital pathology, etc. It is a substantial effort for researchers in the field to develop custom functionalities to handle these requirements. Consequently, there has been duplication of effort, and as a result, researchers have incompatible tools, which makes it hard to collaborate. + +MONAI stands for Medical Open Network for AI. Its mission is to accelerate the development of healthcare imaging solutions by providing domain-specialized building blocks and a common foundation for the community to converge in a native PyTorch paradigm.

        + +

        + https://monai.io/ +

        + +

        + Medical & Healthcare +

        +
        +
        + +
        +
        + + + +
        + + How theator Built a Continuous Training Framework to Scale Up Its Surgical Intelligence Platform + +
        +
        Shai Brown, Daniel Neimark, Maya Zohar, Omri Bar, Dotan Asselmann
        +

        Theator is re-imagining surgery with a Surgical Intelligence platform that leverages highly advanced AI, specifically machine learning and computer vision technology, to analyze every step, event, milestone, and critical junction of surgical procedures. + +Our platform analyzes lengthy surgical procedure videos and extracts meaningful information, providing surgeons with highlight reels of key moments in an operation, enhanced by annotations. + +As the team expanded, we realized that we were spending too much time manually running model training and focusing on DevOps tasks and not enough time dedicated to core research. + +To face this, we build an automation framework composed of multiple training pipelines using PyTorch and ClearML. Our framework automates and manages our entire process, from model development to deployment to continuous training for model improvement. + +New data is now immediately processed and fed directly into training pipelines – speeding up workflow, minimizing human error, and freeing up our research team for more important tasks. Thus, enabling us to scale our ML operation and deliver better models for our end users.

        + +

        + +

        + +

        + Medical & Healthcare +

        +
        +
        + +
        +
        + + + +
        + + Q&Aid: A Conversation Agent Powered by PyTorch + +
        +
        Cebere Bogdan, Cebere Tudor, Manolache Andrei, Horia Paul-Ion
        +

        We present Q&Aid, a conversation agent that relies on a series of machine learning models to filter, label, and answer medical questions based on a provided image and text inputs. Q&Aid is simplifying the hospital logic backend by standardizing it to a Health Intel Provider (HIP). A HIP is a collection of models trained on local data that receives text and visual input, afterward filtering, labeling, and feeding the data to the right models and generating at the end output for the aggregator. Any hospital is identified as a HIP holding custom models and labeling based on its knowledge. The hospitals are training and fine-tuning their models, such as a Visual Question Answering (VQA) model, on private data (e.g. brain anomaly segmentation). We aggregate all of the tasks that the hospitals can provide into a single chat app, offering the results to the user. When the chat ends, the transcript is forwarded to each hospital, a doctor being in charge of the final decision.

        + +

        + https://qrgo.page.link/d1fQk +

        + +

        + Medical & Healthcare +

        +
        +
        + +
        +
        + + + +
        + + Sleepbot: Multi-signal Sleep Stage Classifier AI for hospital and home + +
        +
        Jaden Hong, Kevin Tran, Tyler Lee, Paul Lee, Freddie Cha, Louis Jung, Dr. Jung Kyung Hong, Dr. In-Young Yoon, David Lee
        +

        Sleep disorders and insomnia are now regarded as a worldwide problem. Roughly 62% of adults worldwide feel that they don't sleep well. However, sleep is difficult to track so it's not easy to get suitable treatment to improve your sleep quality. Currently, the PSG (Polysomnography) is the only way to evaluate the sleep quality accurately but it's expensive and often inaccurate due to the first night effect. + +We propose a multi-signal sleep stage classifier for contactless sleep tracking: Sleepbot. By automating the manual PSG reading and providing explainable analysis, Sleepbot opens a new possibility to apply sleep staging AI in both home and hospital. With sound recorded by a smartphone app and RF-sensed signal measured by Asleep's non-contact sleep tracker, Sleepbot provides a clinical level of sleep stage classification. + +Sleepbot achieved 85.5 % accuracy in 5-class (Wake, N1, N2, N3, Rem) using PSG signals measured from 3,700 subjects and 77 % accuracy in 3-class (Wake, Sleep, REM) classification using only sound data measured from 1,2000 subjects.

        + +

        + +

        + +

        + Medical & Healthcare +

        +
        +
        + +
        +
        + +
        + PyMDE: Minimum-Distortion Embedding +
        +
        Akshay Agrawal, Alnur Ali, Stephen Boyd
        +

        We present a unifying framework for the vector embedding problem: given a set of items and some known relationships between them, we seek a representation of the items by vectors, possibly subject to some constraints (e.g., requiring the vectors to have zero mean and identity covariance). We want the vectors associated with similar items to be near each other, and vectors associated with dissimilar items to not be near, measured in Euclidean distance. We formalize this by introducing distortion functions, defined for some pairs of the items. Our goal is to choose an embedding that minimizes the total distortion, subject to the constraints. We call this the minimum-distortion embedding (MDE) problem. The MDE framework generalizes many well-known embedding methods, such as PCA, the Laplacian eigenmap, multidimensional scaling, UMAP, and others, and also includes new types of embeddings. + +Our accompanying software library, PyMDE, makes it easy for users to specify and approximately solve MDE problems, enabling experimentation with well-known and custom embeddings alike. By making use of automatic differentiation and hardware acceleration via PyTorch, we are able to scale to very large embedding problems. We will showcase examples of embedding real datasets, including an academic co-authorship network, single-cell mRNA transcriptomes, US census data, and population genetics.

        + +

        + +

        + +

        + Medical & Healthcare +

        +
        +
        + +
        +
        + + + +
        + + TorchIO: Pre-Processing & Augmentation of Medical Images for Deep Learning Applications + +
        +
        Fernando Pérez-García, Rachel Sparks, Sébastien Ourselin
        +

        Processing of medical images such as MRI or CT presents unique challenges compared to RGB images typically used in computer vision. These include a lack of labels for large datasets, high computational costs, and metadata to describe the physical properties of voxels. Data augmentation is used to artificially increase the size of the training datasets. Training with image patches decreases the need for computational power. Spatial metadata needs to be carefully taken into account in order to ensure a correct alignment of volumes. + +We present TorchIO, an open-source Python library to enable efficient loading, preprocessing, augmentation and patch-based sampling of medical images for deep learning. TorchIO follows the style of PyTorch and integrates standard medical image processing libraries to efficiently process images during training of neural networks. TorchIO transforms can be composed, reproduced, traced and extended. We provide multiple generic preprocessing and augmentation operations as well as simulation of MRI-specific artifacts. + +TorchIO was developed to help researchers standardize medical image processing pipelines and allow them to focus on the deep learning experiments. It encourages open science, as it supports reproducibility and is version controlled so that the software can be cited precisely. Due to its modularity, the library is compatible with other frameworks for deep learning with medical images.

        + +

        + +

        + +

        + Medical & Healthcare +

        +
        +
        + +
        +
        + + + +
        + + Deep Learning Based Model to Predict Covid19 Patients' Outcomes on Admission + +
        +
        Laila Rasmy, Ziqian Xie, Degui Zhi
        +

        With the extensive use of electronic records and the availability of historical patient information, predictive models that can help identify patients at risk based on their history at an early stage can be a valuable adjunct to clinician judgment. Deep learning models can better predict patients' outcomes by consuming their medical history regardless of the length and the complexity of such data. We used our Pytorch_EHR framework to train a model that can predict COVID-19 patient's health outcomes on admission. We used the Cerner Real-world COVID-19 (Q2) cohort which included information for 117,496 COVID patients from 62 health systems. We used a cohort of 55,068 patients and defined our outcomes including mortality, intubation, and hospitalization longer than 3 days as binary outcomes. We feed the model with all diagnoses, medication, laboratory results, and other clinical events information available before or on their first COVID-19 encounter admission date. We kept the data preprocessing at a minimum for convenience and practicality relying on the embedding layer that learns features representations from the large training set. Our model showed improved performance compared to other baseline machine learning models like logistic regression (LR). For in-hospital mortality, our model showed AUROC of 89.5%, 90.6%, and 84.3% for in-hospital mortality, intubation, and hospitalization for more than 3 days, respectively versus LR which showed 82.8%, 83.2%, and 76.8%

        + +

        + https://github.com/ZhiGroup/pytorch_ehr +

        + +

        + Medical & Healthcare +

        +
        +
        + +
        +
        + + + +
        + + Rolling out Transformers with TorchScript and Inferentia + +
        +
        Binghui Ouyang, Alexander O’Connor
        +

        While Transformers have brought unprecedented improvements in the accuracy and ease of developing NLP applications, their deployment remains challenging due to the large size of the models and their computational complexity. + Indeed, until recently is has been a widespread misconception that hosting high-performance transformer-based models was prohibitively expensive, and technically challenging. Fortunately, recent advances in both the PyTorch ecosystem and in custom hardware for inference have created a world where models can be deployed in a cost-effective, scalable way, without the need for complex engineering. + +In this presentation, we will discuss the use of PyTorch and AWS Inferentia to deploy production-scale models in chatbot intent classification - a particularly relevant and demanding scenario. + +Autodesk deploys a number of transformer based models to solve customer support issues across our channels, and our ability to provide a flexible, high-quality machine learning solution is supported by leveraging cutting-edge technology such as transformer based classification. Our chatbot, AVA, responds to tens of thousands of customer interactions monthly, and we are evolving our architecture to be supported by customer inference. + +We will discuss our experience of piloting transformer-based intent models, and present a workflow for going from data to deployment for similar projects.

        + +

        + +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + PyTorchTS: PyTorch Probabilistic Time Series Forecasting Framework + +
        +
        Kashif Rasul
        +

        PyTorchTS is a PyTorch based Probabilistic Time Series forecasting framework that comes with state of the art univariate and multivariate models.

        + +

        + https://github.com/zalandoresearch/pytorch-ts +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + MMF: A modular framework for multimodal research + +
        +
        Sasha Sheng, Amanpreet Singh
        +

        MMF is designed from ground up to let you focus on what matters -- your model -- by providing boilerplate code for distributed training, common datasets and state-of-the-art pretrained baselines out-of-the-box. MMF is built on top of PyTorch that brings all of its power in your hands. MMF is not strongly opinionated. So you can use all of your PyTorch knowledge here. MMF is created to be easily extensible and composable. Through our modular design, you can use specific components from MMF that you care about. Our configuration system allows MMF to easily adapt to your needs.

        + +

        + +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + +
        + AllenNLP: An NLP research library for developing state-of-the-art models +
        +
        Dirk Groeneveld, Akshita Bhagia, Pete Walsh, Michael Schmitz
        +

        An Apache 2.0 NLP research library, built on PyTorch, for developing state-of-the-art deep learning models on a wide variety of linguistic tasks.

        + +

        + https://github.com/allenai/allennlp +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + Project Spock at Tubi: Understanding Content using Deep Learning for NLP + +
        +
        John Trenkle, Jaya Kawale & Tubi ML team
        +

        Tubi is one of the leading platforms providing free high-quality streaming movies and TV shows to a worldwide audience. We embrace a data-driven approach and leverage advanced machine learning techniques using PyTorch to enhance our platform and business in any way we can. The Three Pillars of AVOD are the guiding principle for our work. The Pillars are +Content: all the titles we maintain in our library +Audience: everyone who watches titles on Tubi +Advertising: ads shown to viewers on behalf of brands + +In this poster, we'll focus on the Content aspect with more details for the various use cases especially Content Understanding. Content is an important pillar of Tubi since to be successful, we need to look at existing titles and beyond what we already have and attempt to understand all of the titles out in the wild and how they could benefit our platform in some fashion. Content Understanding revolves around digesting a rich collection of 1st- and 3rd-party data in structured (metadata) and unstructured (text) forms and developing representations that capture the essence of those Titles. With the analogy of linear algebra, we can say we are attempting to project Title vectors from the universe to our tubiverse with as much fidelity as possible in order to ascertain potential value for each target use case. We will describe several techniques to understand content better using Pytorch.

        + +

        + +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + RL Based Performance Optimization of Deep Neural Networks + +
        +
        Benoit Steiner, Chris Cummins, Horace He, Hugh Leather
        +

        As the usage of machine learning techniques is becoming ubiquitous, the efficient execution of neural networks is crucial to many applications. Frameworks, such as Halide and TVM, separate the algorithmic representation of +the deep learning model from the schedule that determines its implementation. Finding good schedules, however, remains extremely challenging. Auto-tuning methods, which search the space of valid schedules and execute each candidate on the hardware, identify some of the best performing schedules, but the search can take hours, hampering the productivity of deep learning practitioners. What is needed is a method that achieves a similar performance without extensive search, delivering the needed efficiency quickly. + +Using PyTorch, we model the scheduling process as a sequence of optimization choices, and implement a new technique to accurately predict the expected performance of a partial schedule using a LSTM over carefully engineered features that describe each DNN operator and their current scheduling choices. Leveraging these predictions we are able to make these optimization decisions greedily and, without any executions on the target hardware, rapidly identify an efficient schedule. +This techniques enables to find schedules that improve the execution performance of deep neural networks by 2.6× over Halide and 1.5× over TVM. Moreover, our technique completes in seconds instead of hours, making it possible to include it as a new backend for PyTorch itself.

        + +

        + http://facebook.ai +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + A Data-Centric Framework for Composable NLP + +
        +
        Zhenghong Liu
        +

        Forte is an open-source toolkit for building Natural Language Processing workflows via assembling state-of-the-art NLP and ML technologies. This toolkit features composable pipeline, cross-task interaction, adaptable data-model interfaces. The highly composable design allows users to build complex NLP pipelines of a wide range of tasks including document retrieval, information extraction, and text generation by combining existing toolkits or customized PyTorch models. The cross-task interaction ability allows developers to utilize the results from individual tasks to make informed decisions. The data-model interface helps developers to focus on building reusable PyTorch models by abstracting out domain and preprocessing details. We show that Forte can be used to build complex pipelines, and the resulting pipeline can be easily adapted to different domains and tasks with small changes in the code.

        + +

        + https://github.com/asyml/forte +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + Environments and Baselines for Multitask Reinforcement Learning + +
        +
        Shagun Sodhani, Amy Zhang, Ludovic Denoyer, Pierre-Alexandre Kamienny, Olivier Delalleau
        +

        The two key components in a multi-task RL codebase are (i) Multi-task RL algorithms and (ii) Multi-task RL environments. We develop open-source libraries for both components. [MTRL](https://github.com/facebookresearch/mtrl) provides components to implement multi-task RL algorithms, and [MTEnv](https://github.com/facebookresearch/mtenv) is a library to interface with existing multi-task RL environments and create new ones. + +MTRL has two building blocks: (i) single task policy and (ii) components to augment the single-task policy for multi-task setup. The ideal workflow is to start with a base policy and add multi-task components as they seem fit. MTRL enables algorithms like GradNorm, Distral, HiPBMDP, PCGrad, Soft Modularization, etc. + +MTEnv is an effort to standardize multi-task RL environments and provide better benchmarks. We extend the Gym API to support multiple tasks, with two guiding principles: (i) Make minimal changes to the Gym Interface (which the community is very familiar with) and (ii) Make it easy to port existing environments to MTEnv. Additionally, we provide a collection of commonly used multi-task RL environments (Acrobot, Cartpole, Multitask variant of DeepMind Control Suite, Meta-World, Multi-armed Bandit, etc.). The RL practitioner can combine its own environments with the MTEnv wrappers to add multi-task support with a small code change. + +MTRL and MTEnv are used in several ongoing/published works at FAIR.

        + +

        + http://qr.w69b.com/g/tGZSFw33G +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + The Hugging Face Ecosystem + +
        +
        Lysandre Debut, Sylvain Gugger, Quentin Lhoest 
        +

        Transfer learning has become the norm to get state-of-the-art results in NLP. Hugging Face provides you with tools to help you on every step along the way: + +- A free git-based shared hub with more than 7,500 PyTorch checkpoints, and more than 800 NLP datasets. +- The ? Datasets library, to easily download the dataset, manipulate it and prepare it. +- The ? Tokenizers library, that provides ultra-fast tokenizers backed by Rust, and converts text in PyTorch tensors. +- The ? Transformers library, providing more than 45 PyTorch implementations of Transformer architectures as simple nn.Module as well as a training API. +- The ? Accelerate library, a non-intrusive API that allows you to run your raw training loop on any distributed setup. + +The pipeline is then simply a six-step process: select a pretrained model from the hub, handle the data with Datasets, tokenize the text with Tokenizers, load the model with Transformers, train it with the Trainer or your own loop powered by Accelerate, before sharing your results with the community on the hub.

        + +

        + https://huggingface.co/models +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + +  Asteroid: the Pytorch-based Audio Source Separation Toolkit for Researchers + +
        +
        Manuel Pariente, Samuele Cornell, Jonas Haag, Joris Cosentino, Michel Olvera, Fabian-Robert Stöter, Efthymios Tzinis
        +

        Asteroid is an audio source separation toolkit built with PyTorch and PyTorch-Lightning. Inspired by the most successful neural source separation systems, it provides all neural building blocks required to build such a system. To improve reproducibility, recipes on common audio source separation datasets are provided, including all the steps from data download\preparation through training to evaluation as well as many current state-of-the-art DNN models. Asteroid exposes all levels of granularity to the user from simple layers to complete ready-to-use models. Our pretrained models are hosted on the asteroid-models community in Zenodo and on the Huggingface model Hub. Loading and using pretrained models is trivial and sharing them is also made easy with asteroid's CLI.","poster_showcase":"Audio Source Separation, Speech Processing, Deep Learning","email":"cornellsamuele@gmail.com"}

        + +

        + https://asteroid-team.github.io/ +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + rlstructures: A Lightweight Python Library for Reinforcement Learning Research + +
        +
        Ludovic Denoyer, Danielle Rothermel, Xavier Martinet
        +

        RLStructures is a lightweight Python library that provides simple APIs as well as data structures that make as few assumptions as possible about the structure of your agent or your task, while allowing for transparently executing multiple policies on multiple environments in parallel (incl. multiple GPUs). It thus facilitates the implementation of RL algorithms while avoiding complex abstractions.

        + +

        + +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + MBRL-Lib: a PyTorch toolbox for model-based reinforcement learning research + +
        +
        Luis Pineda, Brandon Amos, Amy Zhang, Nathan O. Lambert, Roberto Calandra
        +

        Model-based reinforcement learning (MBRL) is an active area of research with enormous potential. In contrast to model-free RL, MBRL algorithms solve tasks by learning a predictive model of the task dynamics, and use this model to predict the future and facilitate decision making. Many researchers have argued that MBRL can result in lower sample complexity, better generalization, as well as safer and more interpretable decisions. However, despite the surge in popularity and great potential of MBRL, there is currently no widely accepted library for facilitating research in this area. Since MBRL methods often involve the interplay of complex components such as probabilistic ensembles, latent variable models, planning algorithms, and even model-free methods, the lack of such a library raises the entry bar to the field and slows down research efforts. In this work we aim to solve this problem by introducing MBRL-Lib, a modular PyTorch toolbox specifically designed for facilitating research on model-based reinforcement learning. MBRL-Lib provides interchangeable options for training dynamics models and running planning algorithms, which can then be used in a mix and match fashion to create novel MBRL methods. The library also provides a set of utility functions to run common MBRL tasks, as well a set of diagnostics tools to identify potential issues while training dynamics models and control algorithms.

        + +

        + https://github.com/facebookresearch/mbrl-lib +

        + +

        + NLP & Multimodal, RL & Time Series +

        +
        +
        + +
        +
        + + + +
        + + Introducing New PyTorch Profiler + +
        +
        Geeta Chauhan, Gisle Dankel, Elena Neroslavaskaya
        +

        Analyzing and improving large-scale deep learning model performance is an ongoing challenge that continues to grow in importance as the model sizes increase. Microsoft and Facebook collaborated to create a native PyTorch performance debugging tool called PyTorch Profiler. The profiler builds on the PyTorch autograd profiler foundation, adds a new high fidelity GPU profiling engine, and out-of-the-box bottleneck analysis tool in Tensorboard. New Profiler delivers the simplest experience available to date where users can profile their models without installing any additional packages and see results immediately in Tensorboard. Until today, beginner users of PyTorch may not have attempted to profile their models due to the task complexity. With the new bottleneck analysis tool, they will find profiling easy and accessible. Experienced users will be delighted by the detailed trace views which illustrate GPU kernel execution events and their relationship to the PyTorch operations. Come learn how to profile your PyTorch models using this new delightfully simple tool.

        + +

        + https://pytorch.org/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool +

        + +

        + Performance & Profiler +

        +
        +
        + +
        +
        + + + +
        + + TRTorch: A Compiler for TorchScript Targeting NVIDIA GPUs with TensorRT + +
        +
        Naren Dasan
        +

        For experimentation and the development of machine learning models, few tools are as approachable as PyTorch. However, when moving from research to production, some of the features that make PyTorch great for development make it hard to deploy. With the introduction of TorchScript, PyTorch has solid tooling for addressing some of the problems of deploying PyTorch models. TorchScript removes the dependency on Python and produces portable, self contained, static representations of code and weights. But in addition to portability, users also look to optimize performance in deployment. When deploying on NVIDIA GPUs, TensorRT, NVIDIA's deep learning optimizer, provides the capability to maximize performance of workloads by tuning the execution of models for specific target hardware. TensorRT also provides tooling for conducting further optimization through mixed and reduced precision execution and post training quantization (PTQ). We present TRTorch, a compiler for PyTorch and TorchScript targeting NVIDIA GPUs, which combines the usability of PyTorch with the performance of TensorRT and allows users to fully optimize their inference workloads without leaving the PyTorch ecosystem. It also simplifies conducting complex optimizations like PTQ by leveraging common PyTorch tooling. TRTorch can be used directly from PyTorch as a TorchScript Backend, embedded in an application or used from the command line to easily increase the performance of inference applications.

        + +

        + https://nvidia.github.io/TRTorch/ +

        + +

        + Performance & Profiler +

        +
        +
        + +
        +
        + + + +
        + + WeightWatcher: A Diagnostic Tool for DNNs + +
        +
        Charles H. Martin
        +

        WeightWatcher (WW) is an open-source, diagnostic tool for analyzing Deep Neural Networks (DNN), without needing access to training or even test data. It can be used to: analyze pre/trained pyTorch models; +inspect models that are difficult to train; gauge improvements in model performance; predict test accuracies across different models; and detect potential problems when compressing or fine-tuning pretrained models. + +WeightWatcher is based on theoretical research (done in\-joint with UC Berkeley) into "Why Deep Learning Works", using ideas from Random Matrix Theory (RMT), Statistical Mechanics, and Strongly Correlated Systems.

        + +

        + +

        + +

        + Performance & Profiler +

        +
        +
        + +
        +
        + +
        + Constrained Optimization in PyTorch 1.9 Through Parametrizations +
        +
        Mario Lezcano-Casado
        +

        "This poster presents the ""parametrizations"" feature that will be added to PyTorch in 1.9.0. +This feature allows for a simple implementation of methods like pruning, weight_normalization or spectral_normalization. +More generally, it implements a way to have ""computed parameters"". This means that we replace a parameter `weight` in a layer with `f(weight)`, where `f` is an arbitrary module. In other words, after putting a parametrization `f` on `layer.weight`, `layer.weight` will return `f(weight)`. +They implement a caching system, so that the value `f(weight)` is computed just once during the forward pass. +A module that implements a parametrisation may also have a `right_inverse` method. If this method is present, it is possible to assign to a parametrised tensor. This is useful when initialising a parametrised tensor. +This feature can be seen as a first step towards invertible modules. In particular, it may also help making distributions first-class citizens in PyTorch. +Parametrisations also allows for a simple implementation of constrained optimisation. From this perspective, parametrisation maps an unconstrained tensor to a constrained space such as the space of orthogonal matrices, SPD matrices, low-rank matrices... This approach is implemented in the library GeoTorch (https://github.com/Lezcano/geotorch/)."

        + +

        + +

        + +

        + Performance & Profiler +

        +
        +
        + +
        +
        + + + +
        + + Distributed Pytorch with Ray + +
        +
        Richard Liaw, Kai Fricke, Amog Kamsetty, Michael Galarnyk
        +

        Ray is a popular framework for distributed Python that can be paired with PyTorch to rapidly scale machine learning applications. Ray contains a large ecosystem of applications and libraries that leverage and integrate with Pytorch. This includes Ray Tune, a Python library for experiment execution and hyperparameter tuning at any scale; RLlib, a state-of-the-art library for reinforcement learning; and Ray Serve, a library for scalable model serving. Together, Ray and Pytorch are becoming the core foundation for the next generation of production machine learning platforms.

        + +

        + https://ray.io/ +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + Avalanche: an End-to-End Library for Continual Learning based on PyTorch + +
        +
        Vincenzo Lomonaco, Lorenzo Pellegrini Andrea Cossu, Antonio Carta, Gabriele Graffieti
        +

        Learning continually from non-stationary data stream is a long sought goal of machine learning research. Recently, we have witnessed a renewed and fast-growing interest in Continual Learning, especially within the deep learning community. However, algorithmic solutions are often difficult to re-implement, evaluate and port across different settings, where even results on standard benchmarks are hard to reproduce. In this work, we propose an open-source, end-to-end library for continual learning based on PyTorch that may provide a shared and collaborative code-base for fast prototyping, training and reproducible evaluation of continual learning algorithms.

        + +

        + https://avalanche.continualai.org +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + PyTorch on IBM Z and LinuxONE (s390x) + +
        +
        Hong Xu
        +

        IBM Z is a hardware product line for mission-critical applications, such as finance and health applications. It employs its own CPU architecture, which PyTorch does not officially support. In this poster, we discuss why it is important to support PyTorch on Z. Then, we show our prebuilt minimal PyTorch package for IBM Z. Finally, we demonstrate our continuing commitment to make more PyTorch features available on IBM Z.

        + +

        + https://codait.github.io/pytorch-on-z +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + The Fundamentals of MLOps for R&D: Orchestration, Automation, Reproducibility + +
        +
        Dr. Ariel Biller
        +

        Both from sanity considerations and the productivity perspective, Data Scientists, ML engineers, Graduate students, and other research-facing roles are all starting to adopt best-practices from production-grade MLOps. + +However, most toolchains come with a hefty price of extra code and maintenance, which reduces the actual time available for R&D. We will show an alternative approach using ClearML, the open-source MLOps solution. + +In this "best-practices" poster, we will overview the "must-haves" of R&D-MLOPs: +Orchestration, Automation, and Reproducibility. These enable easy remote execution through magically reproducible setups and even custom, reusable, bottom-up pipelines. + +We will take a single example and schematically transform it from the "as downloaded from GitHub" stage to a fully-fledged, scalable, version-controlled, parameterizable R&D pipeline. We will measure the number of changes needed to the codebase and provide evidence of real low-cost integration. All code, logs, and metrics will be available as supporting information.

        + +

        + +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + FairTorch: Aspiring to Mitigate the Unfairness of Machine Learning Models + +
        +
        Masashi Sode, Akihiko Fukuchi, Yoki Yabe, Yasufumi Nakata
        +

        Is your machine learning model fair enough to be used in your system? What if a recruiting AI discriminates on gender and race? What if the accuracy of medical AI depends on a person's annual income or on the GDP of the country where it is used? Today's AI has the potential to cause such problems. In recent years, fairness in machine learning has received increasing attention. If current machine learning models used for decision making may cause unfair discrimination, developing a fair machine learning model is an important goal in many areas, such as medicine, employment, and politics. Despite the importance of this goal to society, as of 2020, there was no PyTorch¹ project incorporating fairness into a machine learning model. To solve this problem, we created FairTorch at the PyTorch Summer Hackathon 2020. + +FairTorch provides a tool to mitigate the unfairness of machine learning models. A unique feature of our tool is that it allows you to add a fairness constraint to your model by adding only a few lines of code, using the fairness criteria provided in the library.

        + +

        + https://github.com/wbawakate/fairtorch +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + TorchDrift: Drift Detection for PyTorch + +
        +
        Thomas Viehmann, Luca Antiga
        +

        When machine learning models are deployed to solve a given task, a crucial question is whether they are actually able to perform as expected. TorchDrift addresses one aspect of the answer, namely drift detection, or whether the information flowing through our models - either probed at the input, output or somewhere in-between - is still consistent with the one it was trained and evaluated on. In a nutshell, TorchDrift is designed to be plugged into PyTorch models and check whether they are operating within spec. +TorchDrift's principles apply PyTorch's motto _from research to production_ to drift detection: We provide a library of methods that canbe used as baselines or building blocks for drift detection research, as well as provide practitioners deploying PyTorch models in production with up-to-date methods and educational material for building the necessary statistical background. Here we introduce TorchDrift with an example illustrating the underlying two-sample tests. We show how TorchDrift can be integrated in high-performance runtimes such as TorchServe or RedisAI, to enable drift detection in real-world applications thanks to the PyTorch JIT.

        + +

        + https://torchdrift.org/ +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + Ouroboros: MLOps for Automated Driving + +
        +
        Quincy Chen, Arjun Bhargava, Sudeep Pillai, Marcus Pan, Chao Fang, Chris Ochoa, Adrien Gaidon, Kuan-Hui Lee, Wolfram Burgard
        +

        Modern machine learning for autonomous vehicles requires a fundamentally different infrastructure and production lifecycle from their standard software continuous-integration/continuous-deployment counterparts. At Toyota Research Institute (TRI), we have developed ​Ouroboros​ - a modern ML platform that supports the end-to-end lifecycle of all ML models delivered to TRI's autonomous vehicle fleets. We envision that all ML models delivered to our fleet undergo a systematic and rigorous treatment. Ouroboros delivers several essential features including: +a. ML dataset governance and infrastructure-as-code​ that ensures the traceability, reproducibility, standardization, and fairness for all ML datasets and models procedurally generated and delivered to the TRI fleet. +b. Unified ML dataset and model management:​ An unified and streamlined workflow for ML dataset curation, label management, and model development that supports several key ML models delivered to the TRI fleet today +c. A Large-scale Multi-task, Multi-modal Dataset for Automated Driving​ that supports the development of various models today, including 3D object detection, 2D object detection, 2D BeVFlow, Panoptic Segmentation; +d. Orchestrated ML workflows​ to stand up scalable ML applications such as push-button re-training solutions, ML CI/CDs pipelines, Dataset Curation workflows, Auto-labelling pipelines, leveraging the most up-to-date cloud tools available. along their lifecycles, ensuring strong governance on building reusable, reproducible, robust, traceable, and fair ML models for the production driving setting. By following the best MLOps practices, we expect our platform to lay the foundation for continuous life-long learning in our autonomous vehicle fleets and accelerate the transition from research to production.

        + +

        + https://github.com/TRI-ML +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + carefree-learn: Tabular Datasets ❤️ PyTorch + +
        +
        Yujian He
        +

        carefree-learn makes PyTorch accessible to people who are familiar with machine learning but not necessarily PyTorch. By having already implemented all the pre-processing and post-processing under the hood, users can focus on implementing the core machine learning algorithms / models with PyTorch and test them on various datasets. By having designed the whole structure carefully, users can easily customize every block in the whole pipeline, and can also 'combine' the implemented blocks to 'construct' new models without efforts. By having carefully made abstractions users can adapt it to their specific down-stream tasks, such as quantitative trading (in fact I've already implemented one for my company and it works pretty well XD). carefree-learn handles distributed training carefully, so users can either run multiple tasks at the same time, or run a huge model with DDP in one line of code. carefree-learn also integrates with mlflow and supports exporting to ONNX, which means it is ready for production to some extend.

        + +

        + +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + OpenMMLab: An Open-Source Algorithm Platform for Computer Vision + +
        +
        Wenwei Zhang
        +

        OpenMMLab project builds open-source toolboxes for Artificial Intelligence (AI). It aims to 1) provide high-quality codebases to reduce the difficulties in algorithm reimplementation; 2) provide a complete research platform to accelerate the research production; and 3) shorten the gap between research production to the industrial applications. Based on PyTorch, OpenMMLab develops MMCV to provide unified abstract training APIs and common utils, which serves as a foundation of 15+ toolboxes and 40+ datasets. + +Since the initial release in October 2018, OpenMMLab has released 15+ toolboxes that cover 10+ directions, implement 100+ algorithms, and contain 1000+ pre-trained models. With a tighter collaboration with the community, OpenMMLab will release more toolboxes with more flexible and easy-to-use training frameworks in the future.

        + +

        + https://openmmlab.com/ +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + Catalyst – Accelerated deep learning R&D + +
        +
        Sergey Kolesnikov
        +

        For the last three years, Catalyst-Team and collaborators have been working on Catalyst  - a high-level PyTorch framework Deep Learning Research and Development. It focuses on reproducibility, rapid experimentation, and codebase reuse so you can create something new rather than write yet another train loop. You get metrics, model checkpointing, advanced logging, and distributed training support without the boilerplate and low-level bugs.

        + +

        + https://catalyst-team.com +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + High-fidelity performance metrics for generative models in PyTorch + +
        +
        Anton Obukhov
        +

        Evaluation of generative models such as GANs is an important part of deep learning research. In 2D image generation, three approaches became widely spread: Inception Score, Fréchet Inception Distance, and Kernel Inception Distance. Despite having a clear mathematical and algorithmic description, these metrics were initially implemented in TensorFlow and inherited a few properties of the framework itself, such as a specific implementation of the interpolation function. These design decisions were effectively baked into the evaluation protocol and became an inherent part of the specification of the metrics. As a result, researchers wishing to compare against state of the art in generative modeling are forced to perform an evaluation using the original metric authors' codebases. Reimplementations of metrics in PyTorch and other frameworks exist, but they do not provide a proper level of fidelity, thus making them unsuitable for reporting results and comparing them to other methods. This software aims to provide epsilon-exact implementations of the said metrics in PyTorch and remove inconveniences associated with generative model evaluation and development. All the evaluation pipeline steps are correctly tested, with relative errors and sources of remaining non-determinism summarized in sections below. +TLDR; fast and reliable GAN evaluation in PyTorch

        + +

        + https://github.com/toshas/torch-fidelity +

        + +

        + Platforms & Ops & Tools +

        +
        +
        + +
        +
        + + + +
        + + Using Satellite Imagery to Identify Oceanic Oil Pollution + +
        +
        Jona Raphael (jona@skytruth.org), Ben Eggleston, Ryan Covington, Tatianna Evanisko, John Amos
        +

        Operational oil discharges from ships, also known as "bilge dumping," have been identified as a major source of petroleum products entering our oceans, cumulatively exceeding the largest oil spills, such as the Exxon Valdez and Deepwater Horizon spills, even when considered over short time spans. However, we still don't have a good estimate of +● How much oil is being discharged; +● Where the discharge is happening; +● Who the responsible vessels are. +This makes it difficult to prevent and effectively respond to oil pollution that can damage our marine and coastal environments and economies that depend on them. + +In this poster we will share SkyTruth's recent work to address these gaps using machine learning tools to detect oil pollution events and identify the responsible vessels when possible. We use a convolutional neural network (CNN) in a ResNet-34 architecture to perform pixel segmentation on all incoming Sentinel-1 synthetic aperture radar (SAR) imagery to classify slicks. Despite the satellites' incomplete oceanic coverage, we have been detecting an average of 135 vessel slicks per month, and have identified several geographic hotspots where oily discharges are occurring regularly. For the images that capture a vessel in the act of discharging oil, we rely on an Automatic Identification System (AIS) database to extract details about the ships, including vessel type and flag state. We will share our experience +● Making sufficient training data from inherently sparse satellite image datasets; +● Building a computer vision model using PyTorch and fastai; +● Fully automating the process in the Amazon Web Services (AWS) cloud. +The application has been running continuously since August 2020, has processed over 380,000 Sentinel-1 images, and has populated a database with more than 1100 high-confidence slicks from vessels. We will be discussing preliminary results from this dataset and remaining challenges to be overcome. +Learn more at https://skytruth.org/bilge-dumping/

        + +

        + +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + UPIT: A fastai Package for Unpaired Image-to-Image Translation + +
        +
        Tanishq Abraham
        +

        Unpaired image-to-image translation algorithms have been used for various computer vision tasks like style transfer and domain adaption. Such algorithms are highly attractive because they alleviate the need for the collection of paired datasets. In this poster, we demonstrate UPIT, a novel fastai/PyTorch package (built with nbdev) for unpaired image-to-image translation. It implements various state-of-the-art unpaired image-to-image translation algorithms such as CycleGAN, DualGAN, UNIT, and more. It enables simple training and inference on unpaired datasets. It also comes with implementations of commonly used metrics like FID, KID, and LPIPS. It also comes with Weights-and-Biases integration for easy experiment tracking. Since it is built on top of fastai and PyTorch, it comes with support for mixed-precision and multi-GPU training. It is highly flexible, and custom dataset types, models, and metrics can be used as well. With UPIT, training and applying unpaired image-to-image translation only takes a few lines of code.

        + +

        + https://github.com/tmabraham/UPIT +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + PyTorchVideo: A Deep Learning Library for Video Understanding + +
        +
        Aaron Adcock, Bo Xiong, Christoph Feichtenhofer, Haoqi Fan, Heng Wang, Kalyan Vasudev Alwala, Matt Feiszli, Tullie Murrell, Wan-Yen Lo, Yanghao Li, Yilei Li, Zhicheng Yan
        +

        PyTorchVideo is the new Facebook AI deep learning library for video understanding research. It contains variety of state of the art pretrained video models, dataset, augmentation, tools for video understanding. PyTorchVideo provides efficient video components on accelerated inference on mobile device.

        + +

        + https://pytorchvideo.org/ +

        + +

        + Vision +

        +
        +
        + +
        +
        + +
        + Deep Learning Enables Fast and Dense Single-Molecule Localization with High Accuracy +
        +
        A. Speiser, L-R. Müller, P. Hoess, U. Matti, C. J. Obara, J. H. Macke, J. Ries, S. C. Turaga
        +

        Single-molecule localization microscopy (SMLM) has had remarkable success in imaging cellular structures with nanometer resolution, but the need for activating only single isolated emitters limits imaging speed and labeling density. Here, we overcome this major limitation using deep learning. We developed DECODE, a computational tool that can localize single emitters at high density in 3D with the highest accuracy for a large range of imaging modalities and conditions. In a public software benchmark competition, it outperformed all other fitters on 12 out of 12 data-sets when comparing both detection accuracy and localization error, often by a substantial margin. DECODE allowed us to take live-cell SMLM data with reduced light exposure in just 3 seconds and to image microtubules at ultra-high labeling density. Packaged for simple installation and use, DECODE will enable many labs to reduce imaging times and increase localization density in SMLM.

        + +

        + http://github.com/turagalab/decode +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + A Robust PyTorch Trainable Entry Convnet Layer in Fourier Domain + +
        +
        Abraham Sánchez, Guillermo Mendoza, E. Ulises Moya-Sánchez
        +

        We draw inspiration from the cortical area V1. We try to mimic their main processing properties by means of: quaternion local phase/orientation to compute lines and edges detection in a specific direction. We analyze how this layer is robust by its greometry to large illumination and brightness changes.

        + +

        + https://gitlab.com/ab.sanchezperez/pytorch-monogenic +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + PyroNear: Embedded Deep Learning for Early Wildfire Detection + +
        +
        François-Guillaume Fernandez, Mateo Lostanlen, Sebastien Elmaleh, Bruno Lenzi, Felix Veith, and more than 15+ contributors
        +

        "PyroNear is non-profit organization composed solely of volunteers which was created in late 2019. Our core belief is that recent technological developments can support the cohabitation between mankind & its natural habitat. We strive towards high-performing, accessible & affordable tech-solutions for protection against natural hazards. More specifically, our first efforts are focused on wildfire protection by increasing the coverage of automatic detection systems. + +Our ongoing initiative has now gathered dozens of volunteers to put up the following main contributions: +- Computer Vision: compiling open-source models and datasets (soon to be published) for vision tasks related to wildfire detection +- Edge Computing: developing an affordable physical prototype running our PyTorch model on a Raspberry Pi +- End-to-end detection workflow: building a responsible end-to-end system for large scale detection and alert management (API, front-end monitoring platform) +- Deployment: working with French firefighter departments to gather field knowledge and conduct a test phase over the incoming European summer." +PyTorch3D is a modular and optimized library for 3D Deep Learning with PyTorch. It includes support for: data structures for heterogeneous batching of 3D data (Meshes, Point clouds and Volumes), optimized 3D operators and loss functions (with custom CUDA kernels), a modular differentiable rendering API for Meshes, Point clouds and Implicit functions, as well as several other tools for 3D Deep Learning.

        + +

        + https://github.com/pyronear +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + PyTorch3D: Fast, Flexible, 3D Deep Learning + +
        +
        Nikhila Ravi, Jeremy Reizenstein, David Novotny, Justin Johnson, Georgia Gkioxari, Roman Shapovalov, Patrick Labatut, Wan-Yen Lo
        +

        PyTorch3D is a modular and optimized library for 3D Deep Learning with PyTorch. It includes support for: data structures for heterogeneous batching of 3D data (Meshes, Point clouds and Volumes), optimized 3D operators and loss functions (with custom CUDA kernels), a modular differentiable rendering API for Meshes, Point clouds and Implicit functions, as well as several other tools for 3D Deep Learning.

        + +

        + https://arxiv.org/abs/2007.08501 +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + Kornia: an Open Source Differentiable Computer Vision Library for PyTorch + +
        +
        E. Riba, J. Shi, D. Mishkin, L. Ferraz, A. Nicolao
        +

        This work presents Kornia, an open source computer vision library built upon a set of differentiable routines and modules that aims to solve generic computer vision problems. The package uses PyTorch as its main backend, not only for efficiency but also to take advantage of the reverse auto-differentiation engine to define and compute the gradient of complex functions. Inspired by OpenCV, Kornia is composed of a set of modules containing operators that can be integrated into neural networks to train models to perform a wide range of operations including image transformations,camera calibration, epipolar geometry, and low level image processing techniques, such as filtering and edge detection that operate directly on high dimensional tensor representations on graphical processing units, generating faster systems. Examples of classical vision problems implemented using our framework are provided including a benchmark comparing to existing vision libraries.

        + +

        + http://www.kornia.org +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + NNGeometry: Easy and Fast Fisher Information Matrices and Neural Tangent Kernels in PyTorch + +
        +
        Thomas George
        +

        Fisher Information Matrices (FIM) and Neural Tangent Kernels (NTK) are useful tools in a number of diverse applications related to neural networks. Yet these theoretical tools are often difficult to implement using current libraries for practical size networks, given that they require per-example gradients, and a large amount of memory since they scale as the number of parameters (for the FIM) or the number of examples x cardinality of the output space (for the NTK). NNGeometry is a PyTorch library that offers a high level API for computing various linear algebra operations such as matrix-vector products, trace, frobenius norm, and so on, where the matrix is either the FIM or the NTK, leveraging recent advances in approximating these matrices.

        + +

        + https://github.com/tfjgeorge/nngeometry/ +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + CompressAI: a research library and evaluation platform for end-to-end compression + +
        +
        Bégaint J., Racapé F., Feltman S., Pushparaja A.
        +

        CompressAI is a PyTorch library that provides custom operations, layers, modules and tools to research, develop and evaluate end-to-end image and video compression codecs. In particular, CompressAI includes pre-trained models and evaluation tools to compare learned methods with traditional codecs. State-of-the-art end-to-end compression models have been reimplemented in PyTorch and trained from scratch, reproducing published results and allowing further research in the domain.

        + +

        + +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + pystiche: A Framework for Neural Style Transfer + +
        +
        Philip Meier, Volker Lohweg
        +

        The seminal work of Gatys, Ecker, and Bethge gave birth to the field of _Neural Style Transfer_ (NST) in 2016. An NST describes the merger between the content and artistic style of two arbitrary images. This idea is nothing new in the field of Non-photorealistic rendering (NPR). What distinguishes NST from traditional NPR approaches is its generality: an NST only needs a single arbitrary content and style image as input and thus "makes -- for the first time -- a generalized style transfer practicable". Besides peripheral tasks, an NST at its core is the definition of an optimization criterion called _perceptual loss_, which estimates the perceptual quality of the stylized image. Usually the perceptual loss comprises a deep neural network that needs to supply encodings of images from various depths. + +`pystiche` is a library for NST written in Python and built upon PyTorch. It provides modular and efficient implementations for commonly used perceptual losses as well as neural net architectures. This enables users to mix current state-of-the-art techniques with new ideas with ease. This poster will showcase the core concepts of `pystiche` that will enable other researchers as well as lay persons to got an NST running in minutes.

        + +

        + https://github.com/pmeier/pystiche +

        + +

        + Vision +

        +
        +
        + +
        +
        + + + +
        + + GaNDLF – A Generally Nuanced Deep Learning Framework for Clinical Imaging Workflows + +
        +
        Siddhish Thakur
        +

        Deep Learning (DL) has greatly highlighted the potential impact of optimized machine learning in both the scientific +and clinical communities. The advent of open-source DL libraries from major industrial entities, such as TensorFlow +(Google), PyTorch (Facebook), further contributes to DL promises on the democratization of computational analytics. However, increased technical and specialized background is required to develop DL algorithms, and the variability of implementation details hinders their reproducibility. Towards lowering the barrier and making the mechanism of DL development, training, and inference more stable, reproducible, and scalable, without requiring an extensive technical background, this manuscript proposes the Generally Nuanced Deep Learning Framework (GaNDLF). With built-in support for k-fold cross-validation, data augmentation, multiple modalities and output classes, and multi-GPU training, as well as the ability to work with both radiographic and histologic imaging, GaNDLF aims to provide an end-to-end solution for all DL-related tasks, to tackle problems in medical imaging and provide a robust application framework for deployment in clinical workflows. + +Keywords: Deep Learning, Framework, Segmentation, Regression, Classification, Cross-validation, Data +augmentation, Deployment, Clinical, Workflows

        + +

        + +

        + +

        + Vision +

        +
        +
        + + + + + + + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pykale/index.html b/ecosystem/pykale/index.html new file mode 100644 index 000000000000..c76344b674f1 --- /dev/null +++ b/ecosystem/pykale/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyKale | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyKale

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        PyKale is a PyTorch library for multimodal learning and transfer learning with deep learning and dimensionality reduction on graphs, images, texts, and videos.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pypose/index.html b/ecosystem/pypose/index.html new file mode 100644 index 000000000000..d3813ee2ad30 --- /dev/null +++ b/ecosystem/pypose/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyPose | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyPose

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        PyPose is a robotics-oriented, PyTorch-based library that combines deep perceptual models with physics-based optimization techniques, so that users can focus on their novel applications.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pypots/index.html b/ecosystem/pypots/index.html new file mode 100644 index 000000000000..c7fcf745174e --- /dev/null +++ b/ecosystem/pypots/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyPOTS | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyPOTS

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A Python toolbox for data mining on Partially-Observed Time Series (POTS) and helps engineers focus more on the core problems in rather than missing parts in their data.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pyro/index.html b/ecosystem/pyro/index.html new file mode 100644 index 000000000000..9809a3f87567 --- /dev/null +++ b/ecosystem/pyro/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/pystiche/index.html b/ecosystem/pystiche/index.html new file mode 100644 index 000000000000..7ffa4ae99aee --- /dev/null +++ b/ecosystem/pystiche/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + pystiche | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        pystiche

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        pystiche is a framework for Neural Style Transfer (NST) built upon PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pysyft/index.html b/ecosystem/pysyft/index.html new file mode 100644 index 000000000000..692c2a0668b5 --- /dev/null +++ b/ecosystem/pysyft/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/pytorch-geometric/index.html b/ecosystem/pytorch-geometric/index.html new file mode 100644 index 000000000000..ae6384b33560 --- /dev/null +++ b/ecosystem/pytorch-geometric/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/pytorch-lightning/index.html b/ecosystem/pytorch-lightning/index.html new file mode 100644 index 000000000000..81a3ef46c041 --- /dev/null +++ b/ecosystem/pytorch-lightning/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/pytorch-metric-learning/index.html b/ecosystem/pytorch-metric-learning/index.html new file mode 100644 index 000000000000..50fabb6b979d --- /dev/null +++ b/ecosystem/pytorch-metric-learning/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyTorch Metric Learning | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyTorch Metric Learning

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        The easiest way to use deep metric learning in your application. Modular, flexible, and extensible.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pytorch-nlp/index.html b/ecosystem/pytorch-nlp/index.html new file mode 100644 index 000000000000..5645c3c62628 --- /dev/null +++ b/ecosystem/pytorch-nlp/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyTorch-NLP | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyTorch-NLP

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Basic Utilities for PyTorch Natural Language Processing (NLP).

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pytorch3d/index.html b/ecosystem/pytorch3d/index.html new file mode 100644 index 000000000000..efec966c8473 --- /dev/null +++ b/ecosystem/pytorch3d/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyTorch3D | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyTorch3D

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        PyTorch3D provides efficient, reusable components for 3D Computer Vision research with PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pytorch_geometric_temporal/index.html b/ecosystem/pytorch_geometric_temporal/index.html new file mode 100644 index 000000000000..ff3a7d42179a --- /dev/null +++ b/ecosystem/pytorch_geometric_temporal/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyTorch Geometric Temporal | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyTorch Geometric Temporal

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        PyTorch Geometric Temporal is a temporal (dynamic) extension library for PyTorch Geometric.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pytorchfi/index.html b/ecosystem/pytorchfi/index.html new file mode 100644 index 000000000000..2b1cd7623361 --- /dev/null +++ b/ecosystem/pytorchfi/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + pytorchfi | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        pytorchfi

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A runtime fault injection tool for PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/pytorchvideo/index.html b/ecosystem/pytorchvideo/index.html new file mode 100644 index 000000000000..5cea9dbe8f50 --- /dev/null +++ b/ecosystem/pytorchvideo/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyTorchVideo | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyTorchVideo

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A deep learning library for video understanding research. Hosts various video-focused models, datasets, training pipelines and more.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/rastervision/index.html b/ecosystem/rastervision/index.html new file mode 100644 index 000000000000..a0d89bb659d6 --- /dev/null +++ b/ecosystem/rastervision/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + raster-vision | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        raster-vision

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        An open source framework for deep learning on satellite and aerial imagery.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/ray/index.html b/ecosystem/ray/index.html new file mode 100644 index 000000000000..45052e92beb7 --- /dev/null +++ b/ecosystem/ray/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Ray | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Ray

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Ray is a fast and simple framework for building and running distributed applications.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/renate/index.html b/ecosystem/renate/index.html new file mode 100644 index 000000000000..a45920a37077 --- /dev/null +++ b/ecosystem/renate/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Renate | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Renate

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Renate is a library providing tools for re-training pytorch models over time as new data becomes available.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/roma/index.html b/ecosystem/roma/index.html new file mode 100644 index 000000000000..1378cd840643 --- /dev/null +++ b/ecosystem/roma/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/simulai/index.html b/ecosystem/simulai/index.html new file mode 100644 index 000000000000..5007a5271983 --- /dev/null +++ b/ecosystem/simulai/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + SimulAI | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        SimulAI

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        SimulAI is basically a toolkit with pipelines for physics-informed machine learning.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/skorch/index.html b/ecosystem/skorch/index.html new file mode 100644 index 000000000000..1ff6b699f373 --- /dev/null +++ b/ecosystem/skorch/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/stable-baselines3/index.html b/ecosystem/stable-baselines3/index.html new file mode 100644 index 000000000000..86ed8b5a8c33 --- /dev/null +++ b/ecosystem/stable-baselines3/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Stable Baselines3 | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Stable Baselines3

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Stable Baselines3 (SB3) is a set of reliable implementations of reinforcement learning algorithms in PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/stoke/index.html b/ecosystem/stoke/index.html new file mode 100644 index 000000000000..6c30bfd6c840 --- /dev/null +++ b/ecosystem/stoke/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + stoke | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        stoke

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A lightweight declarative PyTorch wrapper for context switching between devices, distributed modes, mixed-precision, and PyTorch extensions.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/substra/index.html b/ecosystem/substra/index.html new file mode 100644 index 000000000000..5d1685a61e2c --- /dev/null +++ b/ecosystem/substra/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Substra | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Substra

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Substra is a federated learning Python library to run federated learning experiments at scale on real distributed data.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/tensorly/index.html b/ecosystem/tensorly/index.html new file mode 100644 index 000000000000..4dc941d07329 --- /dev/null +++ b/ecosystem/tensorly/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/ecosystem/textbrewer/index.html b/ecosystem/textbrewer/index.html new file mode 100644 index 000000000000..2c18eccbd647 --- /dev/null +++ b/ecosystem/textbrewer/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + TextBrewer | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        TextBrewer

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A PyTorch-based knowledge distillation toolkit for natural language processing

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/tiatoolbox/index.html b/ecosystem/tiatoolbox/index.html new file mode 100644 index 000000000000..f602e2684d7d --- /dev/null +++ b/ecosystem/tiatoolbox/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + TIAToolbox | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        TIAToolbox

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        TIAToolbox provides an easy-to-use API where researchers can use, adapt and create models for CPath.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchdistill/index.html b/ecosystem/torchdistill/index.html new file mode 100644 index 000000000000..6b2f66d586cc --- /dev/null +++ b/ecosystem/torchdistill/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + torchdistill | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        torchdistill

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        torchdistill is a coding-free framework built on PyTorch for reproducible deep learning and knowledge distillation studies.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchdrift/index.html b/ecosystem/torchdrift/index.html new file mode 100644 index 000000000000..6107f84e7740 --- /dev/null +++ b/ecosystem/torchdrift/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + TorchDrift | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        TorchDrift

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        TorchDrift is a data and concept drift library for PyTorch. It lets you monitor your PyTorch models to see if they operate within spec.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchdrug/index.html b/ecosystem/torchdrug/index.html new file mode 100644 index 000000000000..827ad92aee97 --- /dev/null +++ b/ecosystem/torchdrug/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + torchdrug | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        torchdrug

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A powerful and flexible machine learning platform for drug discovery.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchgeo/index.html b/ecosystem/torchgeo/index.html new file mode 100644 index 000000000000..dc0ce342bf5f --- /dev/null +++ b/ecosystem/torchgeo/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + torchgeo | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        torchgeo

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Datasets, transforms, and models for geospatial data

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchio/index.html b/ecosystem/torchio/index.html new file mode 100644 index 000000000000..0fb566b7de28 --- /dev/null +++ b/ecosystem/torchio/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + TorchIO | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        TorchIO

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        TorchIO is a set of tools to efficiently read, preprocess, sample, augment, and write 3D medical images in deep learning applications written in PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchmetrics/index.html b/ecosystem/torchmetrics/index.html new file mode 100644 index 000000000000..d4df40ef3eae --- /dev/null +++ b/ecosystem/torchmetrics/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + TorchMetrics | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        TorchMetrics

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        Machine learning metrics for distributed, scalable PyTorch applications.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchopt/index.html b/ecosystem/torchopt/index.html new file mode 100644 index 000000000000..073c8871fd3b --- /dev/null +++ b/ecosystem/torchopt/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + TorchOpt | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        TorchOpt

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        TorchOpt is a PyTorch-based library for efficient differentiable optimization.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchpoints3d/index.html b/ecosystem/torchpoints3d/index.html new file mode 100644 index 000000000000..ecdd7f8cfebf --- /dev/null +++ b/ecosystem/torchpoints3d/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + PyTorch-Points3d | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        PyTorch-Points3d

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A PyTorch framework for deep learning on point clouds.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/torchquantum/index.html b/ecosystem/torchquantum/index.html new file mode 100644 index 000000000000..ac5c97323e4e --- /dev/null +++ b/ecosystem/torchquantum/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + TorchQuantum | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        TorchQuantum

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        TorchQuantum is a quantum classical simulation framework based on PyTorch. It supports statevector, density matrix simulation and pulse simulation on different hardware platforms such as CPUs and GPUs.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/trains/index.html b/ecosystem/trains/index.html new file mode 100644 index 000000000000..50a5a5365dd3 --- /dev/null +++ b/ecosystem/trains/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Clear ML | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Clear ML

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        ClearML is a full system ML / DL experiment manager, versioning and ML-Ops solution.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/transformers/index.html b/ecosystem/transformers/index.html new file mode 100644 index 000000000000..f149985d3420 --- /dev/null +++ b/ecosystem/transformers/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Transformers | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Transformers

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        State-of-the-art Natural Language Processing for PyTorch.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/trtorch/index.html b/ecosystem/trtorch/index.html new file mode 100644 index 000000000000..b677bf6f3d14 --- /dev/null +++ b/ecosystem/trtorch/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + Torch-TensorRT | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        Torch-TensorRT

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        PyTorch/TorchScript compiler for NVIDIA GPUs using TensorRT

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/usb/index.html b/ecosystem/usb/index.html new file mode 100644 index 000000000000..1fc1cb8c7cf3 --- /dev/null +++ b/ecosystem/usb/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + USB | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        USB

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        USB is a Pytorch-based Python package for Semi-Supervised Learning (SSL). It is easy-to-use/extend, affordable to small groups, and comprehensive for developing and evaluating SSL algorithms.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/vissl/index.html b/ecosystem/vissl/index.html new file mode 100644 index 000000000000..da8115edf718 --- /dev/null +++ b/ecosystem/vissl/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + VISSL | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        VISSL

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        A library for state-of-the-art self-supervised learning

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ecosystem/vllm/index.html b/ecosystem/vllm/index.html new file mode 100644 index 000000000000..b130425d21c2 --- /dev/null +++ b/ecosystem/vllm/index.html @@ -0,0 +1,718 @@ + + + + + + + + + + + + + vllm | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +
        +
        +

        vllm

        + + + + + Group 5 + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + +

        vllm is a high-throughput and memory-efficient inference and serving engine for LLMs.

        + + + + +
        +
        + +
        +
        +
        +
        + +
        +
        + +
        + +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/edge.html b/edge.html index a94862e306ab..76f7990b0c40 100644 --- a/edge.html +++ b/edge.html @@ -1,12 +1,310 @@ ---- -layout: default -title: PyTorch Edge -body-class: announcement -background-class: announcement-background -permalink: /edge ---- - -
        + + + + + + + + + + + + + PyTorch Edge | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Edge

        @@ -55,3 +353,306 @@

        Learn more about PyTorch Edge

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/events.html b/events.html index 1f90419bdebe..d3a7cf5b6b84 100644 --- a/events.html +++ b/events.html @@ -1,39 +1,2193 @@ ---- -layout: default -title: Events -permalink: /events -body-class: ecosystem -background-class: events-background ---- - -
        + + + + + + + + + + + + + Events | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        Events

        Don’t miss out on our upcoming PyTorch live events. Find a complete list below.

        -
        -
        -
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/past_issues/2021-03-11-issue-1.html b/past_issues/2021-03-11-issue-1.html new file mode 100644 index 000000000000..1f31867d20a4 --- /dev/null +++ b/past_issues/2021-03-11-issue-1.html @@ -0,0 +1,40 @@ +

        Issue #1

        + +

        Welcome to the first issue of the PyTorch Contributors newsletter! Keeping track of everything that’s happening in the PyTorch developer world is a big task; here you will find curated news including RFCs, feature roadmaps, notable PRs, editorials from developers, and more. If you have questions or suggestions for the newsletter, just reply back to this email.

        + +

        PyTorch 1.8.0

        + +

        PyTorch 1.8 was released on March 4th with support for functional transformations using torch.fx, stabilized frontend APIs for scientific computing (torch.fft, torch.linalg, Autograd for complex tensors) and significant improvements to distributed training. Read the full Release Notes.

        + +

        PyTorch Ecosystem Day

        + +

        On April 21, we’re hosting a virtual event for our ecosystem and industry communities to showcase their work and discover new opportunities to collaborate. The day will be filled with discussion on new developments, trends, challenges and best practices through posters, breakout sessions and networking.

        + +

        The PyTorch open source process

        + +

        @ezyang describes the challenges of maintaining a PyTorch-scale project, and the current open source processes (triaging and CI oncalls, RFC discussions) to help PyTorch operate effectively.

        + +

        Developers forum

        + +

        We launched https://dev-discuss.pytorch.org/ a low-traffic high-signal forum for long-form discussions about PyTorch internals.

        + +

        [RFC] Dataloader v2

        + +

        @VitalyFedyunin proposes redesigning the DataLoader to support lazy loading, sharding, pipelining data operations (including async) and shuffling & sampling in a more modular way. Join the discussion here.

        + +

        [RFC] Improving TorchScript Usability

        + +

        In a series of 3 blog posts (1, 2, 3) @t-vi explores ideas to improve the user and developer experience of TorchScript.

        + +

        [RFC] CSR and DM storage formats for sparse tensors

        + +

        @pearu proposes an RFC to make linear algebra operations more performant by

        + +
          +
        • implementing the CSR storage format, where a 2D array is defined by shape and 1D tensors for compressed row indices, column indices, and values (PyTorch 1D tensor)
        • +
        • introducing the Dimension Mapping storage format that generalizes a 2D CSR to multidimensional arrays using a bijective mapping between the storage and wrapper elements.
        • +
        + +

        [RFC] Forward Mode AD

        + +

        @albanD proposes an RFC to implement forward mode autodiff using Tensor-based dual numbers, where the real part represents the tensor and the dual part stores the forward gradient of the tensor. The core of the feature has landed (PR), with more formulas in WIP. Complete forward mode AD is expected to land by July 2021.

        diff --git a/past_issues/2021-05-11-issue-2.html b/past_issues/2021-05-11-issue-2.html new file mode 100644 index 000000000000..433d70e028a6 --- /dev/null +++ b/past_issues/2021-05-11-issue-2.html @@ -0,0 +1,38 @@ +

        Issue #2

        + +

        Welcome to the second edition of the PyTorch newsletter! In this issue, read about how we celebrated the PyTorch community at the first-ever PyTorch Ecosystem Day (PTED), discover a new podcast for PyTorch developers, and learn about important updates to the PyTorch frontend.

        + +

        PyTorch Ecosystem Day

        + +

        Piotr Bialecki (Sr. Software Engineer, NVIDIA) spoke about his journey of using PyTorch and what he sees in the future for PyTorch. Miquel Farré (Sr. Technology Manager, Disney) spoke about the Creative Genome project that uses the PyTorch ecosystem to annotate all Disney content. Ritchie Ng (CEO, Hessian Matrix) spoke about the growth of AI in the Asia Pacific region, and how to get started with PyTorch for production AI use cases. Members of the community showcased how they were using PyTorch via 71 posters and pop-up breakout sessions. See all of the posters and listen to the opening keynote talks here!

        + +

        PyTorch Developer Podcast

        + +

        Edward Yang (Research Engineer, Facebook AI) talks about internal development concepts like binding C++ in Python, the dispatcher, PyTorch’s library structure and more. Check out this new series; each episode is around 15 minutes long. Listen to it wherever you get your podcasts.

        + +

        Forward Mode AD

        +

        The core logic for Forward Mode AD (based on “dual tensors”) is now in PyTorch. All the APIs to manipulate such Tensors, codegen and view handling are in master (1.9.0a0) already. Gradcheck and a first set of formulas will be added in the following month; full support for all PyTorch functions, custom Autograd functions and higher order gradients will happen later this year. Read more about this or share your feedback with @albanD on the corresponding RFC.

        + +

        Make complex conjugation lazy

        + +

        PR #54987 makes the conjugate operation on complex tensors return a view that has a special is_conj() bit flipped. Aside from saving memory by not creating a full tensor, this grants a potential speedup if the following operation can handle conjugated inputs directly. For such operations (like gemm), a flag is passed to the low-level API; for others the conjugate is materialized before passing to the operation.

        + +

        torch.use_deterministic_algorithms is stable

        + +

        torch.use_deterministic_algorithms() (docs) is stable in master (1.9.0a0). If True, the flag switches non-deterministic operations to their deterministic implementation if available, and throws a RuntimeError if not.

        + +

        torch.linalg and torch.special

        + +

        torch.linalg is now stable; the module maintains fidelity with NumPy’s np.linalg linear algebra functions. +torch.special (beta) contains functions in scipy.special. Here’s the tracking issue if you’d like to contribute functions to torch.special. If you want a function not already on the list, let us know on the tracking issue about your use case and why it should be added.

        + +

        Generalizing AMP to work on CPU

        + +
        +

        @ezyang: Intel is interested in bringing automatic mixed precision to CPU in [RFC] Extend Autocast to CPU/CUDA with BF16 data type · Issue #55374 · pytorch/pytorch · One big question is what the API for autocasting should be for CPU; should we provide a single, generalized API torch.autocast (keep in mind that CPU autocasting would be through bfloat16, while the existing GPU autocasting is via float16), or provide separate APIs for CPU/CUDA? If you have any thoughts or opinions on the subject, please chime in on the issue.

        +
        + +


        +

        + +

        Are you enjoying reading this newsletter? What would you like to know more about? All feedback is welcome and appreciated! To share your suggestions, use this form or simply reply to this email.

        diff --git a/preview_setup.sh b/preview_setup.sh new file mode 100755 index 000000000000..14052d9e5d47 --- /dev/null +++ b/preview_setup.sh @@ -0,0 +1,6 @@ +#!/bin/bash +rm -rf pytorch.github.io +git clone --recursive https://github.com/pytorch/pytorch.github.io.git -b site --depth 1 +cp *.md pytorch.github.io/_hub +cp images/* pytorch.github.io/assets/images/ + diff --git a/previous-versions.html b/previous-versions.html new file mode 100644 index 000000000000..4aab09c5eec4 --- /dev/null +++ b/previous-versions.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/pt-26-live-q-a.html b/pt-26-live-q-a.html index 154fe993fca7..dd646ad878c9 100644 --- a/pt-26-live-q-a.html +++ b/pt-26-live-q-a.html @@ -1,12 +1,310 @@ ---- -layout: default -title: "PyTorch 2.6 Live Q&A" -body-class: announcement -background-class: announcement-background -permalink: /pt-26-live-q-a ---- - -
        + + + + + + + + + + + + + PyTorch 2.6 Live Q&A | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Webinars

        @@ -40,4 +338,306 @@

        PyTorch 2.6 Live Q&A

        -
        \ No newline at end of file +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pt-27-release-qa.html b/pt-27-release-qa.html index 12a713ab06b2..5ed6835d8ec5 100644 --- a/pt-27-release-qa.html +++ b/pt-27-release-qa.html @@ -1,12 +1,310 @@ ---- -layout: default -title: "PyTorch 2.7 Release Live Q&A" -body-class: announcement -background-class: announcement-background -permalink: /pt-27-release-qa ---- - -
        + + + + + + + + + + + + + PyTorch 2.7 Release Live Q&A | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Webinars

        @@ -43,4 +341,306 @@

        PyTorch 2.7 Release Live Q&A

        -
        \ No newline at end of file +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pt-dinov2-multi-label-plant-species-classification.html b/pt-dinov2-multi-label-plant-species-classification.html index 10b99f213935..cdb21ecd2038 100644 --- a/pt-dinov2-multi-label-plant-species-classification.html +++ b/pt-dinov2-multi-label-plant-species-classification.html @@ -1,12 +1,310 @@ ---- -layout: default -title: "Using PyTorch and DINOv2 for Multi-label Plant Species Classification" -body-class: announcement -background-class: announcement-background -permalink: /pt-dinov2-multi-label-plant-species-classification ---- - -
        + + + + + + + + + + + + + Using PyTorch and DINOv2 for Multi-label Plant Species Classification | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Webinars

        @@ -36,4 +334,306 @@

        Using PyTorch and DINOv2 for Multi-label Plant Species Classification

        -
        \ No newline at end of file +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pytorch-domains.html b/pytorch-domains.html index 740c2a49fbd1..12e6620217fb 100644 --- a/pytorch-domains.html +++ b/pytorch-domains.html @@ -1,12 +1,310 @@ ---- -layout: default -title: PyTorch Domains -permalink: /pytorch-domains -body-class: blog -background-class: features-background ---- - -
        + + + + + + + + + + + + + PyTorch Domains | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Domains

        Learn about the various parts of the PyTorch ecosystem

        @@ -113,3 +411,306 @@

        torchao

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/redirects.json b/redirects.json new file mode 100644 index 000000000000..1a3520c54077 --- /dev/null +++ b/redirects.json @@ -0,0 +1 @@ +{"/2017/05/11/Internals.html":"https://pytorch.org/blog/a-tour-of-pytorch-internals-1/","/2017/06/27/Internals2.html":"https://pytorch.org/blog/a-tour-of-pytorch-internals-2/","/2018/01/19/a-year-in.html":"https://pytorch.org/blog/a-year-in/","/2018/03/05/tensor-comprehensions.html":"https://pytorch.org/blog/tensor-comprehensions/","/2018/04/22/0_4_0-migration-guide.html":"https://pytorch.org/blog/pytorch-0_4_0-migration-guide/","/2018/05/02/road-to-1.0.html":"https://pytorch.org/blog/the-road-to-1_0/","/2019/04/29/road-to-1.0.html":"https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/","/2019/05/08/model-serving-in-pyorch.html":"https://pytorch.org/blog/model-serving-in-pyorch/","/2019/05/23/torchvision03.html":"https://pytorch.org/blog/torchvision03/","/2019/06/10/pytorch_hub.html":"https://pytorch.org/blog/towards-reproducible-research-with-pytorch-hub/","/2019/07/23/mapillary-research.html":"https://pytorch.org/blog/mapillary-research/","/2019/08/06/pytorch_aug2019_releases.html":"https://pytorch.org/blog/pytorch-1.2-and-domain-api-release/","/get-started/":"https://pytorch.org/get-started/locally/","/previous-versions.html":"https://pytorch.org/get-started/previous-versions/","/ecosystem/Captum/":"https://captum.ai/","/ecosystem/Flair/":"https://github.com/flairNLP/flair","/ecosystem/Ignite/":"https://github.com/pytorch/ignite","/ecosystem/advertorch/":"https://github.com/BorealisAI/advertorch","/ecosystem/allennlp/":"https://allennlp.org/","/ecosystem/botorch/":"https://botorch.org/","/ecosystem/crypten/":"https://github.com/facebookresearch/CrypTen","/ecosystem/fastai/":"https://docs.fast.ai","/ecosystem/glow/":"https://github.com/pytorch/glow","/ecosystem/gpytorch/":"https://cornellius-gp.github.io/","/ecosystem/horovod/":"http://horovod.ai","/ecosystem/parlai/":"http://parl.ai/","/ecosystem/pennylane/":"https://pennylane.ai/","/ecosystem/pyro/":"http://pyro.ai/","/ecosystem/pysyft/":"https://github.com/OpenMined/PySyft","/ecosystem/pytorch-geometric/":"https://github.com/pyg-team/pytorch_geometric/","/ecosystem/pytorch-lightning/":"https://github.com/williamFalcon/pytorch-lightning","/ecosystem/roma/":"https://github.com/naver/roma","/ecosystem/skorch/":"https://github.com/skorch-dev/skorch","/ecosystem/tensorly/":"http://tensorly.org/stable/home.html","/mobile/":"https://pytorch.org/mobile/home/","/blog/categories/":"https://pytorch.org/blog/","/resources/contributors/":"https://pytorch.org/newsletter","/ecosystem/":"https://landscape.pytorch.org/","/ecosystem/join.html":"https://github.com/pytorch-fdn/ecosystem","/hub/developer-models/compact":"https://pytorch.org/hub/research-models/compact","/hub/developer-models":"https://pytorch.org/hub/research-models","/support/":"https://pytorch.org/resources/"} \ No newline at end of file diff --git a/resources.html b/resources.html deleted file mode 100644 index 7e0931b98cc6..000000000000 --- a/resources.html +++ /dev/null @@ -1,57 +0,0 @@ ---- -layout: default -title: Resources -permalink: resources/ -body-class: resources -background-class: resources-background -redirect_from: /support/ ---- -{% assign resources = site.resources | sort: 'order' %} - -
        -
        -

        Resources

        - -

        Explore educational courses, get your questions answered, and join the discussion with other PyTorch developers.

        -
        -
        - -
        -
        -
        -
        - - {% for resource in resources %} - {% assign card_title = resource.title | split: ' ' %} - - {% if resource.show-pytorch-logo == true %} - {% assign resource_class = "pytorch-resource" %} - {% else %} - {% assign resource_class = resource.class %} - {% endif %} - - -
        -
        -
        -
        \ No newline at end of file diff --git a/resources/contributors/index.html b/resources/contributors/index.html new file mode 100644 index 000000000000..cc6c293ce476 --- /dev/null +++ b/resources/contributors/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/resources/index.html b/resources/index.html new file mode 100644 index 000000000000..b1b9cd6b710f --- /dev/null +++ b/resources/index.html @@ -0,0 +1,989 @@ + + + + + + + + + + + + + Resources | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + + + +
        +
        +

        Resources

        + +

        Explore educational courses, get your questions answered, and join the discussion with other PyTorch developers.

        +
        +
        + +
        +
        +
        +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        +
        +
        +
        + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/add_noindex_tags.sh b/scripts/add_noindex_tags.sh deleted file mode 100755 index 73b5a6e85636..000000000000 --- a/scripts/add_noindex_tags.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -# Adds tags to all html files in a -# directory (recursively) -# -# Usage: -# ./add_noindex_tags.sh directory -# -# Example (from the root directory) -# ./scripts/add_no_index_tags.sh docs/1.6.0 -if [ "$1" == "" ]; then - echo "Incorrect usage. Correct Usage: add_no_index_tags.sh " - exit 1 -fi -find $1 -name "*.html" -print0 | xargs -0 sed -i '//a \ \ ' diff --git a/scripts/deploy-site.sh b/scripts/deploy-site.sh deleted file mode 100755 index 43258031d5af..000000000000 --- a/scripts/deploy-site.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -# ideas used from https://gist.github.com/motemen/8595451 - -# Based on https://github.com/eldarlabs/ghpages-deploy-script/blob/master/scripts/deploy-ghpages.sh -# Used with their MIT license https://github.com/eldarlabs/ghpages-deploy-script/blob/master/LICENSE - -# abort the script if there is a non-zero error -set -ex - -# initialize hub submodule -git submodule deinit -f . && git submodule update --init --recursive - -# use latest hub -./_devel/update_hub_submodule.sh - -# Files not related to build should be deleted. -pushd _hub -rm -R `ls -1 -d */` -rm -f README.md -popd - -# show where we are on the machine -pwd -remote=$(git config remote.origin.url) - -# make a directory to put the master branch -mkdir master-branch -cd master-branch -# now lets setup a new repo so we can update the master branch -git init -git remote add origin "$remote" -git fetch --depth 1 - -# switch into the the master branch -if git rev-parse --verify origin/master > /dev/null 2>&1 -then - git checkout master - # delete any old site as we are going to replace it - # Note: this explodes if there aren't any, so moving it here for now - git rm -rf . -else - git checkout --orphan master -fi - -cd "../" -make build_deploy -cd master-branch - -# copy over or recompile the new site -cp -a "../_site/." . - -# have small jekyll config to allow underscores -echo "include: [_static, _images, _modules, _sources, _asserts.html, _creation.html, _comparison.html, _lowrank.html, _script.html, _diagnostic.html, _dynamo.html, _serialization.html, _type_utils, _tensor_str.html, _trace.html, _utils.html, _internal, _C, _distributed_autograd.html, _distributed_c10d.html, _distributed_rpc.html, _fft.html, _linalg.html, _monitor.html, _nested.html, _nn.html, _profiler.html, _sparse.html, _special.html, __config__.html, _dynamo, _lobpcg.html, _jit_internal.html, _numeric_suite.html, _numeric_suite_fx.html, _sanitizer.html, _symbolic_trace.html, _async.html, _freeze.html, _fuser.html, _type_utils.html, _utils ]" > _config.yml - -# stage any changes and new files -git add -A -# now commit, ignoring branch master doesn't seem to work, so trying skip -git commit --allow-empty -m "Deploy to GitHub Pages on master [ci skip]" -# and push, but send any output to /dev/null to hide anything sensitive -git push --force --quiet https://pytorchbot:$SECRET_PYTORCHBOT_TOKEN@github.com/pytorch/pytorch.github.io.git master -# go back to where we started and remove the master git repo we made and used -# for deployment -cd .. -rm -rf master-branch - -echo "Finished Deployment!" diff --git a/scripts/gen_quick_start_module.py b/scripts/gen_quick_start_module.py deleted file mode 100755 index 5fd20d79949f..000000000000 --- a/scripts/gen_quick_start_module.py +++ /dev/null @@ -1,271 +0,0 @@ -#!/usr/bin/env python3 -""" -Generates quick start module for https://pytorch.org/get-started/locally/ page -If called from update-quick-start-module.yml workflow (--autogenerate parameter set) -Will output new quick-start-module.js, and new published_version.json file -based on the current release matrix. -If called standalone will generate quick-start-module.js from existing -published_version.json file -""" - -import argparse -import copy -import json -from enum import Enum -from pathlib import Path -from typing import Dict - -BASE_DIR = Path(__file__).parent.parent - - -class OperatingSystem(Enum): - LINUX: str = "linux" - WINDOWS: str = "windows" - MACOS: str = "macos" - - -PRE_CXX11_ABI = "pre-cxx11" -CXX11_ABI = "cxx11-abi" -DEBUG = "debug" -RELEASE = "release" -DEFAULT = "default" -ENABLE = "enable" -DISABLE = "disable" -MACOS = "macos" - -# Mapping json to release matrix default values -acc_arch_ver_default = { - "nightly": { - "accnone": ("cpu", ""), - "cuda.x": ("cuda", "11.8"), - "cuda.y": ("cuda", "12.1"), - "cuda.z": ("cuda", "12.4"), - "rocm5.x": ("rocm", "6.0"), - }, - "release": { - "accnone": ("cpu", ""), - "cuda.x": ("cuda", "11.8"), - "cuda.y": ("cuda", "12.1"), - "cuda.z": ("cuda", "12.4"), - "rocm5.x": ("rocm", "6.0"), - }, -} - -# Initialize arch version to default values -# these default values will be overwritten by -# extracted values from the release marix -acc_arch_ver_map = acc_arch_ver_default - -LIBTORCH_DWNL_INSTR = { - PRE_CXX11_ABI: "Download here (Pre-cxx11 ABI):", - CXX11_ABI: "Download here (cxx11 ABI):", - RELEASE: "Download here (Release version):", - DEBUG: "Download here (Debug version):", - MACOS: "Download arm64 libtorch here (ROCm and CUDA are not supported):", -} - - -def load_json_from_basedir(filename: str): - try: - with open(BASE_DIR / filename) as fptr: - return json.load(fptr) - except FileNotFoundError as exc: - raise ImportError(f"File {filename} not found error: {exc.strerror}") from exc - except json.JSONDecodeError as exc: - raise ImportError(f"Invalid JSON {filename}") from exc - - -def read_published_versions(): - return load_json_from_basedir("published_versions.json") - - -def write_published_versions(versions): - with open(BASE_DIR / "published_versions.json", "w") as outfile: - json.dump(versions, outfile, indent=2) - -# Create releases JSON for PyTorch website. -# Matrix is being used to populate config data for -# the "Start Locally" installation options table. -def write_releases_file(matrix): - with open(BASE_DIR / "releases.json", "w") as outfile: - json.dump(matrix, outfile, indent=2) - -def read_matrix_for_os(osys: OperatingSystem, channel: str): - jsonfile = load_json_from_basedir(f"{osys.value}_{channel}_matrix.json") - return jsonfile["include"] - - -def read_quick_start_module_template(): - with open(BASE_DIR / "_includes" / "quick-start-module.js") as fptr: - return fptr.read() - - -def get_package_type(pkg_key: str, os_key: OperatingSystem) -> str: - if pkg_key != "pip": - return pkg_key - return "manywheel" if os_key == OperatingSystem.LINUX.value else "wheel" - - -def get_gpu_info(acc_key, instr, acc_arch_map): - gpu_arch_type, gpu_arch_version = acc_arch_map[acc_key] - if DEFAULT in instr: - gpu_arch_type, gpu_arch_version = acc_arch_map["accnone"] - return (gpu_arch_type, gpu_arch_version) - - -# This method is used for generating new published_versions.json file -# It will modify versions json object with installation instructions -# Provided by generate install matrix Github Workflow, stored in release_matrix -# json object. -def update_versions(versions, release_matrix, release_version): - version = "preview" - template = "preview" - acc_arch_map = acc_arch_ver_map[release_version] - - if release_version != "nightly": - version = release_matrix[OperatingSystem.LINUX.value][0]["stable_version"] - if version not in versions["versions"]: - versions["versions"][version] = copy.deepcopy( - versions["versions"][template] - ) - versions["latest_stable"] = version - - # Perform update of the json file from release matrix - for os_key, os_vers in versions["versions"][version].items(): - for pkg_key, pkg_vers in os_vers.items(): - for acc_key, instr in pkg_vers.items(): - package_type = get_package_type(pkg_key, os_key) - gpu_arch_type, gpu_arch_version = get_gpu_info( - acc_key, instr, acc_arch_map - ) - - pkg_arch_matrix = [ - x - for x in release_matrix[os_key] - if (x["package_type"], x["gpu_arch_type"], x["gpu_arch_version"]) - == (package_type, gpu_arch_type, gpu_arch_version) - ] - - if pkg_arch_matrix: - if package_type != "libtorch": - instr["command"] = pkg_arch_matrix[0]["installation"] - else: - if os_key == OperatingSystem.LINUX.value: - rel_entry_dict = { - x["devtoolset"]: x["installation"] - for x in pkg_arch_matrix - if x["libtorch_variant"] == "shared-with-deps" - } - if instr["versions"] is not None: - for ver in [CXX11_ABI, PRE_CXX11_ABI]: - # temporarily remove setting pre-cxx11-abi. For Release 2.7 we - # should remove pre-cxx11-abi completely. - if ver == PRE_CXX11_ABI: - continue - else: - instr["versions"][LIBTORCH_DWNL_INSTR[ver]] = ( - rel_entry_dict[ver] - ) - - elif os_key == OperatingSystem.WINDOWS.value: - rel_entry_dict = { - x["libtorch_config"]: x["installation"] - for x in pkg_arch_matrix - } - if instr["versions"] is not None: - for ver in [RELEASE, DEBUG]: - instr["versions"][LIBTORCH_DWNL_INSTR[ver]] = ( - rel_entry_dict[ver] - ) - elif os_key == OperatingSystem.MACOS.value: - if instr["versions"] is not None: - instr["versions"][LIBTORCH_DWNL_INSTR[MACOS]] = ( - pkg_arch_matrix[0]["installation"] - ) - - -# This method is used for generating new quick-start-module.js -# from the versions json object -def gen_install_matrix(versions) -> Dict[str, str]: - result = {} - version_map = { - "preview": "preview", - "stable": versions["latest_stable"], - } - for ver, ver_key in version_map.items(): - for os_key, os_vers in versions["versions"][ver_key].items(): - for pkg_key, pkg_vers in os_vers.items(): - for acc_key, instr in pkg_vers.items(): - extra_key = "python" if pkg_key != "libtorch" else "cplusplus" - key = f"{ver},{pkg_key},{os_key},{acc_key},{extra_key}" - note = instr["note"] - lines = [note] if note is not None else [] - if pkg_key == "libtorch": - ivers = instr["versions"] - if ivers is not None: - lines += [ - f"{lab}
        {val}" - for (lab, val) in ivers.items() - ] - else: - command = instr["command"] - if command is not None: - lines.append(command) - result[key] = "
        ".join(lines) - return result - - -# This method is used for extracting two latest verisons of cuda and -# last verion of rocm. It will modify the acc_arch_ver_map object used -# to update getting started page. -def extract_arch_ver_map(release_matrix): - def gen_ver_list(chan, gpu_arch_type): - return { - x["desired_cuda"]: x["gpu_arch_version"] - for x in release_matrix[chan]["linux"] - if x["gpu_arch_type"] == gpu_arch_type - } - - for chan in ("nightly", "release"): - cuda_ver_list = gen_ver_list(chan, "cuda") - rocm_ver_list = gen_ver_list(chan, "rocm") - cuda_list = sorted(cuda_ver_list.values()) - acc_arch_ver_map[chan]["rocm5.x"] = ("rocm", max(rocm_ver_list.values())) - for cuda_ver, label in zip(cuda_list, ["cuda.x", "cuda.y", "cuda.z"]): - acc_arch_ver_map[chan][label] = ("cuda", cuda_ver) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--autogenerate", dest="autogenerate", action="store_true") - parser.set_defaults(autogenerate=True) - - options = parser.parse_args() - versions = read_published_versions() - - if options.autogenerate: - release_matrix = {} - for val in ("nightly", "release"): - release_matrix[val] = {} - for osys in OperatingSystem: - release_matrix[val][osys.value] = read_matrix_for_os(osys, val) - - write_releases_file(release_matrix) - - extract_arch_ver_map(release_matrix) - for val in ("nightly", "release"): - update_versions(versions, release_matrix[val], val) - - write_published_versions(versions) - - template = read_quick_start_module_template() - versions_str = json.dumps(gen_install_matrix(versions)) - template = template.replace("{{ installMatrix }}", versions_str) - template = template.replace( - "{{ VERSION }}", f"\"Stable ({versions['latest_stable']})\"" - ) - print(template.replace("{{ ACC ARCH MAP }}", json.dumps(acc_arch_ver_map))) - - -if __name__ == "__main__": - main() diff --git a/staff.html b/staff.html index 1fbdc00d951f..f4ead944c61d 100644 --- a/staff.html +++ b/staff.html @@ -1,12 +1,310 @@ ---- -layout: default -title: PyTorch Staff -body-class: announcement -background-class: announcement-background -permalink: /staff ---- - -
        + + + + + + + + + + + + + PyTorch Staff | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Staff

        @@ -169,3 +467,306 @@

        Bazil Sterling, Communications and Marketing Associate

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/style-guide.html b/style-guide.html index b0fe6c737a69..6f3e35fe7dc3 100644 --- a/style-guide.html +++ b/style-guide.html @@ -1,10 +1,310 @@ ---- -layout: default -body-class: style-guide -background-class: style-guide ---- + + + + + + + + + + + + + PyTorch + + + -
        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        Header 1
              Article Page

        Jumbotron text. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua

        @@ -16,10 +316,475 @@

        Header 1
              Article Page

        - {% for post in site.style_guide %} - {{ post.content }} - {% endfor %} + +

        Header 2

        +

        This is body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea.

        + +

        Header 3

        + +

        This is body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea.

        + +

        Header 4

        + +

        This is body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea.

        + +
        Header 5
        + +

        This is body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea.

        + +
        + +

        This is more body copy with code snippets. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Here is an inline link. Ut enim ad minim veniam, quis nostrud torch.*.FloatTensor ullamco laboris nisi ut aliquip ex ea commodo consequat.

        + +

        This is italicized body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat

        + +

        This is bolded body copy. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

        + +
        + +

        This is body copy before an unordered list. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea.

        + +
          +
        • Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
        • +
        • Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
        • +
        • Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
        • +
        + +

        This is body copy after an unordered list. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea.

        + +
          +
        1. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
        2. +
        3. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
        4. +
        5. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
        6. +
        + +

        This is body copy after an ordered list. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea.

        + +
        +
        Definition list
        +
        Lorem ipsum dolor sit amet, consectetur adipiscing elit
        + +
        Definition list
        +
        Lorem ipsum dolor sit amet, consectetur adipiscing elit
        + +
        Definition list
        +
        Lorem ipsum dolor sit amet, consectetur adipiscing elit
        +
        + +
        + +

        Here's an image

        + +
        + +
        +

        “This is a blockquote. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat”

        +
        + +
          brew install pytorch # Here is a small code block
        +  brew install pytorch # Here is a small code block
        +
        + +
        # Here is a large code block with syntax highlighting
        +
        +# !/usr/bin/python3
        +
        +# Dictionaries map keys to values.
        +
        +fred = { 'mike': 456, 'bill': 399, 'sarah': 521 }
        +
        +# Subscripts.
        +try:
        +    print(fred)
        +    print(fred['bill'])
        +    print(fred['nora'])
        +    print("Won't see this!")
        +except KeyError as rest:
        +    print("Lookup failed:", rest)
        +print()
        +
        +# Entries can be added, udated, or deleted.
        +fred['bill'] = 'Sopwith Camel'
        +fred['wilma'] = 2233
        +del fred['mike']
        +print(fred)
        +print()
        +
        +# Get all the keys.
        +print(fred.keys())
        +for k in fred.keys():
        +    print(k, "=>", fred[k])
        +print()
        +
        +# Test for presence of a key.
        +for t in [ 'zingo', 'sarah', 'bill', 'wilma' ]:
        +    print(t,end=' ')
        +    if t in fred:
        +        print('=>', fred[t])
        +    else:
        +        print('is not present.')
        +
        + +

        Here is a table:

        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Datatype torch.dtypeTensor types
        32-bit floating pointtorch.float32 or torch.floattorch.*.FloatTensor
        64-bit floating pointtorch.float64 or torch.doubletorch.*.DoubleTensor
        16-bit floating pointtorch.float16 or torch.halftorch.*.HalfTensor
        8-bit integer (unsigned)torch.uint8torch.*.ByteTensor
        8-bit integer (signed)torch.int8torch.*.CharTensor
        16-bit integer (signed)torch.int16 or torch.shorttorch.*.ShortTensor
        32-bit integer (signed)torch.int32 or torch.inttorch.*.IntTensor
        64-bit integer (signed)torch.int64 or torch.longtorch.*.LongTensor
        + + +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/support/index.html b/support/index.html new file mode 100644 index 000000000000..da4fed0f3278 --- /dev/null +++ b/support/index.html @@ -0,0 +1,11 @@ + + + + Redirecting… + + + + +

        Redirecting…

        + Click here if you are not redirected. + diff --git a/tac.html b/tac.html index 171e68422fb2..5136d13699e3 100644 --- a/tac.html +++ b/tac.html @@ -1,12 +1,310 @@ ---- -layout: default -title: PyTorch Foundation Technical Advisory Council -body-class: announcement -background-class: announcement-background -permalink: /tac ---- - -
        + + + + + + + + + + + + + PyTorch Foundation Technical Advisory Council | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch Foundation Technical Advisory Council

        @@ -247,3 +545,306 @@

        Piotr Bialecki, NVIDIA

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test_run_python_code.py b/test_run_python_code.py new file mode 100644 index 000000000000..f44a28569633 --- /dev/null +++ b/test_run_python_code.py @@ -0,0 +1,41 @@ +from subprocess import check_output, STDOUT, CalledProcessError +import sys +import pytest +import glob + + +PYTHON_CODE_DIR = "python_code" +ALL_FILES = glob.glob(PYTHON_CODE_DIR + "/*.py") + + +@pytest.mark.parametrize('file_path', ALL_FILES) +def test_run_file(file_path): + if 'nvidia' in file_path: + # FIXME: NVIDIA models checkoints are on cuda + pytest.skip("temporarily disabled") + if 'pytorch_fairseq_translation' in file_path: + pytest.skip("temporarily disabled") + if 'ultralytics_yolov5' in file_path: + # FIXME torch.nn.modules.module.ModuleAttributeError: 'autoShape' object has no attribute 'fuse + pytest.skip("temporarily disabled") + if 'huggingface_pytorch-transformers' in file_path: + # FIXME torch.nn.modules.module.ModuleAttributeError: 'autoShape' object has no attribute 'fuse + pytest.skip("temporarily disabled") + if 'pytorch_fairseq_roberta' in file_path: + pytest.skip("temporarily disabled") + + # We just run the python files in a separate sub-process. We really want a + # subprocess here because otherwise we might run into package versions + # issues: imagine script A that needs torchvivion 0.9 and script B that + # needs torchvision 0.10. If script A is run prior to script B in the same + # process, script B will still be run with torchvision 0.9 because the only + # "import torchvision" statement that counts is the first one, and even + # torchub sys.path shenanigans can do nothing about this. By creating + # subprocesses we're sure that all file executions are fully independent. + try: + # This is inspired (and heavily simplified) from + # https://github.com/cloudpipe/cloudpickle/blob/343da119685f622da2d1658ef7b3e2516a01817f/tests/testutils.py#L177 + out = check_output([sys.executable, file_path], stderr=STDOUT) + print(out.decode()) + except CalledProcessError as e: + raise RuntimeError(f"Script {file_path} errored with output:\n{e.output.decode()}") diff --git a/videos.html b/videos.html index e7e80960167e..bf00bc2bbe46 100644 --- a/videos.html +++ b/videos.html @@ -1,12 +1,310 @@ ---- -layout: default -title: Videos -permalink: /videos -body-class: blog -background-class: features-background ---- - -
        + + + + + + + + + + + + + Videos | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        Videos

        Learn about the latest PyTorch tutorials and more

        @@ -18,19 +316,692 @@

        Learn about the latest PyTorch tutorials and more

        - {% assign videos = site.videos | sort_natural: "date" | reverse %} - {% for post in videos %} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - {% endfor %} + + + +
        +
        +
        +
        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/videos/pt20qa1.html b/videos/pt20qa1.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa1.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa10.html b/videos/pt20qa10.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa10.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa11.html b/videos/pt20qa11.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa11.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa12.html b/videos/pt20qa12.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa12.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa2.html b/videos/pt20qa2.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa2.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa3.html b/videos/pt20qa3.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa3.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa4.html b/videos/pt20qa4.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa4.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa5.html b/videos/pt20qa5.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa5.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa6.html b/videos/pt20qa6.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa6.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa7.html b/videos/pt20qa7.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa7.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa8.html b/videos/pt20qa8.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa8.html @@ -0,0 +1 @@ + diff --git a/videos/pt20qa9.html b/videos/pt20qa9.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/pt20qa9.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf1.html b/videos/ptconf1.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf1.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf11.html b/videos/ptconf11.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf11.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf12.html b/videos/ptconf12.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf12.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf13.html b/videos/ptconf13.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf13.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf15.html b/videos/ptconf15.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf15.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf16.html b/videos/ptconf16.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf16.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf2.html b/videos/ptconf2.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf2.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf3.html b/videos/ptconf3.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf3.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf4.html b/videos/ptconf4.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf4.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf5.html b/videos/ptconf5.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf5.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf6.html b/videos/ptconf6.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf6.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf7.html b/videos/ptconf7.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf7.html @@ -0,0 +1 @@ + diff --git a/videos/ptconf8.html b/videos/ptconf8.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/ptconf8.html @@ -0,0 +1 @@ + diff --git a/videos/vid1.html b/videos/vid1.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid1.html @@ -0,0 +1 @@ + diff --git a/videos/vid10.html b/videos/vid10.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid10.html @@ -0,0 +1 @@ + diff --git a/videos/vid11.html b/videos/vid11.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid11.html @@ -0,0 +1 @@ + diff --git a/videos/vid12.html b/videos/vid12.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid12.html @@ -0,0 +1 @@ + diff --git a/videos/vid13.html b/videos/vid13.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid13.html @@ -0,0 +1 @@ + diff --git a/videos/vid2.html b/videos/vid2.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid2.html @@ -0,0 +1 @@ + diff --git a/videos/vid3.html b/videos/vid3.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid3.html @@ -0,0 +1 @@ + diff --git a/videos/vid4.html b/videos/vid4.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid4.html @@ -0,0 +1 @@ + diff --git a/videos/vid5.html b/videos/vid5.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid5.html @@ -0,0 +1 @@ + diff --git a/videos/vid6.html b/videos/vid6.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid6.html @@ -0,0 +1 @@ + diff --git a/videos/vid7.html b/videos/vid7.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid7.html @@ -0,0 +1 @@ + diff --git a/videos/vid8.html b/videos/vid8.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid8.html @@ -0,0 +1 @@ + diff --git a/videos/vid9.html b/videos/vid9.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/videos/vid9.html @@ -0,0 +1 @@ + diff --git a/wechat.html b/wechat.html index 3cddd1e48c26..d22c979887bf 100644 --- a/wechat.html +++ b/wechat.html @@ -1,12 +1,310 @@ ---- -layout: default -title: PyTorch on WeChat -body-class: announcement -background-class: announcement-background -permalink: /wechat ---- - -
        + + + + + + + + + + + + + PyTorch on WeChat | PyTorch + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! Learn more. +
        +
        +
        +
        + + +
        + + + + + + + + +
        + +
        +
        + + +
        + +

        PyTorch on WeChat

        @@ -28,3 +326,306 @@

        Connect with us

        + + +
        +
        +
        +
        +

        Docs

        +

        Access comprehensive developer documentation for PyTorch

        + View Docs +
        + +
        +

        Tutorials

        +

        Get in-depth tutorials for beginners and advanced developers

        + View Tutorials +
        + +
        +

        Resources

        +

        Find development resources and get your questions answered

        + View Resources +
        +
        +
        +
        + +
        + +
        + +
        +
        +
        +
        + + +
        +
        +
        + + +
        + + + + + + + + + + + + + + + + + + + + + + + diff --git a/yarn.lock b/yarn.lock index 5af6829962ee..04d705fb6637 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9,10 +9,12 @@ anchor-js@^4.1.1: bootstrap@4.3.1: version "4.3.1" resolved "https://registry.yarnpkg.com/bootstrap/-/bootstrap-4.3.1.tgz#280ca8f610504d99d7b6b4bfc4b68cec601704ac" + integrity sha512-rXqOmH1VilAt2DyPzluTi2blhk17bO7ef+zLLPlWvG494pDxcM234pJ8wTc/6R40UWizAIIMgxjvxZg5kmsbag== jquery@^3.5.0: version "3.5.0" resolved "https://registry.yarnpkg.com/jquery/-/jquery-3.5.0.tgz#9980b97d9e4194611c36530e7dc46a58d7340fc9" + integrity sha512-Xb7SVYMvygPxbFMpTFQiHh1J7HClEaThguL15N/Gg37Lri/qKyhRGZYzHRyLH8Stq3Aow0LsHO2O2ci86fCrNQ== popper.js@^1.14.3: version "1.14.3"